Merge tag 'asoc-3.4' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus

ASoC: fixes for 3.4 A bunch of driver-specific fixes and one generic fix for the new support for platform DAPM contexts - we were picking the wrong default for the idle_bias_off setting which was meaning we weren't actually achieving any useful runtime PM on platform devices.
author: Takashi Iwai <tiwai@suse.de> 2012-04-07 06:28:00 -0400
committer: Takashi Iwai <tiwai@suse.de> 2012-04-07 06:28:00 -0400
commit: c38f62b08d800104fa9b0e9d6e9141459986c06d (patch)
tree: 1d04d768c8aa0c1a544d1f068317c7beb0101be2 /drivers/md
parent: 250f32747e62cb415b85083e247184188f24e566 (diff)
parent: 8abe05c6eb358967f16bce8a02c88d57c82cfbd6 (diff)
41 files changed, 2424 insertions, 807 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index faa4741df6d3..10f122a3a856 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -277,8 +277,8 @@ config DM_MIRROR
         needed for live data migration tools such as 'pvmove'.
 config DM_RAID
-       tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
+       tristate "RAID 1/4/5/6 target"
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       depends on BLK_DEV_DM
       select MD_RAID1
       select MD_RAID456
       select BLK_DEV_MD
@@ -359,8 +359,8 @@ config DM_DELAY
        If unsure, say N.
 config DM_UEVENT
-        bool "DM uevents (EXPERIMENTAL)"
+        bool "DM uevents"
-        depends on BLK_DEV_DM && EXPERIMENTAL
+        depends on BLK_DEV_DM
        ---help---
        Generate udev events for DM events.
@@ -370,4 +370,24 @@ config DM_FLAKEY
       ---help---
         A target that intermittently fails I/O for debugging purposes.
+config DM_VERITY
+        tristate "Verity target support (EXPERIMENTAL)"
+        depends on BLK_DEV_DM && EXPERIMENTAL
+        select CRYPTO
+        select CRYPTO_HASH
+        select DM_BUFIO
+        ---help---
+          This device-mapper target creates a read-only device that
+          transparently validates the data on one underlying device against
+          a pre-generated tree of cryptographic checksums stored on a second
+          device.
+          You'll need to activate the digests you're going to use in the
+          cryptoapi configuration.
+          To compile this code as a module, choose M here: the module will
+          be called dm-verity.
+          If unsure, say N.
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 046860c7a166..8b2e0dffe82e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)           += dm-zero.o
 obj-$(CONFIG_DM_RAID)   += dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING)      += dm-thin-pool.o
+obj-$(CONFIG_DM_VERITY)         += dm-verity.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                     += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cdf36b1e9aa6..3d0dfa7a89a2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
+#include <linux/seq_file.h>
 #include "md.h"
 #include "bitmap.h"
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap)
 }
 /*
- * just a placeholder - calls kmalloc for bitmap pages
- */
-static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
-{
-        unsigned char *page;
-        page = kzalloc(PAGE_SIZE, GFP_NOIO);
-        if (!page)
-                printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
-        else
-                pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
-                         bmname(bitmap), page);
-        return page;
-}
-/*
- * for now just a placeholder -- just calls kfree for bitmap pages
- */
-static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
-{
-        pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
-        kfree(page);
-}
-/*
 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
 *
 * 1) check to see if this page is allocated, if it's not then try to alloc
@@ -96,7 +72,7 @@ __acquires(bitmap->lock)
        /* this page has not been allocated yet */
        spin_unlock_irq(&bitmap->lock);
-        mappage = bitmap_alloc_page(bitmap);
+        mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
        spin_lock_irq(&bitmap->lock);
        if (mappage == NULL) {
@@ -109,7 +85,7 @@ __acquires(bitmap->lock)
        } else if (bitmap->bp[page].map ||
                   bitmap->bp[page].hijacked) {
                /* somebody beat us to getting the page */
-                bitmap_free_page(bitmap, mappage);
+                kfree(mappage);
                return 0;
        } else {
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
                ptr = bitmap->bp[page].map;
                bitmap->bp[page].map = NULL;
                bitmap->missing_pages++;
-                bitmap_free_page(bitmap, ptr);
+                kfree(ptr);
        }
 }
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
                did_alloc = 1;
        }
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (! test_bit(In_sync, &rdev->flags)
                    || test_bit(Faulty, &rdev->flags))
                        continue;
@@ -445,19 +421,14 @@ out:
 void bitmap_update_sb(struct bitmap *bitmap)
 {
        bitmap_super_t *sb;
-        unsigned long flags;
        if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
                return;
        if (bitmap->mddev->bitmap_info.external)
                return;
-        spin_lock_irqsave(&bitmap->lock, flags);
+        if (!bitmap->sb_page) /* no superblock */
-        if (!bitmap->sb_page) { /* no superblock */
-                spin_unlock_irqrestore(&bitmap->lock, flags);
                return;
-        }
+        sb = kmap_atomic(bitmap->sb_page);
-        spin_unlock_irqrestore(&bitmap->lock, flags);
-        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        sb->events = cpu_to_le64(bitmap->mddev->events);
        if (bitmap->mddev->events < bitmap->events_cleared)
                /* rocking back to read-only */
@@ -467,7 +438,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
        /* Just in case these have been changed via sysfs: */
        sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
        sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
-        kunmap_atomic(sb, KM_USER0);
+        kunmap_atomic(sb);
        write_page(bitmap, bitmap->sb_page, 1);
 }
@@ -478,7 +449,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
        if (!bitmap || !bitmap->sb_page)
                return;
-        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+        sb = kmap_atomic(bitmap->sb_page);
        printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
        printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
        printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
@@ -497,7 +468,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
        printk(KERN_DEBUG "     sync size: %llu KB\n",
                        (unsigned long long)le64_to_cpu(sb->sync_size)/2);
        printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
-        kunmap_atomic(sb, KM_USER0);
+        kunmap_atomic(sb);
 }
 /*
@@ -525,7 +496,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
        }
        bitmap->sb_page->index = 0;
-        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+        sb = kmap_atomic(bitmap->sb_page);
        sb->magic = cpu_to_le32(BITMAP_MAGIC);
        sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -533,7 +504,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
        chunksize = bitmap->mddev->bitmap_info.chunksize;
        BUG_ON(!chunksize);
        if (!is_power_of_2(chunksize)) {
-                kunmap_atomic(sb, KM_USER0);
+                kunmap_atomic(sb);
                printk(KERN_ERR "bitmap chunksize not a power of 2\n");
                return -EINVAL;
        }
@@ -571,7 +542,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
        bitmap->flags |= BITMAP_HOSTENDIAN;
        sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
-        kunmap_atomic(sb, KM_USER0);
+        kunmap_atomic(sb);
        return 0;
 }
@@ -603,7 +574,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
                return err;
        }
-        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+        sb = kmap_atomic(bitmap->sb_page);
        chunksize = le32_to_cpu(sb->chunksize);
        daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap)
        /* keep the array size field of the bitmap superblock up to date */
        sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
-        if (!bitmap->mddev->persistent)
+        if (bitmap->mddev->persistent) {
-                goto success;
+                /*
+                 * We have a persistent array superblock, so compare the
-        /*
+                 * bitmap's UUID and event counter to the mddev's
-         * if we have a persistent array superblock, compare the
+                 */
-         * bitmap's UUID and event counter to the mddev's
+                if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
-         */
+                        printk(KERN_INFO
-        if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
+                               "%s: bitmap superblock UUID mismatch\n",
-                printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
+                               bmname(bitmap));
-                        bmname(bitmap));
+                        goto out;
-                goto out;
+                }
-        }
+                events = le64_to_cpu(sb->events);
-        events = le64_to_cpu(sb->events);
+                if (events < bitmap->mddev->events) {
-        if (events < bitmap->mddev->events) {
+                        printk(KERN_INFO
-                printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
+                               "%s: bitmap file is out of date (%llu < %llu) "
-                        "-- forcing full recovery\n", bmname(bitmap), events,
+                               "-- forcing full recovery\n",
-                        (unsigned long long) bitmap->mddev->events);
+                               bmname(bitmap), events,
-                sb->state |= cpu_to_le32(BITMAP_STALE);
+                               (unsigned long long) bitmap->mddev->events);
+                        sb->state |= cpu_to_le32(BITMAP_STALE);
+                }
        }
-success:
        /* assign fields using values from superblock */
        bitmap->mddev->bitmap_info.chunksize = chunksize;
        bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -664,7 +637,7 @@ success:
                bitmap->events_cleared = bitmap->mddev->events;
        err = 0;
 out:
-        kunmap_atomic(sb, KM_USER0);
+        kunmap_atomic(sb);
        if (err)
                bitmap_print_sb(bitmap);
        return err;
@@ -680,16 +653,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
                             enum bitmap_mask_op op)
 {
        bitmap_super_t *sb;
-        unsigned long flags;
        int old;
-        spin_lock_irqsave(&bitmap->lock, flags);
+        if (!bitmap->sb_page) /* can't set the state */
-        if (!bitmap->sb_page) { /* can't set the state */
-                spin_unlock_irqrestore(&bitmap->lock, flags);
                return 0;
-        }
+        sb = kmap_atomic(bitmap->sb_page);
-        spin_unlock_irqrestore(&bitmap->lock, flags);
-        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        old = le32_to_cpu(sb->state) & bits;
        switch (op) {
        case MASK_SET:
@@ -703,7 +671,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
        default:
                BUG();
        }
-        kunmap_atomic(sb, KM_USER0);
+        kunmap_atomic(sb);
        return old;
 }
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
        unsigned long bit;
        struct page *page;
        void *kaddr;
-        unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
+        unsigned long chunk = block >> bitmap->chunkshift;
        if (!bitmap->filemap)
                return;
@@ -881,12 +849,12 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
        bit = file_page_offset(bitmap, chunk);
        /* set the bit */
-        kaddr = kmap_atomic(page, KM_USER0);
+        kaddr = kmap_atomic(page);
        if (bitmap->flags & BITMAP_HOSTENDIAN)
                set_bit(bit, kaddr);
        else
                __set_bit_le(bit, kaddr);
-        kunmap_atomic(kaddr, KM_USER0);
+        kunmap_atomic(kaddr);
        pr_debug("set file bit %lu page %lu\n", bit, page->index);
        /* record page number so it gets flushed to disk when unplug occurs */
        set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
@@ -1050,10 +1018,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
                                 * if bitmap is out of date, dirty the
                                 * whole page and write it out
                                 */
-                                paddr = kmap_atomic(page, KM_USER0);
+                                paddr = kmap_atomic(page);
                                memset(paddr + offset, 0xff,
                                       PAGE_SIZE - offset);
-                                kunmap_atomic(paddr, KM_USER0);
+                                kunmap_atomic(paddr);
                                write_page(bitmap, page, 1);
                                ret = -EIO;
@@ -1061,18 +1029,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
                                        goto err;
                        }
                }
-                paddr = kmap_atomic(page, KM_USER0);
+                paddr = kmap_atomic(page);
                if (bitmap->flags & BITMAP_HOSTENDIAN)
                        b = test_bit(bit, paddr);
                else
                        b = test_bit_le(bit, paddr);
-                kunmap_atomic(paddr, KM_USER0);
+                kunmap_atomic(paddr);
                if (b) {
                        /* if the disk bit is set, set the memory bit */
-                        int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
+                        int needed = ((sector_t)(i+1) << bitmap->chunkshift
                                      >= start);
                        bitmap_set_memory_bits(bitmap,
-                                               (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
+                                               (sector_t)i << bitmap->chunkshift,
                                               needed);
                        bit_cnt++;
                }
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap)
 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
 {
-        sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+        sector_t chunk = offset >> bitmap->chunkshift;
        unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
        bitmap->bp[page].count += inc;
        bitmap_checkfree(bitmap, page);
@@ -1209,10 +1177,10 @@ void bitmap_daemon_work(struct mddev *mddev)
                            mddev->bitmap_info.external == 0) {
                                bitmap_super_t *sb;
                                bitmap->need_sync = 0;
-                                sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+                                sb = kmap_atomic(bitmap->sb_page);
                                sb->events_cleared =
                                        cpu_to_le64(bitmap->events_cleared);
-                                kunmap_atomic(sb, KM_USER0);
+                                kunmap_atomic(sb);
                                write_page(bitmap, bitmap->sb_page, 1);
                        }
                        spin_lock_irqsave(&bitmap->lock, flags);
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev)
                                bitmap->allclean = 0;
                }
                bmc = bitmap_get_counter(bitmap,
-                                         (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+                                         (sector_t)j << bitmap->chunkshift,
                                         &blocks, 0);
                if (!bmc)
                        j |= PAGE_COUNTER_MASK;
@@ -1231,11 +1199,11 @@ void bitmap_daemon_work(struct mddev *mddev)
                                /* we can clear the bit */
                                *bmc = 0;
                                bitmap_count_page(bitmap,
-                                                  (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+                                                  (sector_t)j << bitmap->chunkshift,
                                                  -1);
                                /* clear the bit */
-                                paddr = kmap_atomic(page, KM_USER0);
+                                paddr = kmap_atomic(page);
                                if (bitmap->flags & BITMAP_HOSTENDIAN)
                                        clear_bit(file_page_offset(bitmap, j),
                                                  paddr);
@@ -1244,7 +1212,7 @@ void bitmap_daemon_work(struct mddev *mddev)
                                                file_page_offset(bitmap,
                                                                 j),
                                                paddr);
-                                kunmap_atomic(paddr, KM_USER0);
+                                kunmap_atomic(paddr);
                        } else if (*bmc <= 2) {
                                *bmc = 1; /* maybe clear the bit next time */
                                set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock)
         * The lock must have been taken with interrupts enabled.
         * If !create, we don't release the lock.
         */
-        sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+        sector_t chunk = offset >> bitmap->chunkshift;
        unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
        unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
        sector_t csize;
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock)
        if (bitmap->bp[page].hijacked ||
            bitmap->bp[page].map == NULL)
-                csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
+                csize = ((sector_t)1) << (bitmap->chunkshift +
                                          PAGE_COUNTER_SHIFT - 1);
        else
-                csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
+                csize = ((sector_t)1) << bitmap->chunkshift;
        *blocks = csize - (offset & (csize - 1));
        if (err < 0)
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
                        set_page_attr(bitmap,
                                      filemap_get_page(
                                              bitmap,
-                                              offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+                                              offset >> bitmap->chunkshift),
                                      BITMAP_PAGE_PENDING);
                        bitmap->allclean = 0;
                }
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
                else {
                        if (*bmc <= 2) {
                                set_page_attr(bitmap,
-                                              filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+                                              filemap_get_page(bitmap, offset >> bitmap->chunkshift),
                                              BITMAP_PAGE_PENDING);
                                bitmap->allclean = 0;
                        }
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
        bitmap->mddev->curr_resync_completed = sector;
        set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
-        sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
+        sector &= ~((1ULL << bitmap->chunkshift) - 1);
        s = 0;
        while (s < sector && s < bitmap->mddev->resync_max_sectors) {
                bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
                struct page *page;
                *bmc = 2 | (needed ? NEEDED_MASK : 0);
                bitmap_count_page(bitmap, offset, 1);
-                page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
+                page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
                set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
                bitmap->allclean = 0;
        }
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
        unsigned long chunk;
        for (chunk = s; chunk <= e; chunk++) {
-                sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
+                sector_t sec = (sector_t)chunk << bitmap->chunkshift;
                bitmap_set_memory_bits(bitmap, sec, 1);
                spin_lock_irq(&bitmap->lock);
                bitmap_file_set_bit(bitmap, sec);
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev)
                goto error;
        bitmap->daemon_lastrun = jiffies;
-        bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
+        bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
+                              - BITMAP_BLOCK_SHIFT);
        /* now that chunksize and chunkshift are set, we can use these macros */
-        chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
+        chunks = (blocks + bitmap->chunkshift - 1) >>
-                        CHUNK_BLOCK_SHIFT(bitmap);
+                        bitmap->chunkshift;
        pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
        BUG_ON(!pages);
@@ -1836,6 +1805,33 @@ out:
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
+{
+        unsigned long chunk_kb;
+        unsigned long flags;
+        if (!bitmap)
+                return;
+        spin_lock_irqsave(&bitmap->lock, flags);
+        chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
+        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+                   "%lu%s chunk",
+                   bitmap->pages - bitmap->missing_pages,
+                   bitmap->pages,
+                   (bitmap->pages - bitmap->missing_pages)
+                   << (PAGE_SHIFT - 10),
+                   chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
+                   chunk_kb ? "KB" : "B");
+        if (bitmap->file) {
+                seq_printf(seq, ", file: ");
+                seq_path(seq, &bitmap->file->f_path, " \t\n");
+        }
+        seq_printf(seq, "\n");
+        spin_unlock_irqrestore(&bitmap->lock, flags);
+}
 static ssize_t
 location_show(struct mddev *mddev, char *page)
 {
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
                        if (mddev->pers) {
                                mddev->pers->quiesce(mddev, 1);
                                rv = bitmap_create(mddev);
+                                if (!rv)
+                                        rv = bitmap_load(mddev);
                                if (rv) {
                                        bitmap_destroy(mddev);
                                        mddev->bitmap_info.offset = 0;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a15436dd9b3e..55ca5aec84e4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,8 +13,6 @@
 #define BITMAP_MAJOR_HI 4
 #define BITMAP_MAJOR_HOSTENDIAN 3
-#define BITMAP_MINOR 39
 /*
 * in-memory bitmap:
 *
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
 /* same, except a mask value for more efficient bitops */
 #define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
-#define BITMAP_BLOCK_SIZE 512
 #define BITMAP_BLOCK_SHIFT 9
 /* how many blocks per chunk? (this is variable) */
 #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
-/* when hijacked, the counters and bits represent even larger "chunks" */
-/* there will be 1024 chunks represented by each counter in the page pointers */
-#define PAGEPTR_BLOCK_RATIO(bitmap) \
-                        (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
-#define PAGEPTR_BLOCK_SHIFT(bitmap) \
-                        (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
-#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
 #endif
@@ -181,12 +168,6 @@ struct bitmap_page {
        unsigned int  count:31;
 };
-/* keep track of bitmap file pages that have pending writes on them */
-struct page_list {
-        struct list_head list;
-        struct page *page;
-};
 /* the main bitmap structure - one per mddev */
 struct bitmap {
        struct bitmap_page *bp;
@@ -196,7 +177,7 @@ struct bitmap {
        struct mddev *mddev; /* the md device that the bitmap is for */
        /* bitmap chunksize -- how much data does each bit represent? */
-        unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+        unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
        unsigned long chunks; /* total number of data chunks for the array */
        __u64   events_cleared;
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
 void bitmap_print_sb(struct bitmap *bitmap);
 void bitmap_update_sb(struct bitmap *bitmap);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
 int  bitmap_setallbits(struct bitmap *bitmap);
 void bitmap_write_all(struct bitmap *bitmap);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0a6806f80ab5..cc06a1e52423 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -12,7 +12,6 @@
 #include <linux/dm-io.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/version.h>
 #include <linux/shrinker.h>
 #include <linux/module.h>
@@ -579,7 +578,7 @@ static void write_endio(struct bio *bio, int error)
        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
        b->write_error = error;
-        if (error) {
+        if (unlikely(error)) {
                struct dm_bufio_client *c = b->c;
                (void)cmpxchg(&c->async_write_error, 0, error);
        }
@@ -698,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
        dm_bufio_lock(c);
 }
+enum new_flag {
+        NF_FRESH = 0,
+        NF_READ = 1,
+        NF_GET = 2,
+        NF_PREFETCH = 3
+};
 /*
 * Allocate a new buffer. If the allocation is not possible, wait until
 * some other thread frees a buffer.
 *
 * May drop the lock and regain it.
 */
-static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c)
+static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
 {
        struct dm_buffer *b;
@@ -727,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
                                return b;
                }
+                if (nf == NF_PREFETCH)
+                        return NULL;
                if (!list_empty(&c->reserved_buffers)) {
                        b = list_entry(c->reserved_buffers.next,
                                       struct dm_buffer, lru_list);
@@ -744,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
        }
 }
-static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c)
+static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
 {
-        struct dm_buffer *b = __alloc_buffer_wait_no_callback(c);
+        struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
+        if (!b)
+                return NULL;
        if (c->alloc_callback)
                c->alloc_callback(b);
@@ -866,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 * Getting a buffer
 *--------------------------------------------------------------*/
-enum new_flag {
-        NF_FRESH = 0,
-        NF_READ = 1,
-        NF_GET = 2
-};
 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
-                                     enum new_flag nf, struct dm_buffer **bp,
+                                     enum new_flag nf, int *need_submit)
-                                     int *need_submit)
 {
        struct dm_buffer *b, *new_b = NULL;
        *need_submit = 0;
        b = __find(c, block);
-        if (b) {
+        if (b)
-                b->hold_count++;
+                goto found_buffer;
-                __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
-                             test_bit(B_WRITING, &b->state));
-                return b;
-        }
        if (nf == NF_GET)
                return NULL;
-        new_b = __alloc_buffer_wait(c);
+        new_b = __alloc_buffer_wait(c, nf);
+        if (!new_b)
+                return NULL;
        /*
         * We've had a period where the mutex was unlocked, so need to
@@ -900,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
        b = __find(c, block);
        if (b) {
                __free_buffer_wake(new_b);
-                b->hold_count++;
+                goto found_buffer;
-                __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
-                             test_bit(B_WRITING, &b->state));
-                return b;
        }
        __check_watermark(c);
@@ -923,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
        *need_submit = 1;
        return b;
+found_buffer:
+        if (nf == NF_PREFETCH)
+                return NULL;
+        /*
+         * Note: it is essential that we don't wait for the buffer to be
+         * read if dm_bufio_get function is used. Both dm_bufio_get and
+         * dm_bufio_prefetch can be used in the driver request routine.
+         * If the user called both dm_bufio_prefetch and dm_bufio_get on
+         * the same buffer, it would deadlock if we waited.
+         */
+        if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
+                return NULL;
+        b->hold_count++;
+        __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
+                     test_bit(B_WRITING, &b->state));
+        return b;
 }
 /*
@@ -957,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
        struct dm_buffer *b;
        dm_bufio_lock(c);
-        b = __bufio_new(c, block, nf, bp, &need_submit);
+        b = __bufio_new(c, block, nf, &need_submit);
        dm_bufio_unlock(c);
-        if (!b || IS_ERR(b))
+        if (!b)
                return b;
        if (need_submit)
@@ -1006,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
 }
 EXPORT_SYMBOL_GPL(dm_bufio_new);
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+                       sector_t block, unsigned n_blocks)
+{
+        struct blk_plug plug;
+        blk_start_plug(&plug);
+        dm_bufio_lock(c);
+        for (; n_blocks--; block++) {
+                int need_submit;
+                struct dm_buffer *b;
+                b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
+                if (unlikely(b != NULL)) {
+                        dm_bufio_unlock(c);
+                        if (need_submit)
+                                submit_io(b, READ, b->block, read_endio);
+                        dm_bufio_release(b);
+                        dm_bufio_cond_resched();
+                        if (!n_blocks)
+                                goto flush_plug;
+                        dm_bufio_lock(c);
+                }
+        }
+        dm_bufio_unlock(c);
+flush_plug:
+        blk_finish_plug(&plug);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
 void dm_bufio_release(struct dm_buffer *b)
 {
        struct dm_bufio_client *c = b->c;
        dm_bufio_lock(c);
-        BUG_ON(test_bit(B_READING, &b->state));
        BUG_ON(!b->hold_count);
        b->hold_count--;
@@ -1025,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b)
                 * invalid buffer.
                 */
                if ((b->read_error || b->write_error) &&
+                    !test_bit(B_READING, &b->state) &&
                    !test_bit(B_WRITING, &b->state) &&
                    !test_bit(B_DIRTY, &b->state)) {
                        __unlink_buffer(b);
@@ -1042,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
        dm_bufio_lock(c);
+        BUG_ON(test_bit(B_READING, &b->state));
        if (!test_and_set_bit(B_DIRTY, &b->state))
                __relink_lru(b, LIST_DIRTY);
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 5c4c3a04e381..b142946a9e32 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
                   struct dm_buffer **bp);
 /*
+ * Prefetch the specified blocks to the cache.
+ * The function starts to read the blocks and returns without waiting for
+ * I/O to finish.
+ */
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+                       sector_t block, unsigned n_blocks);
+/*
 * Release a reference obtained with dm_bufio_{read,get,new}. The data
 * pointer and dm_buffer pointer is no longer valid after this call.
 */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8c2a000cf3f5..3f06df59fd82 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -176,7 +176,6 @@ struct crypt_config {
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
-#define MIN_BIO_PAGES  8
 static struct kmem_cache *_crypt_io_pool;
@@ -590,9 +589,9 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
        int r = 0;
        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
-                src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+                src = kmap_atomic(sg_page(&dmreq->sg_in));
                r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
-                kunmap_atomic(src, KM_USER0);
+                kunmap_atomic(src);
        } else
                memset(iv, 0, cc->iv_size);
@@ -608,14 +607,14 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
                return 0;
-        dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+        dst = kmap_atomic(sg_page(&dmreq->sg_out));
        r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
        /* Tweak the first block of plaintext sector */
        if (!r)
                crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
-        kunmap_atomic(dst, KM_USER0);
+        kunmap_atomic(dst);
        return r;
 }
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
                }
                /*
-                 * if additional pages cannot be allocated without waiting,
+                 * If additional pages cannot be allocated without waiting,
-                 * return a partially allocated bio, the caller will then try
+                 * return a partially-allocated bio.  The caller will then try
-                 * to allocate additional bios while submitting this partial bio
+                 * to allocate more bios while submitting this partial bio.
                 */
-                if (i == (MIN_BIO_PAGES - 1))
+                gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
-                        gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
                len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
        queue_work(cc->io_queue, &io->work);
 }
-static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
-                                          int error, int async)
 {
        struct bio *clone = io->ctx.bio_out;
        struct crypt_config *cc = io->target->private;
-        if (unlikely(error < 0)) {
+        if (unlikely(io->error < 0)) {
                crypt_free_buffer_pages(cc, clone);
                bio_put(clone);
-                io->error = -EIO;
                crypt_dec_pending(io);
                return;
        }
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
                sector += bio_sectors(clone);
                crypt_inc_pending(io);
                r = crypt_convert(cc, &io->ctx);
+                if (r < 0)
+                        io->error = -EIO;
                crypt_finished = atomic_dec_and_test(&io->ctx.pending);
                /* Encryption was already finished, submit io now */
                if (crypt_finished) {
-                        kcryptd_crypt_write_io_submit(io, r, 0);
+                        kcryptd_crypt_write_io_submit(io, 0);
                        /*
                         * If there was an error, do not try next fragments.
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
        crypt_dec_pending(io);
 }
-static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
+static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
 {
-        if (unlikely(error < 0))
-                io->error = -EIO;
        crypt_dec_pending(io);
 }
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
                           io->sector);
        r = crypt_convert(cc, &io->ctx);
+        if (r < 0)
+                io->error = -EIO;
        if (atomic_dec_and_test(&io->ctx.pending))
-                kcryptd_crypt_read_done(io, r);
+                kcryptd_crypt_read_done(io);
        crypt_dec_pending(io);
 }
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
                error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+        if (error < 0)
+                io->error = -EIO;
        mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
        if (!atomic_dec_and_test(&ctx->pending))
                return;
        if (bio_data_dir(io->base_bio) == READ)
-                kcryptd_crypt_read_done(io, error);
+                kcryptd_crypt_read_done(io);
        else
-                kcryptd_crypt_write_io_submit(io, error, 1);
+                kcryptd_crypt_write_io_submit(io, 1);
 }
 static void kcryptd_crypt(struct work_struct *work)
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
        char *cipher_api = NULL;
        int cpu, ret = -EINVAL;
+        char dummy;
        /* Convert to crypto api definition? */
        if (strchr(cipher_in, '(')) {
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        if (!keycount)
                cc->tfms_count = 1;
-        else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+        else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
                 !is_power_of_2(cc->tfms_count)) {
                ti->error = "Bad cipher key count specification";
                return -EINVAL;
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        int ret;
        struct dm_arg_set as;
        const char *opt_string;
+        char dummy;
        static struct dm_arg _args[] = {
                {0, 1, "Invalid number of feature args"},
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        ret = -EINVAL;
-        if (sscanf(argv[2], "%llu", &tmpll) != 1) {
+        if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
                ti->error = "Invalid iv_offset sector";
                goto bad;
        }
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
-        if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+        if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
                ti->error = "Invalid device sector";
                goto bad;
        }
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f18375dcedd9..2dc22dddb2ae 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        struct delay_c *dc;
        unsigned long long tmpll;
+        char dummy;
        if (argc != 3 && argc != 6) {
                ti->error = "requires exactly 3 or 6 arguments";
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        dc->reads = dc->writes = 0;
-        if (sscanf(argv[1], "%llu", &tmpll) != 1) {
+        if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
                ti->error = "Invalid device sector";
                goto bad;
        }
        dc->start_read = tmpll;
-        if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
+        if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
                ti->error = "Invalid delay";
                goto bad;
        }
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        if (argc == 3)
                goto out;
-        if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+        if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
                ti->error = "Invalid write device sector";
                goto bad_dev_read;
        }
        dc->start_write = tmpll;
-        if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
+        if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
                ti->error = "Invalid write delay";
                goto bad_dev_read;
        }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 042e71996569..aa70f7d43a1a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -283,7 +283,7 @@ int dm_exception_store_init(void)
        return 0;
 persistent_fail:
-        dm_persistent_snapshot_exit();
+        dm_transient_snapshot_exit();
 transient_fail:
        return r;
 }
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 9fb18c147825..ac49c01f1a44 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        unsigned long long tmpll;
        struct dm_arg_set as;
        const char *devname;
+        char dummy;
        as.argc = argc;
        as.argv = argv;
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        devname = dm_shift_arg(&as);
-        if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
+        if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
                ti->error = "Invalid device sector";
                goto bad;
        }
@@ -323,7 +324,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
         * Corrupt successful READs while in down state.
         * If flags were specified, only corrupt those that match.
         */
-        if (!error && bio_submitted_while_down &&
+        if (fc->corrupt_bio_byte && !error && bio_submitted_while_down &&
            (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
            all_corrupt_bio_flags_match(bio, fc))
                corrupt_bio_data(bio, fc);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ad2eba40e319..ea5dd289fe2a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -296,6 +296,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
        unsigned offset;
        unsigned num_bvecs;
        sector_t remaining = where->count;
+        struct request_queue *q = bdev_get_queue(where->bdev);
+        sector_t discard_sectors;
        /*
         * where->count may be zero if rw holds a flush and we need to
@@ -305,9 +307,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
                /*
                 * Allocate a suitably sized-bio.
                 */
-                num_bvecs = dm_sector_div_up(remaining,
+                if (rw & REQ_DISCARD)
-                                             (PAGE_SIZE >> SECTOR_SHIFT));
+                        num_bvecs = 1;
-                num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
+                else
+                        num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
+                                          dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
                bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
                bio->bi_sector = where->sector + (where->count - remaining);
                bio->bi_bdev = where->bdev;
@@ -315,10 +320,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
                bio->bi_destructor = dm_bio_destructor;
                store_io_and_region_in_bio(bio, io, region);
-                /*
+                if (rw & REQ_DISCARD) {
-                 * Try and add as many pages as possible.
+                        discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
-                 */
+                        bio->bi_size = discard_sectors << SECTOR_SHIFT;
-                while (remaining) {
+                        remaining -= discard_sectors;
+                } else while (remaining) {
+                        /*
+                         * Try and add as many pages as possible.
+                         */
                        dp->get_page(dp, &page, &len, &offset);
                        len = min(len, to_bytes(remaining));
                        if (!bio_add_page(bio, page, len, offset))
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 31c2dc25886d..a1a3e6df17b8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
        struct hd_geometry geometry;
        unsigned long indata[4];
        char *geostr = (char *) param + param->data_start;
+        char dummy;
        md = find_device(param);
        if (!md)
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
                goto out;
        }
-        x = sscanf(geostr, "%lu %lu %lu %lu", indata,
+        x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
-                   indata + 1, indata + 2, indata + 3);
+                   indata + 1, indata + 2, indata + 3, &dummy);
        if (x != 4) {
                DMWARN("Unable to interpret geometry settings.");
@@ -1437,7 +1438,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
        if (!argc) {
                DMWARN("Empty message received.");
-                goto out;
+                goto out_argv;
        }
        table = dm_get_live_table(md);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9728839f844a..3639eeab6042 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        struct linear_c *lc;
        unsigned long long tmp;
+        char dummy;
        if (argc != 2) {
                ti->error = "Invalid argument count";
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                return -ENOMEM;
        }
-        if (sscanf(argv[1], "%llu", &tmp) != 1) {
+        if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
                ti->error = "dm-linear: Invalid device sector";
                goto bad;
        }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 3b52bb72bd1f..65ebaebf502b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
        unsigned int region_count;
        size_t bitset_size, buf_size;
        int r;
+        char dummy;
        if (argc < 1 || argc > 2) {
                DMWARN("wrong number of arguments to dirty region log");
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                }
        }
-        if (sscanf(argv[0], "%u", &region_size) != 1 ||
+        if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
            !_check_region_size(ti, region_size)) {
                DMWARN("invalid region size %s", argv[0]);
                return -EINVAL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 801d92d237cf..922a3385eead 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m)
        kfree(m);
 }
+static int set_mapinfo(struct multipath *m, union map_info *info)
+{
+        struct dm_mpath_io *mpio;
+        mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
+        if (!mpio)
+                return -ENOMEM;
+        memset(mpio, 0, sizeof(*mpio));
+        info->ptr = mpio;
+        return 0;
+}
+static void clear_mapinfo(struct multipath *m, union map_info *info)
+{
+        struct dm_mpath_io *mpio = info->ptr;
+        info->ptr = NULL;
+        mempool_free(mpio, m->mpio_pool);
+}
 /*-----------------------------------------------
 * Path selection
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m)
 }
 static int map_io(struct multipath *m, struct request *clone,
-                  struct dm_mpath_io *mpio, unsigned was_queued)
+                  union map_info *map_context, unsigned was_queued)
 {
        int r = DM_MAPIO_REMAPPED;
        size_t nr_bytes = blk_rq_bytes(clone);
        unsigned long flags;
        struct pgpath *pgpath;
        struct block_device *bdev;
+        struct dm_mpath_io *mpio = map_context->ptr;
        spin_lock_irqsave(&m->lock, flags);
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m)
 {
        int r;
        unsigned long flags;
-        struct dm_mpath_io *mpio;
        union map_info *info;
        struct request *clone, *n;
        LIST_HEAD(cl);
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m)
                list_del_init(&clone->queuelist);
                info = dm_get_rq_mapinfo(clone);
-                mpio = info->ptr;
-                r = map_io(m, clone, mpio, 1);
+                r = map_io(m, clone, info, 1);
                if (r < 0) {
-                        mempool_free(mpio, m->mpio_pool);
+                        clear_mapinfo(m, info);
                        dm_kill_unmapped_request(clone, r);
                } else if (r == DM_MAPIO_REMAPPED)
                        dm_dispatch_request(clone);
                else if (r == DM_MAPIO_REQUEUE) {
-                        mempool_free(mpio, m->mpio_pool);
+                        clear_mapinfo(m, info);
                        dm_requeue_unmapped_request(clone);
                }
        }
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
                         union map_info *map_context)
 {
        int r;
-        struct dm_mpath_io *mpio;
        struct multipath *m = (struct multipath *) ti->private;
-        mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
+        if (set_mapinfo(m, map_context) < 0)
-        if (!mpio)
                /* ENOMEM, requeue */
                return DM_MAPIO_REQUEUE;
-        memset(mpio, 0, sizeof(*mpio));
-        map_context->ptr = mpio;
        clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-        r = map_io(m, clone, mpio, 0);
+        r = map_io(m, clone, map_context, 0);
        if (r < 0 || r == DM_MAPIO_REQUEUE)
-                mempool_free(mpio, m->mpio_pool);
+                clear_mapinfo(m, map_context);
        return r;
 }
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
        struct priority_group *pg;
        unsigned pgnum;
        unsigned long flags;
+        char dummy;
-        if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+        if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
            (pgnum > m->nr_priority_groups)) {
                DMWARN("invalid PG number supplied to switch_pg_num");
                return -EINVAL;
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
 {
        struct priority_group *pg;
        unsigned pgnum;
+        char dummy;
-        if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+        if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
            (pgnum > m->nr_priority_groups)) {
                DMWARN("invalid PG number supplied to bypass_pg");
                return -EINVAL;
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
        struct path_selector *ps;
        int r;
+        BUG_ON(!mpio);
        r  = do_end_io(m, clone, error, mpio);
        if (pgpath) {
                ps = &pgpath->pg->ps;
                if (ps->type->end_io)
                        ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
        }
-        mempool_free(mpio, m->mpio_pool);
+        clear_mapinfo(m, map_context);
        return r;
 }
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 03a837aa5ce6..3941fae0de9f 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
        struct selector *s = ps->context;
        struct path_info *pi;
        unsigned repeat_count = QL_MIN_IO;
+        char dummy;
        /*
         * Arguments: [<repeat_count>]
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
                return -EINVAL;
        }
-        if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+        if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
                *error = "queue-length ps: invalid repeat count";
                return -EINVAL;
        }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 86cb7e5d83d5..b0ba52459ed7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
                return 0;
        if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
-                DMERR("Failed to read device superblock");
+                DMERR("Failed to read superblock of device at position %d",
+                      rdev->raid_disk);
+                set_bit(Faulty, &rdev->flags);
                return -EINVAL;
        }
@@ -615,14 +617,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
-        struct md_rdev *r, *t;
+        struct md_rdev *r;
        uint64_t failed_devices;
        struct dm_raid_superblock *sb;
        sb = page_address(rdev->sb_page);
        failed_devices = le64_to_cpu(sb->failed_devices);
-        rdev_for_each(r, t, mddev)
+        rdev_for_each(r, mddev)
                if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
                        failed_devices |= (1ULL << r->raid_disk);
@@ -668,7 +670,14 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
                return ret;
        sb = page_address(rdev->sb_page);
-        if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+        /*
+         * Two cases that we want to write new superblocks and rebuild:
+         * 1) New device (no matching magic number)
+         * 2) Device specified for rebuild (!In_sync w/ offset == 0)
+         */
+        if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
+            (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
                super_sync(rdev->mddev, rdev);
                set_bit(FirstUse, &rdev->flags);
@@ -700,7 +709,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
        struct dm_raid_superblock *sb;
        uint32_t new_devs = 0;
        uint32_t rebuilds = 0;
-        struct md_rdev *r, *t;
+        struct md_rdev *r;
        struct dm_raid_superblock *sb2;
        sb = page_address(rdev->sb_page);
@@ -743,13 +752,10 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
         *    case the In_sync bit will /not/ be set and
         *    recovery_cp must be MaxSector.
         */
-        rdev_for_each(r, t, mddev) {
+        rdev_for_each(r, mddev) {
                if (!test_bit(In_sync, &r->flags)) {
-                        if (!test_bit(FirstUse, &r->flags))
+                        DMINFO("Device %d specified for rebuild: "
-                                DMERR("Superblock area of "
+                               "Clearing superblock", r->raid_disk);
-                                      "rebuild device %d should have been "
-                                      "cleared.", r->raid_disk);
-                        set_bit(FirstUse, &r->flags);
                        rebuilds++;
                } else if (test_bit(FirstUse, &r->flags))
                        new_devs++;
@@ -778,7 +784,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
         * Now we set the Faulty bit for those devices that are
         * recorded in the superblock as failed.
         */
-        rdev_for_each(r, t, mddev) {
+        rdev_for_each(r, mddev) {
                if (!r->sb_page)
                        continue;
                sb2 = page_address(r->sb_page);
@@ -851,11 +857,27 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
        int ret;
-        struct md_rdev *rdev, *freshest, *tmp;
+        unsigned redundancy = 0;
+        struct raid_dev *dev;
+        struct md_rdev *rdev, *freshest;
        struct mddev *mddev = &rs->md;
+        switch (rs->raid_type->level) {
+        case 1:
+                redundancy = rs->md.raid_disks - 1;
+                break;
+        case 4:
+        case 5:
+        case 6:
+                redundancy = rs->raid_type->parity_devs;
+                break;
+        default:
+                ti->error = "Unknown RAID type";
+                return -EINVAL;
+        }
        freshest = NULL;
-        rdev_for_each(rdev, tmp, mddev) {
+        rdev_for_each(rdev, mddev) {
                if (!rdev->meta_bdev)
                        continue;
@@ -868,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                case 0:
                        break;
                default:
+                        dev = container_of(rdev, struct raid_dev, rdev);
+                        if (redundancy--) {
+                                if (dev->meta_dev)
+                                        dm_put_device(ti, dev->meta_dev);
+                                dev->meta_dev = NULL;
+                                rdev->meta_bdev = NULL;
+                                if (rdev->sb_page)
+                                        put_page(rdev->sb_page);
+                                rdev->sb_page = NULL;
+                                rdev->sb_loaded = 0;
+                                /*
+                                 * We might be able to salvage the data device
+                                 * even though the meta device has failed.  For
+                                 * now, we behave as though '- -' had been
+                                 * set for this device in the table.
+                                 */
+                                if (dev->data_dev)
+                                        dm_put_device(ti, dev->data_dev);
+                                dev->data_dev = NULL;
+                                rdev->bdev = NULL;
+                                list_del(&rdev->same_set);
+                                continue;
+                        }
                        ti->error = "Failed to load superblock";
                        return ret;
                }
@@ -884,7 +937,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
        if (super_validate(mddev, freshest))
                return -EINVAL;
-        rdev_for_each(rdev, tmp, mddev)
+        rdev_for_each(rdev, mddev)
                if ((rdev != freshest) && super_validate(mddev, rdev))
                        return -EINVAL;
@@ -971,6 +1024,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
        INIT_WORK(&rs->md.event_work, do_table_event);
        ti->private = rs;
+        ti->num_flush_requests = 1;
        mutex_lock(&rs->md.reconfig_mutex);
        ret = md_run(&rs->md);
@@ -1209,7 +1263,7 @@ static void raid_resume(struct dm_target *ti)
 static struct target_type raid_target = {
        .name = "raid",
-        .version = {1, 1, 0},
+        .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057be686..d039de8322f0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
                      unsigned int mirror, char **argv)
 {
        unsigned long long offset;
+        char dummy;
-        if (sscanf(argv[1], "%llu", &offset) != 1) {
+        if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
                ti->error = "Invalid offset";
                return -EINVAL;
        }
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 {
        unsigned param_count;
        struct dm_dirty_log *dl;
+        char dummy;
        if (argc < 2) {
                ti->error = "Insufficient mirror log arguments";
                return NULL;
        }
-        if (sscanf(argv[1], "%u", &param_count) != 1) {
+        if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
                ti->error = "Invalid mirror log argument count";
                return NULL;
        }
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 {
        unsigned num_features;
        struct dm_target *ti = ms->ti;
+        char dummy;
        *args_used = 0;
        if (!argc)
                return 0;
-        if (sscanf(argv[0], "%u", &num_features) != 1) {
+        if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
                ti->error = "Invalid number of features";
                return -EINVAL;
        }
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        unsigned int nr_mirrors, m, args_used;
        struct mirror_set *ms;
        struct dm_dirty_log *dl;
+        char dummy;
        dl = create_dirty_log(ti, argc, argv, &args_used);
        if (!dl)
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        argv += args_used;
        argc -= args_used;
-        if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
+        if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
            nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
                ti->error = "Invalid number of mirrors";
                dm_dirty_log_destroy(dl);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 27f1d423b76c..6ab1192cdd5f 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
        struct selector *s = (struct selector *) ps->context;
        struct path_info *pi;
        unsigned repeat_count = RR_MIN_IO;
+        char dummy;
        if (argc > 1) {
                *error = "round-robin ps: incorrect number of arguments";
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
        }
        /* First path argument is number of I/Os before switching path */
-        if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+        if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
                *error = "round-robin ps: invalid repeat count";
                return -EINVAL;
        }
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 59883bd78214..9df8f6bd6418 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
        struct path_info *pi;
        unsigned repeat_count = ST_MIN_IO;
        unsigned relative_throughput = 1;
+        char dummy;
        /*
         * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
                return -EINVAL;
        }
-        if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+        if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
                *error = "service-time ps: invalid repeat count";
                return -EINVAL;
        }
        if ((argc == 2) &&
-            (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+            (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
             relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
                *error = "service-time ps: invalid relative_throughput value";
                return -EINVAL;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3d80cf0c152d..35c94ff24ad5 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
                      unsigned int stripe, char **argv)
 {
        unsigned long long start;
+        char dummy;
-        if (sscanf(argv[1], "%llu", &start) != 1)
+        if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
                return -EINVAL;
        if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 63cc54289aff..2e227fbf1622 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t)
        vfree(t->highs);
        /* free the device list */
-        if (t->devices.next != &t->devices)
+        free_devices(&t->devices);
-                free_devices(&t->devices);
        dm_free_md_mempools(t->mempools);
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
        struct dm_dev_internal *dd;
        unsigned int major, minor;
        struct dm_table *t = ti->table;
+        char dummy;
        BUG_ON(!t);
-        if (sscanf(path, "%u:%u", &major, &minor) == 2) {
+        if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
                /* Extract the major/minor numbers */
                dev = MKDEV(major, minor);
                if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
                             unsigned *value, char **error, unsigned grouped)
 {
        const char *arg_str = dm_shift_arg(arg_set);
+        char dummy;
        if (!arg_str ||
-            (sscanf(arg_str, "%u", value) != 1) ||
+            (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
            (*value < arg->min) ||
            (*value > arg->max) ||
            (grouped && arg_set->argc < *value)) {
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 59c4f0446ffa..737d38865b69 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
                data_sm = dm_sm_disk_create(tm, nr_blocks);
                if (IS_ERR(data_sm)) {
                        DMERR("sm_disk_create failed");
+                        dm_tm_unlock(tm, sblock);
                        r = PTR_ERR(data_sm);
                        goto bad;
                }
@@ -613,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
        if (r < 0)
                goto out;
-        r = dm_sm_root_size(pmd->metadata_sm, &data_len);
+        r = dm_sm_root_size(pmd->data_sm, &data_len);
        if (r < 0)
                goto out;
@@ -712,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
        if (r)
                goto bad;
+        if (bdev_size > THIN_METADATA_MAX_SECTORS)
+                bdev_size = THIN_METADATA_MAX_SECTORS;
        disk_super = dm_block_data(sblock);
        disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
        disk_super->version = cpu_to_le32(THIN_VERSION);
@@ -789,6 +793,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
        return 0;
 }
+/*
+ * __open_device: Returns @td corresponding to device with id @dev,
+ * creating it if @create is set and incrementing @td->open_count.
+ * On failure, @td is undefined.
+ */
 static int __open_device(struct dm_pool_metadata *pmd,
                         dm_thin_id dev, int create,
                         struct dm_thin_device **td)
@@ -799,10 +808,16 @@ static int __open_device(struct dm_pool_metadata *pmd,
        struct disk_device_details details_le;
        /*
-         * Check the device isn't already open.
+         * If the device is already open, return it.
         */
        list_for_each_entry(td2, &pmd->thin_devices, list)
                if (td2->id == dev) {
+                        /*
+                         * May not create an already-open device.
+                         */
+                        if (create)
+                                return -EEXIST;
                        td2->open_count++;
                        *td = td2;
                        return 0;
@@ -817,6 +832,9 @@ static int __open_device(struct dm_pool_metadata *pmd,
                if (r != -ENODATA || !create)
                        return r;
+                /*
+                 * Create new device.
+                 */
                changed = 1;
                details_le.mapped_blocks = 0;
                details_le.transaction_id = cpu_to_le64(pmd->trans_id);
@@ -882,12 +900,10 @@ static int __create_thin(struct dm_pool_metadata *pmd,
        r = __open_device(pmd, dev, 1, &td);
        if (r) {
-                __close_device(td);
                dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
                dm_btree_del(&pmd->bl_info, dev_root);
                return r;
        }
-        td->changed = 1;
        __close_device(td);
        return r;
@@ -967,14 +983,14 @@ static int __create_snap(struct dm_pool_metadata *pmd,
                goto bad;
        r = __set_snapshot_details(pmd, td, origin, pmd->time);
+        __close_device(td);
        if (r)
                goto bad;
-        __close_device(td);
        return 0;
 bad:
-        __close_device(td);
        dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
        dm_btree_remove(&pmd->details_info, pmd->details_root,
                        &key, &pmd->details_root);
@@ -1211,6 +1227,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
        if (r)
                return r;
+        td->mapped_blocks--;
+        td->changed = 1;
        pmd->need_commit = 1;
        return 0;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 859c16896877..ed4725e67c96 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -11,6 +11,19 @@
 #define THIN_METADATA_BLOCK_SIZE 4096
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about 16k metadata blocks.
+ */
+#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
 /*----------------------------------------------------------------*/
 struct dm_pool_metadata;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c3087575fef0..213ae32a0fc4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
 #define DEFERRED_SET_SIZE 64
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
+#define COMMIT_PERIOD HZ
 /*
 * The block size of the device holding pool data must be
@@ -32,16 +33,6 @@
 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 /*
- * The metadata device is currently limited in size.  The limitation is
- * checked lower down in dm-space-map-metadata, but we also check it here
- * so we can fail early.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
- */
-#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
-/*
 * Device id is restricted to 24 bits.
 */
 #define MAX_DEV_ID ((1 << 24) - 1)
@@ -72,7 +63,7 @@
 * missed out if the io covers the block. (schedule_copy).
 *
 * iv) insert the new mapping into the origin's btree
- * (process_prepared_mappings).  This act of inserting breaks some
+ * (process_prepared_mapping).  This act of inserting breaks some
 * sharing of btree nodes between the two devices.  Breaking sharing only
 * effects the btree of that specific device.  Btrees for the other
 * devices that share the block never change.  The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
        struct hlist_node list;
        struct bio_prison *prison;
        struct cell_key key;
-        unsigned count;
+        struct bio *holder;
        struct bio_list bios;
 };
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
 * This may block if a new cell needs allocating.  You must ensure that
 * cells will be unlocked even if the calling thread is blocked.
 *
- * Returns the number of entries in the cell prior to the new addition
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
- * or < 0 on failure.
 */
 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
                      struct bio *inmate, struct cell **ref)
 {
-        int r;
+        int r = 1;
        unsigned long flags;
        uint32_t hash = hash_key(prison, key);
-        struct cell *uninitialized_var(cell), *cell2 = NULL;
+        struct cell *cell, *cell2;
        BUG_ON(hash > prison->nr_buckets);
        spin_lock_irqsave(&prison->lock, flags);
        cell = __search_bucket(prison->cells + hash, key);
+        if (cell) {
+                bio_list_add(&cell->bios, inmate);
+                goto out;
+        }
-        if (!cell) {
+        /*
-                /*
+         * Allocate a new cell
-                 * Allocate a new cell
+         */
-                 */
+        spin_unlock_irqrestore(&prison->lock, flags);
-                spin_unlock_irqrestore(&prison->lock, flags);
+        cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
-                cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
+        spin_lock_irqsave(&prison->lock, flags);
-                spin_lock_irqsave(&prison->lock, flags);
-                /*
+        /*
-                 * We've been unlocked, so we have to double check that
+         * We've been unlocked, so we have to double check that
-                 * nobody else has inserted this cell in the meantime.
+         * nobody else has inserted this cell in the meantime.
-                 */
+         */
-                cell = __search_bucket(prison->cells + hash, key);
+        cell = __search_bucket(prison->cells + hash, key);
+        if (cell) {
+                mempool_free(cell2, prison->cell_pool);
+                bio_list_add(&cell->bios, inmate);
+                goto out;
+        }
-                if (!cell) {
+        /*
-                        cell = cell2;
+         * Use new cell.
-                        cell2 = NULL;
+         */
+        cell = cell2;
-                        cell->prison = prison;
+        cell->prison = prison;
-                        memcpy(&cell->key, key, sizeof(cell->key));
+        memcpy(&cell->key, key, sizeof(cell->key));
-                        cell->count = 0;
+        cell->holder = inmate;
-                        bio_list_init(&cell->bios);
+        bio_list_init(&cell->bios);
-                        hlist_add_head(&cell->list, prison->cells + hash);
+        hlist_add_head(&cell->list, prison->cells + hash);
-                }
-        }
-        r = cell->count++;
+        r = 0;
-        bio_list_add(&cell->bios, inmate);
-        spin_unlock_irqrestore(&prison->lock, flags);
-        if (cell2)
+out:
-                mempool_free(cell2, prison->cell_pool);
+        spin_unlock_irqrestore(&prison->lock, flags);
        *ref = cell;
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
        hlist_del(&cell->list);
-        if (inmates)
+        bio_list_add(inmates, cell->holder);
-                bio_list_merge(inmates, &cell->bios);
+        bio_list_merge(inmates, &cell->bios);
        mempool_free(cell, prison->cell_pool);
 }
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
 * bio may be in the cell.  This function releases the cell, and also does
 * a sanity check.
 */
+static void __cell_release_singleton(struct cell *cell, struct bio *bio)
+{
+        hlist_del(&cell->list);
+        BUG_ON(cell->holder != bio);
+        BUG_ON(!bio_list_empty(&cell->bios));
+}
 static void cell_release_singleton(struct cell *cell, struct bio *bio)
 {
-        struct bio_prison *prison = cell->prison;
-        struct bio_list bios;
-        struct bio *b;
        unsigned long flags;
+        struct bio_prison *prison = cell->prison;
-        bio_list_init(&bios);
        spin_lock_irqsave(&prison->lock, flags);
-        __cell_release(cell, &bios);
+        __cell_release_singleton(cell, bio);
        spin_unlock_irqrestore(&prison->lock, flags);
+}
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+        struct bio_prison *prison = cell->prison;
+        hlist_del(&cell->list);
+        bio_list_merge(inmates, &cell->bios);
-        b = bio_list_pop(&bios);
+        mempool_free(cell, prison->cell_pool);
-        BUG_ON(b != bio);
+}
-        BUG_ON(!bio_list_empty(&bios));
+static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+        unsigned long flags;
+        struct bio_prison *prison = cell->prison;
+        spin_lock_irqsave(&prison->lock, flags);
+        __cell_release_no_holder(cell, inmates);
+        spin_unlock_irqrestore(&prison->lock, flags);
 }
 static void cell_error(struct cell *cell)
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 * devices.
 */
 struct new_mapping;
+struct pool_features {
+        unsigned zero_new_blocks:1;
+        unsigned discard_enabled:1;
+        unsigned discard_passdown:1;
+};
 struct pool {
        struct list_head list;
        struct dm_target *ti;   /* Only set if a pool target is bound */
@@ -484,7 +509,7 @@ struct pool {
        dm_block_t offset_mask;
        dm_block_t low_water_blocks;
-        unsigned zero_new_blocks:1;
+        struct pool_features pf;
        unsigned low_water_triggered:1; /* A dm event has been sent */
        unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
@@ -493,17 +518,21 @@ struct pool {
        struct workqueue_struct *wq;
        struct work_struct worker;
+        struct delayed_work waker;
        unsigned ref_count;
+        unsigned long last_commit_jiffies;
        spinlock_t lock;
        struct bio_list deferred_bios;
        struct bio_list deferred_flush_bios;
        struct list_head prepared_mappings;
+        struct list_head prepared_discards;
        struct bio_list retry_on_resume_list;
-        struct deferred_set ds; /* FIXME: move to thin_c */
+        struct deferred_set shared_read_ds;
+        struct deferred_set all_io_ds;
        struct new_mapping *next_mapping;
        mempool_t *mapping_pool;
@@ -521,7 +550,7 @@ struct pool_c {
        struct dm_target_callbacks callbacks;
        dm_block_t low_water_blocks;
-        unsigned zero_new_blocks:1;
+        struct pool_features pf;
 };
 /*
@@ -529,6 +558,7 @@ struct pool_c {
 */
 struct thin_c {
        struct dm_dev *pool_dev;
+        struct dm_dev *origin_dev;
        dm_thin_id dev_id;
        struct pool *pool;
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
 /*----------------------------------------------------------------*/
+struct endio_hook {
+        struct thin_c *tc;
+        struct deferred_entry *shared_read_entry;
+        struct deferred_entry *all_io_entry;
+        struct new_mapping *overwrite_mapping;
+};
 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
        struct bio *bio;
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
        bio_list_init(master);
        while ((bio = bio_list_pop(&bios))) {
-                if (dm_get_mapinfo(bio)->ptr == tc)
+                struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+                if (h->tc == tc)
                        bio_endio(bio, DM_ENDIO_REQUEUE);
                else
                        bio_list_add(master, bio);
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
                (bio->bi_sector & pool->offset_mask);
 }
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
-                            dm_block_t block)
+{
+        bio->bi_bdev = tc->origin_dev->bdev;
+}
+static void issue(struct thin_c *tc, struct bio *bio)
 {
        struct pool *pool = tc->pool;
        unsigned long flags;
-        remap(tc, bio, block);
        /*
         * Batch together any FUA/FLUSH bios we find and then issue
         * a single commit for them in process_deferred_bios().
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
                generic_make_request(bio);
 }
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+        remap_to_origin(tc, bio);
+        issue(tc, bio);
+}
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+                            dm_block_t block)
+{
+        remap(tc, bio, block);
+        issue(tc, bio);
+}
 /*
 * wake_worker() is used when new work is queued and when pool_resume is
 * ready to continue deferred IO processing.
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool)
 /*
 * Bio endio functions.
 */
-struct endio_hook {
-        struct thin_c *tc;
-        bio_end_io_t *saved_bi_end_io;
-        struct deferred_entry *entry;
-};
 struct new_mapping {
        struct list_head list;
-        int prepared;
+        unsigned quiesced:1;
+        unsigned prepared:1;
+        unsigned pass_discard:1;
        struct thin_c *tc;
        dm_block_t virt_block;
        dm_block_t data_block;
-        struct cell *cell;
+        struct cell *cell, *cell2;
        int err;
        /*
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
 {
        struct pool *pool = m->tc->pool;
-        if (list_empty(&m->list) && m->prepared) {
+        if (m->quiesced && m->prepared) {
                list_add(&m->list, &pool->prepared_mappings);
                wake_worker(pool);
        }
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 static void overwrite_endio(struct bio *bio, int err)
 {
        unsigned long flags;
-        struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
+        struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+        struct new_mapping *m = h->overwrite_mapping;
        struct pool *pool = m->tc->pool;
        m->err = err;
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
        spin_unlock_irqrestore(&pool->lock, flags);
 }
-static void shared_read_endio(struct bio *bio, int err)
-{
-        struct list_head mappings;
-        struct new_mapping *m, *tmp;
-        struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
-        unsigned long flags;
-        struct pool *pool = h->tc->pool;
-        bio->bi_end_io = h->saved_bi_end_io;
-        bio_endio(bio, err);
-        INIT_LIST_HEAD(&mappings);
-        ds_dec(h->entry, &mappings);
-        spin_lock_irqsave(&pool->lock, flags);
-        list_for_each_entry_safe(m, tmp, &mappings, list) {
-                list_del(&m->list);
-                INIT_LIST_HEAD(&m->list);
-                __maybe_add_mapping(m);
-        }
-        spin_unlock_irqrestore(&pool->lock, flags);
-        mempool_free(h, pool->endio_hook_pool);
-}
 /*----------------------------------------------------------------*/
 /*
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
 * Same as cell_defer above, except it omits one particular detainee,
 * a write bio that covers the block and has already been processed.
 */
-static void cell_defer_except(struct thin_c *tc, struct cell *cell,
+static void cell_defer_except(struct thin_c *tc, struct cell *cell)
-                              struct bio *exception)
 {
        struct bio_list bios;
-        struct bio *bio;
        struct pool *pool = tc->pool;
        unsigned long flags;
        bio_list_init(&bios);
-        cell_release(cell, &bios);
        spin_lock_irqsave(&pool->lock, flags);
-        while ((bio = bio_list_pop(&bios)))
+        cell_release_no_holder(cell, &pool->deferred_bios);
-                if (bio != exception)
-                        bio_list_add(&pool->deferred_bios, bio);
        spin_unlock_irqrestore(&pool->lock, flags);
        wake_worker(pool);
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m)
         * the bios in the cell.
         */
        if (bio) {
-                cell_defer_except(tc, m->cell, bio);
+                cell_defer_except(tc, m->cell);
                bio_endio(bio, 0);
        } else
                cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m)
        mempool_free(m, tc->pool->mapping_pool);
 }
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+        int r;
+        struct thin_c *tc = m->tc;
+        r = dm_thin_remove_block(tc->td, m->virt_block);
+        if (r)
+                DMERR("dm_thin_remove_block() failed");
+        /*
+         * Pass the discard down to the underlying device?
+         */
+        if (m->pass_discard)
+                remap_and_issue(tc, m->bio, m->data_block);
+        else
+                bio_endio(m->bio, 0);
+        cell_defer_except(tc, m->cell);
+        cell_defer_except(tc, m->cell2);
+        mempool_free(m, tc->pool->mapping_pool);
+}
+static void process_prepared(struct pool *pool, struct list_head *head,
+                             void (*fn)(struct new_mapping *))
 {
        unsigned long flags;
        struct list_head maps;
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool)
        INIT_LIST_HEAD(&maps);
        spin_lock_irqsave(&pool->lock, flags);
-        list_splice_init(&pool->prepared_mappings, &maps);
+        list_splice_init(head, &maps);
        spin_unlock_irqrestore(&pool->lock, flags);
        list_for_each_entry_safe(m, tmp, &maps, list)
-                process_prepared_mapping(m);
+                fn(m);
 }
 /*
 * Deferred bio jobs.
 */
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-        return ((bio_data_dir(bio) == WRITE) &&
+        return !(bio->bi_sector & pool->offset_mask) &&
-                !(bio->bi_sector & pool->offset_mask)) &&
                (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+}
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+        return (bio_data_dir(bio) == WRITE) &&
+                io_overlaps_block(pool, bio);
 }
 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
 }
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
-                          dm_block_t data_origin, dm_block_t data_dest,
+                          struct dm_dev *origin, dm_block_t data_origin,
+                          dm_block_t data_dest,
                          struct cell *cell, struct bio *bio)
 {
        int r;
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        struct new_mapping *m = get_next_mapping(pool);
        INIT_LIST_HEAD(&m->list);
+        m->quiesced = 0;
        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        m->err = 0;
        m->bio = NULL;
-        ds_add_work(&pool->ds, &m->list);
+        if (!ds_add_work(&pool->shared_read_ds, &m->list))
+                m->quiesced = 1;
        /*
         * IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
         * bio immediately. Otherwise we use kcopyd to clone the data first.
         */
        if (io_overwrites_block(pool, bio)) {
+                struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+                h->overwrite_mapping = m;
                m->bio = bio;
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-                dm_get_mapinfo(bio)->ptr = m;
                remap_and_issue(tc, bio, data_dest);
        } else {
                struct dm_io_region from, to;
-                from.bdev = tc->pool_dev->bdev;
+                from.bdev = origin->bdev;
                from.sector = data_origin * pool->sectors_per_block;
                from.count = pool->sectors_per_block;
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        }
 }
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+                                   dm_block_t data_origin, dm_block_t data_dest,
+                                   struct cell *cell, struct bio *bio)
+{
+        schedule_copy(tc, virt_block, tc->pool_dev,
+                      data_origin, data_dest, cell, bio);
+}
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+                                   dm_block_t data_dest,
+                                   struct cell *cell, struct bio *bio)
+{
+        schedule_copy(tc, virt_block, tc->origin_dev,
+                      virt_block, data_dest, cell, bio);
+}
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
                          dm_block_t data_block, struct cell *cell,
                          struct bio *bio)
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        struct new_mapping *m = get_next_mapping(pool);
        INIT_LIST_HEAD(&m->list);
+        m->quiesced = 1;
        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
         * zeroing pre-existing data, we can issue the bio immediately.
         * Otherwise we use kcopyd to zero the data first.
         */
-        if (!pool->zero_new_blocks)
+        if (!pool->pf.zero_new_blocks)
                process_prepared_mapping(m);
        else if (io_overwrites_block(pool, bio)) {
+                struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+                h->overwrite_mapping = m;
                m->bio = bio;
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-                dm_get_mapinfo(bio)->ptr = m;
                remap_and_issue(tc, bio, data_block);
        } else {
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 */
 static void retry_on_resume(struct bio *bio)
 {
-        struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+        struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+        struct thin_c *tc = h->tc;
        struct pool *pool = tc->pool;
        unsigned long flags;
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell)
                retry_on_resume(bio);
 }
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+        int r;
+        struct pool *pool = tc->pool;
+        struct cell *cell, *cell2;
+        struct cell_key key, key2;
+        dm_block_t block = get_bio_block(tc, bio);
+        struct dm_thin_lookup_result lookup_result;
+        struct new_mapping *m;
+        build_virtual_key(tc->td, block, &key);
+        if (bio_detain(tc->pool->prison, &key, bio, &cell))
+                return;
+        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+        switch (r) {
+        case 0:
+                /*
+                 * Check nobody is fiddling with this pool block.  This can
+                 * happen if someone's in the process of breaking sharing
+                 * on this block.
+                 */
+                build_data_key(tc->td, lookup_result.block, &key2);
+                if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+                        cell_release_singleton(cell, bio);
+                        break;
+                }
+                if (io_overlaps_block(pool, bio)) {
+                        /*
+                         * IO may still be going to the destination block.  We must
+                         * quiesce before we can do the removal.
+                         */
+                        m = get_next_mapping(pool);
+                        m->tc = tc;
+                        m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
+                        m->virt_block = block;
+                        m->data_block = lookup_result.block;
+                        m->cell = cell;
+                        m->cell2 = cell2;
+                        m->err = 0;
+                        m->bio = bio;
+                        if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+                                list_add(&m->list, &pool->prepared_discards);
+                                wake_worker(pool);
+                        }
+                } else {
+                        /*
+                         * This path is hit if people are ignoring
+                         * limits->discard_granularity.  It ignores any
+                         * part of the discard that is in a subsequent
+                         * block.
+                         */
+                        sector_t offset = bio->bi_sector - (block << pool->block_shift);
+                        unsigned remaining = (pool->sectors_per_block - offset) << 9;
+                        bio->bi_size = min(bio->bi_size, remaining);
+                        cell_release_singleton(cell, bio);
+                        cell_release_singleton(cell2, bio);
+                        remap_and_issue(tc, bio, lookup_result.block);
+                }
+                break;
+        case -ENODATA:
+                /*
+                 * It isn't provisioned, just forget it.
+                 */
+                cell_release_singleton(cell, bio);
+                bio_endio(bio, 0);
+                break;
+        default:
+                DMERR("discard: find block unexpectedly returned %d", r);
+                cell_release_singleton(cell, bio);
+                bio_io_error(bio);
+                break;
+        }
+}
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
                          struct cell_key *key,
                          struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
        r = alloc_data_block(tc, &data_block);
        switch (r) {
        case 0:
-                schedule_copy(tc, block, lookup_result->block,
+                schedule_internal_copy(tc, block, lookup_result->block,
-                              data_block, cell, bio);
+                                       data_block, cell, bio);
                break;
        case -ENOSPC:
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
        if (bio_data_dir(bio) == WRITE)
                break_sharing(tc, bio, block, &key, lookup_result, cell);
        else {
-                struct endio_hook *h;
+                struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
-                h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
-                h->tc = tc;
+                h->shared_read_entry = ds_inc(&pool->shared_read_ds);
-                h->entry = ds_inc(&pool->ds);
-                save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
-                dm_get_mapinfo(bio)->ptr = h;
                cell_release_singleton(cell, bio);
                remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
        r = alloc_data_block(tc, &data_block);
        switch (r) {
        case 0:
-                schedule_zero(tc, block, data_block, cell, bio);
+                if (tc->origin_dev)
+                        schedule_external_copy(tc, block, data_block, cell, bio);
+                else
+                        schedule_zero(tc, block, data_block, cell, bio);
                break;
        case -ENOSPC:
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
                break;
        case -ENODATA:
-                provision_block(tc, bio, block, cell);
+                if (bio_data_dir(bio) == READ && tc->origin_dev) {
+                        cell_release_singleton(cell, bio);
+                        remap_to_origin_and_issue(tc, bio);
+                } else
+                        provision_block(tc, bio, block, cell);
                break;
        default:
                DMERR("dm_thin_find_block() failed, error = %d", r);
+                cell_release_singleton(cell, bio);
                bio_io_error(bio);
                break;
        }
 }
+static int need_commit_due_to_time(struct pool *pool)
+{
+        return jiffies < pool->last_commit_jiffies ||
+               jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+}
 static void process_deferred_bios(struct pool *pool)
 {
        unsigned long flags;
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool)
        spin_unlock_irqrestore(&pool->lock, flags);
        while ((bio = bio_list_pop(&bios))) {
-                struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+                struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+                struct thin_c *tc = h->tc;
                /*
                 * If we've got no free new_mapping structs, and processing
                 * this bio might require one, we pause until there are some
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool)
                        break;
                }
-                process_bio(tc, bio);
+                if (bio->bi_rw & REQ_DISCARD)
+                        process_discard(tc, bio);
+                else
+                        process_bio(tc, bio);
        }
        /*
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool)
        bio_list_init(&pool->deferred_flush_bios);
        spin_unlock_irqrestore(&pool->lock, flags);
-        if (bio_list_empty(&bios))
+        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
                return;
        r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool)
                        bio_io_error(bio);
                return;
        }
+        pool->last_commit_jiffies = jiffies;
        while ((bio = bio_list_pop(&bios)))
                generic_make_request(bio);
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws)
 {
        struct pool *pool = container_of(ws, struct pool, worker);
-        process_prepared_mappings(pool);
+        process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+        process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
        process_deferred_bios(pool);
 }
+/*
+ * We want to commit periodically so that not too much
+ * unwritten data builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+        struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
+        wake_worker(pool);
+        queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
+}
 /*----------------------------------------------------------------*/
 /*
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
        wake_worker(pool);
 }
+static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+        struct pool *pool = tc->pool;
+        struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+        h->tc = tc;
+        h->shared_read_entry = NULL;
+        h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
+        h->overwrite_mapping = NULL;
+        return h;
+}
 /*
 * Non-blocking function called from the thin target's map function.
 */
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
        struct dm_thin_device *td = tc->td;
        struct dm_thin_lookup_result result;
-        /*
+        map_context->ptr = thin_hook_bio(tc, bio);
-         * Save the thin context for easy access from the deferred bio later.
+        if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
-         */
-        map_context->ptr = tc;
-        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
                thin_defer_bio(tc, bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
        pool->ti = ti;
        pool->low_water_blocks = pt->low_water_blocks;
-        pool->zero_new_blocks = pt->zero_new_blocks;
+        pool->pf = pt->pf;
        return 0;
 }
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 /*----------------------------------------------------------------
 * Pool creation
 *--------------------------------------------------------------*/
+/* Initialize pool features. */
+static void pool_features_init(struct pool_features *pf)
+{
+        pf->zero_new_blocks = 1;
+        pf->discard_enabled = 1;
+        pf->discard_passdown = 1;
+}
 static void __pool_destroy(struct pool *pool)
 {
        __pool_table_remove(pool);
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        pool->block_shift = ffs(block_size) - 1;
        pool->offset_mask = block_size - 1;
        pool->low_water_blocks = 0;
-        pool->zero_new_blocks = 1;
+        pool_features_init(&pool->pf);
        pool->prison = prison_create(PRISON_CELLS);
        if (!pool->prison) {
                *error = "Error creating pool's bio prison";
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        }
        INIT_WORK(&pool->worker, do_worker);
+        INIT_DELAYED_WORK(&pool->waker, do_waker);
        spin_lock_init(&pool->lock);
        bio_list_init(&pool->deferred_bios);
        bio_list_init(&pool->deferred_flush_bios);
        INIT_LIST_HEAD(&pool->prepared_mappings);
+        INIT_LIST_HEAD(&pool->prepared_discards);
        pool->low_water_triggered = 0;
        pool->no_free_space = 0;
        bio_list_init(&pool->retry_on_resume_list);
-        ds_init(&pool->ds);
+        ds_init(&pool->shared_read_ds);
+        ds_init(&pool->all_io_ds);
        pool->next_mapping = NULL;
        pool->mapping_pool =
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
                goto bad_endio_hook_pool;
        }
        pool->ref_count = 1;
+        pool->last_commit_jiffies = jiffies;
        pool->pool_md = pool_md;
        pool->md_dev = metadata_dev;
        __pool_table_insert(pool);
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool)
 static struct pool *__pool_find(struct mapped_device *pool_md,
                                struct block_device *metadata_dev,
-                                unsigned long block_size, char **error)
+                                unsigned long block_size, char **error,
+                                int *created)
 {
        struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
                                return ERR_PTR(-EINVAL);
                        __pool_inc(pool);
-                } else
+                } else {
                        pool = pool_create(pool_md, metadata_dev, block_size, error);
+                        *created = 1;
+                }
        }
        return pool;
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti)
        mutex_unlock(&dm_thin_pool_table.mutex);
 }
-struct pool_features {
-        unsigned zero_new_blocks:1;
-};
 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
                               struct dm_target *ti)
 {
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
        const char *arg_name;
        static struct dm_arg _args[] = {
-                {0, 1, "Invalid number of pool feature arguments"},
+                {0, 3, "Invalid number of pool feature arguments"},
        };
        /*
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
                if (!strcasecmp(arg_name, "skip_block_zeroing")) {
                        pf->zero_new_blocks = 0;
                        continue;
+                } else if (!strcasecmp(arg_name, "ignore_discard")) {
+                        pf->discard_enabled = 0;
+                        continue;
+                } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+                        pf->discard_passdown = 0;
+                        continue;
                }
                ti->error = "Unrecognised pool feature requested";
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 *
 * Optional feature arguments are:
 *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
+ *           ignore_discard: disable discard
+ *           no_discard_passdown: don't pass discards down to the data device
 */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
-        int r;
+        int r, pool_created = 0;
        struct pool_c *pt;
        struct pool *pool;
        struct pool_features pf;
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        dm_block_t low_water_blocks;
        struct dm_dev *metadata_dev;
        sector_t metadata_dev_size;
+        char b[BDEVNAME_SIZE];
        /*
         * FIXME Remove validation from scope of lock.
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
        metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
-        if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
+        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
-                ti->error = "Metadata device is too large";
+                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
-                r = -EINVAL;
+                       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
-                goto out_metadata;
-        }
        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
        if (r) {
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        /*
         * Set default pool features.
         */
-        memset(&pf, 0, sizeof(pf));
+        pool_features_init(&pf);
-        pf.zero_new_blocks = 1;
        dm_consume_args(&as, 4);
        r = parse_pool_features(&as, &pf, ti);
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
        pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-                           block_size, &ti->error);
+                           block_size, &ti->error, &pool_created);
        if (IS_ERR(pool)) {
                r = PTR_ERR(pool);
                goto out_free_pt;
        }
+        /*
+         * 'pool_created' reflects whether this is the first table load.
+         * Top level discard support is not allowed to be changed after
+         * initial load.  This would require a pool reload to trigger thin
+         * device changes.
+         */
+        if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
+                ti->error = "Discard support cannot be disabled once enabled";
+                r = -EINVAL;
+                goto out_flags_changed;
+        }
+        /*
+         * If discard_passdown was enabled verify that the data device
+         * supports discards.  Disable discard_passdown if not; otherwise
+         * -EOPNOTSUPP will be returned.
+         */
+        if (pf.discard_passdown) {
+                struct request_queue *q = bdev_get_queue(data_dev->bdev);
+                if (!q || !blk_queue_discard(q)) {
+                        DMWARN("Discard unsupported by data device: Disabling discard passdown.");
+                        pf.discard_passdown = 0;
+                }
+        }
        pt->pool = pool;
        pt->ti = ti;
        pt->metadata_dev = metadata_dev;
        pt->data_dev = data_dev;
        pt->low_water_blocks = low_water_blocks;
-        pt->zero_new_blocks = pf.zero_new_blocks;
+        pt->pf = pf;
        ti->num_flush_requests = 1;
-        ti->num_discard_requests = 0;
+        /*
+         * Only need to enable discards if the pool should pass
+         * them down to the data device.  The thin device's discard
+         * processing will cause mappings to be removed from the btree.
+         */
+        if (pf.discard_enabled && pf.discard_passdown) {
+                ti->num_discard_requests = 1;
+                /*
+                 * Setting 'discards_supported' circumvents the normal
+                 * stacking of discard limits (this keeps the pool and
+                 * thin devices' discard limits consistent).
+                 */
+                ti->discards_supported = 1;
+        }
        ti->private = pt;
        pt->callbacks.congested_fn = pool_is_congested;
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        return 0;
+out_flags_changed:
+        __pool_dec(pool);
 out_free_pt:
        kfree(pt);
 out:
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti)
        __requeue_bios(pool);
        spin_unlock_irqrestore(&pool->lock, flags);
-        wake_worker(pool);
+        do_waker(&pool->waker.work);
 }
 static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti)
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
+        cancel_delayed_work(&pool->waker);
        flush_workqueue(pool->wq);
        r = dm_pool_commit_metadata(pool->pmd);
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 static int pool_status(struct dm_target *ti, status_type_t type,
                       char *result, unsigned maxlen)
 {
-        int r;
+        int r, count;
        unsigned sz = 0;
        uint64_t transaction_id;
        dm_block_t nr_free_blocks_data;
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
                       (unsigned long)pool->sectors_per_block,
                       (unsigned long long)pt->low_water_blocks);
-                DMEMIT("%u ", !pool->zero_new_blocks);
+                count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
+                        !pool->pf.discard_passdown;
+                DMEMIT("%u ", count);
-                if (!pool->zero_new_blocks)
+                if (!pool->pf.zero_new_blocks)
                        DMEMIT("skip_block_zeroing ");
+                if (!pool->pf.discard_enabled)
+                        DMEMIT("ignore_discard ");
+                if (!pool->pf.discard_passdown)
+                        DMEMIT("no_discard_passdown ");
                break;
        }
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+        /*
+         * FIXME: these limits may be incompatible with the pool's data device
+         */
+        limits->max_discard_sectors = pool->sectors_per_block;
+        /*
+         * This is just a hint, and not enforced.  We have to cope with
+         * bios that overlap 2 blocks.
+         */
+        limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+        limits->discard_zeroes_data = pool->pf.zero_new_blocks;
+}
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct pool_c *pt = ti->private;
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
        blk_limits_io_min(limits, 0);
        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+        if (pool->pf.discard_enabled)
+                set_discard_limits(pool, limits);
 }
 static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-        .version = {1, 0, 0},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti)
        __pool_dec(tc->pool);
        dm_pool_close_thin_device(tc->td);
        dm_put_device(ti, tc->pool_dev);
+        if (tc->origin_dev)
+                dm_put_device(ti, tc->origin_dev);
        kfree(tc);
        mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti)
 /*
 * Thin target parameters:
 *
- * <pool_dev> <dev_id>
+ * <pool_dev> <dev_id> [origin_dev]
 *
 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
 * dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
+ *
+ * If the pool device has discards disabled, they get disabled for the thin
+ * device as well.
 */
 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
        int r;
        struct thin_c *tc;
-        struct dm_dev *pool_dev;
+        struct dm_dev *pool_dev, *origin_dev;
        struct mapped_device *pool_md;
        mutex_lock(&dm_thin_pool_table.mutex);
-        if (argc != 2) {
+        if (argc != 2 && argc != 3) {
                ti->error = "Invalid argument count";
                r = -EINVAL;
                goto out_unlock;
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto out_unlock;
        }
+        if (argc == 3) {
+                r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+                if (r) {
+                        ti->error = "Error opening origin device";
+                        goto bad_origin_dev;
+                }
+                tc->origin_dev = origin_dev;
+        }
        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
        if (r) {
                ti->error = "Error opening pool device";
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        ti->split_io = tc->pool->sectors_per_block;
        ti->num_flush_requests = 1;
-        ti->num_discard_requests = 0;
-        ti->discards_supported = 0;
+        /* In case the pool supports discards, pass them on. */
+        if (tc->pool->pf.discard_enabled) {
+                ti->discards_supported = 1;
+                ti->num_discard_requests = 1;
+        }
        dm_put(pool_md);
@@ -2289,6 +2582,9 @@ bad_pool_lookup:
 bad_common:
        dm_put_device(ti, tc->pool_dev);
 bad_pool_dev:
+        if (tc->origin_dev)
+                dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
        kfree(tc);
 out_unlock:
        mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2595,46 @@ out_unlock:
 static int thin_map(struct dm_target *ti, struct bio *bio,
                    union map_info *map_context)
 {
-        bio->bi_sector -= ti->begin;
+        bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
        return thin_bio_map(ti, bio, map_context);
 }
+static int thin_endio(struct dm_target *ti,
+                      struct bio *bio, int err,
+                      union map_info *map_context)
+{
+        unsigned long flags;
+        struct endio_hook *h = map_context->ptr;
+        struct list_head work;
+        struct new_mapping *m, *tmp;
+        struct pool *pool = h->tc->pool;
+        if (h->shared_read_entry) {
+                INIT_LIST_HEAD(&work);
+                ds_dec(h->shared_read_entry, &work);
+                spin_lock_irqsave(&pool->lock, flags);
+                list_for_each_entry_safe(m, tmp, &work, list) {
+                        list_del(&m->list);
+                        m->quiesced = 1;
+                        __maybe_add_mapping(m);
+                }
+                spin_unlock_irqrestore(&pool->lock, flags);
+        }
+        if (h->all_io_entry) {
+                INIT_LIST_HEAD(&work);
+                ds_dec(h->all_io_entry, &work);
+                list_for_each_entry_safe(m, tmp, &work, list)
+                        list_add(&m->list, &pool->prepared_discards);
+        }
+        mempool_free(h, pool->endio_hook_pool);
+        return 0;
+}
 static void thin_postsuspend(struct dm_target *ti)
 {
        if (dm_noflush_suspending(ti))
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
                        DMEMIT("%s %lu",
                               format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
                               (unsigned long) tc->dev_id);
+                        if (tc->origin_dev)
+                                DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
                        break;
                }
        }
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti,
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct thin_c *tc = ti->private;
+        struct pool *pool = tc->pool;
        blk_limits_io_min(limits, 0);
-        blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+        set_discard_limits(pool, limits);
 }
 static struct target_type thin_target = {
        .name = "thin",
-        .version = {1, 0, 0},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
        .map = thin_map,
+        .end_io = thin_endio,
        .postsuspend = thin_postsuspend,
        .status = thin_status,
        .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
new file mode 100644
index 000000000000..fa365d39b612
--- /dev/null
+++ b/drivers/md/dm-verity.c
@@ -0,0 +1,913 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ *
+ * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
+ * default prefetch value. Data are read in "prefetch_cluster" chunks from the
+ * hash device. Setting this greatly improves performance when data and hash
+ * are on the same disk on different partitions on devices with poor random
+ * access behavior.
+ */
+#include "dm-bufio.h"
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <crypto/hash.h>
+#define DM_MSG_PREFIX                   "verity"
+#define DM_VERITY_IO_VEC_INLINE         16
+#define DM_VERITY_MEMPOOL_SIZE          4
+#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
+#define DM_VERITY_MAX_LEVELS            63
+static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
+module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
+struct dm_verity {
+        struct dm_dev *data_dev;
+        struct dm_dev *hash_dev;
+        struct dm_target *ti;
+        struct dm_bufio_client *bufio;
+        char *alg_name;
+        struct crypto_shash *tfm;
+        u8 *root_digest;        /* digest of the root block */
+        u8 *salt;               /* salt: its size is salt_size */
+        unsigned salt_size;
+        sector_t data_start;    /* data offset in 512-byte sectors */
+        sector_t hash_start;    /* hash start in blocks */
+        sector_t data_blocks;   /* the number of data blocks */
+        sector_t hash_blocks;   /* the number of hash blocks */
+        unsigned char data_dev_block_bits;      /* log2(data blocksize) */
+        unsigned char hash_dev_block_bits;      /* log2(hash blocksize) */
+        unsigned char hash_per_block_bits;      /* log2(hashes in hash block) */
+        unsigned char levels;   /* the number of tree levels */
+        unsigned char version;
+        unsigned digest_size;   /* digest size for the current hash algorithm */
+        unsigned shash_descsize;/* the size of temporary space for crypto */
+        int hash_failed;        /* set to 1 if hash of any block failed */
+        mempool_t *io_mempool;  /* mempool of struct dm_verity_io */
+        mempool_t *vec_mempool; /* mempool of bio vector */
+        struct workqueue_struct *verify_wq;
+        /* starting blocks for each tree level. 0 is the lowest level. */
+        sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+};
+struct dm_verity_io {
+        struct dm_verity *v;
+        struct bio *bio;
+        /* original values of bio->bi_end_io and bio->bi_private */
+        bio_end_io_t *orig_bi_end_io;
+        void *orig_bi_private;
+        sector_t block;
+        unsigned n_blocks;
+        /* saved bio vector */
+        struct bio_vec *io_vec;
+        unsigned io_vec_size;
+        struct work_struct work;
+        /* A space for short vectors; longer vectors are allocated separately. */
+        struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
+        /*
+         * Three variably-size fields follow this struct:
+         *
+         * u8 hash_desc[v->shash_descsize];
+         * u8 real_digest[v->digest_size];
+         * u8 want_digest[v->digest_size];
+         *
+         * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
+         */
+};
+static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
+{
+        return (struct shash_desc *)(io + 1);
+}
+static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+        return (u8 *)(io + 1) + v->shash_descsize;
+}
+static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+        return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+/*
+ * Auxiliary structure appended to each dm-bufio buffer. If the value
+ * hash_verified is nonzero, hash of the block has been verified.
+ *
+ * The variable hash_verified is set to 0 when allocating the buffer, then
+ * it can be changed to 1 and it is never reset to 0 again.
+ *
+ * There is no lock around this value, a race condition can at worst cause
+ * that multiple processes verify the hash of the same buffer simultaneously
+ * and write 1 to hash_verified simultaneously.
+ * This condition is harmless, so we don't need locking.
+ */
+struct buffer_aux {
+        int hash_verified;
+};
+/*
+ * Initialize struct buffer_aux for a freshly created buffer.
+ */
+static void dm_bufio_alloc_callback(struct dm_buffer *buf)
+{
+        struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+        aux->hash_verified = 0;
+}
+/*
+ * Translate input sector number to the sector number on the target device.
+ */
+static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
+{
+        return v->data_start + dm_target_offset(v->ti, bi_sector);
+}
+/*
+ * Return hash position of a specified block at a specified tree level
+ * (0 is the lowest level).
+ * The lowest "hash_per_block_bits"-bits of the result denote hash position
+ * inside a hash block. The remaining bits denote location of the hash block.
+ */
+static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
+                                         int level)
+{
+        return block >> (level * v->hash_per_block_bits);
+}
+static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
+                                 sector_t *hash_block, unsigned *offset)
+{
+        sector_t position = verity_position_at_level(v, block, level);
+        unsigned idx;
+        *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
+        if (!offset)
+                return;
+        idx = position & ((1 << v->hash_per_block_bits) - 1);
+        if (!v->version)
+                *offset = idx * v->digest_size;
+        else
+                *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
+}
+/*
+ * Verify hash of a metadata block pertaining to the specified data block
+ * ("block" argument) at a specified level ("level" argument).
+ *
+ * On successful return, io_want_digest(v, io) contains the hash value for
+ * a lower tree level or for the data block (if we're at the lowest leve).
+ *
+ * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
+ * If "skip_unverified" is false, unverified buffer is hashed and verified
+ * against current value of io_want_digest(v, io).
+ */
+static int verity_verify_level(struct dm_verity_io *io, sector_t block,
+                               int level, bool skip_unverified)
+{
+        struct dm_verity *v = io->v;
+        struct dm_buffer *buf;
+        struct buffer_aux *aux;
+        u8 *data;
+        int r;
+        sector_t hash_block;
+        unsigned offset;
+        verity_hash_at_level(v, block, level, &hash_block, &offset);
+        data = dm_bufio_read(v->bufio, hash_block, &buf);
+        if (unlikely(IS_ERR(data)))
+                return PTR_ERR(data);
+        aux = dm_bufio_get_aux_data(buf);
+        if (!aux->hash_verified) {
+                struct shash_desc *desc;
+                u8 *result;
+                if (skip_unverified) {
+                        r = 1;
+                        goto release_ret_r;
+                }
+                desc = io_hash_desc(v, io);
+                desc->tfm = v->tfm;
+                desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+                r = crypto_shash_init(desc);
+                if (r < 0) {
+                        DMERR("crypto_shash_init failed: %d", r);
+                        goto release_ret_r;
+                }
+                if (likely(v->version >= 1)) {
+                        r = crypto_shash_update(desc, v->salt, v->salt_size);
+                        if (r < 0) {
+                                DMERR("crypto_shash_update failed: %d", r);
+                                goto release_ret_r;
+                        }
+                }
+                r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
+                if (r < 0) {
+                        DMERR("crypto_shash_update failed: %d", r);
+                        goto release_ret_r;
+                }
+                if (!v->version) {
+                        r = crypto_shash_update(desc, v->salt, v->salt_size);
+                        if (r < 0) {
+                                DMERR("crypto_shash_update failed: %d", r);
+                                goto release_ret_r;
+                        }
+                }
+                result = io_real_digest(v, io);
+                r = crypto_shash_final(desc, result);
+                if (r < 0) {
+                        DMERR("crypto_shash_final failed: %d", r);
+                        goto release_ret_r;
+                }
+                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+                        DMERR_LIMIT("metadata block %llu is corrupted",
+                                (unsigned long long)hash_block);
+                        v->hash_failed = 1;
+                        r = -EIO;
+                        goto release_ret_r;
+                } else
+                        aux->hash_verified = 1;
+        }
+        data += offset;
+        memcpy(io_want_digest(v, io), data, v->digest_size);
+        dm_bufio_release(buf);
+        return 0;
+release_ret_r:
+        dm_bufio_release(buf);
+        return r;
+}
+/*
+ * Verify one "dm_verity_io" structure.
+ */
+static int verity_verify_io(struct dm_verity_io *io)
+{
+        struct dm_verity *v = io->v;
+        unsigned b;
+        int i;
+        unsigned vector = 0, offset = 0;
+        for (b = 0; b < io->n_blocks; b++) {
+                struct shash_desc *desc;
+                u8 *result;
+                int r;
+                unsigned todo;
+                if (likely(v->levels)) {
+                        /*
+                         * First, we try to get the requested hash for
+                         * the current block. If the hash block itself is
+                         * verified, zero is returned. If it isn't, this
+                         * function returns 0 and we fall back to whole
+                         * chain verification.
+                         */
+                        int r = verity_verify_level(io, io->block + b, 0, true);
+                        if (likely(!r))
+                                goto test_block_hash;
+                        if (r < 0)
+                                return r;
+                }
+                memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
+                for (i = v->levels - 1; i >= 0; i--) {
+                        int r = verity_verify_level(io, io->block + b, i, false);
+                        if (unlikely(r))
+                                return r;
+                }
+test_block_hash:
+                desc = io_hash_desc(v, io);
+                desc->tfm = v->tfm;
+                desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+                r = crypto_shash_init(desc);
+                if (r < 0) {
+                        DMERR("crypto_shash_init failed: %d", r);
+                        return r;
+                }
+                if (likely(v->version >= 1)) {
+                        r = crypto_shash_update(desc, v->salt, v->salt_size);
+                        if (r < 0) {
+                                DMERR("crypto_shash_update failed: %d", r);
+                                return r;
+                        }
+                }
+                todo = 1 << v->data_dev_block_bits;
+                do {
+                        struct bio_vec *bv;
+                        u8 *page;
+                        unsigned len;
+                        BUG_ON(vector >= io->io_vec_size);
+                        bv = &io->io_vec[vector];
+                        page = kmap_atomic(bv->bv_page);
+                        len = bv->bv_len - offset;
+                        if (likely(len >= todo))
+                                len = todo;
+                        r = crypto_shash_update(desc,
+                                        page + bv->bv_offset + offset, len);
+                        kunmap_atomic(page);
+                        if (r < 0) {
+                                DMERR("crypto_shash_update failed: %d", r);
+                                return r;
+                        }
+                        offset += len;
+                        if (likely(offset == bv->bv_len)) {
+                                offset = 0;
+                                vector++;
+                        }
+                        todo -= len;
+                } while (todo);
+                if (!v->version) {
+                        r = crypto_shash_update(desc, v->salt, v->salt_size);
+                        if (r < 0) {
+                                DMERR("crypto_shash_update failed: %d", r);
+                                return r;
+                        }
+                }
+                result = io_real_digest(v, io);
+                r = crypto_shash_final(desc, result);
+                if (r < 0) {
+                        DMERR("crypto_shash_final failed: %d", r);
+                        return r;
+                }
+                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+                        DMERR_LIMIT("data block %llu is corrupted",
+                                (unsigned long long)(io->block + b));
+                        v->hash_failed = 1;
+                        return -EIO;
+                }
+        }
+        BUG_ON(vector != io->io_vec_size);
+        BUG_ON(offset);
+        return 0;
+}
+/*
+ * End one "io" structure with a given error.
+ */
+static void verity_finish_io(struct dm_verity_io *io, int error)
+{
+        struct bio *bio = io->bio;
+        struct dm_verity *v = io->v;
+        bio->bi_end_io = io->orig_bi_end_io;
+        bio->bi_private = io->orig_bi_private;
+        if (io->io_vec != io->io_vec_inline)
+                mempool_free(io->io_vec, v->vec_mempool);
+        mempool_free(io, v->io_mempool);
+        bio_endio(bio, error);
+}
+static void verity_work(struct work_struct *w)
+{
+        struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
+        verity_finish_io(io, verity_verify_io(io));
+}
+static void verity_end_io(struct bio *bio, int error)
+{
+        struct dm_verity_io *io = bio->bi_private;
+        if (error) {
+                verity_finish_io(io, error);
+                return;
+        }
+        INIT_WORK(&io->work, verity_work);
+        queue_work(io->v->verify_wq, &io->work);
+}
+/*
+ * Prefetch buffers for the specified io.
+ * The root buffer is not prefetched, it is assumed that it will be cached
+ * all the time.
+ */
+static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+{
+        int i;
+        for (i = v->levels - 2; i >= 0; i--) {
+                sector_t hash_block_start;
+                sector_t hash_block_end;
+                verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
+                verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
+                if (!i) {
+                        unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
+                        cluster >>= v->data_dev_block_bits;
+                        if (unlikely(!cluster))
+                                goto no_prefetch_cluster;
+                        if (unlikely(cluster & (cluster - 1)))
+                                cluster = 1 << (fls(cluster) - 1);
+                        hash_block_start &= ~(sector_t)(cluster - 1);
+                        hash_block_end |= cluster - 1;
+                        if (unlikely(hash_block_end >= v->hash_blocks))
+                                hash_block_end = v->hash_blocks - 1;
+                }
+no_prefetch_cluster:
+                dm_bufio_prefetch(v->bufio, hash_block_start,
+                                  hash_block_end - hash_block_start + 1);
+        }
+}
+/*
+ * Bio map function. It allocates dm_verity_io structure and bio vector and
+ * fills them. Then it issues prefetches and the I/O.
+ */
+static int verity_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        struct dm_verity *v = ti->private;
+        struct dm_verity_io *io;
+        bio->bi_bdev = v->data_dev->bdev;
+        bio->bi_sector = verity_map_sector(v, bio->bi_sector);
+        if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
+            ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
+                DMERR_LIMIT("unaligned io");
+                return -EIO;
+        }
+        if ((bio->bi_sector + bio_sectors(bio)) >>
+            (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
+                DMERR_LIMIT("io out of range");
+                return -EIO;
+        }
+        if (bio_data_dir(bio) == WRITE)
+                return -EIO;
+        io = mempool_alloc(v->io_mempool, GFP_NOIO);
+        io->v = v;
+        io->bio = bio;
+        io->orig_bi_end_io = bio->bi_end_io;
+        io->orig_bi_private = bio->bi_private;
+        io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
+        io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
+        bio->bi_end_io = verity_end_io;
+        bio->bi_private = io;
+        io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
+        if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
+                io->io_vec = io->io_vec_inline;
+        else
+                io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
+        memcpy(io->io_vec, bio_iovec(bio),
+               io->io_vec_size * sizeof(struct bio_vec));
+        verity_prefetch_io(v, io);
+        generic_make_request(bio);
+        return DM_MAPIO_SUBMITTED;
+}
+/*
+ * Status: V (valid) or C (corruption found)
+ */
+static int verity_status(struct dm_target *ti, status_type_t type,
+                         char *result, unsigned maxlen)
+{
+        struct dm_verity *v = ti->private;
+        unsigned sz = 0;
+        unsigned x;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                DMEMIT("%c", v->hash_failed ? 'C' : 'V');
+                break;
+        case STATUSTYPE_TABLE:
+                DMEMIT("%u %s %s %u %u %llu %llu %s ",
+                        v->version,
+                        v->data_dev->name,
+                        v->hash_dev->name,
+                        1 << v->data_dev_block_bits,
+                        1 << v->hash_dev_block_bits,
+                        (unsigned long long)v->data_blocks,
+                        (unsigned long long)v->hash_start,
+                        v->alg_name
+                        );
+                for (x = 0; x < v->digest_size; x++)
+                        DMEMIT("%02x", v->root_digest[x]);
+                DMEMIT(" ");
+                if (!v->salt_size)
+                        DMEMIT("-");
+                else
+                        for (x = 0; x < v->salt_size; x++)
+                                DMEMIT("%02x", v->salt[x]);
+                break;
+        }
+        return 0;
+}
+static int verity_ioctl(struct dm_target *ti, unsigned cmd,
+                        unsigned long arg)
+{
+        struct dm_verity *v = ti->private;
+        int r = 0;
+        if (v->data_start ||
+            ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
+                r = scsi_verify_blk_ioctl(NULL, cmd);
+        return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
+                                     cmd, arg);
+}
+static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                        struct bio_vec *biovec, int max_size)
+{
+        struct dm_verity *v = ti->private;
+        struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
+        if (!q->merge_bvec_fn)
+                return max_size;
+        bvm->bi_bdev = v->data_dev->bdev;
+        bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
+        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+static int verity_iterate_devices(struct dm_target *ti,
+                                  iterate_devices_callout_fn fn, void *data)
+{
+        struct dm_verity *v = ti->private;
+        return fn(ti, v->data_dev, v->data_start, ti->len, data);
+}
+static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+        struct dm_verity *v = ti->private;
+        if (limits->logical_block_size < 1 << v->data_dev_block_bits)
+                limits->logical_block_size = 1 << v->data_dev_block_bits;
+        if (limits->physical_block_size < 1 << v->data_dev_block_bits)
+                limits->physical_block_size = 1 << v->data_dev_block_bits;
+        blk_limits_io_min(limits, limits->logical_block_size);
+}
+static void verity_dtr(struct dm_target *ti)
+{
+        struct dm_verity *v = ti->private;
+        if (v->verify_wq)
+                destroy_workqueue(v->verify_wq);
+        if (v->vec_mempool)
+                mempool_destroy(v->vec_mempool);
+        if (v->io_mempool)
+                mempool_destroy(v->io_mempool);
+        if (v->bufio)
+                dm_bufio_client_destroy(v->bufio);
+        kfree(v->salt);
+        kfree(v->root_digest);
+        if (v->tfm)
+                crypto_free_shash(v->tfm);
+        kfree(v->alg_name);
+        if (v->hash_dev)
+                dm_put_device(ti, v->hash_dev);
+        if (v->data_dev)
+                dm_put_device(ti, v->data_dev);
+        kfree(v);
+}
+/*
+ * Target parameters:
+ *      <version>       The current format is version 1.
+ *                      Vsn 0 is compatible with original Chromium OS releases.
+ *      <data device>
+ *      <hash device>
+ *      <data block size>
+ *      <hash block size>
+ *      <the number of data blocks>
+ *      <hash start block>
+ *      <algorithm>
+ *      <digest>
+ *      <salt>          Hex string or "-" if no salt.
+ */
+static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+        struct dm_verity *v;
+        unsigned num;
+        unsigned long long num_ll;
+        int r;
+        int i;
+        sector_t hash_position;
+        char dummy;
+        v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
+        if (!v) {
+                ti->error = "Cannot allocate verity structure";
+                return -ENOMEM;
+        }
+        ti->private = v;
+        v->ti = ti;
+        if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
+                ti->error = "Device must be readonly";
+                r = -EINVAL;
+                goto bad;
+        }
+        if (argc != 10) {
+                ti->error = "Invalid argument count: exactly 10 arguments required";
+                r = -EINVAL;
+                goto bad;
+        }
+        if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
+            num < 0 || num > 1) {
+                ti->error = "Invalid version";
+                r = -EINVAL;
+                goto bad;
+        }
+        v->version = num;
+        r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
+        if (r) {
+                ti->error = "Data device lookup failed";
+                goto bad;
+        }
+        r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
+        if (r) {
+                ti->error = "Data device lookup failed";
+                goto bad;
+        }
+        if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
+            !num || (num & (num - 1)) ||
+            num < bdev_logical_block_size(v->data_dev->bdev) ||
+            num > PAGE_SIZE) {
+                ti->error = "Invalid data device block size";
+                r = -EINVAL;
+                goto bad;
+        }
+        v->data_dev_block_bits = ffs(num) - 1;
+        if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
+            !num || (num & (num - 1)) ||
+            num < bdev_logical_block_size(v->hash_dev->bdev) ||
+            num > INT_MAX) {
+                ti->error = "Invalid hash device block size";
+                r = -EINVAL;
+                goto bad;
+        }
+        v->hash_dev_block_bits = ffs(num) - 1;
+        if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
+            num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
+            (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
+                ti->error = "Invalid data blocks";
+                r = -EINVAL;
+                goto bad;
+        }
+        v->data_blocks = num_ll;
+        if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
+                ti->error = "Data device is too small";
+                r = -EINVAL;
+                goto bad;
+        }
+        if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
+            num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
+            (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
+                ti->error = "Invalid hash start";
+                r = -EINVAL;
+                goto bad;
+        }
+        v->hash_start = num_ll;
+        v->alg_name = kstrdup(argv[7], GFP_KERNEL);
+        if (!v->alg_name) {
+                ti->error = "Cannot allocate algorithm name";
+                r = -ENOMEM;
+                goto bad;
+        }
+        v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
+        if (IS_ERR(v->tfm)) {
+                ti->error = "Cannot initialize hash function";
+                r = PTR_ERR(v->tfm);
+                v->tfm = NULL;
+                goto bad;
+        }
+        v->digest_size = crypto_shash_digestsize(v->tfm);
+        if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
+                ti->error = "Digest size too big";
+                r = -EINVAL;
+                goto bad;
+        }
+        v->shash_descsize =
+                sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
+        v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
+        if (!v->root_digest) {
+                ti->error = "Cannot allocate root digest";
+                r = -ENOMEM;
+                goto bad;
+        }
+        if (strlen(argv[8]) != v->digest_size * 2 ||
+            hex2bin(v->root_digest, argv[8], v->digest_size)) {
+                ti->error = "Invalid root digest";
+                r = -EINVAL;
+                goto bad;
+        }
+        if (strcmp(argv[9], "-")) {
+                v->salt_size = strlen(argv[9]) / 2;
+                v->salt = kmalloc(v->salt_size, GFP_KERNEL);
+                if (!v->salt) {
+                        ti->error = "Cannot allocate salt";
+                        r = -ENOMEM;
+                        goto bad;
+                }
+                if (strlen(argv[9]) != v->salt_size * 2 ||
+                    hex2bin(v->salt, argv[9], v->salt_size)) {
+                        ti->error = "Invalid salt";
+                        r = -EINVAL;
+                        goto bad;
+                }
+        }
+        v->hash_per_block_bits =
+                fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
+        v->levels = 0;
+        if (v->data_blocks)
+                while (v->hash_per_block_bits * v->levels < 64 &&
+                       (unsigned long long)(v->data_blocks - 1) >>
+                       (v->hash_per_block_bits * v->levels))
+                        v->levels++;
+        if (v->levels > DM_VERITY_MAX_LEVELS) {
+                ti->error = "Too many tree levels";
+                r = -E2BIG;
+                goto bad;
+        }
+        hash_position = v->hash_start;
+        for (i = v->levels - 1; i >= 0; i--) {
+                sector_t s;
+                v->hash_level_block[i] = hash_position;
+                s = verity_position_at_level(v, v->data_blocks, i);
+                s = (s >> v->hash_per_block_bits) +
+                    !!(s & ((1 << v->hash_per_block_bits) - 1));
+                if (hash_position + s < hash_position) {
+                        ti->error = "Hash device offset overflow";
+                        r = -E2BIG;
+                        goto bad;
+                }
+                hash_position += s;
+        }
+        v->hash_blocks = hash_position;
+        v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
+                1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
+                dm_bufio_alloc_callback, NULL);
+        if (IS_ERR(v->bufio)) {
+                ti->error = "Cannot initialize dm-bufio";
+                r = PTR_ERR(v->bufio);
+                v->bufio = NULL;
+                goto bad;
+        }
+        if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
+                ti->error = "Hash device is too small";
+                r = -E2BIG;
+                goto bad;
+        }
+        v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
+          sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
+        if (!v->io_mempool) {
+                ti->error = "Cannot allocate io mempool";
+                r = -ENOMEM;
+                goto bad;
+        }
+        v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
+                                        BIO_MAX_PAGES * sizeof(struct bio_vec));
+        if (!v->vec_mempool) {
+                ti->error = "Cannot allocate vector mempool";
+                r = -ENOMEM;
+                goto bad;
+        }
+        /* WQ_UNBOUND greatly improves performance when running on ramdisk */
+        v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
+        if (!v->verify_wq) {
+                ti->error = "Cannot allocate workqueue";
+                r = -ENOMEM;
+                goto bad;
+        }
+        return 0;
+bad:
+        verity_dtr(ti);
+        return r;
+}
+static struct target_type verity_target = {
+        .name           = "verity",
+        .version        = {1, 0, 0},
+        .module         = THIS_MODULE,
+        .ctr            = verity_ctr,
+        .dtr            = verity_dtr,
+        .map            = verity_map,
+        .status         = verity_status,
+        .ioctl          = verity_ioctl,
+        .merge          = verity_merge,
+        .iterate_devices = verity_iterate_devices,
+        .io_hints       = verity_io_hints,
+};
+static int __init dm_verity_init(void)
+{
+        int r;
+        r = dm_register_target(&verity_target);
+        if (r < 0)
+                DMERR("register failed %d", r);
+        return r;
+}
+static void __exit dm_verity_exit(void)
+{
+        dm_unregister_target(&verity_target);
+}
+module_init(dm_verity_init);
+module_exit(dm_verity_exit);
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
+MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
+MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b89c548ec3f8..e24143cc2040 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
                /*
                 * Store bio_set for cleanup.
                 */
+                clone->bi_end_io = NULL;
                clone->bi_private = md->bs;
                bio_put(clone);
                free_tio(md, tio);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index feb2c3c7bb44..45135f69509c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
        }
        conf->nfaults = 0;
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                conf->rdev = rdev;
        md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 627456542fb3..b0fcc7d02adb 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
        struct dev_info *dev0;
        unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+        int maxbytes = biovec->bv_len;
+        struct request_queue *subq;
        rcu_read_lock();
        dev0 = which_dev(mddev, sector);
        maxsectors = dev0->end_sector - sector;
+        subq = bdev_get_queue(dev0->rdev->bdev);
+        if (subq->merge_bvec_fn) {
+                bvm->bi_bdev = dev0->rdev->bdev;
+                bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
+                maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
+                                                             biovec));
+        }
        rcu_read_unlock();
        if (maxsectors < bio_sectors)
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q,
                maxsectors -= bio_sectors;
        if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
-                return biovec->bv_len;
+                return maxbytes;
-        /* The bytes available at this offset could be really big,
-         * so we cap at 2^31 to avoid overflow */
+        if (maxsectors > (maxbytes >> 9))
-        if (maxsectors > (1 << (31-9)))
+                return maxbytes;
-                return 1<<31;
+        else
-        return maxsectors << 9;
+                return maxsectors << 9;
 }
 static int linear_congested(void *data, int bits)
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
        cnt = 0;
        conf->array_sectors = 0;
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                int j = rdev->raid_disk;
                struct dev_info *disk = conf->disks + j;
                sector_t sectors;
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
-                /* as we don't honour merge_bvec_fn, we must never risk
-                 * violating it, so limit max_segments to 1 lying within
-                 * a single page.
-                 */
-                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                        blk_queue_max_segments(mddev->queue, 1);
-                        blk_queue_segment_boundary(mddev->queue,
-                                                   PAGE_CACHE_SIZE - 1);
-                }
                conf->array_sectors += rdev->sectors;
                cnt++;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ce88755baf4a..b572e1e386ce 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws)
        INIT_WORK(&mddev->flush_work, md_submit_flush_data);
        atomic_set(&mddev->flush_pending, 1);
        rcu_read_lock();
-        list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+        rdev_for_each_rcu(rdev, mddev)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Faulty, &rdev->flags)) {
                        /* Take two references, one is dropped
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
 {
        struct md_rdev *rdev;
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                if (rdev->desc_nr == nr)
                        return rdev;
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
 {
        struct md_rdev *rdev;
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                if (rdev->bdev->bd_dev == dev)
                        return rdev;
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
        sb->disks[0].state = (1<<MD_DISK_REMOVED);
-        list_for_each_entry(rdev2, &mddev->disks, same_set) {
+        rdev_for_each(rdev2, mddev) {
                mdp_disk_t *d;
                int desc_nr;
                int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1805,18 +1805,18 @@ retry:
                                                | BB_LEN(internal_bb));
                                *bbp++ = cpu_to_le64(store_bb);
                        }
+                        bb->changed = 0;
                        if (read_seqretry(&bb->lock, seq))
                                goto retry;
                        bb->sector = (rdev->sb_start +
                                      (int)le32_to_cpu(sb->bblog_offset));
                        bb->size = le16_to_cpu(sb->bblog_size);
-                        bb->changed = 0;
                }
        }
        max_dev = 0;
-        list_for_each_entry(rdev2, &mddev->disks, same_set)
+        rdev_for_each(rdev2, mddev)
                if (rdev2->desc_nr+1 > max_dev)
                        max_dev = rdev2->desc_nr+1;
@@ -1833,7 +1833,7 @@ retry:
        for (i=0; i<max_dev;i++)
                sb->dev_roles[i] = cpu_to_le16(0xfffe);
        
-        list_for_each_entry(rdev2, &mddev->disks, same_set) {
+        rdev_for_each(rdev2, mddev) {
                i = rdev2->desc_nr;
                if (test_bit(Faulty, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev)
                return 0; /* nothing to do */
        if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
                return 0; /* shouldn't register, or already is */
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                /* skip spares and non-functional disks */
                if (test_bit(Faulty, &rdev->flags))
                        continue;
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev)
 {
        struct md_rdev *rdev, *tmp;
-        rdev_for_each(rdev, tmp, mddev) {
+        rdev_for_each_safe(rdev, tmp, mddev) {
                if (!rdev->mddev) {
                        MD_BUG();
                        continue;
@@ -2307,11 +2307,11 @@ static void md_print_devices(void)
                        bitmap_print_sb(mddev->bitmap);
                else
                        printk("%s: ", mdname(mddev));
-                list_for_each_entry(rdev, &mddev->disks, same_set)
+                rdev_for_each(rdev, mddev)
                        printk("<%s>", bdevname(rdev->bdev,b));
                printk("\n");
-                list_for_each_entry(rdev, &mddev->disks, same_set)
+                rdev_for_each(rdev, mddev)
                        print_rdev(rdev, mddev->major_version);
        }
        printk("md:     **********************************\n");
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
         * with the rest of the array)
         */
        struct md_rdev *rdev;
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (rdev->sb_events == mddev->events ||
                    (nospares &&
                     rdev->raid_disk < 0 &&
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
 repeat:
        /* First make sure individual recovery_offsets are correct */
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk >= 0 &&
                    mddev->delta_disks >= 0 &&
                    !test_bit(In_sync, &rdev->flags) &&
@@ -2364,8 +2364,9 @@ repeat:
                clear_bit(MD_CHANGE_DEVS, &mddev->flags);
                if (!mddev->external) {
                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-                        list_for_each_entry(rdev, &mddev->disks, same_set) {
+                        rdev_for_each(rdev, mddev) {
                                if (rdev->badblocks.changed) {
+                                        rdev->badblocks.changed = 0;
                                        md_ack_all_badblocks(&rdev->badblocks);
                                        md_error(mddev, rdev);
                                }
@@ -2430,7 +2431,7 @@ repeat:
                mddev->events --;
        }
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (rdev->badblocks.changed)
                        any_badblocks_changed++;
                if (test_bit(Faulty, &rdev->flags))
@@ -2444,7 +2445,7 @@ repeat:
                 mdname(mddev), mddev->in_sync);
        bitmap_update_sb(mddev->bitmap);
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                char b[BDEVNAME_SIZE];
                if (rdev->sb_loaded != 1)
@@ -2493,7 +2494,7 @@ repeat:
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (test_and_clear_bit(FaultRecorded, &rdev->flags))
                        clear_bit(Blocked, &rdev->flags);
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                        struct md_rdev *rdev2;
                        mddev_lock(mddev);
-                        list_for_each_entry(rdev2, &mddev->disks, same_set)
+                        rdev_for_each(rdev2, mddev)
                                if (rdev->bdev == rdev2->bdev &&
                                    rdev != rdev2 &&
                                    overlaps(rdev->data_offset, rdev->sectors,
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev)
        char b[BDEVNAME_SIZE];
        freshest = NULL;
-        rdev_for_each(rdev, tmp, mddev)
+        rdev_for_each_safe(rdev, tmp, mddev)
                switch (super_types[mddev->major_version].
                        load_super(rdev, freshest, mddev->minor_version)) {
                case 1:
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev)
                validate_super(mddev, freshest);
        i = 0;
-        rdev_for_each(rdev, tmp, mddev) {
+        rdev_for_each_safe(rdev, tmp, mddev) {
                if (mddev->max_disks &&
                    (rdev->desc_nr >= mddev->max_disks ||
                     i > mddev->max_disks)) {
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                return -EINVAL;
        }
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                rdev->new_raid_disk = rdev->raid_disk;
        /* ->takeover must set new_* and/or delta_disks
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                mddev->safemode = 0;
        }
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk < 0)
                        continue;
                if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                        continue;
                sysfs_unlink_rdev(mddev, rdev);
        }
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk < 0)
                        continue;
                if (rdev->new_raid_disk == rdev->raid_disk)
@@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev)
         * the only valid external interface is through the md
         * device.
         */
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                sync_blockdev(rdev->bdev);
@@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev)
                struct md_rdev *rdev2;
                int warned = 0;
-                list_for_each_entry(rdev, &mddev->disks, same_set)
+                rdev_for_each(rdev, mddev)
-                        list_for_each_entry(rdev2, &mddev->disks, same_set) {
+                        rdev_for_each(rdev2, mddev) {
                                if (rdev < rdev2 &&
                                    rdev->bdev->bd_contains ==
                                    rdev2->bdev->bd_contains) {
@@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev)
        mddev->in_sync = 1;
        smp_wmb();
        mddev->ready = 1;
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                if (rdev->raid_disk >= 0)
                        if (sysfs_link_rdev(mddev, rdev))
                                /* failure here is OK */;
@@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev)
        mddev->changed = 0;
        mddev->degraded = 0;
        mddev->safemode = 0;
+        mddev->merge_check_needed = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
        mddev->bitmap_info.chunksize = 0;
@@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
                /* tell userspace to handle 'inactive' */
                sysfs_notify_dirent_safe(mddev->sysfs_state);
-                list_for_each_entry(rdev, &mddev->disks, same_set)
+                rdev_for_each(rdev, mddev)
                        if (rdev->raid_disk >= 0)
                                sysfs_unlink_rdev(mddev, rdev);
@@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev)
        printk(KERN_INFO "md: running: ");
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                char b[BDEVNAME_SIZE];
                printk("<%s>", bdevname(rdev->bdev,b));
        }
@@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
        struct md_rdev *rdev;
        nr=working=insync=failed=spare=0;
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                nr++;
                if (test_bit(Faulty, &rdev->flags))
                        failed++;
@@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
                 * grow, and re-add.
                 */
                return -EBUSY;
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                sector_t avail = rdev->sectors;
                if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
        struct mddev *mddev = v;
        sector_t sectors;
        struct md_rdev *rdev;
-        struct bitmap *bitmap;
        if (v == (void*)1) {
                struct md_personality *pers;
@@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                }
                sectors = 0;
-                list_for_each_entry(rdev, &mddev->disks, same_set) {
+                rdev_for_each(rdev, mddev) {
                        char b[BDEVNAME_SIZE];
                        seq_printf(seq, " %s[%d]",
                                bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                } else
                        seq_printf(seq, "\n       ");
-                if ((bitmap = mddev->bitmap)) {
+                bitmap_status(seq, mddev->bitmap);
-                        unsigned long chunk_kb;
-                        unsigned long flags;
-                        spin_lock_irqsave(&bitmap->lock, flags);
-                        chunk_kb = mddev->bitmap_info.chunksize >> 10;
-                        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
-                                "%lu%s chunk",
-                                bitmap->pages - bitmap->missing_pages,
-                                bitmap->pages,
-                                (bitmap->pages - bitmap->missing_pages)
-                                        << (PAGE_SHIFT - 10),
-                                chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
-                                chunk_kb ? "KB" : "B");
-                        if (bitmap->file) {
-                                seq_printf(seq, ", file: ");
-                                seq_path(seq, &bitmap->file->f_path, " \t\n");
-                        }
-                        seq_printf(seq, "\n");
-                        spin_unlock_irqrestore(&bitmap->lock, flags);
-                }
                seq_printf(seq, "\n");
        }
@@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev)
                max_sectors = mddev->dev_sectors;
                j = MaxSector;
                rcu_read_lock();
-                list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+                rdev_for_each_rcu(rdev, mddev)
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(Faulty, &rdev->flags) &&
                            !test_bit(In_sync, &rdev->flags) &&
@@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev)
                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                                mddev->curr_resync = MaxSector;
                        rcu_read_lock();
-                        list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+                        rdev_for_each_rcu(rdev, mddev)
                                if (rdev->raid_disk >= 0 &&
                                    mddev->delta_disks >= 0 &&
                                    !test_bit(Faulty, &rdev->flags) &&
@@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev)
        mddev->curr_resync_completed = 0;
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
                    (test_bit(Faulty, &rdev->flags) ||
@@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev)
                             "degraded");
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk >= 0 &&
                    !test_bit(In_sync, &rdev->flags) &&
                    !test_bit(Faulty, &rdev->flags))
@@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev)
         * do the superblock for an incrementally recovered device
         * written out.
         */
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                if (!mddev->degraded ||
                    test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = -1;
@@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev)
                         * failed devices.
                         */
                        struct md_rdev *rdev;
-                        list_for_each_entry(rdev, &mddev->disks, same_set)
+                        rdev_for_each(rdev, mddev)
                                if (rdev->raid_disk >= 0 &&
                                    !test_bit(Blocked, &rdev->flags) &&
                                    test_bit(Faulty, &rdev->flags) &&
@@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
                return;
        write_seqlock_irq(&bb->lock);
-        if (bb->changed == 0) {
+        if (bb->changed == 0 && bb->unacked_exist) {
                u64 *p = bb->page;
                int i;
                for (i = 0; i < bb->count ; i++) {
@@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this,
        struct mddev *mddev;
        int need_delay = 0;
-        if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+        for_each_mddev(mddev, tmp) {
+                if (mddev_trylock(mddev)) {
-                printk(KERN_INFO "md: stopping all md devices.\n");
+                        __md_stop_writes(mddev);
+                        mddev->safemode = 2;
-                for_each_mddev(mddev, tmp) {
+                        mddev_unlock(mddev);
-                        if (mddev_trylock(mddev)) {
-                                /* Force a switch to readonly even array
-                                 * appears to still be in use.  Hence
-                                 * the '100'.
-                                 */
-                                md_set_readonly(mddev, 100);
-                                mddev_unlock(mddev);
-                        }
-                        need_delay = 1;
                }
-                /*
+                need_delay = 1;
-                 * certain more exotic SCSI devices are known to be
-                 * volatile wrt too early system reboots. While the
-                 * right place to handle this issue is the given
-                 * driver, we do want to have a safe RAID driver ...
-                 */
-                if (need_delay)
-                        mdelay(1000*1);
        }
+        /*
+         * certain more exotic SCSI devices are known to be
+         * volatile wrt too early system reboots. While the
+         * right place to handle this issue is the given
+         * driver, we do want to have a safe RAID driver ...
+         */
+        if (need_delay)
+                mdelay(1000*1);
        return NOTIFY_DONE;
 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 44c63dfeeb2b..1c2063ccf48e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -128,6 +128,10 @@ struct md_rdev {
 enum flag_bits {
        Faulty,                 /* device is known to have a fault */
        In_sync,                /* device is in_sync with rest of array */
+        Unmerged,               /* device is being added to array and should
+                                 * be considerred for bvec_merge_fn but not
+                                 * yet for actual IO
+                                 */
        WriteMostly,            /* Avoid reading if at all possible */
        AutoDetected,           /* added by auto-detect */
        Blocked,                /* An error occurred but has not yet
@@ -345,6 +349,10 @@ struct mddev {
        int                             degraded;       /* whether md should consider
                                                         * adding a spare
                                                         */
+        int                             merge_check_needed; /* at least one
+                                                             * member device
+                                                             * has a
+                                                             * merge_bvec_fn */
        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t               recovery_wait;
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 /*
 * iterates through the 'same array disks' ringlist
 */
-#define rdev_for_each(rdev, tmp, mddev)                         \
+#define rdev_for_each(rdev, mddev)                              \
+        list_for_each_entry(rdev, &((mddev)->disks), same_set)
+#define rdev_for_each_safe(rdev, tmp, mddev)                            \
        list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
 #define rdev_for_each_rcu(rdev, mddev)                          \
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a222f516660e..9339e67fcc79 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
        }
        working_disks = 0;
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                disk_idx = rdev->raid_disk;
                if (disk_idx < 0 ||
                    disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c768f8f1..5709bfeab1e8 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n)
        return &n->keys[le32_to_cpu(n->header.max_entries)];
 }
-/*
+static inline void *value_ptr(struct node *n, uint32_t index)
- * FIXME: Now that value size is stored in node we don't need the third parm.
- */
-static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
 {
-        BUG_ON(value_size != le32_to_cpu(n->header.value_size));
+        uint32_t value_size = le32_to_cpu(n->header.value_size);
        return value_base(n) + (value_size * index);
 }
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2d389e..aa71e2359a07 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift)
        if (shift < 0) {
                shift = -shift;
                BUG_ON(shift > nr_entries);
-                BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
+                BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
                memmove(key_ptr(n, 0),
                        key_ptr(n, shift),
                        (nr_entries - shift) * sizeof(__le64));
-                memmove(value_ptr(n, 0, value_size),
+                memmove(value_ptr(n, 0),
-                        value_ptr(n, shift, value_size),
+                        value_ptr(n, shift),
                        (nr_entries - shift) * value_size);
        } else {
                BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
                memmove(key_ptr(n, shift),
                        key_ptr(n, 0),
                        nr_entries * sizeof(__le64));
-                memmove(value_ptr(n, shift, value_size),
+                memmove(value_ptr(n, shift),
-                        value_ptr(n, 0, value_size),
+                        value_ptr(n, 0),
                        nr_entries * value_size);
        }
 }
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
                memcpy(key_ptr(left, nr_left),
                       key_ptr(right, 0),
                       shift * sizeof(__le64));
-                memcpy(value_ptr(left, nr_left, value_size),
+                memcpy(value_ptr(left, nr_left),
-                       value_ptr(right, 0, value_size),
+                       value_ptr(right, 0),
                       shift * value_size);
        } else {
                BUG_ON(shift > le32_to_cpu(right->header.max_entries));
                memcpy(key_ptr(right, 0),
                       key_ptr(left, nr_left - shift),
                       shift * sizeof(__le64));
-                memcpy(value_ptr(right, 0, value_size),
+                memcpy(value_ptr(right, 0),
-                       value_ptr(left, nr_left - shift, value_size),
+                       value_ptr(left, nr_left - shift),
                       shift * value_size);
        }
 }
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index)
                        key_ptr(n, index + 1),
                        nr_to_copy * sizeof(__le64));
-                memmove(value_ptr(n, index, value_size),
+                memmove(value_ptr(n, index),
-                        value_ptr(n, index + 1, value_size),
+                        value_ptr(n, index + 1),
                        nr_to_copy * value_size);
        }
        n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 }
-static unsigned del_threshold(struct node *n)
-{
-        return le32_to_cpu(n->header.max_entries) / 3;
-}
 static unsigned merge_threshold(struct node *n)
 {
-        /*
+        return le32_to_cpu(n->header.max_entries) / 3;
-         * The extra one is because we know we're potentially going to
-         * delete an entry.
-         */
-        return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
 }
 struct child {
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
        if (inc)
                inc_children(info->tm, result->n, &le64_type);
-        *((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
+        *((__le64 *) value_ptr(parent, index)) =
                cpu_to_le64(dm_block_location(result->block));
        return 0;
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
 static void shift(struct node *left, struct node *right, int count)
 {
+        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+        uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
+        BUG_ON(max_entries != r_max_entries);
+        BUG_ON(nr_left - count > max_entries);
+        BUG_ON(nr_right + count > max_entries);
        if (!count)
                return;
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
                node_shift(right, count);
        }
-        left->header.nr_entries =
+        left->header.nr_entries = cpu_to_le32(nr_left - count);
-                cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
+        right->header.nr_entries = cpu_to_le32(nr_right + count);
-        BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
-        right->header.nr_entries =
-                cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
-        BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
 }
 static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
        struct node *right = r->n;
        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+        unsigned threshold = 2 * merge_threshold(left) + 1;
-        if (nr_left + nr_right <= merge_threshold(left)) {
+        if (nr_left + nr_right < threshold) {
                /*
                 * Merge
                 */
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
                 * Rebalance.
                 */
                unsigned target_left = (nr_left + nr_right) / 2;
-                unsigned shift_ = nr_left - target_left;
-                BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
-                BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
                shift(left, right, nr_left - target_left);
                *key_ptr(parent, r->index) = right->keys[0];
        }
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
        return exit_child(info, &right);
 }
+/*
+ * We dump as many entries from center as possible into left, then the rest
+ * in right, then rebalance2.  This wastes some cpu, but I want something
+ * simple atm.
+ */
+static void delete_center_node(struct dm_btree_info *info, struct node *parent,
+                               struct child *l, struct child *c, struct child *r,
+                               struct node *left, struct node *center, struct node *right,
+                               uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+        unsigned shift = min(max_entries - nr_left, nr_center);
+        BUG_ON(nr_left + shift > max_entries);
+        node_copy(left, center, -shift);
+        left->header.nr_entries = cpu_to_le32(nr_left + shift);
+        if (shift != nr_center) {
+                shift = nr_center - shift;
+                BUG_ON((nr_right + shift) > max_entries);
+                node_shift(right, shift);
+                node_copy(center, right, shift);
+                right->header.nr_entries = cpu_to_le32(nr_right + shift);
+        }
+        *key_ptr(parent, r->index) = right->keys[0];
+        delete_at(parent, c->index);
+        r->index--;
+        dm_tm_dec(info->tm, dm_block_location(c->block));
+        __rebalance2(info, parent, l, r);
+}
+/*
+ * Redistributes entries among 3 sibling nodes.
+ */
+static void redistribute3(struct dm_btree_info *info, struct node *parent,
+                          struct child *l, struct child *c, struct child *r,
+                          struct node *left, struct node *center, struct node *right,
+                          uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+        int s;
+        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+        unsigned target = (nr_left + nr_center + nr_right) / 3;
+        BUG_ON(target > max_entries);
+        if (nr_left < nr_right) {
+                s = nr_left - target;
+                if (s < 0 && nr_center < -s) {
+                        /* not enough in central node */
+                        shift(left, center, nr_center);
+                        s = nr_center - target;
+                        shift(left, right, s);
+                        nr_right += s;
+                } else
+                        shift(left, center, s);
+                shift(center, right, target - nr_right);
+        } else {
+                s = target - nr_right;
+                if (s > 0 && nr_center < s) {
+                        /* not enough in central node */
+                        shift(center, right, nr_center);
+                        s = target - nr_center;
+                        shift(left, right, s);
+                        nr_left -= s;
+                } else
+                        shift(center, right, s);
+                shift(left, center, nr_left - target);
+        }
+        *key_ptr(parent, c->index) = center->keys[0];
+        *key_ptr(parent, r->index) = right->keys[0];
+}
 static void __rebalance3(struct dm_btree_info *info, struct node *parent,
                         struct child *l, struct child *c, struct child *r)
 {
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
        uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
-        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
-        unsigned target;
+        unsigned threshold = merge_threshold(left) * 4 + 1;
        BUG_ON(left->header.max_entries != center->header.max_entries);
        BUG_ON(center->header.max_entries != right->header.max_entries);
-        if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
+        if ((nr_left + nr_center + nr_right) < threshold)
-                /*
+                delete_center_node(info, parent, l, c, r, left, center, right,
-                 * Delete center node:
+                                   nr_left, nr_center, nr_right);
-                 *
+        else
-                 * We dump as many entries from center as possible into
+                redistribute3(info, parent, l, c, r, left, center, right,
-                 * left, then the rest in right, then rebalance2.  This
+                              nr_left, nr_center, nr_right);
-                 * wastes some cpu, but I want something simple atm.
-                 */
-                unsigned shift = min(max_entries - nr_left, nr_center);
-                BUG_ON(nr_left + shift > max_entries);
-                node_copy(left, center, -shift);
-                left->header.nr_entries = cpu_to_le32(nr_left + shift);
-                if (shift != nr_center) {
-                        shift = nr_center - shift;
-                        BUG_ON((nr_right + shift) >= max_entries);
-                        node_shift(right, shift);
-                        node_copy(center, right, shift);
-                        right->header.nr_entries = cpu_to_le32(nr_right + shift);
-                }
-                *key_ptr(parent, r->index) = right->keys[0];
-                delete_at(parent, c->index);
-                r->index--;
-                dm_tm_dec(info->tm, dm_block_location(c->block));
-                __rebalance2(info, parent, l, r);
-                return;
-        }
-        /*
-         * Rebalance
-         */
-        target = (nr_left + nr_center + nr_right) / 3;
-        BUG_ON(target > max_entries);
-        /*
-         * Adjust the left node
-         */
-        shift(left, center, nr_left - target);
-        /*
-         * Adjust the right node
-         */
-        shift(center, right, target - nr_right);
-        *key_ptr(parent, c->index) = center->keys[0];
-        *key_ptr(parent, r->index) = right->keys[0];
 }
 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s,
        if (r)
                return r;
-        if (child_entries > del_threshold(n))
-                return 0;
        has_left_sibling = i > 0;
        has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
                 */
                if (shadow_has_parent(s)) {
                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
-                        memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
+                        memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
                               &location, sizeof(__le64));
                }
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
                if (info->value_type.dec)
                        info->value_type.dec(info->value_type.context,
-                                             value_ptr(n, index, info->value_type.size));
+                                             value_ptr(n, index));
                delete_at(n, index);
        }
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ffbe26c..d12b2cc51f1a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
                        dm_tm_inc(tm, value64(n, i));
        else if (vt->inc)
                for (i = 0; i < nr_entries; i++)
-                        vt->inc(vt->context,
+                        vt->inc(vt->context, value_ptr(n, i));
-                                value_ptr(n, i, vt->size));
 }
 static int insert_at(size_t value_size, struct node *node, unsigned index,
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
                                for (i = 0; i < f->nr_children; i++)
                                        info->value_type.dec(info->value_type.context,
-                                                             value_ptr(f->n, i, info->value_type.size));
+                                                             value_ptr(f->n, i));
                        }
                        f->current_child = f->nr_children;
                }
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
        } while (!(flags & LEAF_NODE));
        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
-        memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
+        memcpy(v, value_ptr(ro_node(s), i), value_size);
        return 0;
 }
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
        size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
                sizeof(uint64_t) : s->info->value_type.size;
-        memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
+        memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
               size * nr_right);
        /*
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
        pn = dm_block_data(parent);
        location = cpu_to_le64(dm_block_location(left));
        __dm_bless_for_disk(&location);
-        memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
+        memcpy_disk(value_ptr(pn, parent_index),
                    &location, sizeof(__le64));
        location = cpu_to_le64(dm_block_location(right));
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
        size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
                sizeof(__le64) : s->info->value_type.size;
-        memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
+        memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
-        memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
+        memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
               nr_right * size);
        /* new_parent should just point to l and r now */
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
        val = cpu_to_le64(dm_block_location(left));
        __dm_bless_for_disk(&val);
        pn->keys[0] = ln->keys[0];
-        memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
+        memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
        val = cpu_to_le64(dm_block_location(right));
        __dm_bless_for_disk(&val);
        pn->keys[1] = rn->keys[0];
-        memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
+        memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
        /*
         * rejig the spine.  This is ugly, since it knows too
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
                        __dm_bless_for_disk(&location);
-                        memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
+                        memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
                                    &location, sizeof(__le64));
                }
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
                    (!info->value_type.equal ||
                     !info->value_type.equal(
                             info->value_type.context,
-                             value_ptr(n, index, info->value_type.size),
+                             value_ptr(n, index),
                             value))) {
                        info->value_type.dec(info->value_type.context,
-                                             value_ptr(n, index, info->value_type.size));
+                                             value_ptr(n, index));
                }
-                memcpy_disk(value_ptr(n, index, info->value_type.size),
+                memcpy_disk(value_ptr(n, index),
                            value, info->value_type.size);
        }
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c06cdc..ff3beed6ad2d 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
                if (r < 0)
                        return r;
-#if 0
-                /* FIXME: dm_btree_remove doesn't handle this yet */
                if (old > 2) {
                        r = dm_btree_remove(&ll->ref_count_info,
                                            ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
                        if (r)
                                return r;
                }
-#endif
        } else {
                __le32 le_rc = cpu_to_le32(ref_count);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7294bd115e34..6f31f5596e01 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
        if (!conf)
                return -ENOMEM;
-        list_for_each_entry(rdev1, &mddev->disks, same_set) {
+        rdev_for_each(rdev1, mddev) {
                pr_debug("md/raid0:%s: looking at %s\n",
                         mdname(mddev),
                         bdevname(rdev1->bdev, b));
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
                sector_div(sectors, mddev->chunk_sectors);
                rdev1->sectors = sectors * mddev->chunk_sectors;
-                list_for_each_entry(rdev2, &mddev->disks, same_set) {
+                rdev_for_each(rdev2, mddev) {
                        pr_debug("md/raid0:%s:   comparing %s(%llu)"
                                 " with %s(%llu)\n",
                                 mdname(mddev),
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
        smallest = NULL;
        dev = conf->devlist;
        err = -EINVAL;
-        list_for_each_entry(rdev1, &mddev->disks, same_set) {
+        rdev_for_each(rdev1, mddev) {
                int j = rdev1->raid_disk;
                if (mddev->level == 10) {
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
                disk_stack_limits(mddev->gendisk, rdev1->bdev,
                                  rdev1->data_offset << 9);
-                /* as we don't honour merge_bvec_fn, we must never risk
-                 * violating it, so limit ->max_segments to 1, lying within
-                 * a single page.
-                 */
-                if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
+                if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
-                        blk_queue_max_segments(mddev->queue, 1);
+                        conf->has_merge_bvec = 1;
-                        blk_queue_segment_boundary(mddev->queue,
-                                                   PAGE_CACHE_SIZE - 1);
-                }
                if (!smallest || (rdev1->sectors < smallest->sectors))
                        smallest = rdev1;
                cnt++;
@@ -290,8 +284,64 @@ abort:
        return err;
 }
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct r0conf *conf,
+                                    sector_t *sectorp)
+{
+        int i;
+        struct strip_zone *z = conf->strip_zone;
+        sector_t sector = *sectorp;
+        for (i = 0; i < conf->nr_strip_zones; i++)
+                if (sector < z[i].zone_end) {
+                        if (i)
+                                *sectorp = sector - z[i-1].zone_end;
+                        return z + i;
+                }
+        BUG();
+}
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
+                                sector_t sector, sector_t *sector_offset)
+{
+        unsigned int sect_in_chunk;
+        sector_t chunk;
+        struct r0conf *conf = mddev->private;
+        int raid_disks = conf->strip_zone[0].nb_dev;
+        unsigned int chunk_sects = mddev->chunk_sectors;
+        if (is_power_of_2(chunk_sects)) {
+                int chunksect_bits = ffz(~chunk_sects);
+                /* find the sector offset inside the chunk */
+                sect_in_chunk  = sector & (chunk_sects - 1);
+                sector >>= chunksect_bits;
+                /* chunk in zone */
+                chunk = *sector_offset;
+                /* quotient is the chunk in real device*/
+                sector_div(chunk, zone->nb_dev << chunksect_bits);
+        } else{
+                sect_in_chunk = sector_div(sector, chunk_sects);
+                chunk = *sector_offset;
+                sector_div(chunk, chunk_sects * zone->nb_dev);
+        }
+        /*
+        *  position the bio over the real device
+        *  real sector = chunk in device + starting of zone
+        *       + the position in the chunk
+        */
+        *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+        return conf->devlist[(zone - conf->strip_zone)*raid_disks
+                             + sector_div(sector, zone->nb_dev)];
+}
 /**
- *      raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *      raid0_mergeable_bvec -- tell bio layer if two requests can be merged
 *      @q: request queue
 *      @bvm: properties of new bio
 *      @biovec: the request that could be merged to it.
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
                                struct bio_vec *biovec)
 {
        struct mddev *mddev = q->queuedata;
+        struct r0conf *conf = mddev->private;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+        sector_t sector_offset = sector;
        int max;
        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
+        struct strip_zone *zone;
+        struct md_rdev *rdev;
+        struct request_queue *subq;
        if (is_power_of_2(chunk_sectors))
                max =  (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
        else
                max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
                                                + bio_sectors)) << 9;
-        if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+        if (max < 0)
+                max = 0; /* bio_add cannot handle a negative return */
        if (max <= biovec->bv_len && bio_sectors == 0)
                return biovec->bv_len;
-        else 
+        if (max < biovec->bv_len)
+                /* too small already, no need to check further */
+                return max;
+        if (!conf->has_merge_bvec)
+                return max;
+        /* May need to check subordinate device */
+        sector = sector_offset;
+        zone = find_zone(mddev->private, &sector_offset);
+        rdev = map_sector(mddev, zone, sector, &sector_offset);
+        subq = bdev_get_queue(rdev->bdev);
+        if (subq->merge_bvec_fn) {
+                bvm->bi_bdev = rdev->bdev;
+                bvm->bi_sector = sector_offset + zone->dev_start +
+                        rdev->data_offset;
+                return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
+        } else
                return max;
 }
@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
        WARN_ONCE(sectors || raid_disks,
                  "%s does not support generic reshape\n", __func__);
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                array_sectors += rdev->sectors;
        return array_sectors;
@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev)
        return 0;
 }
-/* Find the zone which holds a particular offset
- * Update *sectorp to be an offset in that zone
- */
-static struct strip_zone *find_zone(struct r0conf *conf,
-                                    sector_t *sectorp)
-{
-        int i;
-        struct strip_zone *z = conf->strip_zone;
-        sector_t sector = *sectorp;
-        for (i = 0; i < conf->nr_strip_zones; i++)
-                if (sector < z[i].zone_end) {
-                        if (i)
-                                *sectorp = sector - z[i-1].zone_end;
-                        return z + i;
-                }
-        BUG();
-}
-/*
- * remaps the bio to the target device. we separate two flows.
- * power 2 flow and a general flow for the sake of perfromance
-*/
-static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
-                                sector_t sector, sector_t *sector_offset)
-{
-        unsigned int sect_in_chunk;
-        sector_t chunk;
-        struct r0conf *conf = mddev->private;
-        int raid_disks = conf->strip_zone[0].nb_dev;
-        unsigned int chunk_sects = mddev->chunk_sectors;
-        if (is_power_of_2(chunk_sects)) {
-                int chunksect_bits = ffz(~chunk_sects);
-                /* find the sector offset inside the chunk */
-                sect_in_chunk  = sector & (chunk_sects - 1);
-                sector >>= chunksect_bits;
-                /* chunk in zone */
-                chunk = *sector_offset;
-                /* quotient is the chunk in real device*/
-                sector_div(chunk, zone->nb_dev << chunksect_bits);
-        } else{
-                sect_in_chunk = sector_div(sector, chunk_sects);
-                chunk = *sector_offset;
-                sector_div(chunk, chunk_sects * zone->nb_dev);
-        }
-        /*
-        *  position the bio over the real device
-        *  real sector = chunk in device + starting of zone
-        *       + the position in the chunk
-        */
-        *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
-        return conf->devlist[(zone - conf->strip_zone)*raid_disks
-                             + sector_div(sector, zone->nb_dev)];
-}
 /*
 * Is io distribute over 1 or more chunks ?
 */
@@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
        }
        sector_offset = bio->bi_sector;
-        zone =  find_zone(mddev->private, &sector_offset);
+        zone = find_zone(mddev->private, &sector_offset);
        tmp_dev = map_sector(mddev, zone, bio->bi_sector,
                             &sector_offset);
        bio->bi_bdev = tmp_dev->bdev;
@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
                return ERR_PTR(-EINVAL);
        }
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                /* check slot number for a disk */
                if (rdev->raid_disk == mddev->raid_disks-1) {
                        printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 0884bba8df4c..05539d9c97f0 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -4,13 +4,16 @@
 struct strip_zone {
        sector_t zone_end;      /* Start of the next zone (in sectors) */
        sector_t dev_start;     /* Zone offset in real dev (in sectors) */
-        int nb_dev;             /* # of devices attached to the zone */
+        int      nb_dev;        /* # of devices attached to the zone */
 };
 struct r0conf {
-        struct strip_zone *strip_zone;
+        struct strip_zone       *strip_zone;
-        struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+        struct md_rdev          **devlist; /* lists of rdevs, pointed to
-        int nr_strip_zones;
+                                            * by strip_zone->dev */
+        int                     nr_strip_zones;
+        int                     has_merge_bvec; /* at least one member has
+                                                 * a merge_bvec_fn */
 };
 #endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a368db2431a5..4a40a200d769 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                rdev = rcu_dereference(conf->mirrors[disk].rdev);
                if (r1_bio->bios[disk] == IO_BLOCKED
                    || rdev == NULL
+                    || test_bit(Unmerged, &rdev->flags)
                    || test_bit(Faulty, &rdev->flags))
                        continue;
                if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
        return best_disk;
 }
+static int raid1_mergeable_bvec(struct request_queue *q,
+                                struct bvec_merge_data *bvm,
+                                struct bio_vec *biovec)
+{
+        struct mddev *mddev = q->queuedata;
+        struct r1conf *conf = mddev->private;
+        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+        int max = biovec->bv_len;
+        if (mddev->merge_check_needed) {
+                int disk;
+                rcu_read_lock();
+                for (disk = 0; disk < conf->raid_disks * 2; disk++) {
+                        struct md_rdev *rdev = rcu_dereference(
+                                conf->mirrors[disk].rdev);
+                        if (rdev && !test_bit(Faulty, &rdev->flags)) {
+                                struct request_queue *q =
+                                        bdev_get_queue(rdev->bdev);
+                                if (q->merge_bvec_fn) {
+                                        bvm->bi_sector = sector +
+                                                rdev->data_offset;
+                                        bvm->bi_bdev = rdev->bdev;
+                                        max = min(max, q->merge_bvec_fn(
+                                                          q, bvm, biovec));
+                                }
+                        }
+                }
+                rcu_read_unlock();
+        }
+        return max;
+}
 int md_raid1_congested(struct mddev *mddev, int bits)
 {
        struct r1conf *conf = mddev->private;
@@ -624,7 +658,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
                return 1;
        rcu_read_lock();
-        for (i = 0; i < conf->raid_disks; i++) {
+        for (i = 0; i < conf->raid_disks * 2; i++) {
                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev && !test_bit(Faulty, &rdev->flags)) {
                        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf)
        spin_lock_irq(&conf->resync_lock);
        if (conf->barrier) {
                conf->nr_waiting++;
-                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+                /* Wait for the barrier to drop.
+                 * However if there are already pending
+                 * requests (preventing the barrier from
+                 * rising completely), and the
+                 * pre-process bio queue isn't empty,
+                 * then don't wait, as we need to empty
+                 * that queue to get the nr_pending
+                 * count down.
+                 */
+                wait_event_lock_irq(conf->wait_barrier,
+                                    !conf->barrier ||
+                                    (conf->nr_pending &&
+                                     current->bio_list &&
+                                     !bio_list_empty(current->bio_list)),
                                    conf->resync_lock,
-                                    );
+                        );
                conf->nr_waiting--;
        }
        conf->nr_pending++;
@@ -1002,7 +1049,8 @@ read_again:
                        break;
                }
                r1_bio->bios[i] = NULL;
-                if (!rdev || test_bit(Faulty, &rdev->flags)) {
+                if (!rdev || test_bit(Faulty, &rdev->flags)
+                    || test_bit(Unmerged, &rdev->flags)) {
                        if (i < conf->raid_disks)
                                set_bit(R1BIO_Degraded, &r1_bio->state);
                        continue;
@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        struct mirror_info *p;
        int first = 0;
        int last = conf->raid_disks - 1;
+        struct request_queue *q = bdev_get_queue(rdev->bdev);
        if (mddev->recovery_disabled == conf->recovery_disabled)
                return -EBUSY;
@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (rdev->raid_disk >= 0)
                first = last = rdev->raid_disk;
+        if (q->merge_bvec_fn) {
+                set_bit(Unmerged, &rdev->flags);
+                mddev->merge_check_needed = 1;
+        }
        for (mirror = first; mirror <= last; mirror++) {
                p = conf->mirrors+mirror;
                if (!p->rdev) {
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
-                        /* as we don't honour merge_bvec_fn, we must
-                         * never risk violating it, so limit
-                         * ->max_segments to one lying with a single
-                         * page, as a one page request is never in
-                         * violation.
-                         */
-                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                                blk_queue_max_segments(mddev->queue, 1);
-                                blk_queue_segment_boundary(mddev->queue,
-                                                           PAGE_CACHE_SIZE - 1);
-                        }
                        p->head_position = 0;
                        rdev->raid_disk = mirror;
@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                        break;
                }
        }
+        if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+                /* Some requests might not have seen this new
+                 * merge_bvec_fn.  We must wait for them to complete
+                 * before merging the device fully.
+                 * First we make sure any code which has tested
+                 * our function has submitted the request, then
+                 * we wait for all outstanding requests to complete.
+                 */
+                synchronize_sched();
+                raise_barrier(conf);
+                lower_barrier(conf);
+                clear_bit(Unmerged, &rdev->flags);
+        }
        md_integrity_add_rdev(rdev, mddev);
        print_conf(conf);
        return err;
@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        err = -EINVAL;
        spin_lock_init(&conf->device_lock);
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                int disk_idx = rdev->raid_disk;
                if (disk_idx >= mddev->raid_disks
                    || disk_idx < 0)
@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev)
        if (IS_ERR(conf))
                return PTR_ERR(conf);
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                if (!mddev->gendisk)
                        continue;
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
-                /* as we don't honour merge_bvec_fn, we must never risk
-                 * violating it, so limit ->max_segments to 1 lying within
-                 * a single page, as a one page request is never in violation.
-                 */
-                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                        blk_queue_max_segments(mddev->queue, 1);
-                        blk_queue_segment_boundary(mddev->queue,
-                                                   PAGE_CACHE_SIZE - 1);
-                }
        }
        mddev->degraded = 0;
@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev)
        if (mddev->queue) {
                mddev->queue->backing_dev_info.congested_fn = raid1_congested;
                mddev->queue->backing_dev_info.congested_data = mddev;
+                blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
        }
        return md_integrity_register(mddev);
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e8aa213f0d5..3540316886f2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -67,6 +67,7 @@ static int max_queued_requests = 1024;
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
+static int enough(struct r10conf *conf, int ignore);
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -347,6 +348,19 @@ static void raid10_end_read_request(struct bio *bio, int error)
                 * wait for the 'master' bio.
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
+        } else {
+                /* If all other devices that store this block have
+                 * failed, we want to return the error upwards rather
+                 * than fail the last device.  Here we redefine
+                 * "uptodate" to mean "Don't want to retry"
+                 */
+                unsigned long flags;
+                spin_lock_irqsave(&conf->device_lock, flags);
+                if (!enough(conf, rdev->raid_disk))
+                        uptodate = 1;
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+        }
+        if (uptodate) {
                raid_end_bio_io(r10_bio);
                rdev_dec_pending(rdev, conf->mddev);
        } else {
@@ -572,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 *      @biovec: the request that could be merged to it.
 *
 *      Return amount of bytes we can accept at this offset
- *      If near_copies == raid_disk, there are no striping issues,
+ *      This requires checking for end-of-chunk if near_copies != raid_disks,
- *      but in that case, the function isn't called at all.
+ *      and for subordinate merge_bvec_fns if merge_check_needed.
 */
 static int raid10_mergeable_bvec(struct request_queue *q,
                                 struct bvec_merge_data *bvm,
                                 struct bio_vec *biovec)
 {
        struct mddev *mddev = q->queuedata;
+        struct r10conf *conf = mddev->private;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
        int max;
        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
-        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+        if (conf->near_copies < conf->raid_disks) {
-        if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+                max = (chunk_sectors - ((sector & (chunk_sectors - 1))
-        if (max <= biovec->bv_len && bio_sectors == 0)
+                                        + bio_sectors)) << 9;
-                return biovec->bv_len;
+                if (max < 0)
-        else
+                        /* bio_add cannot handle a negative return */
-                return max;
+                        max = 0;
+                if (max <= biovec->bv_len && bio_sectors == 0)
+                        return biovec->bv_len;
+        } else
+                max = biovec->bv_len;
+        if (mddev->merge_check_needed) {
+                struct r10bio r10_bio;
+                int s;
+                r10_bio.sector = sector;
+                raid10_find_phys(conf, &r10_bio);
+                rcu_read_lock();
+                for (s = 0; s < conf->copies; s++) {
+                        int disk = r10_bio.devs[s].devnum;
+                        struct md_rdev *rdev = rcu_dereference(
+                                conf->mirrors[disk].rdev);
+                        if (rdev && !test_bit(Faulty, &rdev->flags)) {
+                                struct request_queue *q =
+                                        bdev_get_queue(rdev->bdev);
+                                if (q->merge_bvec_fn) {
+                                        bvm->bi_sector = r10_bio.devs[s].addr
+                                                + rdev->data_offset;
+                                        bvm->bi_bdev = rdev->bdev;
+                                        max = min(max, q->merge_bvec_fn(
+                                                          q, bvm, biovec));
+                                }
+                        }
+                        rdev = rcu_dereference(conf->mirrors[disk].replacement);
+                        if (rdev && !test_bit(Faulty, &rdev->flags)) {
+                                struct request_queue *q =
+                                        bdev_get_queue(rdev->bdev);
+                                if (q->merge_bvec_fn) {
+                                        bvm->bi_sector = r10_bio.devs[s].addr
+                                                + rdev->data_offset;
+                                        bvm->bi_bdev = rdev->bdev;
+                                        max = min(max, q->merge_bvec_fn(
+                                                          q, bvm, biovec));
+                                }
+                        }
+                }
+                rcu_read_unlock();
+        }
+        return max;
 }
 /*
@@ -654,11 +711,12 @@ retry:
                disk = r10_bio->devs[slot].devnum;
                rdev = rcu_dereference(conf->mirrors[disk].replacement);
                if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+                    test_bit(Unmerged, &rdev->flags) ||
                    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
                        rdev = rcu_dereference(conf->mirrors[disk].rdev);
-                if (rdev == NULL)
+                if (rdev == NULL ||
-                        continue;
+                    test_bit(Faulty, &rdev->flags) ||
-                if (test_bit(Faulty, &rdev->flags))
+                    test_bit(Unmerged, &rdev->flags))
                        continue;
                if (!test_bit(In_sync, &rdev->flags) &&
                    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -849,9 +907,22 @@ static void wait_barrier(struct r10conf *conf)
        spin_lock_irq(&conf->resync_lock);
        if (conf->barrier) {
                conf->nr_waiting++;
-                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+                /* Wait for the barrier to drop.
+                 * However if there are already pending
+                 * requests (preventing the barrier from
+                 * rising completely), and the
+                 * pre-process bio queue isn't empty,
+                 * then don't wait, as we need to empty
+                 * that queue to get the nr_pending
+                 * count down.
+                 */
+                wait_event_lock_irq(conf->wait_barrier,
+                                    !conf->barrier ||
+                                    (conf->nr_pending &&
+                                     current->bio_list &&
+                                     !bio_list_empty(current->bio_list)),
                                    conf->resync_lock,
-                                    );
+                        );
                conf->nr_waiting--;
        }
        conf->nr_pending++;
@@ -1107,12 +1178,14 @@ retry_write:
                        blocked_rdev = rrdev;
                        break;
                }
-                if (rrdev && test_bit(Faulty, &rrdev->flags))
+                if (rrdev && (test_bit(Faulty, &rrdev->flags)
+                              || test_bit(Unmerged, &rrdev->flags)))
                        rrdev = NULL;
                r10_bio->devs[i].bio = NULL;
                r10_bio->devs[i].repl_bio = NULL;
-                if (!rdev || test_bit(Faulty, &rdev->flags)) {
+                if (!rdev || test_bit(Faulty, &rdev->flags) ||
+                    test_bit(Unmerged, &rdev->flags)) {
                        set_bit(R10BIO_Degraded, &r10_bio->state);
                        continue;
                }
@@ -1463,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        int mirror;
        int first = 0;
        int last = conf->raid_disks - 1;
+        struct request_queue *q = bdev_get_queue(rdev->bdev);
        if (mddev->recovery_cp < MaxSector)
                /* only hot-add to in-sync arrays, as recovery is
                 * very different from resync
                 */
                return -EBUSY;
-        if (!enough(conf, -1))
+        if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
                return -EINVAL;
        if (rdev->raid_disk >= 0)
                first = last = rdev->raid_disk;
+        if (q->merge_bvec_fn) {
+                set_bit(Unmerged, &rdev->flags);
+                mddev->merge_check_needed = 1;
+        }
        if (rdev->saved_raid_disk >= first &&
            conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
                mirror = rdev->saved_raid_disk;
@@ -1494,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                        err = 0;
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
-                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                                blk_queue_max_segments(mddev->queue, 1);
-                                blk_queue_segment_boundary(mddev->queue,
-                                                           PAGE_CACHE_SIZE - 1);
-                        }
                        conf->fullsync = 1;
                        rcu_assign_pointer(p->replacement, rdev);
                        break;
@@ -1506,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
-                /* as we don't honour merge_bvec_fn, we must
-                 * never risk violating it, so limit
-                 * ->max_segments to one lying with a single
-                 * page, as a one page request is never in
-                 * violation.
-                 */
-                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                        blk_queue_max_segments(mddev->queue, 1);
-                        blk_queue_segment_boundary(mddev->queue,
-                                                   PAGE_CACHE_SIZE - 1);
-                }
                p->head_position = 0;
                p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1527,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                rcu_assign_pointer(p->rdev, rdev);
                break;
        }
+        if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+                /* Some requests might not have seen this new
+                 * merge_bvec_fn.  We must wait for them to complete
+                 * before merging the device fully.
+                 * First we make sure any code which has tested
+                 * our function has submitted the request, then
+                 * we wait for all outstanding requests to complete.
+                 */
+                synchronize_sched();
+                raise_barrier(conf, 0);
+                lower_barrier(conf);
+                clear_bit(Unmerged, &rdev->flags);
+        }
        md_integrity_add_rdev(rdev, mddev);
        print_conf(conf);
        return err;
@@ -1668,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error)
        d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
        if (repl)
                rdev = conf->mirrors[d].replacement;
-        if (!rdev) {
+        else
-                smp_mb();
                rdev = conf->mirrors[d].rdev;
-        }
        if (!uptodate) {
                if (repl)
@@ -2052,6 +2125,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                       "md/raid10:%s: %s: Failing raid device\n",
                       mdname(mddev), b);
                md_error(mddev, conf->mirrors[d].rdev);
+                r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
                return;
        }
@@ -2072,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                        d = r10_bio->devs[sl].devnum;
                        rdev = rcu_dereference(conf->mirrors[d].rdev);
                        if (rdev &&
+                            !test_bit(Unmerged, &rdev->flags) &&
                            test_bit(In_sync, &rdev->flags) &&
                            is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
                                        &first_bad, &bad_sectors) == 0) {
@@ -2105,8 +2180,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                                    rdev,
                                    r10_bio->devs[r10_bio->read_slot].addr
                                    + sect,
-                                    s, 0))
+                                    s, 0)) {
                                md_error(mddev, rdev);
+                                r10_bio->devs[r10_bio->read_slot].bio
+                                        = IO_BLOCKED;
+                        }
                        break;
                }
@@ -2122,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                        d = r10_bio->devs[sl].devnum;
                        rdev = rcu_dereference(conf->mirrors[d].rdev);
                        if (!rdev ||
+                            test_bit(Unmerged, &rdev->flags) ||
                            !test_bit(In_sync, &rdev->flags))
                                continue;
@@ -2299,17 +2378,20 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
         * This is all done synchronously while the array is
         * frozen.
         */
+        bio = r10_bio->devs[slot].bio;
+        bdevname(bio->bi_bdev, b);
+        bio_put(bio);
+        r10_bio->devs[slot].bio = NULL;
        if (mddev->ro == 0) {
                freeze_array(conf);
                fix_read_error(conf, mddev, r10_bio);
                unfreeze_array(conf);
-        }
+        } else
+                r10_bio->devs[slot].bio = IO_BLOCKED;
        rdev_dec_pending(rdev, mddev);
-        bio = r10_bio->devs[slot].bio;
-        bdevname(bio->bi_bdev, b);
-        r10_bio->devs[slot].bio =
-                mddev->ro ? IO_BLOCKED : NULL;
 read_more:
        rdev = read_balance(conf, r10_bio, &max_sectors);
        if (rdev == NULL) {
@@ -2318,13 +2400,10 @@ read_more:
                       mdname(mddev), b,
                       (unsigned long long)r10_bio->sector);
                raid_end_bio_io(r10_bio);
-                bio_put(bio);
                return;
        }
        do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
-        if (bio)
-                bio_put(bio);
        slot = r10_bio->read_slot;
        printk_ratelimited(
                KERN_ERR
@@ -2360,7 +2439,6 @@ read_more:
                        mbio->bi_phys_segments++;
                spin_unlock_irq(&conf->device_lock);
                generic_make_request(bio);
-                bio = NULL;
                r10_bio = mempool_alloc(conf->r10bio_pool,
                                        GFP_NOIO);
@@ -3225,7 +3303,7 @@ static int run(struct mddev *mddev)
                blk_queue_io_opt(mddev->queue, chunk_size *
                                 (conf->raid_disks / conf->near_copies));
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                disk_idx = rdev->raid_disk;
                if (disk_idx >= conf->raid_disks
@@ -3243,18 +3321,8 @@ static int run(struct mddev *mddev)
                        disk->rdev = rdev;
                }
-                disk->rdev = rdev;
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
-                /* as we don't honour merge_bvec_fn, we must never risk
-                 * violating it, so limit max_segments to 1 lying
-                 * within a single page.
-                 */
-                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                        blk_queue_max_segments(mddev->queue, 1);
-                        blk_queue_segment_boundary(mddev->queue,
-                                                   PAGE_CACHE_SIZE - 1);
-                }
                disk->head_position = 0;
        }
@@ -3318,8 +3386,7 @@ static int run(struct mddev *mddev)
                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
        }
-        if (conf->near_copies < conf->raid_disks)
+        blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-                blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
        if (md_integrity_register(mddev))
                goto out_free_conf;
@@ -3369,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state)
        }
 }
+static int raid10_resize(struct mddev *mddev, sector_t sectors)
+{
+        /* Resize of 'far' arrays is not supported.
+         * For 'near' and 'offset' arrays we can set the
+         * number of sectors used to be an appropriate multiple
+         * of the chunk size.
+         * For 'offset', this is far_copies*chunksize.
+         * For 'near' the multiplier is the LCM of
+         * near_copies and raid_disks.
+         * So if far_copies > 1 && !far_offset, fail.
+         * Else find LCM(raid_disks, near_copy)*far_copies and
+         * multiply by chunk_size.  Then round to this number.
+         * This is mostly done by raid10_size()
+         */
+        struct r10conf *conf = mddev->private;
+        sector_t oldsize, size;
+        if (conf->far_copies > 1 && !conf->far_offset)
+                return -EINVAL;
+        oldsize = raid10_size(mddev, 0, 0);
+        size = raid10_size(mddev, sectors, 0);
+        md_set_array_sectors(mddev, size);
+        if (mddev->array_sectors > size)
+                return -EINVAL;
+        set_capacity(mddev->gendisk, mddev->array_sectors);
+        revalidate_disk(mddev->gendisk);
+        if (sectors > mddev->dev_sectors &&
+            mddev->recovery_cp > oldsize) {
+                mddev->recovery_cp = oldsize;
+                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        }
+        mddev->dev_sectors = sectors;
+        mddev->resync_max_sectors = size;
+        return 0;
+}
 static void *raid10_takeover_raid0(struct mddev *mddev)
 {
        struct md_rdev *rdev;
@@ -3392,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
        conf = setup_conf(mddev);
        if (!IS_ERR(conf)) {
-                list_for_each_entry(rdev, &mddev->disks, same_set)
+                rdev_for_each(rdev, mddev)
                        if (rdev->raid_disk >= 0)
                                rdev->new_raid_disk = rdev->raid_disk * 2;
                conf->barrier = 1;
@@ -3438,6 +3542,7 @@ static struct md_personality raid10_personality =
        .sync_request   = sync_request,
        .quiesce        = raid10_quiesce,
        .size           = raid10_size,
+        .resize         = raid10_resize,
        .takeover       = raid10_takeover,
 };
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 360f2b98f62b..23ac880bba9a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
                        md_wakeup_thread(conf->mddev->thread);
                } else {
                        BUG_ON(stripe_operations_active(sh));
-                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-                                atomic_dec(&conf->preread_active_stripes);
+                                if (atomic_dec_return(&conf->preread_active_stripes)
-                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+                                    < IO_THRESHOLD)
                                        md_wakeup_thread(conf->mddev->thread);
-                        }
                        atomic_dec(&conf->active_stripes);
                        if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
                                list_add_tail(&sh->lru, &conf->inactive_list);
@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        pr_debug("raid456: run(%s) called.\n", mdname(mddev));
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
+        rdev_for_each(rdev, mddev) {
                raid_disk = rdev->raid_disk;
                if (raid_disk >= max_disks
                    || raid_disk < 0)
@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev)
                blk_queue_io_opt(mddev->queue, chunk_size *
                                 (conf->raid_disks - conf->max_degraded));
-                list_for_each_entry(rdev, &mddev->disks, same_set)
+                rdev_for_each(rdev, mddev)
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
        }
@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (mddev->recovery_disabled == conf->recovery_disabled)
                return -EBUSY;
-        if (has_failed(conf))
+        if (rdev->saved_raid_disk < 0 && has_failed(conf))
                /* no point adding a device */
                return -EINVAL;
@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev)
        if (!check_stripe_cache(mddev))
                return -ENOSPC;
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        rdev_for_each(rdev, mddev)
                if (!test_bit(In_sync, &rdev->flags)
                    && !test_bit(Faulty, &rdev->flags))
                        spares++;
@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev)
         * such devices during the reshape and confusion could result.
         */
        if (mddev->delta_disks >= 0) {
-                int added_devices = 0;
+                rdev_for_each(rdev, mddev)
-                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk < 0 &&
                            !test_bit(Faulty, &rdev->flags)) {
                                if (raid5_add_disk(mddev, rdev) == 0) {
                                        if (rdev->raid_disk
-                                            >= conf->previous_raid_disks) {
+                                            >= conf->previous_raid_disks)
                                                set_bit(In_sync, &rdev->flags);
-                                                added_devices++;
+                                        else
-                                        } else
                                                rdev->recovery_offset = 0;
                                        if (sysfs_link_rdev(mddev, rdev))
@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev)
                                   && !test_bit(Faulty, &rdev->flags)) {
                                /* This is a spare that was manually added */
                                set_bit(In_sync, &rdev->flags);
-                                added_devices++;
                        }
                /* When a reshape changes the number of devices,
@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev)
                spin_lock_irq(&conf->device_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
                conf->reshape_progress = MaxSector;
+                mddev->reshape_position = MaxSector;
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
author	Takashi Iwai <tiwai@suse.de>	2012-04-07 06:28:00 -0400
committer	Takashi Iwai <tiwai@suse.de>	2012-04-07 06:28:00 -0400
commit	c38f62b08d800104fa9b0e9d6e9141459986c06d (patch)
tree	1d04d768c8aa0c1a544d1f068317c7beb0101be2 /drivers/md
parent	250f32747e62cb415b85083e247184188f24e566 (diff)
parent	8abe05c6eb358967f16bce8a02c88d57c82cfbd6 (diff)