Merge branch 'for-4.5/drivers' of git://git.kernel.dk/linux-block

Pull block driver updates from Jens Axboe: "This is the block driver pull request for 4.5, with the exception of NVMe, which is in a separate branch and will be posted after this one. This pull request contains: - A set of bcache stability fixes, which have been acked by Kent. These have been used and tested for more than a year by the community, so it's about time that they got in. - A set of drbd updates from the drbd team (Andreas, Lars, Philipp) and Markus Elfring, Oleg Drokin. - A set of fixes for xen blkback/front from the usual suspects, (Bob, Konrad) as well as community based fixes from Kiri, Julien, and Peng. - A 2038 time fix for sx8 from Shraddha, with a fix from me. - A small mtip32xx cleanup from Zhu Yanjun. - A null_blk division fix from Arnd" * 'for-4.5/drivers' of git://git.kernel.dk/linux-block: (71 commits) null_blk: use sector_div instead of do_div mtip32xx: restrict variables visible in current code module xen/blkfront: Fix crash if backend doesn't follow the right states. xen/blkback: Fix two memory leaks. xen/blkback: make st_ statistics per ring xen/blkfront: Handle non-indirect grant with 64KB pages xen-blkfront: Introduce blkif_ring_get_request xen-blkback: clear PF_NOFREEZE for xen_blkif_schedule() xen/blkback: Free resources if connect_ring failed. xen/blocks: Return -EXX instead of -1 xen/blkback: make pool of persistent grants and free pages per-queue xen/blkback: get the number of hardware queues/rings from blkfront xen/blkback: pseudo support for multi hardware queues/rings xen/blkback: separate ring information out of struct xen_blkif xen/blkfront: correct setting for xen_blkif_max_ring_order xen/blkfront: make persistent grants pool per-queue xen/blkfront: Remove duplicate setting of ->xbdev. xen/blkfront: Cleanup of comments, fix unaligned variables, and syntax errors. xen/blkfront: negotiate number of queues/rings to be used with backend xen/blkfront: split per device io_lock ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-21 21:19:38 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-21 21:19:38 -0500
commit: 641203549a21ba6a701aecd05c3dfc969ec670cc (patch)
tree: 5e3d177c380ed811b5bf37e0bf9b8098416a9bc6
parent: 404a47410c26a115123885977053e9a1a4460929 (diff)
parent: e93d12ae3be91d18b2a46deebb90a3f516db3d3c (diff)
33 files changed, 3893 insertions, 1326 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 45d2717760fc..b8a717c4f863 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3665,13 +3665,12 @@ F:	drivers/scsi/dpt*
 F:      drivers/scsi/dpt/
 DRBD DRIVER
-P:      Philipp Reisner
+M:      Philipp Reisner <philipp.reisner@linbit.com>
-P:      Lars Ellenberg
+M:      Lars Ellenberg <lars.ellenberg@linbit.com>
-M:      drbd-dev@lists.linbit.com
+L:      drbd-dev@lists.linbit.com
-L:      drbd-user@lists.linbit.com
 W:      http://www.drbd.org
-T:      git git://git.drbd.org/linux-2.6-drbd.git drbd
+T:      git git://git.linbit.com/linux-drbd.git
-T:      git git://git.drbd.org/drbd-8.3.git
+T:      git git://git.linbit.com/drbd-8.4.git
 S:      Supported
 F:      drivers/block/drbd/
 F:      lib/lru_cache.c
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index b3868e7a1ffd..10459a145062 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
        return need_transaction;
 }
-static int al_write_transaction(struct drbd_device *device);
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+        return al_enr >>
+                /* bit to page */
+                ((PAGE_SHIFT + 3) -
+                /* al extent number to bit */
+                 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+        const unsigned int stripes = device->ldev->md.al_stripes;
+        const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+        /* transaction number, modulo on-disk ring buffer wrap around */
+        unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+        /* ... to aligned 4k on disk block */
+        t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+        /* ... to 512 byte sector in activity log */
+        t *= 8;
+        /* ... plus offset to the on disk position */
+        return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer)
+{
+        struct lc_element *e;
+        sector_t sector;
+        int i, mx;
+        unsigned extent_nr;
+        unsigned crc = 0;
+        int err = 0;
+        memset(buffer, 0, sizeof(*buffer));
+        buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+        buffer->tr_number = cpu_to_be32(device->al_tr_number);
+        i = 0;
+        /* Even though no one can start to change this list
+         * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+         * lc_try_lock_for_transaction() --, someone may still
+         * be in the process of changing it. */
+        spin_lock_irq(&device->al_lock);
+        list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+                if (i == AL_UPDATES_PER_TRANSACTION) {
+                        i++;
+                        break;
+                }
+                buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+                buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+                if (e->lc_number != LC_FREE)
+                        drbd_bm_mark_for_writeout(device,
+                                        al_extent_to_bm_page(e->lc_number));
+                i++;
+        }
+        spin_unlock_irq(&device->al_lock);
+        BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+        buffer->n_updates = cpu_to_be16(i);
+        for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+                buffer->update_slot_nr[i] = cpu_to_be16(-1);
+                buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+        }
+        buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+        buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+        mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+                   device->act_log->nr_elements - device->al_tr_cycle);
+        for (i = 0; i < mx; i++) {
+                unsigned idx = device->al_tr_cycle + i;
+                extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+                buffer->context[i] = cpu_to_be32(extent_nr);
+        }
+        for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+                buffer->context[i] = cpu_to_be32(LC_FREE);
+        device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+        if (device->al_tr_cycle >= device->act_log->nr_elements)
+                device->al_tr_cycle = 0;
+        sector = al_tr_number_to_on_disk_sector(device);
+        crc = crc32c(0, buffer, 4096);
+        buffer->crc32c = cpu_to_be32(crc);
+        if (drbd_bm_write_hinted(device))
+                err = -EIO;
+        else {
+                bool write_al_updates;
+                rcu_read_lock();
+                write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+                rcu_read_unlock();
+                if (write_al_updates) {
+                        if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+                                err = -EIO;
+                                drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+                        } else {
+                                device->al_tr_number++;
+                                device->al_writ_cnt++;
+                        }
+                }
+        }
+        return err;
+}
+static int al_write_transaction(struct drbd_device *device)
+{
+        struct al_transaction_on_disk *buffer;
+        int err;
+        if (!get_ldev(device)) {
+                drbd_err(device, "disk is %s, cannot start al transaction\n",
+                        drbd_disk_str(device->state.disk));
+                return -EIO;
+        }
+        /* The bitmap write may have failed, causing a state change. */
+        if (device->state.disk < D_INCONSISTENT) {
+                drbd_err(device,
+                        "disk is %s, cannot write al transaction\n",
+                        drbd_disk_str(device->state.disk));
+                put_ldev(device);
+                return -EIO;
+        }
+        /* protects md_io_buffer, al_tr_cycle, ... */
+        buffer = drbd_md_get_buffer(device, __func__);
+        if (!buffer) {
+                drbd_err(device, "disk failed while waiting for md_io buffer\n");
+                put_ldev(device);
+                return -ENODEV;
+        }
+        err = __al_write_transaction(device, buffer);
+        drbd_md_put_buffer(device);
+        put_ldev(device);
+        return err;
+}
 void drbd_al_begin_io_commit(struct drbd_device *device)
 {
@@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
        wake_up(&device->al_wait);
 }
-#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
-/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
- * are still coupled, or assume too much about their relation.
- * Code below will not work if this is violated.
- * Will be cleaned up with some followup patch.
- */
-# error FIXME
-#endif
-static unsigned int al_extent_to_bm_page(unsigned int al_enr)
-{
-        return al_enr >>
-                /* bit to page */
-                ((PAGE_SHIFT + 3) -
-                /* al extent number to bit */
-                 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
-}
-static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
-{
-        const unsigned int stripes = device->ldev->md.al_stripes;
-        const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
-        /* transaction number, modulo on-disk ring buffer wrap around */
-        unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
-        /* ... to aligned 4k on disk block */
-        t = ((t % stripes) * stripe_size_4kB) + t/stripes;
-        /* ... to 512 byte sector in activity log */
-        t *= 8;
-        /* ... plus offset to the on disk position */
-        return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
-}
-int al_write_transaction(struct drbd_device *device)
-{
-        struct al_transaction_on_disk *buffer;
-        struct lc_element *e;
-        sector_t sector;
-        int i, mx;
-        unsigned extent_nr;
-        unsigned crc = 0;
-        int err = 0;
-        if (!get_ldev(device)) {
-                drbd_err(device, "disk is %s, cannot start al transaction\n",
-                        drbd_disk_str(device->state.disk));
-                return -EIO;
-        }
-        /* The bitmap write may have failed, causing a state change. */
-        if (device->state.disk < D_INCONSISTENT) {
-                drbd_err(device,
-                        "disk is %s, cannot write al transaction\n",
-                        drbd_disk_str(device->state.disk));
-                put_ldev(device);
-                return -EIO;
-        }
-        /* protects md_io_buffer, al_tr_cycle, ... */
-        buffer = drbd_md_get_buffer(device, __func__);
-        if (!buffer) {
-                drbd_err(device, "disk failed while waiting for md_io buffer\n");
-                put_ldev(device);
-                return -ENODEV;
-        }
-        memset(buffer, 0, sizeof(*buffer));
-        buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
-        buffer->tr_number = cpu_to_be32(device->al_tr_number);
-        i = 0;
-        /* Even though no one can start to change this list
-         * once we set the LC_LOCKED -- from drbd_al_begin_io(),
-         * lc_try_lock_for_transaction() --, someone may still
-         * be in the process of changing it. */
-        spin_lock_irq(&device->al_lock);
-        list_for_each_entry(e, &device->act_log->to_be_changed, list) {
-                if (i == AL_UPDATES_PER_TRANSACTION) {
-                        i++;
-                        break;
-                }
-                buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
-                buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
-                if (e->lc_number != LC_FREE)
-                        drbd_bm_mark_for_writeout(device,
-                                        al_extent_to_bm_page(e->lc_number));
-                i++;
-        }
-        spin_unlock_irq(&device->al_lock);
-        BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
-        buffer->n_updates = cpu_to_be16(i);
-        for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
-                buffer->update_slot_nr[i] = cpu_to_be16(-1);
-                buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
-        }
-        buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
-        buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
-        mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
-                   device->act_log->nr_elements - device->al_tr_cycle);
-        for (i = 0; i < mx; i++) {
-                unsigned idx = device->al_tr_cycle + i;
-                extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
-                buffer->context[i] = cpu_to_be32(extent_nr);
-        }
-        for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
-                buffer->context[i] = cpu_to_be32(LC_FREE);
-        device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
-        if (device->al_tr_cycle >= device->act_log->nr_elements)
-                device->al_tr_cycle = 0;
-        sector = al_tr_number_to_on_disk_sector(device);
-        crc = crc32c(0, buffer, 4096);
-        buffer->crc32c = cpu_to_be32(crc);
-        if (drbd_bm_write_hinted(device))
-                err = -EIO;
-        else {
-                bool write_al_updates;
-                rcu_read_lock();
-                write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
-                rcu_read_unlock();
-                if (write_al_updates) {
-                        if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
-                                err = -EIO;
-                                drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-                        } else {
-                                device->al_tr_number++;
-                                device->al_writ_cnt++;
-                        }
-                }
-        }
-        drbd_md_put_buffer(device);
-        put_ldev(device);
-        return err;
-}
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
        int rv;
@@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device)
        wake_up(&device->al_wait);
 }
-int drbd_initialize_al(struct drbd_device *device, void *buffer)
+int drbd_al_initialize(struct drbd_device *device, void *buffer)
 {
        struct al_transaction_on_disk *al = buffer;
        struct drbd_md *md = &device->ldev->md;
-        sector_t al_base = md->md_offset + md->al_offset;
        int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
        int i;
-        memset(al, 0, 4096);
+        __al_write_transaction(device, al);
-        al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+        /* There may or may not have been a pending transaction. */
-        al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+        spin_lock_irq(&device->al_lock);
-        al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+        lc_committed(device->act_log);
+        spin_unlock_irq(&device->al_lock);
-        for (i = 0; i < al_size_4k; i++) {
+        /* The rest of the transactions will have an empty "updates" list, and
-                int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+         * are written out only to provide the context, and to initialize the
+         * on-disk ring buffer. */
+        for (i = 1; i < al_size_4k; i++) {
+                int err = __al_write_transaction(device, al);
                if (err)
                        return err;
        }
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 9462d2752850..0dabc9b93725 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -24,7 +24,7 @@
 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 #include <linux/drbd.h>
@@ -479,8 +479,14 @@ void drbd_bm_cleanup(struct drbd_device *device)
 * this masks out the remaining bits.
 * Returns the number of bits cleared.
 */
+#ifndef BITS_PER_PAGE
 #define BITS_PER_PAGE           (1UL << (PAGE_SHIFT + 3))
 #define BITS_PER_PAGE_MASK      (BITS_PER_PAGE - 1)
+#else
+# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
+#  error "ambiguous BITS_PER_PAGE"
+# endif
+#endif
 #define BITS_PER_LONG_MASK      (BITS_PER_LONG - 1)
 static int bm_clear_surplus(struct drbd_bitmap *b)
 {
@@ -559,21 +565,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
        unsigned long *p_addr;
        unsigned long bits = 0;
        unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
-        int idx, i, last_word;
+        int idx, last_word;
        /* all but last page */
        for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
                p_addr = __bm_map_pidx(b, idx);
-                for (i = 0; i < LWPP; i++)
+                bits += bitmap_weight(p_addr, BITS_PER_PAGE);
-                        bits += hweight_long(p_addr[i]);
                __bm_unmap(p_addr);
                cond_resched();
        }
        /* last (or only) page */
        last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
        p_addr = __bm_map_pidx(b, idx);
-        for (i = 0; i < last_word; i++)
+        bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
-                bits += hweight_long(p_addr[i]);
        p_addr[last_word] &= cpu_to_lel(mask);
        bits += hweight_long(p_addr[last_word]);
        /* 32bit arch, may have an unused padding long */
@@ -1419,6 +1423,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
        int bits;
        int changed = 0;
        unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+        /* I think it is more cache line friendly to hweight_long then set to ~0UL,
+         * than to first bitmap_weight() all words, then bitmap_fill() all words */
        for (i = first_word; i < last_word; i++) {
                bits = hweight_long(paddr[i]);
                paddr[i] = ~0UL;
@@ -1628,8 +1635,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
                int n = e-s;
                p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
                bm = p_addr + MLPP(s);
-                while (n--)
+                count += bitmap_weight(bm, n * BITS_PER_LONG);
-                        count += hweight_long(*bm++);
                bm_unmap(p_addr);
        } else {
                drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 6b88a35fb048..96a0107a72ea 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored)
        return 0;
 }
+static int device_ed_gen_id_show(struct seq_file *m, void *ignored)
+{
+        struct drbd_device *device = m->private;
+        seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid);
+        return 0;
+}
 #define drbd_debugfs_device_attr(name)                                          \
 static int device_ ## name ## _open(struct inode *inode, struct file *file)     \
 {                                                                               \
@@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests)
 drbd_debugfs_device_attr(act_log_extents)
 drbd_debugfs_device_attr(resync_extents)
 drbd_debugfs_device_attr(data_gen_id)
+drbd_debugfs_device_attr(ed_gen_id)
 void drbd_debugfs_device_add(struct drbd_device *device)
 {
@@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
        DCF(act_log_extents);
        DCF(resync_extents);
        DCF(data_gen_id);
+        DCF(ed_gen_id);
 #undef DCF
        return;
@@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device)
        drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
        drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
        drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+        drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id);
        drbd_debugfs_remove(&device->debugfs_vol);
 }
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e66d453a5f2b..b6844feb9f9b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -77,13 +77,6 @@ extern int fault_devs;
 extern char usermode_helper[];
-/* I don't remember why XCPU ...
- * This is used to wake the asender,
- * and to interrupt sending the sending task
- * on disconnect.
- */
-#define DRBD_SIG SIGXCPU
 /* This is used to stop/restart our threads.
 * Cannot use SIGTERM nor SIGKILL, since these
 * are sent out by init on runlevel changes
@@ -292,6 +285,9 @@ struct drbd_device_work {
 extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+extern void lock_all_resources(void);
+extern void unlock_all_resources(void);
 struct drbd_request {
        struct drbd_work w;
        struct drbd_device *device;
@@ -504,7 +500,6 @@ enum {
        MD_NO_FUA,              /* Users wants us to not use FUA/FLUSH on meta data dev */
-        SUSPEND_IO,             /* suspend application io */
        BITMAP_IO,              /* suspend application io;
                                   once no more io in flight, start bitmap io */
        BITMAP_IO_QUEUED,       /* Started bitmap IO */
@@ -632,12 +627,6 @@ struct bm_io_work {
        void (*done)(struct drbd_device *device, int rv);
 };
-enum write_ordering_e {
-        WO_none,
-        WO_drain_io,
-        WO_bdev_flush,
-};
 struct fifo_buffer {
        unsigned int head_index;
        unsigned int size;
@@ -650,8 +639,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size);
 enum {
        NET_CONGESTED,          /* The data socket is congested */
        RESOLVE_CONFLICTS,      /* Set on one node, cleared on the peer! */
-        SEND_PING,              /* whether asender should send a ping asap */
+        SEND_PING,
-        SIGNAL_ASENDER,         /* whether asender wants to be interrupted */
        GOT_PING_ACK,           /* set when we receive a ping_ack packet, ping_wait gets woken */
        CONN_WD_ST_CHG_REQ,     /* A cluster wide state change on the connection is active */
        CONN_WD_ST_CHG_OKAY,
@@ -670,6 +658,8 @@ enum {
        DEVICE_WORK_PENDING,    /* tell worker that some device has pending work */
 };
+enum which_state { NOW, OLD = NOW, NEW };
 struct drbd_resource {
        char *name;
 #ifdef CONFIG_DEBUG_FS
@@ -755,7 +745,8 @@ struct drbd_connection {
        unsigned long last_reconnect_jif;
        struct drbd_thread receiver;
        struct drbd_thread worker;
-        struct drbd_thread asender;
+        struct drbd_thread ack_receiver;
+        struct workqueue_struct *ack_sender;
        /* cached pointers,
         * so we can look up the oldest pending requests more quickly.
@@ -774,6 +765,8 @@ struct drbd_connection {
        struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
        struct {
+                unsigned long last_sent_barrier_jif;
                /* whether this sender thread
                 * has processed a single write yet. */
                bool seen_any_write_yet;
@@ -788,6 +781,17 @@ struct drbd_connection {
        } send;
 };
+static inline bool has_net_conf(struct drbd_connection *connection)
+{
+        bool has_net_conf;
+        rcu_read_lock();
+        has_net_conf = rcu_dereference(connection->net_conf);
+        rcu_read_unlock();
+        return has_net_conf;
+}
 void __update_timing_details(
                struct drbd_thread_timing_details *tdp,
                unsigned int *cb_nr,
@@ -811,6 +815,7 @@ struct drbd_peer_device {
        struct list_head peer_devices;
        struct drbd_device *device;
        struct drbd_connection *connection;
+        struct work_struct send_acks_work;
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_peer_dev;
 #endif
@@ -829,6 +834,7 @@ struct drbd_device {
        struct dentry *debugfs_vol_act_log_extents;
        struct dentry *debugfs_vol_resync_extents;
        struct dentry *debugfs_vol_data_gen_id;
+        struct dentry *debugfs_vol_ed_gen_id;
 #endif
        unsigned int vnr;       /* volume number within the connection */
@@ -873,6 +879,7 @@ struct drbd_device {
        atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
        atomic_t unacked_cnt;    /* Need to send replies for */
        atomic_t local_cnt;      /* Waiting for local completion */
+        atomic_t suspend_cnt;
        /* Interval tree of pending local requests */
        struct rb_root read_requests;
@@ -1020,6 +1027,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev
        return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
 }
+static inline struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+        return idr_find(&connection->peer_devices, volume_number);
+}
 #define for_each_resource(resource, _resources) \
        list_for_each_entry(resource, _resources, resources)
@@ -1113,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
+extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
@@ -1424,7 +1437,7 @@ extern struct bio_set *drbd_md_io_bio_set;
 /* to allocate from that set */
 extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
-extern rwlock_t global_state_lock;
+extern struct mutex resources_mutex;
 extern int conn_lowest_minor(struct drbd_connection *connection);
 extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
@@ -1454,6 +1467,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 /* drbd_nl.c */
+extern struct mutex notification_mutex;
 extern void drbd_suspend_io(struct drbd_device *device);
 extern void drbd_resume_io(struct drbd_device *device);
 extern char *ppsize(char *buf, unsigned long long size);
@@ -1536,7 +1552,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
-extern int drbd_asender(struct drbd_thread *thi);
+extern int drbd_ack_receiver(struct drbd_thread *thi);
+extern void drbd_send_ping_wf(struct work_struct *ws);
+extern void drbd_send_acks_wf(struct work_struct *ws);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
 extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
                bool throttle_if_app_is_waiting);
@@ -1649,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s
 #define drbd_rs_failed_io(device, sector, size) \
        __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
 extern void drbd_al_shrink(struct drbd_device *device);
-extern int drbd_initialize_al(struct drbd_device *, void *);
+extern int drbd_al_initialize(struct drbd_device *, void *);
 /* drbd_nl.c */
 /* state info broadcast */
@@ -1668,6 +1686,29 @@ struct sib_info {
 };
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
+extern void notify_resource_state(struct sk_buff *,
+                                  unsigned int,
+                                  struct drbd_resource *,
+                                  struct resource_info *,
+                                  enum drbd_notification_type);
+extern void notify_device_state(struct sk_buff *,
+                                unsigned int,
+                                struct drbd_device *,
+                                struct device_info *,
+                                enum drbd_notification_type);
+extern void notify_connection_state(struct sk_buff *,
+                                    unsigned int,
+                                    struct drbd_connection *,
+                                    struct connection_info *,
+                                    enum drbd_notification_type);
+extern void notify_peer_device_state(struct sk_buff *,
+                                     unsigned int,
+                                     struct drbd_peer_device *,
+                                     struct peer_device_info *,
+                                     enum drbd_notification_type);
+extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
+                          struct drbd_connection *, const char *, int);
 /*
 * inline helper functions
 *************************/
@@ -1694,19 +1735,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r
        return 0;
 }
-static inline enum drbd_state_rv
-_drbd_set_state(struct drbd_device *device, union drbd_state ns,
-                enum chg_state_flags flags, struct completion *done)
-{
-        enum drbd_state_rv rv;
-        read_lock(&global_state_lock);
-        rv = __drbd_set_state(device, ns, flags, done);
-        read_unlock(&global_state_lock);
-        return rv;
-}
 static inline union drbd_state drbd_read_state(struct drbd_device *device)
 {
        struct drbd_resource *resource = device->resource;
@@ -1937,16 +1965,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit)
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
-static inline void wake_asender(struct drbd_connection *connection)
+/* To get the ack_receiver out of the blocking network stack,
+ * so it can change its sk_rcvtimeo from idle- to ping-timeout,
+ * and send a ping, we need to send a signal.
+ * Which signal we send is irrelevant. */
+static inline void wake_ack_receiver(struct drbd_connection *connection)
 {
-        if (test_bit(SIGNAL_ASENDER, &connection->flags))
+        struct task_struct *task = connection->ack_receiver.task;
-                force_sig(DRBD_SIG, connection->asender.task);
+        if (task && get_t_state(&connection->ack_receiver) == RUNNING)
+                force_sig(SIGXCPU, task);
 }
 static inline void request_ping(struct drbd_connection *connection)
 {
        set_bit(SEND_PING, &connection->flags);
-        wake_asender(connection);
+        wake_ack_receiver(connection);
 }
 extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
@@ -2230,7 +2263,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device)
        if (drbd_suspended(device))
                return false;
-        if (test_bit(SUSPEND_IO, &device->flags))
+        if (atomic_read(&device->suspend_cnt))
                return false;
        /* to avoid potential deadlock or bitmap corruption,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 74d97f4bac34..5b43dfb79819 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
 */
 struct idr drbd_devices;
 struct list_head drbd_resources;
+struct mutex resources_mutex;
 struct kmem_cache *drbd_request_cache;
 struct kmem_cache *drbd_ee_cache;       /* peer requests */
@@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str
        /* long elapsed = (long)(jiffies - device->last_received); */
        drop_it =   connection->meta.socket == sock
-                || !connection->asender.task
+                || !connection->ack_receiver.task
-                || get_t_state(&connection->asender) != RUNNING
+                || get_t_state(&connection->ack_receiver) != RUNNING
                || connection->cstate < C_WF_REPORT_PARAMS;
        if (drop_it)
@@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock,
                drbd_update_congested(connection);
        }
        do {
-                /* STRANGE
-                 * tcp_sendmsg does _not_ use its size parameter at all ?
-                 *
-                 * -EAGAIN on timeout, -EINTR on signal.
-                 */
-/* THINK
- * do we need to block DRBD_SIG if sock == &meta.socket ??
- * otherwise wake_asender() might interrupt some send_*Ack !
- */
                rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
                if (rv == -EAGAIN) {
                        if (we_should_drop_the_connection(connection, sock))
@@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device)
                drbd_bm_cleanup(device);
        }
-        drbd_free_ldev(device->ldev);
+        drbd_backing_dev_free(device, device->ldev);
        device->ldev = NULL;
        clear_bit(AL_SUSPENDED, &device->flags);
@@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref)
        if (device->this_bdev)
                bdput(device->this_bdev);
-        drbd_free_ldev(device->ldev);
+        drbd_backing_dev_free(device, device->ldev);
        device->ldev = NULL;
        drbd_release_all_peer_reqs(device);
@@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
                cpumask_copy(resource->cpu_mask, new_cpu_mask);
                for_each_connection_rcu(connection, resource) {
                        connection->receiver.reset_cpu_mask = 1;
-                        connection->asender.reset_cpu_mask = 1;
+                        connection->ack_receiver.reset_cpu_mask = 1;
                        connection->worker.reset_cpu_mask = 1;
                }
        }
@@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
        kref_init(&resource->kref);
        idr_init(&resource->devices);
        INIT_LIST_HEAD(&resource->connections);
-        resource->write_ordering = WO_bdev_flush;
+        resource->write_ordering = WO_BDEV_FLUSH;
        list_add_tail_rcu(&resource->resources, &drbd_resources);
        mutex_init(&resource->conf_update);
        mutex_init(&resource->adm_mutex);
@@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
        connection->receiver.connection = connection;
        drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
        connection->worker.connection = connection;
-        drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
+        drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
-        connection->asender.connection = connection;
+        connection->ack_receiver.connection = connection;
        kref_init(&connection->kref);
@@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device)
 {
        /* opencoded create_singlethread_workqueue(),
         * to be able to say "drbd%d", ..., minor */
-        device->submit.wq = alloc_workqueue("drbd%u_submit",
+        device->submit.wq =
-                        WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+                alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
        if (!device->submit.wq)
                return -ENOMEM;
@@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
                        goto out_idr_remove_from_resource;
                }
                kref_get(&connection->kref);
+                INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
        }
        if (init_submitter(device)) {
@@ -2923,7 +2916,7 @@ static int __init drbd_init(void)
        drbd_proc = NULL; /* play safe for drbd_cleanup */
        idr_init(&drbd_devices);
-        rwlock_init(&global_state_lock);
+        mutex_init(&resources_mutex);
        INIT_LIST_HEAD(&drbd_resources);
        err = drbd_genl_register();
@@ -2971,18 +2964,6 @@ fail:
        return err;
 }
-void drbd_free_ldev(struct drbd_backing_dev *ldev)
-{
-        if (ldev == NULL)
-                return;
-        blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-        blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-        kfree(ldev->disk_conf);
-        kfree(ldev);
-}
 static void drbd_free_one_sock(struct drbd_socket *ds)
 {
        struct socket *s;
@@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
         * and read it. */
        bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
        bdev->md.md_offset = drbd_md_ss(bdev);
+        /* Even for (flexible or indexed) external meta data,
+         * initially restrict us to the 4k superblock for now.
+         * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
+        bdev->md.md_size_sect = 8;
        if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
                /* NOTE: can't do normal error processing here as this is
@@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
        spin_lock_irq(&device->resource->req_lock);
        set_bit(BITMAP_IO, &device->flags);
-        if (atomic_read(&device->ap_bio_cnt) == 0) {
+        /* don't wait for pending application IO if the caller indicates that
+         * application IO does not conflict anyways. */
+        if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
                if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
                        drbd_queue_work(&first_peer_device(device)->connection->sender_work,
                                        &device->bm_io_work.w);
@@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
        return 0;
 }
+void lock_all_resources(void)
+{
+        struct drbd_resource *resource;
+        int __maybe_unused i = 0;
+        mutex_lock(&resources_mutex);
+        local_irq_disable();
+        for_each_resource(resource, &drbd_resources)
+                spin_lock_nested(&resource->req_lock, i++);
+}
+void unlock_all_resources(void)
+{
+        struct drbd_resource *resource;
+        for_each_resource(resource, &drbd_resources)
+                spin_unlock(&resource->req_lock);
+        local_irq_enable();
+        mutex_unlock(&resources_mutex);
+}
 #ifdef CONFIG_DRBD_FAULT_INJECTION
 /* Fault insertion support including random number generator shamelessly
 * stolen from kernel/rcutorture.c */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e80cbefbc2b5..c055c5e12f24 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -36,6 +36,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 #include <asm/unaligned.h>
 #include <linux/drbd_limits.h>
 #include <linux/kthread.h>
@@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
 /* .dumpit */
 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices_done(struct netlink_callback *cb);
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_connections_done(struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
 #include <linux/drbd_genl_api.h>
 #include "drbd_nla.h"
 #include <linux/genl_magic_func.h>
+static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
+DEFINE_MUTEX(notification_mutex);
 /* used blkdev_get_by_path, to claim our meta data device(s) */
 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
@@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
        sib.sib_reason = SIB_HELPER_PRE;
        sib.helper_name = cmd;
        drbd_bcast_event(device, &sib);
+        notify_helper(NOTIFY_CALL, device, connection, cmd, 0);
        ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
        if (ret)
                drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
        sib.sib_reason = SIB_HELPER_POST;
        sib.helper_exit_code = ret;
        drbd_bcast_event(device, &sib);
+        notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret);
        if (current == connection->worker.task)
                clear_bit(CALLBACK_PENDING, &connection->flags);
@@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
        drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
        /* TODO: conn_bcast_event() ?? */
+        notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0);
        ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
        if (ret)
@@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
                          usermode_helper, cmd, resource_name,
                          (ret >> 8) & 0xff, ret);
        /* TODO: conn_bcast_event() ?? */
+        notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret);
        if (ret < 0) /* Ignore any ERRNOs we got. */
                ret = 0;
@@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size)
 * and can be long lived.
 * This changes an device->flag, is triggered by drbd internals,
 * and should be short-lived. */
+/* It needs to be a counter, since multiple threads might
+   independently suspend and resume IO. */
 void drbd_suspend_io(struct drbd_device *device)
 {
-        set_bit(SUSPEND_IO, &device->flags);
+        atomic_inc(&device->suspend_cnt);
        if (drbd_suspended(device))
                return;
        wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
@@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device)
 void drbd_resume_io(struct drbd_device *device)
 {
-        clear_bit(SUSPEND_IO, &device->flags);
+        if (atomic_dec_and_test(&device->suspend_cnt))
-        wake_up(&device->misc_wait);
+                wake_up(&device->misc_wait);
 }
 /**
@@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device)
 enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 {
-        sector_t prev_first_sect, prev_size; /* previous meta location */
+        struct md_offsets_and_sizes {
-        sector_t la_size_sect, u_size;
+                u64 last_agreed_sect;
+                u64 md_offset;
+                s32 al_offset;
+                s32 bm_offset;
+                u32 md_size_sect;
+                u32 al_stripes;
+                u32 al_stripe_size_4k;
+        } prev;
+        sector_t u_size, size;
        struct drbd_md *md = &device->ldev->md;
-        u32 prev_al_stripe_size_4k;
-        u32 prev_al_stripes;
-        sector_t size;
        char ppb[10];
        void *buffer;
        int md_moved, la_size_changed;
        enum determine_dev_size rv = DS_UNCHANGED;
-        /* race:
+        /* We may change the on-disk offsets of our meta data below.  Lock out
-         * application request passes inc_ap_bio,
+         * anything that may cause meta data IO, to avoid acting on incomplete
-         * but then cannot get an AL-reference.
+         * layout changes or scribbling over meta data that is in the process
-         * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+         * of being moved.
         *
-         * to avoid that:
+         * Move is not exactly correct, btw, currently we have all our meta
-         * Suspend IO right here.
+         * data in core memory, to "move" it we just write it all out, there
-         * still lock the act_log to not trigger ASSERTs there.
+         * are no reads. */
-         */
        drbd_suspend_io(device);
        buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
        if (!buffer) {
@@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                return DS_ERROR;
        }
-        /* no wait necessary anymore, actually we could assert that */
+        /* remember current offset and sizes */
-        wait_event(device->al_wait, lc_try_lock(device->act_log));
+        prev.last_agreed_sect = md->la_size_sect;
+        prev.md_offset = md->md_offset;
-        prev_first_sect = drbd_md_first_sector(device->ldev);
+        prev.al_offset = md->al_offset;
-        prev_size = device->ldev->md.md_size_sect;
+        prev.bm_offset = md->bm_offset;
-        la_size_sect = device->ldev->md.la_size_sect;
+        prev.md_size_sect = md->md_size_sect;
+        prev.al_stripes = md->al_stripes;
+        prev.al_stripe_size_4k = md->al_stripe_size_4k;
        if (rs) {
                /* rs is non NULL if we should change the AL layout only */
-                prev_al_stripes = md->al_stripes;
-                prev_al_stripe_size_4k = md->al_stripe_size_4k;
                md->al_stripes = rs->al_stripes;
                md->al_stripe_size_4k = rs->al_stripe_size / 4;
                md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
@@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
        rcu_read_unlock();
        size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
-        if (size < la_size_sect) {
+        if (size < prev.last_agreed_sect) {
                if (rs && u_size == 0) {
                        /* Remove "rs &&" later. This check should always be active, but
                           right now the receiver expects the permissive behavior */
@@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
                if (unlikely(err)) {
                        /* currently there is only one error: ENOMEM! */
-                        size = drbd_bm_capacity(device)>>1;
+                        size = drbd_bm_capacity(device);
                        if (size == 0) {
                                drbd_err(device, "OUT OF MEMORY! "
                                    "Could not allocate bitmap!\n");
                        } else {
                                drbd_err(device, "BM resizing failed. "
-                                    "Leaving size unchanged at size = %lu KB\n",
+                                    "Leaving size unchanged\n");
-                                    (unsigned long)size);
                        }
                        rv = DS_ERROR;
                }
                /* racy, see comments above. */
                drbd_set_my_capacity(device, size);
-                device->ldev->md.la_size_sect = size;
+                md->la_size_sect = size;
                drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
                     (unsigned long long)size>>1);
        }
        if (rv <= DS_ERROR)
                goto err_out;
-        la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+        la_size_changed = (prev.last_agreed_sect != md->la_size_sect);
-        md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
+        md_moved = prev.md_offset    != md->md_offset
-                || prev_size       != device->ldev->md.md_size_sect;
+                || prev.md_size_sect != md->md_size_sect;
        if (la_size_changed || md_moved || rs) {
                u32 prev_flags;
@@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                 * Clear the timer, to avoid scary "timer expired!" messages,
                 * "Superblock" is written out at least twice below, anyways. */
                del_timer(&device->md_sync_timer);
-                drbd_al_shrink(device); /* All extents inactive. */
+                /* We won't change the "al-extents" setting, we just may need
+                 * to move the on-disk location of the activity log ringbuffer.
+                 * Lock for transaction is good enough, it may well be "dirty"
+                 * or even "starving". */
+                wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
+                /* mark current on-disk bitmap and activity log as unreliable */
                prev_flags = md->flags;
-                md->flags &= ~MDF_PRIMARY_IND;
+                md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
                drbd_md_write(device, buffer);
+                drbd_al_initialize(device, buffer);
                drbd_info(device, "Writing the whole bitmap, %s\n",
                         la_size_changed && md_moved ? "size changed and md moved" :
                         la_size_changed ? "size changed" : "md moved");
                /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
                drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
                               "size changed", BM_LOCKED_MASK);
-                drbd_initialize_al(device, buffer);
+                /* on-disk bitmap and activity log is authoritative again
+                 * (unless there was an IO error meanwhile...) */
                md->flags = prev_flags;
                drbd_md_write(device, buffer);
@@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                                  md->al_stripes, md->al_stripe_size_4k * 4);
        }
-        if (size > la_size_sect)
+        if (size > prev.last_agreed_sect)
-                rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+                rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO;
-        if (size < la_size_sect)
+        if (size < prev.last_agreed_sect)
                rv = DS_SHRUNK;
        if (0) {
        err_out:
-                if (rs) {
+                /* restore previous offset and sizes */
-                        md->al_stripes = prev_al_stripes;
+                md->la_size_sect = prev.last_agreed_sect;
-                        md->al_stripe_size_4k = prev_al_stripe_size_4k;
+                md->md_offset = prev.md_offset;
-                        md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+                md->al_offset = prev.al_offset;
+                md->bm_offset = prev.bm_offset;
-                        drbd_md_set_sector_offsets(device, device->ldev);
+                md->md_size_sect = prev.md_size_sect;
-                }
+                md->al_stripes = prev.al_stripes;
+                md->al_stripe_size_4k = prev.al_stripe_size_4k;
+                md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k;
        }
        lc_unlock(device->act_log);
        wake_up(&device->al_wait);
@@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
                lc_destroy(n);
                return -EBUSY;
        } else {
-                if (t)
+                lc_destroy(t);
-                        lc_destroy(t);
        }
        drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
        return 0;
@@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
        if (b) {
                struct drbd_connection *connection = first_peer_device(device)->connection;
+                blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
                if (blk_queue_discard(b) &&
                    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
-                        /* For now, don't allow more than one activity log extent worth of data
+                        /* We don't care, stacking below should fix it for the local device.
-                         * to be discarded in one go. We may need to rework drbd_al_begin_io()
+                         * Whether or not it is a suitable granularity on the remote device
-                         * to allow for even larger discard ranges */
+                         * is not our problem, really. If you care, you need to
-                        blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
+                         * use devices with similar topology on all peers. */
+                        q->limits.discard_granularity = 512;
                        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
-                        /* REALLY? Is stacking secdiscard "legal"? */
-                        if (blk_queue_secdiscard(b))
-                                queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
                } else {
                        blk_queue_max_discard_sectors(q, 0);
                        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
-                        queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+                        q->limits.discard_granularity = 0;
                }
                blk_queue_stack_limits(q, b);
@@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
                        q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
                }
        }
+        /* To avoid confusion, if this queue does not support discard, clear
+         * max_discard_sectors, which is what lsblk -D reports to the user.  */
+        if (!blk_queue_discard(q)) {
+                blk_queue_max_discard_sectors(q, 0);
+                q->limits.discard_granularity = 0;
+        }
 }
 void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
@@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection)
                connection->cstate == C_STANDALONE;
        spin_unlock_irq(&connection->resource->req_lock);
        if (stop_threads) {
-                /* asender is implicitly stopped by receiver
+                /* ack_receiver thread and ack_sender workqueue are implicitly
-                 * in conn_disconnect() */
+                 * stopped by receiver in conn_disconnect() */
                drbd_thread_stop(&connection->receiver);
                drbd_thread_stop(&connection->worker);
        }
@@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
                goto fail_unlock;
        }
-        write_lock_irq(&global_state_lock);
+        lock_all_resources();
        retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
        if (retcode == NO_ERROR) {
                rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
                drbd_resync_after_changed(device);
        }
-        write_unlock_irq(&global_state_lock);
+        unlock_all_resources();
        if (retcode != NO_ERROR)
                goto fail_unlock;
@@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
                set_bit(MD_NO_FUA, &device->flags);
        if (write_ordering_changed(old_disk_conf, new_disk_conf))
-                drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+                drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
        drbd_md_sync(device);
@@ -1449,6 +1486,88 @@ success:
        return 0;
 }
+static struct block_device *open_backing_dev(struct drbd_device *device,
+                const char *bdev_path, void *claim_ptr, bool do_bd_link)
+{
+        struct block_device *bdev;
+        int err = 0;
+        bdev = blkdev_get_by_path(bdev_path,
+                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr);
+        if (IS_ERR(bdev)) {
+                drbd_err(device, "open(\"%s\") failed with %ld\n",
+                                bdev_path, PTR_ERR(bdev));
+                return bdev;
+        }
+        if (!do_bd_link)
+                return bdev;
+        err = bd_link_disk_holder(bdev, device->vdisk);
+        if (err) {
+                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+                drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
+                                bdev_path, err);
+                bdev = ERR_PTR(err);
+        }
+        return bdev;
+}
+static int open_backing_devices(struct drbd_device *device,
+                struct disk_conf *new_disk_conf,
+                struct drbd_backing_dev *nbc)
+{
+        struct block_device *bdev;
+        bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
+        if (IS_ERR(bdev))
+                return ERR_OPEN_DISK;
+        nbc->backing_bdev = bdev;
+        /*
+         * meta_dev_idx >= 0: external fixed size, possibly multiple
+         * drbd sharing one meta device.  TODO in that case, paranoia
+         * check that [md_bdev, meta_dev_idx] is not yet used by some
+         * other drbd minor!  (if you use drbd.conf + drbdadm, that
+         * should check it for you already; but if you don't, or
+         * someone fooled it, we need to double check here)
+         */
+        bdev = open_backing_dev(device, new_disk_conf->meta_dev,
+                /* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
+                 * if potentially shared with other drbd minors */
+                        (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
+                /* avoid double bd_claim_by_disk() for the same (source,target) tuple,
+                 * as would happen with internal metadata. */
+                        (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
+                         new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
+        if (IS_ERR(bdev))
+                return ERR_OPEN_MD_DISK;
+        nbc->md_bdev = bdev;
+        return NO_ERROR;
+}
+static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
+        bool do_bd_unlink)
+{
+        if (!bdev)
+                return;
+        if (do_bd_unlink)
+                bd_unlink_disk_holder(bdev, device->vdisk);
+        blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+}
+void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
+{
+        if (ldev == NULL)
+                return;
+        close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev);
+        close_backing_dev(device, ldev->backing_bdev, true);
+        kfree(ldev->disk_conf);
+        kfree(ldev);
+}
 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
@@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        sector_t min_md_device_sectors;
        struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
        struct disk_conf *new_disk_conf = NULL;
-        struct block_device *bdev;
        struct lru_cache *resync_lru = NULL;
        struct fifo_buffer *new_plan = NULL;
        union drbd_state ns, os;
@@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        device = adm_ctx.device;
        mutex_lock(&adm_ctx.resource->adm_mutex);
        peer_device = first_peer_device(device);
-        connection = peer_device ? peer_device->connection : NULL;
+        connection = peer_device->connection;
        conn_reconfig_start(connection);
        /* if you want to reconfigure, please tear down first */
@@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                goto fail;
        }
-        write_lock_irq(&global_state_lock);
-        retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
-        write_unlock_irq(&global_state_lock);
-        if (retcode != NO_ERROR)
-                goto fail;
        rcu_read_lock();
        nc = rcu_dereference(connection->net_conf);
        if (nc) {
@@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        }
        rcu_read_unlock();
-        bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
+        retcode = open_backing_devices(device, new_disk_conf, nbc);
-                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
+        if (retcode != NO_ERROR)
-        if (IS_ERR(bdev)) {
-                drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
-                        PTR_ERR(bdev));
-                retcode = ERR_OPEN_DISK;
-                goto fail;
-        }
-        nbc->backing_bdev = bdev;
-        /*
-         * meta_dev_idx >= 0: external fixed size, possibly multiple
-         * drbd sharing one meta device.  TODO in that case, paranoia
-         * check that [md_bdev, meta_dev_idx] is not yet used by some
-         * other drbd minor!  (if you use drbd.conf + drbdadm, that
-         * should check it for you already; but if you don't, or
-         * someone fooled it, we need to double check here)
-         */
-        bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
-                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-                                  (new_disk_conf->meta_dev_idx < 0) ?
-                                  (void *)device : (void *)drbd_m_holder);
-        if (IS_ERR(bdev)) {
-                drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
-                        PTR_ERR(bdev));
-                retcode = ERR_OPEN_MD_DISK;
                goto fail;
-        }
-        nbc->md_bdev = bdev;
        if ((nbc->backing_bdev == nbc->md_bdev) !=
            (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
@@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                goto force_diskless_dec;
        }
+        lock_all_resources();
+        retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+        if (retcode != NO_ERROR) {
+                unlock_all_resources();
+                goto force_diskless_dec;
+        }
        /* Reset the "barriers don't work" bits here, then force meta data to
         * be written, to ensure we determine if barriers are supported. */
        if (new_disk_conf->md_flushes)
@@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        new_disk_conf = NULL;
        new_plan = NULL;
-        drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
+        drbd_resync_after_changed(device);
+        drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
+        unlock_all_resources();
        if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
                set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 fail:
        conn_reconfig_done(connection);
        if (nbc) {
-                if (nbc->backing_bdev)
+                close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
-                        blkdev_put(nbc->backing_bdev,
+                close_backing_dev(device, nbc->backing_bdev, true);
-                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-                if (nbc->md_bdev)
-                        blkdev_put(nbc->md_bdev,
-                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
                kfree(nbc);
        }
        kfree(new_disk_conf);
@@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 static int adm_detach(struct drbd_device *device, int force)
 {
        enum drbd_state_rv retcode;
+        void *buffer;
        int ret;
        if (force) {
@@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force)
        }
        drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-        drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
+        buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
-        retcode = drbd_request_state(device, NS(disk, D_FAILED));
+        if (buffer) {
-        drbd_md_put_buffer(device);
+                retcode = drbd_request_state(device, NS(disk, D_FAILED));
+                drbd_md_put_buffer(device);
+        } else /* already <= D_FAILED */
+                retcode = SS_NOTHING_TO_DO;
        /* D_FAILED will transition to DISKLESS. */
+        drbd_resume_io(device);
        ret = wait_event_interruptible(device->misc_wait,
                        device->state.disk != D_FAILED);
-        drbd_resume_io(device);
        if ((int)retcode == (int)SS_IS_DISKLESS)
                retcode = SS_NOTHING_TO_DO;
        if (ret)
@@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
        return 0;
 }
+static void connection_to_info(struct connection_info *info,
+                               struct drbd_connection *connection)
+{
+        info->conn_connection_state = connection->cstate;
+        info->conn_role = conn_highest_peer(connection);
+}
+static void peer_device_to_info(struct peer_device_info *info,
+                                struct drbd_peer_device *peer_device)
+{
+        struct drbd_device *device = peer_device->device;
+        info->peer_repl_state =
+                max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn);
+        info->peer_disk_state = device->state.pdsk;
+        info->peer_resync_susp_user = device->state.user_isp;
+        info->peer_resync_susp_peer = device->state.peer_isp;
+        info->peer_resync_susp_dependency = device->state.aftr_isp;
+}
 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 {
+        struct connection_info connection_info;
+        enum drbd_notification_type flags;
+        unsigned int peer_devices = 0;
        struct drbd_config_context adm_ctx;
        struct drbd_peer_device *peer_device;
        struct net_conf *old_net_conf, *new_net_conf = NULL;
@@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
        connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
        memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
+        idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+                peer_devices++;
+        }
+        connection_to_info(&connection_info, connection);
+        flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+        mutex_lock(&notification_mutex);
+        notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags);
+        idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+                struct peer_device_info peer_device_info;
+                peer_device_to_info(&peer_device_info, peer_device);
+                flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+                notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags);
+        }
+        mutex_unlock(&notification_mutex);
        mutex_unlock(&adm_ctx.resource->conf_update);
        rcu_read_lock();
@@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
                        drbd_err(connection,
                                "unexpected rv2=%d in conn_try_disconnect()\n",
                                rv2);
+                /* Unlike in DRBD 9, the state engine has generated
+                 * NOTIFY_DESTROY events before clearing connection->net_conf. */
        }
        return rv;
 }
@@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
                mutex_unlock(&device->resource->conf_update);
                synchronize_rcu();
                kfree(old_disk_conf);
+                new_disk_conf = NULL;
        }
        ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
@@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 fail_ldev:
        put_ldev(device);
+        kfree(new_disk_conf);
        goto fail;
 }
@@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
        mutex_lock(&adm_ctx.resource->adm_mutex);
        device = adm_ctx.device;
        if (test_bit(NEW_CUR_UUID, &device->flags)) {
-                drbd_uuid_new_current(device);
+                if (get_ldev_if_state(device, D_ATTACHING)) {
+                        drbd_uuid_new_current(device);
+                        put_ldev(device);
+                } else {
+                        /* This is effectively a multi-stage "forced down".
+                         * The NEW_CUR_UUID bit is supposedly only set, if we
+                         * lost the replication connection, and are configured
+                         * to freeze IO and wait for some fence-peer handler.
+                         * So we still don't have a replication connection.
+                         * And now we don't have a local disk either.  After
+                         * resume, we will fail all pending and new IO, because
+                         * we don't have any data anymore.  Which means we will
+                         * eventually be able to terminate all users of this
+                         * device, and then take it down.  By bumping the
+                         * "effective" data uuid, we make sure that you really
+                         * need to tear down before you reconfigure, we will
+                         * the refuse to re-connect or re-attach (because no
+                         * matching real data uuid exists).
+                         */
+                        u64 val;
+                        get_random_bytes(&val, sizeof(u64));
+                        drbd_set_ed_uuid(device, val);
+                        drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n");
+                }
                clear_bit(NEW_CUR_UUID, &device->flags);
        }
        drbd_suspend_io(device);
@@ -2910,6 +3071,486 @@ nla_put_failure:
 }
 /*
+ * The generic netlink dump callbacks are called outside the genl_lock(), so
+ * they cannot use the simple attribute parsing code which uses global
+ * attribute tables.
+ */
+static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr)
+{
+        const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+        const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+        struct nlattr *nla;
+        nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen),
+                       DRBD_NLA_CFG_CONTEXT);
+        if (!nla)
+                return NULL;
+        return drbd_nla_find_nested(maxtype, nla, __nla_type(attr));
+}
+static void resource_to_info(struct resource_info *, struct drbd_resource *);
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct drbd_genlmsghdr *dh;
+        struct drbd_resource *resource;
+        struct resource_info resource_info;
+        struct resource_statistics resource_statistics;
+        int err;
+        rcu_read_lock();
+        if (cb->args[0]) {
+                for_each_resource_rcu(resource, &drbd_resources)
+                        if (resource == (struct drbd_resource *)cb->args[0])
+                                goto found_resource;
+                err = 0;  /* resource was probably deleted */
+                goto out;
+        }
+        resource = list_entry(&drbd_resources,
+                              struct drbd_resource, resources);
+found_resource:
+        list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) {
+                goto put_result;
+        }
+        err = 0;
+        goto out;
+put_result:
+        dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                        cb->nlh->nlmsg_seq, &drbd_genl_family,
+                        NLM_F_MULTI, DRBD_ADM_GET_RESOURCES);
+        err = -ENOMEM;
+        if (!dh)
+                goto out;
+        dh->minor = -1U;
+        dh->ret_code = NO_ERROR;
+        err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL);
+        if (err)
+                goto out;
+        err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN));
+        if (err)
+                goto out;
+        resource_to_info(&resource_info, resource);
+        err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN));
+        if (err)
+                goto out;
+        resource_statistics.res_stat_write_ordering = resource->write_ordering;
+        err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+        if (err)
+                goto out;
+        cb->args[0] = (long)resource;
+        genlmsg_end(skb, dh);
+        err = 0;
+out:
+        rcu_read_unlock();
+        if (err)
+                return err;
+        return skb->len;
+}
+static void device_to_statistics(struct device_statistics *s,
+                                 struct drbd_device *device)
+{
+        memset(s, 0, sizeof(*s));
+        s->dev_upper_blocked = !may_inc_ap_bio(device);
+        if (get_ldev(device)) {
+                struct drbd_md *md = &device->ldev->md;
+                u64 *history_uuids = (u64 *)s->history_uuids;
+                struct request_queue *q;
+                int n;
+                spin_lock_irq(&md->uuid_lock);
+                s->dev_current_uuid = md->uuid[UI_CURRENT];
+                BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
+                for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
+                        history_uuids[n] = md->uuid[UI_HISTORY_START + n];
+                for (; n < HISTORY_UUIDS; n++)
+                        history_uuids[n] = 0;
+                s->history_uuids_len = HISTORY_UUIDS;
+                spin_unlock_irq(&md->uuid_lock);
+                s->dev_disk_flags = md->flags;
+                q = bdev_get_queue(device->ldev->backing_bdev);
+                s->dev_lower_blocked =
+                        bdi_congested(&q->backing_dev_info,
+                                      (1 << WB_async_congested) |
+                                      (1 << WB_sync_congested));
+                put_ldev(device);
+        }
+        s->dev_size = drbd_get_capacity(device->this_bdev);
+        s->dev_read = device->read_cnt;
+        s->dev_write = device->writ_cnt;
+        s->dev_al_writes = device->al_writ_cnt;
+        s->dev_bm_writes = device->bm_writ_cnt;
+        s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
+        s->dev_lower_pending = atomic_read(&device->local_cnt);
+        s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
+        s->dev_exposed_data_uuid = device->ed_uuid;
+}
+static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr)
+{
+        if (cb->args[0]) {
+                struct drbd_resource *resource =
+                        (struct drbd_resource *)cb->args[0];
+                kref_put(&resource->kref, drbd_destroy_resource);
+        }
+        return 0;
+}
+int drbd_adm_dump_devices_done(struct netlink_callback *cb) {
+        return put_resource_in_arg0(cb, 7);
+}
+static void device_to_info(struct device_info *, struct drbd_device *);
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct nlattr *resource_filter;
+        struct drbd_resource *resource;
+        struct drbd_device *uninitialized_var(device);
+        int minor, err, retcode;
+        struct drbd_genlmsghdr *dh;
+        struct device_info device_info;
+        struct device_statistics device_statistics;
+        struct idr *idr_to_search;
+        resource = (struct drbd_resource *)cb->args[0];
+        if (!cb->args[0] && !cb->args[1]) {
+                resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+                if (resource_filter) {
+                        retcode = ERR_RES_NOT_KNOWN;
+                        resource = drbd_find_resource(nla_data(resource_filter));
+                        if (!resource)
+                                goto put_result;
+                        cb->args[0] = (long)resource;
+                }
+        }
+        rcu_read_lock();
+        minor = cb->args[1];
+        idr_to_search = resource ? &resource->devices : &drbd_devices;
+        device = idr_get_next(idr_to_search, &minor);
+        if (!device) {
+                err = 0;
+                goto out;
+        }
+        idr_for_each_entry_continue(idr_to_search, device, minor) {
+                retcode = NO_ERROR;
+                goto put_result;  /* only one iteration */
+        }
+        err = 0;
+        goto out;  /* no more devices */
+put_result:
+        dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                        cb->nlh->nlmsg_seq, &drbd_genl_family,
+                        NLM_F_MULTI, DRBD_ADM_GET_DEVICES);
+        err = -ENOMEM;
+        if (!dh)
+                goto out;
+        dh->ret_code = retcode;
+        dh->minor = -1U;
+        if (retcode == NO_ERROR) {
+                dh->minor = device->minor;
+                err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device);
+                if (err)
+                        goto out;
+                if (get_ldev(device)) {
+                        struct disk_conf *disk_conf =
+                                rcu_dereference(device->ldev->disk_conf);
+                        err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN));
+                        put_ldev(device);
+                        if (err)
+                                goto out;
+                }
+                device_to_info(&device_info, device);
+                err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN));
+                if (err)
+                        goto out;
+                device_to_statistics(&device_statistics, device);
+                err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+                if (err)
+                        goto out;
+                cb->args[1] = minor + 1;
+        }
+        genlmsg_end(skb, dh);
+        err = 0;
+out:
+        rcu_read_unlock();
+        if (err)
+                return err;
+        return skb->len;
+}
+int drbd_adm_dump_connections_done(struct netlink_callback *cb)
+{
+        return put_resource_in_arg0(cb, 6);
+}
+enum { SINGLE_RESOURCE, ITERATE_RESOURCES };
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct nlattr *resource_filter;
+        struct drbd_resource *resource = NULL, *next_resource;
+        struct drbd_connection *uninitialized_var(connection);
+        int err = 0, retcode;
+        struct drbd_genlmsghdr *dh;
+        struct connection_info connection_info;
+        struct connection_statistics connection_statistics;
+        rcu_read_lock();
+        resource = (struct drbd_resource *)cb->args[0];
+        if (!cb->args[0]) {
+                resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+                if (resource_filter) {
+                        retcode = ERR_RES_NOT_KNOWN;
+                        resource = drbd_find_resource(nla_data(resource_filter));
+                        if (!resource)
+                                goto put_result;
+                        cb->args[0] = (long)resource;
+                        cb->args[1] = SINGLE_RESOURCE;
+                }
+        }
+        if (!resource) {
+                if (list_empty(&drbd_resources))
+                        goto out;
+                resource = list_first_entry(&drbd_resources, struct drbd_resource, resources);
+                kref_get(&resource->kref);
+                cb->args[0] = (long)resource;
+                cb->args[1] = ITERATE_RESOURCES;
+        }
+    next_resource:
+        rcu_read_unlock();
+        mutex_lock(&resource->conf_update);
+        rcu_read_lock();
+        if (cb->args[2]) {
+                for_each_connection_rcu(connection, resource)
+                        if (connection == (struct drbd_connection *)cb->args[2])
+                                goto found_connection;
+                /* connection was probably deleted */
+                goto no_more_connections;
+        }
+        connection = list_entry(&resource->connections, struct drbd_connection, connections);
+found_connection:
+        list_for_each_entry_continue_rcu(connection, &resource->connections, connections) {
+                if (!has_net_conf(connection))
+                        continue;
+                retcode = NO_ERROR;
+                goto put_result;  /* only one iteration */
+        }
+no_more_connections:
+        if (cb->args[1] == ITERATE_RESOURCES) {
+                for_each_resource_rcu(next_resource, &drbd_resources) {
+                        if (next_resource == resource)
+                                goto found_resource;
+                }
+                /* resource was probably deleted */
+        }
+        goto out;
+found_resource:
+        list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) {
+                mutex_unlock(&resource->conf_update);
+                kref_put(&resource->kref, drbd_destroy_resource);
+                resource = next_resource;
+                kref_get(&resource->kref);
+                cb->args[0] = (long)resource;
+                cb->args[2] = 0;
+                goto next_resource;
+        }
+        goto out;  /* no more resources */
+put_result:
+        dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                        cb->nlh->nlmsg_seq, &drbd_genl_family,
+                        NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS);
+        err = -ENOMEM;
+        if (!dh)
+                goto out;
+        dh->ret_code = retcode;
+        dh->minor = -1U;
+        if (retcode == NO_ERROR) {
+                struct net_conf *net_conf;
+                err = nla_put_drbd_cfg_context(skb, resource, connection, NULL);
+                if (err)
+                        goto out;
+                net_conf = rcu_dereference(connection->net_conf);
+                if (net_conf) {
+                        err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN));
+                        if (err)
+                                goto out;
+                }
+                connection_to_info(&connection_info, connection);
+                err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN));
+                if (err)
+                        goto out;
+                connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+                err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+                if (err)
+                        goto out;
+                cb->args[2] = (long)connection;
+        }
+        genlmsg_end(skb, dh);
+        err = 0;
+out:
+        rcu_read_unlock();
+        if (resource)
+                mutex_unlock(&resource->conf_update);
+        if (err)
+                return err;
+        return skb->len;
+}
+enum mdf_peer_flag {
+        MDF_PEER_CONNECTED =    1 << 0,
+        MDF_PEER_OUTDATED =     1 << 1,
+        MDF_PEER_FENCING =      1 << 2,
+        MDF_PEER_FULL_SYNC =    1 << 3,
+};
+static void peer_device_to_statistics(struct peer_device_statistics *s,
+                                      struct drbd_peer_device *peer_device)
+{
+        struct drbd_device *device = peer_device->device;
+        memset(s, 0, sizeof(*s));
+        s->peer_dev_received = device->recv_cnt;
+        s->peer_dev_sent = device->send_cnt;
+        s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
+                              atomic_read(&device->rs_pending_cnt);
+        s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
+        s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
+        s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
+        if (get_ldev(device)) {
+                struct drbd_md *md = &device->ldev->md;
+                spin_lock_irq(&md->uuid_lock);
+                s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
+                spin_unlock_irq(&md->uuid_lock);
+                s->peer_dev_flags =
+                        (drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
+                                MDF_PEER_CONNECTED : 0) +
+                        (drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
+                         !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
+                                MDF_PEER_OUTDATED : 0) +
+                        /* FIXME: MDF_PEER_FENCING? */
+                        (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
+                                MDF_PEER_FULL_SYNC : 0);
+                put_ldev(device);
+        }
+}
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb)
+{
+        return put_resource_in_arg0(cb, 9);
+}
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct nlattr *resource_filter;
+        struct drbd_resource *resource;
+        struct drbd_device *uninitialized_var(device);
+        struct drbd_peer_device *peer_device = NULL;
+        int minor, err, retcode;
+        struct drbd_genlmsghdr *dh;
+        struct idr *idr_to_search;
+        resource = (struct drbd_resource *)cb->args[0];
+        if (!cb->args[0] && !cb->args[1]) {
+                resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+                if (resource_filter) {
+                        retcode = ERR_RES_NOT_KNOWN;
+                        resource = drbd_find_resource(nla_data(resource_filter));
+                        if (!resource)
+                                goto put_result;
+                }
+                cb->args[0] = (long)resource;
+        }
+        rcu_read_lock();
+        minor = cb->args[1];
+        idr_to_search = resource ? &resource->devices : &drbd_devices;
+        device = idr_find(idr_to_search, minor);
+        if (!device) {
+next_device:
+                minor++;
+                cb->args[2] = 0;
+                device = idr_get_next(idr_to_search, &minor);
+                if (!device) {
+                        err = 0;
+                        goto out;
+                }
+        }
+        if (cb->args[2]) {
+                for_each_peer_device(peer_device, device)
+                        if (peer_device == (struct drbd_peer_device *)cb->args[2])
+                                goto found_peer_device;
+                /* peer device was probably deleted */
+                goto next_device;
+        }
+        /* Make peer_device point to the list head (not the first entry). */
+        peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
+found_peer_device:
+        list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) {
+                if (!has_net_conf(peer_device->connection))
+                        continue;
+                retcode = NO_ERROR;
+                goto put_result;  /* only one iteration */
+        }
+        goto next_device;
+put_result:
+        dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                        cb->nlh->nlmsg_seq, &drbd_genl_family,
+                        NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES);
+        err = -ENOMEM;
+        if (!dh)
+                goto out;
+        dh->ret_code = retcode;
+        dh->minor = -1U;
+        if (retcode == NO_ERROR) {
+                struct peer_device_info peer_device_info;
+                struct peer_device_statistics peer_device_statistics;
+                dh->minor = minor;
+                err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device);
+                if (err)
+                        goto out;
+                peer_device_to_info(&peer_device_info, peer_device);
+                err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN));
+                if (err)
+                        goto out;
+                peer_device_to_statistics(&peer_device_statistics, peer_device);
+                err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+                if (err)
+                        goto out;
+                cb->args[1] = minor;
+                cb->args[2] = (long)peer_device;
+        }
+        genlmsg_end(skb, dh);
+        err = 0;
+out:
+        rcu_read_unlock();
+        if (err)
+                return err;
+        return skb->len;
+}
+/*
 * Return the connection of @resource if @resource has exactly one connection.
 */
 static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
@@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx)
        return NO_ERROR;
 }
+static void resource_to_info(struct resource_info *info,
+                             struct drbd_resource *resource)
+{
+        info->res_role = conn_highest_role(first_connection(resource));
+        info->res_susp = resource->susp;
+        info->res_susp_nod = resource->susp_nod;
+        info->res_susp_fen = resource->susp_fen;
+}
 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 {
+        struct drbd_connection *connection;
        struct drbd_config_context adm_ctx;
        enum drbd_ret_code retcode;
        struct res_opts res_opts;
@@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
        }
        /* not yet safe for genl_family.parallel_ops */
-        if (!conn_create(adm_ctx.resource_name, &res_opts))
+        mutex_lock(&resources_mutex);
+        connection = conn_create(adm_ctx.resource_name, &res_opts);
+        mutex_unlock(&resources_mutex);
+        if (connection) {
+                struct resource_info resource_info;
+                mutex_lock(&notification_mutex);
+                resource_to_info(&resource_info, connection->resource);
+                notify_resource_state(NULL, 0, connection->resource,
+                                      &resource_info, NOTIFY_CREATE);
+                mutex_unlock(&notification_mutex);
+        } else
                retcode = ERR_NOMEM;
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
 }
+static void device_to_info(struct device_info *info,
+                           struct drbd_device *device)
+{
+        info->dev_disk_state = device->state.disk;
+}
 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
@@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
        mutex_lock(&adm_ctx.resource->adm_mutex);
        retcode = drbd_create_device(&adm_ctx, dh->minor);
+        if (retcode == NO_ERROR) {
+                struct drbd_device *device;
+                struct drbd_peer_device *peer_device;
+                struct device_info info;
+                unsigned int peer_devices = 0;
+                enum drbd_notification_type flags;
+                device = minor_to_device(dh->minor);
+                for_each_peer_device(peer_device, device) {
+                        if (!has_net_conf(peer_device->connection))
+                                continue;
+                        peer_devices++;
+                }
+                device_to_info(&info, device);
+                mutex_lock(&notification_mutex);
+                flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+                notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags);
+                for_each_peer_device(peer_device, device) {
+                        struct peer_device_info peer_device_info;
+                        if (!has_net_conf(peer_device->connection))
+                                continue;
+                        peer_device_to_info(&peer_device_info, peer_device);
+                        flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+                        notify_peer_device_state(NULL, 0, peer_device, &peer_device_info,
+                                                 NOTIFY_CREATE | flags);
+                }
+                mutex_unlock(&notification_mutex);
+        }
        mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
@@ -3498,13 +4199,35 @@ out:
 static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
 {
+        struct drbd_peer_device *peer_device;
        if (device->state.disk == D_DISKLESS &&
            /* no need to be device->state.conn == C_STANDALONE &&
             * we may want to delete a minor from a live replication group.
             */
            device->state.role == R_SECONDARY) {
+                struct drbd_connection *connection =
+                        first_connection(device->resource);
                _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
                                    CS_VERBOSE + CS_WAIT_COMPLETE);
+                /* If the state engine hasn't stopped the sender thread yet, we
+                 * need to flush the sender work queue before generating the
+                 * DESTROY events here. */
+                if (get_t_state(&connection->worker) == RUNNING)
+                        drbd_flush_workqueue(&connection->sender_work);
+                mutex_lock(&notification_mutex);
+                for_each_peer_device(peer_device, device) {
+                        if (!has_net_conf(peer_device->connection))
+                                continue;
+                        notify_peer_device_state(NULL, 0, peer_device, NULL,
+                                                 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+                }
+                notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY);
+                mutex_unlock(&notification_mutex);
                drbd_delete_device(device);
                return NO_ERROR;
        } else
@@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource)
        if (!idr_is_empty(&resource->devices))
                return ERR_RES_IN_USE;
+        /* The state engine has stopped the sender thread, so we don't
+         * need to flush the sender work queue before generating the
+         * DESTROY event here. */
+        mutex_lock(&notification_mutex);
+        notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY);
+        mutex_unlock(&notification_mutex);
+        mutex_lock(&resources_mutex);
        list_del_rcu(&resource->resources);
+        mutex_unlock(&resources_mutex);
        /* Make sure all threads have actually stopped: state handling only
         * does drbd_thread_stop_nowait(). */
        list_for_each_entry(connection, &resource->connections, connections)
@@ -3637,7 +4369,6 @@ finish:
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 {
-        static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
        struct sk_buff *msg;
        struct drbd_genlmsghdr *d_out;
        unsigned seq;
@@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
        if (nla_put_status_info(msg, device, sib))
                goto nla_put_failure;
        genlmsg_end(msg, d_out);
-        err = drbd_genl_multicast_events(msg, 0);
+        err = drbd_genl_multicast_events(msg, GFP_NOWAIT);
        /* msg has been consumed or freed in netlink_broadcast() */
        if (err && err != -ESRCH)
                goto failed;
@@ -3672,3 +4403,405 @@ failed:
                        "Event seq:%u sib_reason:%u\n",
                        err, seq, sib->sib_reason);
 }
+static int nla_put_notification_header(struct sk_buff *msg,
+                                       enum drbd_notification_type type)
+{
+        struct drbd_notification_header nh = {
+                .nh_type = type,
+        };
+        return drbd_notification_header_to_skb(msg, &nh, true);
+}
+void notify_resource_state(struct sk_buff *skb,
+                           unsigned int seq,
+                           struct drbd_resource *resource,
+                           struct resource_info *resource_info,
+                           enum drbd_notification_type type)
+{
+        struct resource_statistics resource_statistics;
+        struct drbd_genlmsghdr *dh;
+        bool multicast = false;
+        int err;
+        if (!skb) {
+                seq = atomic_inc_return(&notify_genl_seq);
+                skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+                err = -ENOMEM;
+                if (!skb)
+                        goto failed;
+                multicast = true;
+        }
+        err = -EMSGSIZE;
+        dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE);
+        if (!dh)
+                goto nla_put_failure;
+        dh->minor = -1U;
+        dh->ret_code = NO_ERROR;
+        if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) ||
+            nla_put_notification_header(skb, type) ||
+            ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+             resource_info_to_skb(skb, resource_info, true)))
+                goto nla_put_failure;
+        resource_statistics.res_stat_write_ordering = resource->write_ordering;
+        err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+        if (err)
+                goto nla_put_failure;
+        genlmsg_end(skb, dh);
+        if (multicast) {
+                err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+                /* skb has been consumed or freed in netlink_broadcast() */
+                if (err && err != -ESRCH)
+                        goto failed;
+        }
+        return;
+nla_put_failure:
+        nlmsg_free(skb);
+failed:
+        drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+                        err, seq);
+}
+void notify_device_state(struct sk_buff *skb,
+                         unsigned int seq,
+                         struct drbd_device *device,
+                         struct device_info *device_info,
+                         enum drbd_notification_type type)
+{
+        struct device_statistics device_statistics;
+        struct drbd_genlmsghdr *dh;
+        bool multicast = false;
+        int err;
+        if (!skb) {
+                seq = atomic_inc_return(&notify_genl_seq);
+                skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+                err = -ENOMEM;
+                if (!skb)
+                        goto failed;
+                multicast = true;
+        }
+        err = -EMSGSIZE;
+        dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE);
+        if (!dh)
+                goto nla_put_failure;
+        dh->minor = device->minor;
+        dh->ret_code = NO_ERROR;
+        if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) ||
+            nla_put_notification_header(skb, type) ||
+            ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+             device_info_to_skb(skb, device_info, true)))
+                goto nla_put_failure;
+        device_to_statistics(&device_statistics, device);
+        device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+        genlmsg_end(skb, dh);
+        if (multicast) {
+                err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+                /* skb has been consumed or freed in netlink_broadcast() */
+                if (err && err != -ESRCH)
+                        goto failed;
+        }
+        return;
+nla_put_failure:
+        nlmsg_free(skb);
+failed:
+        drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
+                 err, seq);
+}
+void notify_connection_state(struct sk_buff *skb,
+                             unsigned int seq,
+                             struct drbd_connection *connection,
+                             struct connection_info *connection_info,
+                             enum drbd_notification_type type)
+{
+        struct connection_statistics connection_statistics;
+        struct drbd_genlmsghdr *dh;
+        bool multicast = false;
+        int err;
+        if (!skb) {
+                seq = atomic_inc_return(&notify_genl_seq);
+                skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+                err = -ENOMEM;
+                if (!skb)
+                        goto failed;
+                multicast = true;
+        }
+        err = -EMSGSIZE;
+        dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE);
+        if (!dh)
+                goto nla_put_failure;
+        dh->minor = -1U;
+        dh->ret_code = NO_ERROR;
+        if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) ||
+            nla_put_notification_header(skb, type) ||
+            ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+             connection_info_to_skb(skb, connection_info, true)))
+                goto nla_put_failure;
+        connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+        connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+        genlmsg_end(skb, dh);
+        if (multicast) {
+                err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+                /* skb has been consumed or freed in netlink_broadcast() */
+                if (err && err != -ESRCH)
+                        goto failed;
+        }
+        return;
+nla_put_failure:
+        nlmsg_free(skb);
+failed:
+        drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
+                 err, seq);
+}
+void notify_peer_device_state(struct sk_buff *skb,
+                              unsigned int seq,
+                              struct drbd_peer_device *peer_device,
+                              struct peer_device_info *peer_device_info,
+                              enum drbd_notification_type type)
+{
+        struct peer_device_statistics peer_device_statistics;
+        struct drbd_resource *resource = peer_device->device->resource;
+        struct drbd_genlmsghdr *dh;
+        bool multicast = false;
+        int err;
+        if (!skb) {
+                seq = atomic_inc_return(&notify_genl_seq);
+                skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+                err = -ENOMEM;
+                if (!skb)
+                        goto failed;
+                multicast = true;
+        }
+        err = -EMSGSIZE;
+        dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE);
+        if (!dh)
+                goto nla_put_failure;
+        dh->minor = -1U;
+        dh->ret_code = NO_ERROR;
+        if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) ||
+            nla_put_notification_header(skb, type) ||
+            ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+             peer_device_info_to_skb(skb, peer_device_info, true)))
+                goto nla_put_failure;
+        peer_device_to_statistics(&peer_device_statistics, peer_device);
+        peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+        genlmsg_end(skb, dh);
+        if (multicast) {
+                err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+                /* skb has been consumed or freed in netlink_broadcast() */
+                if (err && err != -ESRCH)
+                        goto failed;
+        }
+        return;
+nla_put_failure:
+        nlmsg_free(skb);
+failed:
+        drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
+                 err, seq);
+}
+void notify_helper(enum drbd_notification_type type,
+                   struct drbd_device *device, struct drbd_connection *connection,
+                   const char *name, int status)
+{
+        struct drbd_resource *resource = device ? device->resource : connection->resource;
+        struct drbd_helper_info helper_info;
+        unsigned int seq = atomic_inc_return(&notify_genl_seq);
+        struct sk_buff *skb = NULL;
+        struct drbd_genlmsghdr *dh;
+        int err;
+        strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
+        helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
+        helper_info.helper_status = status;
+        skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+        err = -ENOMEM;
+        if (!skb)
+                goto fail;
+        err = -EMSGSIZE;
+        dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER);
+        if (!dh)
+                goto fail;
+        dh->minor = device ? device->minor : -1;
+        dh->ret_code = NO_ERROR;
+        mutex_lock(&notification_mutex);
+        if (nla_put_drbd_cfg_context(skb, resource, connection, device) ||
+            nla_put_notification_header(skb, type) ||
+            drbd_helper_info_to_skb(skb, &helper_info, true))
+                goto unlock_fail;
+        genlmsg_end(skb, dh);
+        err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+        skb = NULL;
+        /* skb has been consumed or freed in netlink_broadcast() */
+        if (err && err != -ESRCH)
+                goto unlock_fail;
+        mutex_unlock(&notification_mutex);
+        return;
+unlock_fail:
+        mutex_unlock(&notification_mutex);
+fail:
+        nlmsg_free(skb);
+        drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+                 err, seq);
+}
+static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
+{
+        struct drbd_genlmsghdr *dh;
+        int err;
+        err = -EMSGSIZE;
+        dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE);
+        if (!dh)
+                goto nla_put_failure;
+        dh->minor = -1U;
+        dh->ret_code = NO_ERROR;
+        if (nla_put_notification_header(skb, NOTIFY_EXISTS))
+                goto nla_put_failure;
+        genlmsg_end(skb, dh);
+        return;
+nla_put_failure:
+        nlmsg_free(skb);
+        pr_err("Error %d sending event. Event seq:%u\n", err, seq);
+}
+static void free_state_changes(struct list_head *list)
+{
+        while (!list_empty(list)) {
+                struct drbd_state_change *state_change =
+                        list_first_entry(list, struct drbd_state_change, list);
+                list_del(&state_change->list);
+                forget_state_change(state_change);
+        }
+}
+static unsigned int notifications_for_state_change(struct drbd_state_change *state_change)
+{
+        return 1 +
+               state_change->n_connections +
+               state_change->n_devices +
+               state_change->n_devices * state_change->n_connections;
+}
+static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0];
+        unsigned int seq = cb->args[2];
+        unsigned int n;
+        enum drbd_notification_type flags = 0;
+        /* There is no need for taking notification_mutex here: it doesn't
+           matter if the initial state events mix with later state chage
+           events; we can always tell the events apart by the NOTIFY_EXISTS
+           flag. */
+        cb->args[5]--;
+        if (cb->args[5] == 1) {
+                notify_initial_state_done(skb, seq);
+                goto out;
+        }
+        n = cb->args[4]++;
+        if (cb->args[4] < cb->args[3])
+                flags |= NOTIFY_CONTINUES;
+        if (n < 1) {
+                notify_resource_state_change(skb, seq, state_change->resource,
+                                             NOTIFY_EXISTS | flags);
+                goto next;
+        }
+        n--;
+        if (n < state_change->n_connections) {
+                notify_connection_state_change(skb, seq, &state_change->connections[n],
+                                               NOTIFY_EXISTS | flags);
+                goto next;
+        }
+        n -= state_change->n_connections;
+        if (n < state_change->n_devices) {
+                notify_device_state_change(skb, seq, &state_change->devices[n],
+                                           NOTIFY_EXISTS | flags);
+                goto next;
+        }
+        n -= state_change->n_devices;
+        if (n < state_change->n_devices * state_change->n_connections) {
+                notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
+                                                NOTIFY_EXISTS | flags);
+                goto next;
+        }
+next:
+        if (cb->args[4] == cb->args[3]) {
+                struct drbd_state_change *next_state_change =
+                        list_entry(state_change->list.next,
+                                   struct drbd_state_change, list);
+                cb->args[0] = (long)next_state_change;
+                cb->args[3] = notifications_for_state_change(next_state_change);
+                cb->args[4] = 0;
+        }
+out:
+        return skb->len;
+}
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+        struct drbd_resource *resource;
+        LIST_HEAD(head);
+        if (cb->args[5] >= 1) {
+                if (cb->args[5] > 1)
+                        return get_initial_state(skb, cb);
+                if (cb->args[0]) {
+                        struct drbd_state_change *state_change =
+                                (struct drbd_state_change *)cb->args[0];
+                        /* connect list to head */
+                        list_add(&head, &state_change->list);
+                        free_state_changes(&head);
+                }
+                return 0;
+        }
+        cb->args[5] = 2;  /* number of iterations */
+        mutex_lock(&resources_mutex);
+        for_each_resource(resource, &drbd_resources) {
+                struct drbd_state_change *state_change;
+                state_change = remember_old_state(resource, GFP_KERNEL);
+                if (!state_change) {
+                        if (!list_empty(&head))
+                                free_state_changes(&head);
+                        mutex_unlock(&resources_mutex);
+                        return -ENOMEM;
+                }
+                copy_old_to_new_state_change(state_change);
+                list_add_tail(&state_change->list, &head);
+                cb->args[5] += notifications_for_state_change(state_change);
+        }
+        mutex_unlock(&resources_mutex);
+        if (!list_empty(&head)) {
+                struct drbd_state_change *state_change =
+                        list_entry(head.next, struct drbd_state_change, list);
+                cb->args[0] = (long)state_change;
+                cb->args[3] = notifications_for_state_change(state_change);
+                list_del(&head);  /* detach list from head */
+        }
+        cb->args[2] = cb->nlh->nlmsg_seq;
+        return get_initial_state(skb, cb);
+}
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 3b10fa6cb039..6537b25db9c1 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
        char wp;
        static char write_ordering_chars[] = {
-                [WO_none] = 'n',
+                [WO_NONE] = 'n',
-                [WO_drain_io] = 'd',
+                [WO_DRAIN_IO] = 'd',
-                [WO_bdev_flush] = 'f',
+                [WO_BDEV_FLUSH] = 'f',
        };
        seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 2da9104a3851..ef9245363dcc 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -23,7 +23,7 @@ enum drbd_packet {
        P_AUTH_RESPONSE       = 0x11,
        P_STATE_CHG_REQ       = 0x12,
-        /* asender (meta socket */
+        /* (meta socket) */
        P_PING                = 0x13,
        P_PING_ACK            = 0x14,
        P_RECV_ACK            = 0x15, /* Used in protocol B */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b4b5680ac6ad..1957fe8601dc 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
        }
 }
-static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 {
        LIST_HEAD(reclaimed);
        struct drbd_peer_request *peer_req, *t;
@@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
        spin_lock_irq(&device->resource->req_lock);
        reclaim_finished_net_peer_reqs(device, &reclaimed);
        spin_unlock_irq(&device->resource->req_lock);
        list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
                drbd_free_net_peer_req(device, peer_req);
 }
+static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
+{
+        struct drbd_peer_device *peer_device;
+        int vnr;
+        rcu_read_lock();
+        idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+                struct drbd_device *device = peer_device->device;
+                if (!atomic_read(&device->pp_in_use_by_net))
+                        continue;
+                kref_get(&device->kref);
+                rcu_read_unlock();
+                drbd_reclaim_net_peer_reqs(device);
+                kref_put(&device->kref, drbd_destroy_device);
+                rcu_read_lock();
+        }
+        rcu_read_unlock();
+}
 /**
 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
 * @device:     DRBD device.
@@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
        if (atomic_read(&device->pp_in_use) < mxb)
                page = __drbd_alloc_pages(device, number);
+        /* Try to keep the fast path fast, but occasionally we need
+         * to reclaim the pages we lended to the network stack. */
+        if (page && atomic_read(&device->pp_in_use_by_net) > 512)
+                drbd_reclaim_net_peer_reqs(device);
        while (page == NULL) {
                prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
-                drbd_kick_lo_and_reclaim_net(device);
+                drbd_reclaim_net_peer_reqs(device);
                if (atomic_read(&device->pp_in_use) < mxb) {
                        page = __drbd_alloc_pages(device, number);
@@ -1099,7 +1123,15 @@ randomize:
                return 0;
        }
-        drbd_thread_start(&connection->asender);
+        drbd_thread_start(&connection->ack_receiver);
+        /* opencoded create_singlethread_workqueue(),
+         * to be able to use format string arguments */
+        connection->ack_sender =
+                alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
+        if (!connection->ack_sender) {
+                drbd_err(connection, "Failed to create workqueue ack_sender\n");
+                return 0;
+        }
        mutex_lock(&connection->resource->conf_update);
        /* The discard_my_data flag is a single-shot modifier to the next
@@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection)
        struct drbd_peer_device *peer_device;
        int vnr;
-        if (connection->resource->write_ordering >= WO_bdev_flush) {
+        if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
                rcu_read_lock();
                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
                        struct drbd_device *device = peer_device->device;
@@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection)
                                /* would rather check on EOPNOTSUPP, but that is not reliable.
                                 * don't try again for ANY return value != 0
                                 * if (rv == -EOPNOTSUPP) */
-                                drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
+                                drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
                        }
                        put_ldev(device);
                        kref_put(&device->kref, drbd_destroy_device);
@@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
        dc = rcu_dereference(bdev->disk_conf);
-        if (wo == WO_bdev_flush && !dc->disk_flushes)
+        if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
-                wo = WO_drain_io;
+                wo = WO_DRAIN_IO;
-        if (wo == WO_drain_io && !dc->disk_drain)
+        if (wo == WO_DRAIN_IO && !dc->disk_drain)
-                wo = WO_none;
+                wo = WO_NONE;
        return wo;
 }
@@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
        enum write_ordering_e pwo;
        int vnr;
        static char *write_ordering_str[] = {
-                [WO_none] = "none",
+                [WO_NONE] = "none",
-                [WO_drain_io] = "drain",
+                [WO_DRAIN_IO] = "drain",
-                [WO_bdev_flush] = "flush",
+                [WO_BDEV_FLUSH] = "flush",
        };
        pwo = resource->write_ordering;
-        if (wo != WO_bdev_flush)
+        if (wo != WO_BDEV_FLUSH)
                wo = min(pwo, wo);
        rcu_read_lock();
        idr_for_each_entry(&resource->devices, device, vnr) {
@@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
        rcu_read_unlock();
        resource->write_ordering = wo;
-        if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+        if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
                drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
@@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
        if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
                /* wait for all pending IO completions, before we start
                 * zeroing things out. */
-                conn_wait_active_ee_empty(first_peer_device(device)->connection);
+                conn_wait_active_ee_empty(peer_req->peer_device->connection);
                /* add it to the active list now,
                 * so we can find it to present it in debugfs */
                peer_req->submit_jif = jiffies;
@@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection)
        rcu_read_unlock();
 }
-static struct drbd_peer_device *
-conn_peer_device(struct drbd_connection *connection, int volume_number)
-{
-        return idr_find(&connection->peer_devices, volume_number);
-}
 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
 {
        int rv;
@@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
         * Therefore we must send the barrier_ack after the barrier request was
         * completed. */
        switch (connection->resource->write_ordering) {
-        case WO_none:
+        case WO_NONE:
                if (rv == FE_RECYCLED)
                        return 0;
@@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
                        drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
                        /* Fall through */
-        case WO_bdev_flush:
+        case WO_BDEV_FLUSH:
-        case WO_drain_io:
+        case WO_DRAIN_IO:
                conn_wait_active_ee_empty(connection);
                drbd_flush(connection);
@@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 }
 /*
- * e_end_resync_block() is called in asender context via
+ * e_end_resync_block() is called in ack_sender context via
 * drbd_finish_peer_reqs().
 */
 static int e_end_resync_block(struct drbd_work *w, int unused)
@@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device,
 }
 /*
- * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
 */
 static int e_end_block(struct drbd_work *w, int cancel)
 {
@@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
        } else
                D_ASSERT(device, drbd_interval_empty(&peer_req->i));
-        drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+        drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
        return err;
 }
@@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
                }
                rcu_read_lock();
-                tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+                tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
                rcu_read_unlock();
                if (!tp)
@@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device,
                        peer_req->w.cb = superseded ? e_send_superseded :
                                                   e_send_retry_write;
                        list_add_tail(&peer_req->w.list, &device->done_ee);
-                        wake_asender(connection);
+                        queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
                        err = -ENOENT;
                        goto out;
@@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        if (dp_flags & DP_SEND_RECEIVE_ACK) {
                /* I really don't like it that the receiver thread
                 * sends on the msock, but anyways */
-                drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+                drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
        }
        if (tp) {
@@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
        os = ns = drbd_read_state(device);
        spin_unlock_irq(&device->resource->req_lock);
-        /* If some other part of the code (asender thread, timeout)
+        /* If some other part of the code (ack_receiver thread, timeout)
         * already decided to close the connection again,
         * we must not "re-establish" it here. */
        if (os.conn <= C_TEAR_DOWN)
@@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection)
         */
        conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
-        /* asender does not clean up anything. it must not interfere, either */
+        /* ack_receiver does not clean up anything. it must not interfere, either */
-        drbd_thread_stop(&connection->asender);
+        drbd_thread_stop(&connection->ack_receiver);
+        if (connection->ack_sender) {
+                destroy_workqueue(connection->ack_sender);
+                connection->ack_sender = NULL;
+        }
        drbd_free_sock(connection);
        rcu_read_lock();
@@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
        return 0;
 }
-static int connection_finish_peer_reqs(struct drbd_connection *connection)
+struct meta_sock_cmd {
+        size_t pkt_size;
+        int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
 {
-        struct drbd_peer_device *peer_device;
+        long t;
-        int vnr, not_empty = 0;
+        struct net_conf *nc;
-        do {
+        rcu_read_lock();
-                clear_bit(SIGNAL_ASENDER, &connection->flags);
+        nc = rcu_dereference(connection->net_conf);
-                flush_signals(current);
+        t = ping_timeout ? nc->ping_timeo : nc->ping_int;
+        rcu_read_unlock();
-                rcu_read_lock();
+        t *= HZ;
-                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+        if (ping_timeout)
-                        struct drbd_device *device = peer_device->device;
+                t /= 10;
-                        kref_get(&device->kref);
-                        rcu_read_unlock();
-                        if (drbd_finish_peer_reqs(device)) {
-                                kref_put(&device->kref, drbd_destroy_device);
-                                return 1;
-                        }
-                        kref_put(&device->kref, drbd_destroy_device);
-                        rcu_read_lock();
-                }
-                set_bit(SIGNAL_ASENDER, &connection->flags);
-                spin_lock_irq(&connection->resource->req_lock);
+        connection->meta.socket->sk->sk_rcvtimeo = t;
-                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+}
-                        struct drbd_device *device = peer_device->device;
-                        not_empty = !list_empty(&device->done_ee);
-                        if (not_empty)
-                                break;
-                }
-                spin_unlock_irq(&connection->resource->req_lock);
-                rcu_read_unlock();
-        } while (not_empty);
-        return 0;
+static void set_ping_timeout(struct drbd_connection *connection)
+{
+        set_rcvtimeo(connection, 1);
 }
-struct asender_cmd {
+static void set_idle_timeout(struct drbd_connection *connection)
-        size_t pkt_size;
+{
-        int (*fn)(struct drbd_connection *connection, struct packet_info *);
+        set_rcvtimeo(connection, 0);
-};
+}
-static struct asender_cmd asender_tbl[] = {
+static struct meta_sock_cmd ack_receiver_tbl[] = {
        [P_PING]            = { 0, got_Ping },
        [P_PING_ACK]        = { 0, got_PingAck },
        [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
@@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = {
        [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
 };
-int drbd_asender(struct drbd_thread *thi)
+int drbd_ack_receiver(struct drbd_thread *thi)
 {
        struct drbd_connection *connection = thi->connection;
-        struct asender_cmd *cmd = NULL;
+        struct meta_sock_cmd *cmd = NULL;
        struct packet_info pi;
+        unsigned long pre_recv_jif;
        int rv;
        void *buf    = connection->meta.rbuf;
        int received = 0;
        unsigned int header_size = drbd_header_size(connection);
        int expect   = header_size;
        bool ping_timeout_active = false;
-        struct net_conf *nc;
-        int ping_timeo, tcp_cork, ping_int;
        struct sched_param param = { .sched_priority = 2 };
        rv = sched_setscheduler(current, SCHED_RR, &param);
        if (rv < 0)
-                drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+                drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
        while (get_t_state(thi) == RUNNING) {
                drbd_thread_current_set_cpu(thi);
-                rcu_read_lock();
+                conn_reclaim_net_peer_reqs(connection);
-                nc = rcu_dereference(connection->net_conf);
-                ping_timeo = nc->ping_timeo;
-                tcp_cork = nc->tcp_cork;
-                ping_int = nc->ping_int;
-                rcu_read_unlock();
                if (test_and_clear_bit(SEND_PING, &connection->flags)) {
                        if (drbd_send_ping(connection)) {
                                drbd_err(connection, "drbd_send_ping has failed\n");
                                goto reconnect;
                        }
-                        connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+                        set_ping_timeout(connection);
                        ping_timeout_active = true;
                }
-                /* TODO: conditionally cork; it may hurt latency if we cork without
+                pre_recv_jif = jiffies;
-                   much to send */
-                if (tcp_cork)
-                        drbd_tcp_cork(connection->meta.socket);
-                if (connection_finish_peer_reqs(connection)) {
-                        drbd_err(connection, "connection_finish_peer_reqs() failed\n");
-                        goto reconnect;
-                }
-                /* but unconditionally uncork unless disabled */
-                if (tcp_cork)
-                        drbd_tcp_uncork(connection->meta.socket);
-                /* short circuit, recv_msg would return EINTR anyways. */
-                if (signal_pending(current))
-                        continue;
                rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
-                clear_bit(SIGNAL_ASENDER, &connection->flags);
-                flush_signals(current);
                /* Note:
                 * -EINTR        (on meta) we got a signal
@@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi)
                 * rv <  expected: "woken" by signal during receive
                 * rv == 0       : "connection shut down by peer"
                 */
-received_more:
                if (likely(rv > 0)) {
                        received += rv;
                        buf      += rv;
@@ -5584,8 +5579,7 @@ received_more:
                } else if (rv == -EAGAIN) {
                        /* If the data socket received something meanwhile,
                         * that is good enough: peer is still alive. */
-                        if (time_after(connection->last_received,
+                        if (time_after(connection->last_received, pre_recv_jif))
-                                jiffies - connection->meta.socket->sk->sk_rcvtimeo))
                                continue;
                        if (ping_timeout_active) {
                                drbd_err(connection, "PingAck did not arrive in time.\n");
@@ -5594,6 +5588,10 @@ received_more:
                        set_bit(SEND_PING, &connection->flags);
                        continue;
                } else if (rv == -EINTR) {
+                        /* maybe drbd_thread_stop(): the while condition will notice.
+                         * maybe woken for send_ping: we'll send a ping above,
+                         * and change the rcvtimeo */
+                        flush_signals(current);
                        continue;
                } else {
                        drbd_err(connection, "sock_recvmsg returned %d\n", rv);
@@ -5603,8 +5601,8 @@ received_more:
                if (received == expect && cmd == NULL) {
                        if (decode_header(connection, connection->meta.rbuf, &pi))
                                goto reconnect;
-                        cmd = &asender_tbl[pi.cmd];
+                        cmd = &ack_receiver_tbl[pi.cmd];
-                        if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+                        if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
                                drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
                                         cmdname(pi.cmd), pi.cmd);
                                goto disconnect;
@@ -5627,9 +5625,8 @@ received_more:
                        connection->last_received = jiffies;
-                        if (cmd == &asender_tbl[P_PING_ACK]) {
+                        if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
-                                /* restore idle timeout */
+                                set_idle_timeout(connection);
-                                connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
                                ping_timeout_active = false;
                        }
@@ -5638,11 +5635,6 @@ received_more:
                        expect   = header_size;
                        cmd      = NULL;
                }
-                if (test_bit(SEND_PING, &connection->flags))
-                        continue;
-                rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
-                if (rv > 0)
-                        goto received_more;
        }
        if (0) {
@@ -5654,9 +5646,41 @@ reconnect:
 disconnect:
                conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
        }
-        clear_bit(SIGNAL_ASENDER, &connection->flags);
-        drbd_info(connection, "asender terminated\n");
+        drbd_info(connection, "ack_receiver terminated\n");
        return 0;
 }
+void drbd_send_acks_wf(struct work_struct *ws)
+{
+        struct drbd_peer_device *peer_device =
+                container_of(ws, struct drbd_peer_device, send_acks_work);
+        struct drbd_connection *connection = peer_device->connection;
+        struct drbd_device *device = peer_device->device;
+        struct net_conf *nc;
+        int tcp_cork, err;
+        rcu_read_lock();
+        nc = rcu_dereference(connection->net_conf);
+        tcp_cork = nc->tcp_cork;
+        rcu_read_unlock();
+        if (tcp_cork)
+                drbd_tcp_cork(connection->meta.socket);
+        err = drbd_finish_peer_reqs(device);
+        kref_put(&device->kref, drbd_destroy_device);
+        /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
+           struct work_struct send_acks_work alive, which is in the peer_device object */
+        if (err) {
+                conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+                return;
+        }
+        if (tcp_cork)
+                drbd_tcp_uncork(connection->meta.socket);
+        return;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3ae2c0086563..2255dcfebd2b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                kref_get(&req->kref); /* wait for the DONE */
        if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
-                /* potentially already completed in the asender thread */
+                /* potentially already completed in the ack_receiver thread */
                if (!(s & RQ_NET_DONE)) {
                        atomic_add(req->i.size >> 9, &device->ap_in_flight);
                        set_if_null_req_not_net_done(peer_device, req);
                }
-                if (s & RQ_NET_PENDING)
+                if (req->rq_state & RQ_NET_PENDING)
                        set_if_null_req_ack_pending(peer_device, req);
        }
@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req)
        return false;
 }
+bool drbd_should_do_remote(union drbd_dev_state s)
+{
+        return s.pdsk == D_UP_TO_DATE ||
+                (s.pdsk >= D_INCONSISTENT &&
+                 s.conn >= C_WF_BITMAP_T &&
+                 s.conn < C_AHEAD);
+        /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+           That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+           states. */
+}
+static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+        return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+        /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+           since we enter state C_AHEAD only if proto >= 96 */
+}
 /* returns number of connections (== 1, for drbd 8.4)
 * expected to actually write this data,
 * which does NOT include those that we are L_AHEAD for. */
@@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
         * stable storage, and this is a WRITE, we may not even submit
         * this bio. */
        if (get_ldev(device)) {
-                req->pre_submit_jif = jiffies;
                if (drbd_insert_fault(device,
                                      rw == WRITE ? DRBD_FAULT_DT_WR
                                    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
                        &device->pending_master_completion[rw == WRITE]);
        if (req->private_bio) {
                /* needs to be marked within the same spinlock */
+                req->pre_submit_jif = jiffies;
                list_add_tail(&req->req_pending_local,
                        &device->pending_completion[rw == WRITE]);
                _req_mod(req, TO_BE_SUBMITTED);
@@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
        return BLK_QC_T_NONE;
 }
+static bool net_timeout_reached(struct drbd_request *net_req,
+                struct drbd_connection *connection,
+                unsigned long now, unsigned long ent,
+                unsigned int ko_count, unsigned int timeout)
+{
+        struct drbd_device *device = net_req->device;
+        if (!time_after(now, net_req->pre_send_jif + ent))
+                return false;
+        if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
+                return false;
+        if (net_req->rq_state & RQ_NET_PENDING) {
+                drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+                        jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+                return true;
+        }
+        /* We received an ACK already (or are using protocol A),
+         * but are waiting for the epoch closing barrier ack.
+         * Check if we sent the barrier already.  We should not blame the peer
+         * for being unresponsive, if we did not even ask it yet. */
+        if (net_req->epoch == connection->send.current_epoch_nr) {
+                drbd_warn(device,
+                        "We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
+                        jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+                return false;
+        }
+        /* Worst case: we may have been blocked for whatever reason, then
+         * suddenly are able to send a lot of requests (and epoch separating
+         * barriers) in quick succession.
+         * The timestamp of the net_req may be much too old and not correspond
+         * to the sending time of the relevant unack'ed barrier packet, so
+         * would trigger a spurious timeout.  The latest barrier packet may
+         * have a too recent timestamp to trigger the timeout, potentially miss
+         * a timeout.  Right now we don't have a place to conveniently store
+         * these timestamps.
+         * But in this particular situation, the application requests are still
+         * completed to upper layers, DRBD should still "feel" responsive.
+         * No need yet to kill this connection, it may still recover.
+         * If not, eventually we will have queued enough into the network for
+         * us to block. From that point of view, the timestamp of the last sent
+         * barrier packet is relevant enough.
+         */
+        if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
+                drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+                        connection->send.last_sent_barrier_jif, now,
+                        jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
+                return true;
+        }
+        return false;
+}
+/* A request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ *   with some state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ *   resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ *   for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
 void request_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
@@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data)
        unsigned long oldest_submit_jif;
        unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
        unsigned long now;
+        unsigned int ko_count = 0, timeout = 0;
        rcu_read_lock();
        nc = rcu_dereference(connection->net_conf);
-        if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
+        if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
-                ent = nc->timeout * HZ/10 * nc->ko_count;
+                ko_count = nc->ko_count;
+                timeout = nc->timeout;
+        }
        if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
                dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
@@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data)
        }
        rcu_read_unlock();
+        ent = timeout * HZ/10 * ko_count;
        et = min_not_zero(dt, ent);
        if (!et)
@@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data)
        spin_lock_irq(&device->resource->req_lock);
        req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
        req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
-        req_peer = connection->req_not_net_done;
        /* maybe the oldest request waiting for the peer is in fact still
-         * blocking in tcp sendmsg */
+         * blocking in tcp sendmsg.  That's ok, though, that's handled via the
-        if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+         * socket send timeout, requesting a ping, and bumping ko-count in
-                req_peer = connection->req_next;
+         * we_should_drop_the_connection().
+         */
+        /* check the oldest request we did successfully sent,
+         * but which is still waiting for an ACK. */
+        req_peer = connection->req_ack_pending;
+        /* if we don't have such request (e.g. protocoll A)
+         * check the oldest requests which is still waiting on its epoch
+         * closing barrier ack. */
+        if (!req_peer)
+                req_peer = connection->req_not_net_done;
        /* evaluate the oldest peer request only in one timer! */
        if (req_peer && req_peer->device != device)
@@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data)
                : req_write ? req_write->pre_submit_jif
                : req_read ? req_read->pre_submit_jif : now;
-        /* The request is considered timed out, if
+        if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
-         * - we have some effective timeout from the configuration,
-         *   with above state restrictions applied,
-         * - the oldest request is waiting for a response from the network
-         *   resp. the local disk,
-         * - the oldest request is in fact older than the effective timeout,
-         * - the connection was established (resp. disk was attached)
-         *   for longer than the timeout already.
-         * Note that for 32bit jiffies and very stable connections/disks,
-         * we may have a wrap around, which is catched by
-         *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
-         *
-         * Side effect: once per 32bit wrap-around interval, which means every
-         * ~198 days with 250 HZ, we have a window where the timeout would need
-         * to expire twice (worst case) to become effective. Good enough.
-         */
-        if (ent && req_peer &&
-                 time_after(now, req_peer->pre_send_jif + ent) &&
-                !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
-                drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
                _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
-        }
        if (dt && oldest_submit_jif != now &&
                 time_after(now, oldest_submit_jif + dt) &&
                !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 9f6a04080e9f..bb2ef78165e5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req,
        return rv;
 }
-static inline bool drbd_should_do_remote(union drbd_dev_state s)
+extern bool drbd_should_do_remote(union drbd_dev_state);
-{
-        return s.pdsk == D_UP_TO_DATE ||
-                (s.pdsk >= D_INCONSISTENT &&
-                 s.conn >= C_WF_BITMAP_T &&
-                 s.conn < C_AHEAD);
-        /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
-           That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
-           states. */
-}
-static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
-{
-        return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
-        /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
-           since we enter state C_AHEAD only if proto >= 96 */
-}
 #endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 2d7dd269b6a8..5a7ef7873b67 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -29,6 +29,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 struct after_state_chg_work {
        struct drbd_work w;
@@ -37,6 +38,7 @@ struct after_state_chg_work {
        union drbd_state ns;
        enum chg_state_flags flags;
        struct completion *done;
+        struct drbd_state_change *state_change;
 };
 enum sanitize_state_warnings {
@@ -48,9 +50,248 @@ enum sanitize_state_warnings {
        IMPLICITLY_UPGRADED_PDSK,
 };
+static void count_objects(struct drbd_resource *resource,
+                          unsigned int *n_devices,
+                          unsigned int *n_connections)
+{
+        struct drbd_device *device;
+        struct drbd_connection *connection;
+        int vnr;
+        *n_devices = 0;
+        *n_connections = 0;
+        idr_for_each_entry(&resource->devices, device, vnr)
+                (*n_devices)++;
+        for_each_connection(connection, resource)
+                (*n_connections)++;
+}
+static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
+{
+        struct drbd_state_change *state_change;
+        unsigned int size, n;
+        size = sizeof(struct drbd_state_change) +
+               n_devices * sizeof(struct drbd_device_state_change) +
+               n_connections * sizeof(struct drbd_connection_state_change) +
+               n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
+        state_change = kmalloc(size, gfp);
+        if (!state_change)
+                return NULL;
+        state_change->n_devices = n_devices;
+        state_change->n_connections = n_connections;
+        state_change->devices = (void *)(state_change + 1);
+        state_change->connections = (void *)&state_change->devices[n_devices];
+        state_change->peer_devices = (void *)&state_change->connections[n_connections];
+        state_change->resource->resource = NULL;
+        for (n = 0; n < n_devices; n++)
+                state_change->devices[n].device = NULL;
+        for (n = 0; n < n_connections; n++)
+                state_change->connections[n].connection = NULL;
+        return state_change;
+}
+struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
+{
+        struct drbd_state_change *state_change;
+        struct drbd_device *device;
+        unsigned int n_devices;
+        struct drbd_connection *connection;
+        unsigned int n_connections;
+        int vnr;
+        struct drbd_device_state_change *device_state_change;
+        struct drbd_peer_device_state_change *peer_device_state_change;
+        struct drbd_connection_state_change *connection_state_change;
+        /* Caller holds req_lock spinlock.
+         * No state, no device IDR, no connections lists can change. */
+        count_objects(resource, &n_devices, &n_connections);
+        state_change = alloc_state_change(n_devices, n_connections, gfp);
+        if (!state_change)
+                return NULL;
+        kref_get(&resource->kref);
+        state_change->resource->resource = resource;
+        state_change->resource->role[OLD] =
+                conn_highest_role(first_connection(resource));
+        state_change->resource->susp[OLD] = resource->susp;
+        state_change->resource->susp_nod[OLD] = resource->susp_nod;
+        state_change->resource->susp_fen[OLD] = resource->susp_fen;
+        connection_state_change = state_change->connections;
+        for_each_connection(connection, resource) {
+                kref_get(&connection->kref);
+                connection_state_change->connection = connection;
+                connection_state_change->cstate[OLD] =
+                        connection->cstate;
+                connection_state_change->peer_role[OLD] =
+                        conn_highest_peer(connection);
+                connection_state_change++;
+        }
+        device_state_change = state_change->devices;
+        peer_device_state_change = state_change->peer_devices;
+        idr_for_each_entry(&resource->devices, device, vnr) {
+                kref_get(&device->kref);
+                device_state_change->device = device;
+                device_state_change->disk_state[OLD] = device->state.disk;
+                /* The peer_devices for each device have to be enumerated in
+                   the order of the connections. We may not use for_each_peer_device() here. */
+                for_each_connection(connection, resource) {
+                        struct drbd_peer_device *peer_device;
+                        peer_device = conn_peer_device(connection, device->vnr);
+                        peer_device_state_change->peer_device = peer_device;
+                        peer_device_state_change->disk_state[OLD] =
+                                device->state.pdsk;
+                        peer_device_state_change->repl_state[OLD] =
+                                max_t(enum drbd_conns,
+                                      C_WF_REPORT_PARAMS, device->state.conn);
+                        peer_device_state_change->resync_susp_user[OLD] =
+                                device->state.user_isp;
+                        peer_device_state_change->resync_susp_peer[OLD] =
+                                device->state.peer_isp;
+                        peer_device_state_change->resync_susp_dependency[OLD] =
+                                device->state.aftr_isp;
+                        peer_device_state_change++;
+                }
+                device_state_change++;
+        }
+        return state_change;
+}
+static void remember_new_state(struct drbd_state_change *state_change)
+{
+        struct drbd_resource_state_change *resource_state_change;
+        struct drbd_resource *resource;
+        unsigned int n;
+        if (!state_change)
+                return;
+        resource_state_change = &state_change->resource[0];
+        resource = resource_state_change->resource;
+        resource_state_change->role[NEW] =
+                conn_highest_role(first_connection(resource));
+        resource_state_change->susp[NEW] = resource->susp;
+        resource_state_change->susp_nod[NEW] = resource->susp_nod;
+        resource_state_change->susp_fen[NEW] = resource->susp_fen;
+        for (n = 0; n < state_change->n_devices; n++) {
+                struct drbd_device_state_change *device_state_change =
+                        &state_change->devices[n];
+                struct drbd_device *device = device_state_change->device;
+                device_state_change->disk_state[NEW] = device->state.disk;
+        }
+        for (n = 0; n < state_change->n_connections; n++) {
+                struct drbd_connection_state_change *connection_state_change =
+                        &state_change->connections[n];
+                struct drbd_connection *connection =
+                        connection_state_change->connection;
+                connection_state_change->cstate[NEW] = connection->cstate;
+                connection_state_change->peer_role[NEW] =
+                        conn_highest_peer(connection);
+        }
+        for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
+                struct drbd_peer_device_state_change *peer_device_state_change =
+                        &state_change->peer_devices[n];
+                struct drbd_device *device =
+                        peer_device_state_change->peer_device->device;
+                union drbd_dev_state state = device->state;
+                peer_device_state_change->disk_state[NEW] = state.pdsk;
+                peer_device_state_change->repl_state[NEW] =
+                        max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
+                peer_device_state_change->resync_susp_user[NEW] =
+                        state.user_isp;
+                peer_device_state_change->resync_susp_peer[NEW] =
+                        state.peer_isp;
+                peer_device_state_change->resync_susp_dependency[NEW] =
+                        state.aftr_isp;
+        }
+}
+void copy_old_to_new_state_change(struct drbd_state_change *state_change)
+{
+        struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+        unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+#define OLD_TO_NEW(x) \
+        (x[NEW] = x[OLD])
+        OLD_TO_NEW(resource_state_change->role);
+        OLD_TO_NEW(resource_state_change->susp);
+        OLD_TO_NEW(resource_state_change->susp_nod);
+        OLD_TO_NEW(resource_state_change->susp_fen);
+        for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+                struct drbd_connection_state_change *connection_state_change =
+                                &state_change->connections[n_connection];
+                OLD_TO_NEW(connection_state_change->peer_role);
+                OLD_TO_NEW(connection_state_change->cstate);
+        }
+        for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+                struct drbd_device_state_change *device_state_change =
+                        &state_change->devices[n_device];
+                OLD_TO_NEW(device_state_change->disk_state);
+        }
+        n_peer_devices = state_change->n_devices * state_change->n_connections;
+        for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+                struct drbd_peer_device_state_change *p =
+                        &state_change->peer_devices[n_peer_device];
+                OLD_TO_NEW(p->disk_state);
+                OLD_TO_NEW(p->repl_state);
+                OLD_TO_NEW(p->resync_susp_user);
+                OLD_TO_NEW(p->resync_susp_peer);
+                OLD_TO_NEW(p->resync_susp_dependency);
+        }
+#undef OLD_TO_NEW
+}
+void forget_state_change(struct drbd_state_change *state_change)
+{
+        unsigned int n;
+        if (!state_change)
+                return;
+        if (state_change->resource->resource)
+                kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
+        for (n = 0; n < state_change->n_devices; n++) {
+                struct drbd_device *device = state_change->devices[n].device;
+                if (device)
+                        kref_put(&device->kref, drbd_destroy_device);
+        }
+        for (n = 0; n < state_change->n_connections; n++) {
+                struct drbd_connection *connection =
+                        state_change->connections[n].connection;
+                if (connection)
+                        kref_put(&connection->kref, drbd_destroy_connection);
+        }
+        kfree(state_change);
+}
 static int w_after_state_ch(struct drbd_work *w, int unused);
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-                           union drbd_state ns, enum chg_state_flags flags);
+                           union drbd_state ns, enum chg_state_flags flags,
+                           struct drbd_state_change *);
 static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
 static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
 static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
@@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
                return R_SECONDARY;
        return R_UNKNOWN;
 }
 static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
 {
        if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
@@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device)
                drbd_info(device, "Resumed AL updates\n");
 }
-/* helper for __drbd_set_state */
+/* helper for _drbd_set_state */
 static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 {
        if (first_peer_device(device)->connection->agreed_pro_version < 90)
@@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 }
 /**
- * __drbd_set_state() - Set a new DRBD state
+ * _drbd_set_state() - Set a new DRBD state
 * @device:     DRBD device.
 * @ns:         new state.
 * @flags:      Flags
 * @done:       Optional completion, that will get completed after the after_state_ch() finished
 *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ * Caller needs to hold req_lock. Do not call directly.
 */
 enum drbd_state_rv
-__drbd_set_state(struct drbd_device *device, union drbd_state ns,
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
-                 enum chg_state_flags flags, struct completion *done)
+                enum chg_state_flags flags, struct completion *done)
 {
        struct drbd_peer_device *peer_device = first_peer_device(device);
        struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
@@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
        enum drbd_state_rv rv = SS_SUCCESS;
        enum sanitize_state_warnings ssw;
        struct after_state_chg_work *ascw;
+        struct drbd_state_change *state_change;
        os = drbd_read_state(device);
@@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
        if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
                clear_bit(RS_DONE, &device->flags);
+        /* FIXME: Have any flags been set earlier in this function already? */
+        state_change = remember_old_state(device->resource, GFP_ATOMIC);
        /* changes to local_cnt and device flags should be visible before
         * changes to state, which again should be visible before anything else
         * depending on that change happens. */
@@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
        device->resource->susp_fen = ns.susp_fen;
        smp_wmb();
+        remember_new_state(state_change);
        /* put replicated vs not-replicated requests in seperate epochs */
        if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
            drbd_should_do_remote((union drbd_dev_state)ns.i))
@@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                ascw->w.cb = w_after_state_ch;
                ascw->device = device;
                ascw->done = done;
+                ascw->state_change = state_change;
                drbd_queue_work(&connection->sender_work,
                                &ascw->w);
        } else {
@@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused)
                container_of(w, struct after_state_chg_work, w);
        struct drbd_device *device = ascw->device;
-        after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+        after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
+        forget_state_change(ascw->state_change);
        if (ascw->flags & CS_WAIT_COMPLETE)
                complete(ascw->done);
        kfree(ascw);
@@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
        D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
        /* open coded non-blocking drbd_suspend_io(device); */
-        set_bit(SUSPEND_IO, &device->flags);
+        atomic_inc(&device->suspend_cnt);
        drbd_bm_lock(device, why, flags);
        rv = io_fn(device);
@@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
        return rv;
 }
+void notify_resource_state_change(struct sk_buff *skb,
+                                  unsigned int seq,
+                                  struct drbd_resource_state_change *resource_state_change,
+                                  enum drbd_notification_type type)
+{
+        struct drbd_resource *resource = resource_state_change->resource;
+        struct resource_info resource_info = {
+                .res_role = resource_state_change->role[NEW],
+                .res_susp = resource_state_change->susp[NEW],
+                .res_susp_nod = resource_state_change->susp_nod[NEW],
+                .res_susp_fen = resource_state_change->susp_fen[NEW],
+        };
+        notify_resource_state(skb, seq, resource, &resource_info, type);
+}
+void notify_connection_state_change(struct sk_buff *skb,
+                                    unsigned int seq,
+                                    struct drbd_connection_state_change *connection_state_change,
+                                    enum drbd_notification_type type)
+{
+        struct drbd_connection *connection = connection_state_change->connection;
+        struct connection_info connection_info = {
+                .conn_connection_state = connection_state_change->cstate[NEW],
+                .conn_role = connection_state_change->peer_role[NEW],
+        };
+        notify_connection_state(skb, seq, connection, &connection_info, type);
+}
+void notify_device_state_change(struct sk_buff *skb,
+                                unsigned int seq,
+                                struct drbd_device_state_change *device_state_change,
+                                enum drbd_notification_type type)
+{
+        struct drbd_device *device = device_state_change->device;
+        struct device_info device_info = {
+                .dev_disk_state = device_state_change->disk_state[NEW],
+        };
+        notify_device_state(skb, seq, device, &device_info, type);
+}
+void notify_peer_device_state_change(struct sk_buff *skb,
+                                     unsigned int seq,
+                                     struct drbd_peer_device_state_change *p,
+                                     enum drbd_notification_type type)
+{
+        struct drbd_peer_device *peer_device = p->peer_device;
+        struct peer_device_info peer_device_info = {
+                .peer_repl_state = p->repl_state[NEW],
+                .peer_disk_state = p->disk_state[NEW],
+                .peer_resync_susp_user = p->resync_susp_user[NEW],
+                .peer_resync_susp_peer = p->resync_susp_peer[NEW],
+                .peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
+        };
+        notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+}
+static void broadcast_state_change(struct drbd_state_change *state_change)
+{
+        struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+        bool resource_state_has_changed;
+        unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+        void (*last_func)(struct sk_buff *, unsigned int, void *,
+                          enum drbd_notification_type) = NULL;
+        void *uninitialized_var(last_arg);
+#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
+#define FINAL_STATE_CHANGE(type) \
+        ({ if (last_func) \
+                last_func(NULL, 0, last_arg, type); \
+        })
+#define REMEMBER_STATE_CHANGE(func, arg, type) \
+        ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
+           last_func = (typeof(last_func))func; \
+           last_arg = arg; \
+         })
+        mutex_lock(&notification_mutex);
+        resource_state_has_changed =
+            HAS_CHANGED(resource_state_change->role) ||
+            HAS_CHANGED(resource_state_change->susp) ||
+            HAS_CHANGED(resource_state_change->susp_nod) ||
+            HAS_CHANGED(resource_state_change->susp_fen);
+        if (resource_state_has_changed)
+                REMEMBER_STATE_CHANGE(notify_resource_state_change,
+                                      resource_state_change, NOTIFY_CHANGE);
+        for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+                struct drbd_connection_state_change *connection_state_change =
+                                &state_change->connections[n_connection];
+                if (HAS_CHANGED(connection_state_change->peer_role) ||
+                    HAS_CHANGED(connection_state_change->cstate))
+                        REMEMBER_STATE_CHANGE(notify_connection_state_change,
+                                              connection_state_change, NOTIFY_CHANGE);
+        }
+        for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+                struct drbd_device_state_change *device_state_change =
+                        &state_change->devices[n_device];
+                if (HAS_CHANGED(device_state_change->disk_state))
+                        REMEMBER_STATE_CHANGE(notify_device_state_change,
+                                              device_state_change, NOTIFY_CHANGE);
+        }
+        n_peer_devices = state_change->n_devices * state_change->n_connections;
+        for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+                struct drbd_peer_device_state_change *p =
+                        &state_change->peer_devices[n_peer_device];
+                if (HAS_CHANGED(p->disk_state) ||
+                    HAS_CHANGED(p->repl_state) ||
+                    HAS_CHANGED(p->resync_susp_user) ||
+                    HAS_CHANGED(p->resync_susp_peer) ||
+                    HAS_CHANGED(p->resync_susp_dependency))
+                        REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
+                                              p, NOTIFY_CHANGE);
+        }
+        FINAL_STATE_CHANGE(NOTIFY_CHANGE);
+        mutex_unlock(&notification_mutex);
+#undef HAS_CHANGED
+#undef FINAL_STATE_CHANGE
+#undef REMEMBER_STATE_CHANGE
+}
 /**
 * after_state_ch() - Perform after state change actions that may sleep
 * @device:     DRBD device.
@@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
 * @flags:      Flags
 */
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-                           union drbd_state ns, enum chg_state_flags flags)
+                           union drbd_state ns, enum chg_state_flags flags,
+                           struct drbd_state_change *state_change)
 {
        struct drbd_resource *resource = device->resource;
        struct drbd_peer_device *peer_device = first_peer_device(device);
        struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        struct sib_info sib;
+        broadcast_state_change(state_change);
        sib.sib_reason = SIB_STATE_CHANGE;
        sib.os = os;
        sib.ns = ns;
@@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        }
        if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
-                if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+                if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
                    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
                        drbd_uuid_new_current(device);
                        drbd_send_uuids(peer_device);
@@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        if (os.disk != D_FAILED && ns.disk == D_FAILED) {
                enum drbd_io_error_p eh = EP_PASS_ON;
                int was_io_error = 0;
-                /* corresponding get_ldev was in __drbd_set_state, to serialize
+                /* corresponding get_ldev was in _drbd_set_state, to serialize
                 * our cleanup here with the transition to D_DISKLESS.
                 * But is is still not save to dreference ldev here, since
                 * we might come from an failed Attach before ldev was set. */
@@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                        was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
+                        /* Intentionally call this handler first, before drbd_send_state().
+                         * See: 2932204 drbd: call local-io-error handler early
+                         * People may chose to hard-reset the box from this handler.
+                         * It is useful if this looks like a "regular node crash". */
                        if (was_io_error && eh == EP_CALL_HELPER)
                                drbd_khelper(device, "local-io-error");
@@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work {
        union drbd_state ns_max; /* new, max state, over all devices */
        enum chg_state_flags flags;
        struct drbd_connection *connection;
+        struct drbd_state_change *state_change;
 };
 static int w_after_conn_state_ch(struct drbd_work *w, int unused)
@@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
        struct drbd_peer_device *peer_device;
        int vnr;
+        broadcast_state_change(acscw->state_change);
+        forget_state_change(acscw->state_change);
        kfree(acscw);
        /* Upon network configuration, we need to start the receiver */
@@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
        if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
                struct net_conf *old_conf;
+                mutex_lock(&notification_mutex);
+                idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+                        notify_peer_device_state(NULL, 0, peer_device, NULL,
+                                                 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+                notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
+                mutex_unlock(&notification_mutex);
                mutex_lock(&connection->resource->conf_update);
                old_conf = connection->net_conf;
                connection->my_addr_len = 0;
@@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
                if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
                        ns.disk = os.disk;
-                rv = __drbd_set_state(device, ns, flags, NULL);
+                rv = _drbd_set_state(device, ns, flags, NULL);
                if (rv < SS_SUCCESS)
                        BUG();
@@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
        enum drbd_conns oc = connection->cstate;
        union drbd_state ns_max, ns_min, os;
        bool have_mutex = false;
+        struct drbd_state_change *state_change;
        if (mask.conn) {
                rv = is_valid_conn_transition(oc, val.conn);
@@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
                        goto abort;
        }
+        state_change = remember_old_state(connection->resource, GFP_ATOMIC);
        conn_old_common_state(connection, &os, &flags);
        flags |= CS_DC_SUSP;
        conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
        conn_pr_state_change(connection, os, ns_max, flags);
+        remember_new_state(state_change);
        acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
        if (acscw) {
@@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
                acscw->w.cb = w_after_conn_state_ch;
                kref_get(&connection->kref);
                acscw->connection = connection;
+                acscw->state_change = state_change;
                drbd_queue_work(&connection->sender_work, &acscw->w);
        } else {
                drbd_err(connection, "Could not kmalloc an acscw\n");
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index 7f53c40823cd..bd989536f888 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -122,9 +122,9 @@ extern enum drbd_state_rv
 _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
                                        union drbd_state, enum chg_state_flags);
-extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
+extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
-                                           enum chg_state_flags,
+                                          enum chg_state_flags,
-                                           struct completion *done);
+                                          struct completion *done);
 extern void print_st_err(struct drbd_device *, union drbd_state,
                        union drbd_state, int);
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
new file mode 100644
index 000000000000..9e503a1a0bfb
--- /dev/null
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -0,0 +1,63 @@
+#ifndef DRBD_STATE_CHANGE_H
+#define DRBD_STATE_CHANGE_H
+struct drbd_resource_state_change {
+        struct drbd_resource *resource;
+        enum drbd_role role[2];
+        bool susp[2];
+        bool susp_nod[2];
+        bool susp_fen[2];
+};
+struct drbd_device_state_change {
+        struct drbd_device *device;
+        enum drbd_disk_state disk_state[2];
+};
+struct drbd_connection_state_change {
+        struct drbd_connection *connection;
+        enum drbd_conns cstate[2];  /* drbd9: enum drbd_conn_state */
+        enum drbd_role peer_role[2];
+};
+struct drbd_peer_device_state_change {
+        struct drbd_peer_device *peer_device;
+        enum drbd_disk_state disk_state[2];
+        enum drbd_conns repl_state[2];  /* drbd9: enum drbd_repl_state */
+        bool resync_susp_user[2];
+        bool resync_susp_peer[2];
+        bool resync_susp_dependency[2];
+};
+struct drbd_state_change {
+        struct list_head list;
+        unsigned int n_devices;
+        unsigned int n_connections;
+        struct drbd_resource_state_change resource[1];
+        struct drbd_device_state_change *devices;
+        struct drbd_connection_state_change *connections;
+        struct drbd_peer_device_state_change *peer_devices;
+};
+extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
+extern void copy_old_to_new_state_change(struct drbd_state_change *);
+extern void forget_state_change(struct drbd_state_change *);
+extern void notify_resource_state_change(struct sk_buff *,
+                                         unsigned int,
+                                         struct drbd_resource_state_change *,
+                                         enum drbd_notification_type type);
+extern void notify_connection_state_change(struct sk_buff *,
+                                           unsigned int,
+                                           struct drbd_connection_state_change *,
+                                           enum drbd_notification_type type);
+extern void notify_device_state_change(struct sk_buff *,
+                                       unsigned int,
+                                       struct drbd_device_state_change *,
+                                       enum drbd_notification_type type);
+extern void notify_peer_device_state_change(struct sk_buff *,
+                                            unsigned int,
+                                            struct drbd_peer_device_state_change *,
+                                            enum drbd_notification_type type);
+#endif  /* DRBD_STATE_CHANGE_H */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5578c1477ba6..eff716c27b1f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int);
 *
 */
-/* About the global_state_lock
-   Each state transition on an device holds a read lock. In case we have
-   to evaluate the resync after dependencies, we grab a write lock, because
-   we need stable states on all devices for that.  */
-rwlock_t global_state_lock;
 /* used for synchronous meta data and bitmap IO
 * submitted by drbd_md_sync_page_io()
 */
@@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
        unsigned long flags = 0;
        struct drbd_peer_device *peer_device = peer_req->peer_device;
        struct drbd_device *device = peer_device->device;
+        struct drbd_connection *connection = peer_device->connection;
        struct drbd_interval i;
        int do_wake;
        u64 block_id;
@@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
         * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
        if (peer_req->flags & EE_WAS_ERROR)
                __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+        if (connection->cstate >= C_WF_REPORT_PARAMS) {
+                kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
+                if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
+                        kref_put(&device->kref, drbd_destroy_device);
+        }
        spin_unlock_irqrestore(&device->resource->req_lock, flags);
        if (block_id == ID_SYNCER)
@@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
        if (do_al_complete_io)
                drbd_al_complete_io(device, &i);
-        wake_asender(peer_device->connection);
        put_ldev(device);
 }
@@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio)
        }
 }
+void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
+{
+        panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
+                device->minor, device->resource->name, device->vnr);
+}
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
 */
 void drbd_request_endio(struct bio *bio)
@@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio)
                        drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
                if (!bio->bi_error)
-                        panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+                        drbd_panic_after_delayed_completion_of_aborted_request(device);
        }
        /* to avoid recursion in __req_mod */
@@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection)
        p->barrier = connection->send.current_epoch_nr;
        p->pad = 0;
        connection->send.current_epoch_writes = 0;
+        connection->send.last_sent_barrier_jif = jiffies;
        return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
 }
@@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned
                connection->send.seen_any_write_yet = true;
                connection->send.current_epoch_nr = epoch;
                connection->send.current_epoch_writes = 0;
+                connection->send.last_sent_barrier_jif = jiffies;
        }
 }
@@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device)
 }
 /**
- * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * drbd_pause_after() - Pause resync on all devices that may not resync now
 * @device:     DRBD device.
 *
 * Called from process context only (admin command and after_state_ch).
 */
-static int _drbd_pause_after(struct drbd_device *device)
+static bool drbd_pause_after(struct drbd_device *device)
 {
+        bool changed = false;
        struct drbd_device *odev;
-        int i, rv = 0;
+        int i;
        rcu_read_lock();
        idr_for_each_entry(&drbd_devices, odev, i) {
                if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
                        continue;
-                if (!_drbd_may_sync_now(odev))
+                if (!_drbd_may_sync_now(odev) &&
-                        rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+                    _drbd_set_state(_NS(odev, aftr_isp, 1),
-                               != SS_NOTHING_TO_DO);
+                                    CS_HARD, NULL) != SS_NOTHING_TO_DO)
+                        changed = true;
        }
        rcu_read_unlock();
-        return rv;
+        return changed;
 }
 /**
- * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * drbd_resume_next() - Resume resync on all devices that may resync now
 * @device:     DRBD device.
 *
 * Called from process context only (admin command and worker).
 */
-static int _drbd_resume_next(struct drbd_device *device)
+static bool drbd_resume_next(struct drbd_device *device)
 {
+        bool changed = false;
        struct drbd_device *odev;
-        int i, rv = 0;
+        int i;
        rcu_read_lock();
        idr_for_each_entry(&drbd_devices, odev, i) {
                if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
                        continue;
                if (odev->state.aftr_isp) {
-                        if (_drbd_may_sync_now(odev))
+                        if (_drbd_may_sync_now(odev) &&
-                                rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+                            _drbd_set_state(_NS(odev, aftr_isp, 0),
-                                                        CS_HARD, NULL)
+                                            CS_HARD, NULL) != SS_NOTHING_TO_DO)
-                                       != SS_NOTHING_TO_DO) ;
+                                changed = true;
                }
        }
        rcu_read_unlock();
-        return rv;
+        return changed;
 }
 void resume_next_sg(struct drbd_device *device)
 {
-        write_lock_irq(&global_state_lock);
+        lock_all_resources();
-        _drbd_resume_next(device);
+        drbd_resume_next(device);
-        write_unlock_irq(&global_state_lock);
+        unlock_all_resources();
 }
 void suspend_other_sg(struct drbd_device *device)
 {
-        write_lock_irq(&global_state_lock);
+        lock_all_resources();
-        _drbd_pause_after(device);
+        drbd_pause_after(device);
-        write_unlock_irq(&global_state_lock);
+        unlock_all_resources();
 }
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
 {
        struct drbd_device *odev;
@@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min
        }
 }
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 void drbd_resync_after_changed(struct drbd_device *device)
 {
-        int changes;
+        int changed;
        do {
-                changes  = _drbd_pause_after(device);
+                changed  = drbd_pause_after(device);
-                changes |= _drbd_resume_next(device);
+                changed |= drbd_resume_next(device);
-        } while (changes);
+        } while (changed);
 }
 void drbd_rs_controller_reset(struct drbd_device *device)
@@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
        } else {
                mutex_lock(device->state_mutex);
        }
-        clear_bit(B_RS_H_DONE, &device->flags);
-        /* req_lock: serialize with drbd_send_and_submit() and others
+        lock_all_resources();
-         * global_state_lock: for stable sync-after dependencies */
+        clear_bit(B_RS_H_DONE, &device->flags);
-        spin_lock_irq(&device->resource->req_lock);
-        write_lock(&global_state_lock);
        /* Did some connection breakage or IO error race with us? */
        if (device->state.conn < C_CONNECTED
        || !get_ldev_if_state(device, D_NEGOTIATING)) {
-                write_unlock(&global_state_lock);
+                unlock_all_resources();
-                spin_unlock_irq(&device->resource->req_lock);
+                goto out;
-                mutex_unlock(device->state_mutex);
-                return;
        }
        ns = drbd_read_state(device);
@@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
        else /* side == C_SYNC_SOURCE */
                ns.pdsk = D_INCONSISTENT;
-        r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+        r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
        ns = drbd_read_state(device);
        if (ns.conn < C_CONNECTED)
@@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                        device->rs_mark_left[i] = tw;
                        device->rs_mark_time[i] = now;
                }
-                _drbd_pause_after(device);
+                drbd_pause_after(device);
                /* Forget potentially stale cached per resync extent bit-counts.
                 * Open coded drbd_rs_cancel_all(device), we already have IRQs
                 * disabled, and know the disk state is ok. */
@@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                device->resync_wenr = LC_FREE;
                spin_unlock(&device->al_lock);
        }
-        write_unlock(&global_state_lock);
+        unlock_all_resources();
-        spin_unlock_irq(&device->resource->req_lock);
        if (r == SS_SUCCESS) {
                wake_up(&device->al_wait); /* for lc_reset() above */
@@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                drbd_md_sync(device);
        }
        put_ldev(device);
+out:
        mutex_unlock(device->state_mutex);
 }
@@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device)
        device->act_log = NULL;
        __acquire(local);
-        drbd_free_ldev(device->ldev);
+        drbd_backing_dev_free(device, device->ldev);
        device->ldev = NULL;
        __release(local);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 15bec407ac37..9b180dbbd03c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -104,9 +104,9 @@
 /* Device instance number, incremented each time a device is probed. */
 static int instance;
-struct list_head online_list;
+static struct list_head online_list;
-struct list_head removing_list;
+static struct list_head removing_list;
-spinlock_t dev_lock;
+static spinlock_t dev_lock;
 /*
 * Global variable used to hold the major block device number
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 95dff91135ad..6f9587156569 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -495,17 +495,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
        id->ppaf.ch_offset = 56;
        id->ppaf.ch_len = 8;
-        do_div(size, bs); /* convert size to pages */
+        sector_div(size, bs); /* convert size to pages */
-        do_div(size, 256); /* concert size to pgs pr blk */
+        size >>= 8; /* concert size to pgs pr blk */
        grp = &id->groups[0];
        grp->mtype = 0;
        grp->fmtype = 0;
        grp->num_ch = 1;
        grp->num_pg = 256;
        blksize = size;
-        do_div(size, (1 << 16));
+        size >>= 16;
        grp->num_lun = size + 1;
-        do_div(blksize, grp->num_lun);
+        sector_div(blksize, grp->num_lun);
        grp->num_blk = blksize;
        grp->num_pln = 1;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 59c91d49b14b..ba4bfe933276 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -23,7 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
 #include <linux/completion.h>
@@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 static unsigned int carm_fill_sync_time(struct carm_host *host,
                                        unsigned int idx, void *mem)
 {
-        struct timeval tv;
        struct carm_msg_sync_time *st = mem;
-        do_gettimeofday(&tv);
+        time64_t tv = ktime_get_real_seconds();
        memset(st, 0, sizeof(*st));
        st->type        = CARM_MSG_MISC;
        st->subtype     = MISC_SET_TIME;
        st->handle      = cpu_to_le32(TAG_ENCODE(idx));
-        st->timestamp   = cpu_to_le32(tv.tv_sec);
+        st->timestamp   = cpu_to_le32(tv);
        return sizeof(struct carm_msg_sync_time);
 }
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 41fb1a917b17..4809c1501d7e 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -84,6 +84,16 @@ MODULE_PARM_DESC(max_persistent_grants,
                 "Maximum number of grants to map persistently");
 /*
+ * Maximum number of rings/queues blkback supports, allow as many queues as there
+ * are CPUs if user has not specified a value.
+ */
+unsigned int xenblk_max_queues;
+module_param_named(max_queues, xenblk_max_queues, uint, 0644);
+MODULE_PARM_DESC(max_queues,
+                 "Maximum number of hardware queues per virtual disk." \
+                 "By default it is the number of online CPUs.");
+/*
 * Maximum order of pages to be used for the shared ring between front and
 * backend, 4KB page granularity is used.
 */
@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
 /* Number of free pages to remove on each call to gnttab_free_pages */
 #define NUM_BATCH_FREE_PAGES 10
-static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
+static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
 {
        unsigned long flags;
-        spin_lock_irqsave(&blkif->free_pages_lock, flags);
+        spin_lock_irqsave(&ring->free_pages_lock, flags);
-        if (list_empty(&blkif->free_pages)) {
+        if (list_empty(&ring->free_pages)) {
-                BUG_ON(blkif->free_pages_num != 0);
+                BUG_ON(ring->free_pages_num != 0);
-                spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+                spin_unlock_irqrestore(&ring->free_pages_lock, flags);
                return gnttab_alloc_pages(1, page);
        }
-        BUG_ON(blkif->free_pages_num == 0);
+        BUG_ON(ring->free_pages_num == 0);
-        page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+        page[0] = list_first_entry(&ring->free_pages, struct page, lru);
        list_del(&page[0]->lru);
-        blkif->free_pages_num--;
+        ring->free_pages_num--;
-        spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+        spin_unlock_irqrestore(&ring->free_pages_lock, flags);
        return 0;
 }
-static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
                                  int num)
 {
        unsigned long flags;
        int i;
-        spin_lock_irqsave(&blkif->free_pages_lock, flags);
+        spin_lock_irqsave(&ring->free_pages_lock, flags);
        for (i = 0; i < num; i++)
-                list_add(&page[i]->lru, &blkif->free_pages);
+                list_add(&page[i]->lru, &ring->free_pages);
-        blkif->free_pages_num += num;
+        ring->free_pages_num += num;
-        spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+        spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 }
-static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
+static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
 {
        /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
        struct page *page[NUM_BATCH_FREE_PAGES];
        unsigned int num_pages = 0;
        unsigned long flags;
-        spin_lock_irqsave(&blkif->free_pages_lock, flags);
+        spin_lock_irqsave(&ring->free_pages_lock, flags);
-        while (blkif->free_pages_num > num) {
+        while (ring->free_pages_num > num) {
-                BUG_ON(list_empty(&blkif->free_pages));
+                BUG_ON(list_empty(&ring->free_pages));
-                page[num_pages] = list_first_entry(&blkif->free_pages,
+                page[num_pages] = list_first_entry(&ring->free_pages,
                                                   struct page, lru);
                list_del(&page[num_pages]->lru);
-                blkif->free_pages_num--;
+                ring->free_pages_num--;
                if (++num_pages == NUM_BATCH_FREE_PAGES) {
-                        spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+                        spin_unlock_irqrestore(&ring->free_pages_lock, flags);
                        gnttab_free_pages(num_pages, page);
-                        spin_lock_irqsave(&blkif->free_pages_lock, flags);
+                        spin_lock_irqsave(&ring->free_pages_lock, flags);
                        num_pages = 0;
                }
        }
-        spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+        spin_unlock_irqrestore(&ring->free_pages_lock, flags);
        if (num_pages != 0)
                gnttab_free_pages(num_pages, page);
 }
 #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
-static int do_block_io_op(struct xen_blkif *blkif);
+static int do_block_io_op(struct xen_blkif_ring *ring);
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
                                struct blkif_request *req,
                                struct pending_req *pending_req);
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
                          unsigned short op, int st);
 #define foreach_grant_safe(pos, n, rbtree, node) \
@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 /*
 * We don't need locking around the persistent grant helpers
- * because blkback uses a single-thread for each backed, so we
+ * because blkback uses a single-thread for each backend, so we
 * can be sure that this functions will never be called recursively.
 *
 * The only exception to that is put_persistent_grant, that can be called
@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 * bit operations to modify the flags of a persistent grant and to count
 * the number of used grants.
 */
-static int add_persistent_gnt(struct xen_blkif *blkif,
+static int add_persistent_gnt(struct xen_blkif_ring *ring,
                               struct persistent_gnt *persistent_gnt)
 {
        struct rb_node **new = NULL, *parent = NULL;
        struct persistent_gnt *this;
+        struct xen_blkif *blkif = ring->blkif;
-        if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+        if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
                if (!blkif->vbd.overflow_max_grants)
                        blkif->vbd.overflow_max_grants = 1;
                return -EBUSY;
        }
        /* Figure out where to put new node */
-        new = &blkif->persistent_gnts.rb_node;
+        new = &ring->persistent_gnts.rb_node;
        while (*new) {
                this = container_of(*new, struct persistent_gnt, node);
@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
        set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
        /* Add new node and rebalance tree. */
        rb_link_node(&(persistent_gnt->node), parent, new);
-        rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
+        rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
-        blkif->persistent_gnt_c++;
+        ring->persistent_gnt_c++;
-        atomic_inc(&blkif->persistent_gnt_in_use);
+        atomic_inc(&ring->persistent_gnt_in_use);
        return 0;
 }
-static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
                                                 grant_ref_t gref)
 {
        struct persistent_gnt *data;
        struct rb_node *node = NULL;
-        node = blkif->persistent_gnts.rb_node;
+        node = ring->persistent_gnts.rb_node;
        while (node) {
                data = container_of(node, struct persistent_gnt, node);
@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
                                return NULL;
                        }
                        set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
-                        atomic_inc(&blkif->persistent_gnt_in_use);
+                        atomic_inc(&ring->persistent_gnt_in_use);
                        return data;
                }
        }
        return NULL;
 }
-static void put_persistent_gnt(struct xen_blkif *blkif,
+static void put_persistent_gnt(struct xen_blkif_ring *ring,
                               struct persistent_gnt *persistent_gnt)
 {
        if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
                pr_alert_ratelimited("freeing a grant already unused\n");
        set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
        clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
-        atomic_dec(&blkif->persistent_gnt_in_use);
+        atomic_dec(&ring->persistent_gnt_in_use);
 }
-static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
                                 unsigned int num)
 {
        struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
                        unmap_data.count = segs_to_unmap;
                        BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-                        put_free_pages(blkif, pages, segs_to_unmap);
+                        put_free_pages(ring, pages, segs_to_unmap);
                        segs_to_unmap = 0;
                }
@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
        struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        struct persistent_gnt *persistent_gnt;
        int segs_to_unmap = 0;
-        struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+        struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
        struct gntab_unmap_queue_data unmap_data;
        unmap_data.pages = pages;
        unmap_data.unmap_ops = unmap;
        unmap_data.kunmap_ops = NULL;
-        while(!list_empty(&blkif->persistent_purge_list)) {
+        while(!list_empty(&ring->persistent_purge_list)) {
-                persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+                persistent_gnt = list_first_entry(&ring->persistent_purge_list,
                                                  struct persistent_gnt,
                                                  remove_node);
                list_del(&persistent_gnt->remove_node);
@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
                if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
                        unmap_data.count = segs_to_unmap;
                        BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-                        put_free_pages(blkif, pages, segs_to_unmap);
+                        put_free_pages(ring, pages, segs_to_unmap);
                        segs_to_unmap = 0;
                }
                kfree(persistent_gnt);
@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
        if (segs_to_unmap > 0) {
                unmap_data.count = segs_to_unmap;
                BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-                put_free_pages(blkif, pages, segs_to_unmap);
+                put_free_pages(ring, pages, segs_to_unmap);
        }
 }
-static void purge_persistent_gnt(struct xen_blkif *blkif)
+static void purge_persistent_gnt(struct xen_blkif_ring *ring)
 {
        struct persistent_gnt *persistent_gnt;
        struct rb_node *n;
@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
        bool scan_used = false, clean_used = false;
        struct rb_root *root;
-        if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
+        if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
-            (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
+            (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
-            !blkif->vbd.overflow_max_grants)) {
+            !ring->blkif->vbd.overflow_max_grants)) {
-                return;
+                goto out;
        }
-        if (work_busy(&blkif->persistent_purge_work)) {
+        if (work_busy(&ring->persistent_purge_work)) {
                pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
-                return;
+                goto out;
        }
        num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
-        num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+        num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
-        num_clean = min(blkif->persistent_gnt_c, num_clean);
+        num_clean = min(ring->persistent_gnt_c, num_clean);
        if ((num_clean == 0) ||
-            (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
+            (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
-                return;
+                goto out;
        /*
         * At this point, we can assure that there will be no calls
@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
        pr_debug("Going to purge %u persistent grants\n", num_clean);
-        BUG_ON(!list_empty(&blkif->persistent_purge_list));
+        BUG_ON(!list_empty(&ring->persistent_purge_list));
-        root = &blkif->persistent_gnts;
+        root = &ring->persistent_gnts;
 purge_list:
        foreach_grant_safe(persistent_gnt, n, root, node) {
                BUG_ON(persistent_gnt->handle ==
@@ -414,7 +425,7 @@ purge_list:
                rb_erase(&persistent_gnt->node, root);
                list_add(&persistent_gnt->remove_node,
-                         &blkif->persistent_purge_list);
+                         &ring->persistent_purge_list);
                if (--num_clean == 0)
                        goto finished;
        }
@@ -435,30 +446,32 @@ finished:
                goto purge_list;
        }
-        blkif->persistent_gnt_c -= (total - num_clean);
+        ring->persistent_gnt_c -= (total - num_clean);
-        blkif->vbd.overflow_max_grants = 0;
+        ring->blkif->vbd.overflow_max_grants = 0;
        /* We can defer this work */
-        schedule_work(&blkif->persistent_purge_work);
+        schedule_work(&ring->persistent_purge_work);
        pr_debug("Purged %u/%u\n", (total - num_clean), total);
+out:
        return;
 }
 /*
 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
 */
-static struct pending_req *alloc_req(struct xen_blkif *blkif)
+static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
 {
        struct pending_req *req = NULL;
        unsigned long flags;
-        spin_lock_irqsave(&blkif->pending_free_lock, flags);
+        spin_lock_irqsave(&ring->pending_free_lock, flags);
-        if (!list_empty(&blkif->pending_free)) {
+        if (!list_empty(&ring->pending_free)) {
-                req = list_entry(blkif->pending_free.next, struct pending_req,
+                req = list_entry(ring->pending_free.next, struct pending_req,
                                 free_list);
                list_del(&req->free_list);
        }
-        spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+        spin_unlock_irqrestore(&ring->pending_free_lock, flags);
        return req;
 }
@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
 * Return the 'pending_req' structure back to the freepool. We also
 * wake up the thread if it was waiting for a free page.
 */
-static void free_req(struct xen_blkif *blkif, struct pending_req *req)
+static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
 {
        unsigned long flags;
        int was_empty;
-        spin_lock_irqsave(&blkif->pending_free_lock, flags);
+        spin_lock_irqsave(&ring->pending_free_lock, flags);
-        was_empty = list_empty(&blkif->pending_free);
+        was_empty = list_empty(&ring->pending_free);
-        list_add(&req->free_list, &blkif->pending_free);
+        list_add(&req->free_list, &ring->pending_free);
-        spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+        spin_unlock_irqrestore(&ring->pending_free_lock, flags);
        if (was_empty)
-                wake_up(&blkif->pending_free_wq);
+                wake_up(&ring->pending_free_wq);
 }
 /*
@@ -556,10 +569,10 @@ abort:
 /*
 * Notification from the guest OS.
 */
-static void blkif_notify_work(struct xen_blkif *blkif)
+static void blkif_notify_work(struct xen_blkif_ring *ring)
 {
-        blkif->waiting_reqs = 1;
+        ring->waiting_reqs = 1;
-        wake_up(&blkif->wq);
+        wake_up(&ring->wq);
 }
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 * SCHEDULER FUNCTIONS
 */
-static void print_stats(struct xen_blkif *blkif)
+static void print_stats(struct xen_blkif_ring *ring)
 {
        pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
                 "  |  ds %4llu | pg: %4u/%4d\n",
-                 current->comm, blkif->st_oo_req,
+                 current->comm, ring->st_oo_req,
-                 blkif->st_rd_req, blkif->st_wr_req,
+                 ring->st_rd_req, ring->st_wr_req,
-                 blkif->st_f_req, blkif->st_ds_req,
+                 ring->st_f_req, ring->st_ds_req,
-                 blkif->persistent_gnt_c,
+                 ring->persistent_gnt_c,
                 xen_blkif_max_pgrants);
-        blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+        ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
-        blkif->st_rd_req = 0;
+        ring->st_rd_req = 0;
-        blkif->st_wr_req = 0;
+        ring->st_wr_req = 0;
-        blkif->st_oo_req = 0;
+        ring->st_oo_req = 0;
-        blkif->st_ds_req = 0;
+        ring->st_ds_req = 0;
 }
 int xen_blkif_schedule(void *arg)
 {
-        struct xen_blkif *blkif = arg;
+        struct xen_blkif_ring *ring = arg;
+        struct xen_blkif *blkif = ring->blkif;
        struct xen_vbd *vbd = &blkif->vbd;
        unsigned long timeout;
        int ret;
        xen_blkif_get(blkif);
+        set_freezable();
        while (!kthread_should_stop()) {
                if (try_to_freeze())
                        continue;
@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
                timeout = msecs_to_jiffies(LRU_INTERVAL);
                timeout = wait_event_interruptible_timeout(
-                        blkif->wq,
+                        ring->wq,
-                        blkif->waiting_reqs || kthread_should_stop(),
+                        ring->waiting_reqs || kthread_should_stop(),
                        timeout);
                if (timeout == 0)
                        goto purge_gnt_list;
                timeout = wait_event_interruptible_timeout(
-                        blkif->pending_free_wq,
+                        ring->pending_free_wq,
-                        !list_empty(&blkif->pending_free) ||
+                        !list_empty(&ring->pending_free) ||
                        kthread_should_stop(),
                        timeout);
                if (timeout == 0)
                        goto purge_gnt_list;
-                blkif->waiting_reqs = 0;
+                ring->waiting_reqs = 0;
                smp_mb(); /* clear flag *before* checking for work */
-                ret = do_block_io_op(blkif);
+                ret = do_block_io_op(ring);
                if (ret > 0)
-                        blkif->waiting_reqs = 1;
+                        ring->waiting_reqs = 1;
                if (ret == -EACCES)
-                        wait_event_interruptible(blkif->shutdown_wq,
+                        wait_event_interruptible(ring->shutdown_wq,
                                                 kthread_should_stop());
 purge_gnt_list:
                if (blkif->vbd.feature_gnt_persistent &&
-                    time_after(jiffies, blkif->next_lru)) {
+                    time_after(jiffies, ring->next_lru)) {
-                        purge_persistent_gnt(blkif);
+                        purge_persistent_gnt(ring);
-                        blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+                        ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
                }
                /* Shrink if we have more than xen_blkif_max_buffer_pages */
-                shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+                shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
-                if (log_stats && time_after(jiffies, blkif->st_print))
+                if (log_stats && time_after(jiffies, ring->st_print))
-                        print_stats(blkif);
+                        print_stats(ring);
        }
        /* Drain pending purge work */
-        flush_work(&blkif->persistent_purge_work);
+        flush_work(&ring->persistent_purge_work);
        if (log_stats)
-                print_stats(blkif);
+                print_stats(ring);
-        blkif->xenblkd = NULL;
+        ring->xenblkd = NULL;
        xen_blkif_put(blkif);
        return 0;
@@ -658,22 +673,22 @@ purge_gnt_list:
 /*
 * Remove persistent grants and empty the pool of free pages
 */
-void xen_blkbk_free_caches(struct xen_blkif *blkif)
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
 {
        /* Free all persistent grant pages */
-        if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
+        if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
-                free_persistent_gnts(blkif, &blkif->persistent_gnts,
+                free_persistent_gnts(ring, &ring->persistent_gnts,
-                        blkif->persistent_gnt_c);
+                        ring->persistent_gnt_c);
-        BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
+        BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
-        blkif->persistent_gnt_c = 0;
+        ring->persistent_gnt_c = 0;
        /* Since we are shutting down remove all pages from the buffer */
-        shrink_free_pagepool(blkif, 0 /* All */);
+        shrink_free_pagepool(ring, 0 /* All */);
 }
 static unsigned int xen_blkbk_unmap_prepare(
-        struct xen_blkif *blkif,
+        struct xen_blkif_ring *ring,
        struct grant_page **pages,
        unsigned int num,
        struct gnttab_unmap_grant_ref *unmap_ops,
@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
        for (i = 0; i < num; i++) {
                if (pages[i]->persistent_gnt != NULL) {
-                        put_persistent_gnt(blkif, pages[i]->persistent_gnt);
+                        put_persistent_gnt(ring, pages[i]->persistent_gnt);
                        continue;
                }
                if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
 static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
 {
-        struct pending_req* pending_req = (struct pending_req*) (data->data);
+        struct pending_req *pending_req = (struct pending_req *)(data->data);
-        struct xen_blkif *blkif = pending_req->blkif;
+        struct xen_blkif_ring *ring = pending_req->ring;
+        struct xen_blkif *blkif = ring->blkif;
        /* BUG_ON used to reproduce existing behaviour,
           but is this the best way to deal with this? */
        BUG_ON(result);
-        put_free_pages(blkif, data->pages, data->count);
+        put_free_pages(ring, data->pages, data->count);
-        make_response(blkif, pending_req->id,
+        make_response(ring, pending_req->id,
                      pending_req->operation, pending_req->status);
-        free_req(blkif, pending_req);
+        free_req(ring, pending_req);
        /*
         * Make sure the request is freed before releasing blkif,
         * or there could be a race between free_req and the
@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
         * pending_free_wq if there's a drain going on, but it has
         * to be taken into account if the current model is changed.
         */
-        if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+        if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
                complete(&blkif->drain_complete);
        }
        xen_blkif_put(blkif);
@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
 static void xen_blkbk_unmap_and_respond(struct pending_req *req)
 {
        struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
-        struct xen_blkif *blkif = req->blkif;
+        struct xen_blkif_ring *ring = req->ring;
        struct grant_page **pages = req->segments;
        unsigned int invcount;
-        invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs,
+        invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
                                           req->unmap, req->unmap_pages);
        work->data = req;
@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
 * of hypercalls, but since this is only used in error paths there's
 * no real need.
 */
-static void xen_blkbk_unmap(struct xen_blkif *blkif,
+static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
                            struct grant_page *pages[],
                            int num)
 {
@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
        while (num) {
                unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-                
-                invcount = xen_blkbk_unmap_prepare(blkif, pages, batch,
+                invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
                                                   unmap, unmap_pages);
                if (invcount) {
                        ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
                        BUG_ON(ret);
-                        put_free_pages(blkif, unmap_pages, invcount);
+                        put_free_pages(ring, unmap_pages, invcount);
                }
                pages += batch;
                num -= batch;
        }
 }
-static int xen_blkbk_map(struct xen_blkif *blkif,
+static int xen_blkbk_map(struct xen_blkif_ring *ring,
                         struct grant_page *pages[],
                         int num, bool ro)
 {
@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
        int ret = 0;
        int last_map = 0, map_until = 0;
        int use_persistent_gnts;
+        struct xen_blkif *blkif = ring->blkif;
        use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
@@ -806,10 +823,11 @@ again:
        for (i = map_until; i < num; i++) {
                uint32_t flags;
-                if (use_persistent_gnts)
+                if (use_persistent_gnts) {
                        persistent_gnt = get_persistent_gnt(
-                                blkif,
+                                ring,
                                pages[i]->gref);
+                }
                if (persistent_gnt) {
                        /*
@@ -819,7 +837,7 @@ again:
                        pages[i]->page = persistent_gnt->page;
                        pages[i]->persistent_gnt = persistent_gnt;
                } else {
-                        if (get_free_page(blkif, &pages[i]->page))
+                        if (get_free_page(ring, &pages[i]->page))
                                goto out_of_memory;
                        addr = vaddr(pages[i]->page);
                        pages_to_gnt[segs_to_map] = pages[i]->page;
@@ -852,7 +870,7 @@ again:
                        BUG_ON(new_map_idx >= segs_to_map);
                        if (unlikely(map[new_map_idx].status != 0)) {
                                pr_debug("invalid buffer -- could not remap it\n");
-                                put_free_pages(blkif, &pages[seg_idx]->page, 1);
+                                put_free_pages(ring, &pages[seg_idx]->page, 1);
                                pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
                                ret |= 1;
                                goto next;
@@ -862,7 +880,7 @@ again:
                        continue;
                }
                if (use_persistent_gnts &&
-                    blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+                    ring->persistent_gnt_c < xen_blkif_max_pgrants) {
                        /*
                         * We are using persistent grants, the grant is
                         * not mapped but we might have room for it.
@@ -880,7 +898,7 @@ again:
                        persistent_gnt->gnt = map[new_map_idx].ref;
                        persistent_gnt->handle = map[new_map_idx].handle;
                        persistent_gnt->page = pages[seg_idx]->page;
-                        if (add_persistent_gnt(blkif,
+                        if (add_persistent_gnt(ring,
                                               persistent_gnt)) {
                                kfree(persistent_gnt);
                                persistent_gnt = NULL;
@@ -888,7 +906,7 @@ again:
                        }
                        pages[seg_idx]->persistent_gnt = persistent_gnt;
                        pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
-                                 persistent_gnt->gnt, blkif->persistent_gnt_c,
+                                 persistent_gnt->gnt, ring->persistent_gnt_c,
                                 xen_blkif_max_pgrants);
                        goto next;
                }
@@ -913,7 +931,7 @@ next:
 out_of_memory:
        pr_alert("%s: out of memory\n", __func__);
-        put_free_pages(blkif, pages_to_gnt, segs_to_map);
+        put_free_pages(ring, pages_to_gnt, segs_to_map);
        return -ENOMEM;
 }
@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
 {
        int rc;
-        rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+        rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
                           pending_req->nr_segs,
                           (pending_req->operation != BLKIF_OP_READ));
@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
                                    struct phys_req *preq)
 {
        struct grant_page **pages = pending_req->indirect_pages;
-        struct xen_blkif *blkif = pending_req->blkif;
+        struct xen_blkif_ring *ring = pending_req->ring;
        int indirect_grefs, rc, n, nseg, i;
        struct blkif_request_segment *segments = NULL;
@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
        for (i = 0; i < indirect_grefs; i++)
                pages[i]->gref = req->u.indirect.indirect_grefs[i];
-        rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+        rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
        if (rc)
                goto unmap;
@@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 unmap:
        if (segments)
                kunmap_atomic(segments);
-        xen_blkbk_unmap(blkif, pages, indirect_grefs);
+        xen_blkbk_unmap(ring, pages, indirect_grefs);
        return rc;
 }
-static int dispatch_discard_io(struct xen_blkif *blkif,
+static int dispatch_discard_io(struct xen_blkif_ring *ring,
                                struct blkif_request *req)
 {
        int err = 0;
        int status = BLKIF_RSP_OKAY;
+        struct xen_blkif *blkif = ring->blkif;
        struct block_device *bdev = blkif->vbd.bdev;
        unsigned long secure;
        struct phys_req preq;
@@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
                        preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
                goto fail_response;
        }
-        blkif->st_ds_req++;
+        ring->st_ds_req++;
        secure = (blkif->vbd.discard_secure &&
                 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -1018,26 +1037,28 @@ fail_response:
        } else if (err)
                status = BLKIF_RSP_ERROR;
-        make_response(blkif, req->u.discard.id, req->operation, status);
+        make_response(ring, req->u.discard.id, req->operation, status);
        xen_blkif_put(blkif);
        return err;
 }
-static int dispatch_other_io(struct xen_blkif *blkif,
+static int dispatch_other_io(struct xen_blkif_ring *ring,
                             struct blkif_request *req,
                             struct pending_req *pending_req)
 {
-        free_req(blkif, pending_req);
+        free_req(ring, pending_req);
-        make_response(blkif, req->u.other.id, req->operation,
+        make_response(ring, req->u.other.id, req->operation,
                      BLKIF_RSP_EOPNOTSUPP);
        return -EIO;
 }
-static void xen_blk_drain_io(struct xen_blkif *blkif)
+static void xen_blk_drain_io(struct xen_blkif_ring *ring)
 {
+        struct xen_blkif *blkif = ring->blkif;
        atomic_set(&blkif->drain, 1);
        do {
-                if (atomic_read(&blkif->inflight) == 0)
+                if (atomic_read(&ring->inflight) == 0)
                        break;
                wait_for_completion_interruptible_timeout(
                                &blkif->drain_complete, HZ);
@@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
        if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
            (error == -EOPNOTSUPP)) {
                pr_debug("flush diskcache op failed, not supported\n");
-                xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
+                xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
                pending_req->status = BLKIF_RSP_EOPNOTSUPP;
        } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
                    (error == -EOPNOTSUPP)) {
                pr_debug("write barrier op failed, not supported\n");
-                xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+                xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
                pending_req->status = BLKIF_RSP_EOPNOTSUPP;
        } else if (error) {
                pr_debug("Buffer not up-to-date at end of operation,"
@@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio)
 * and transmute  it to the block API to hand it over to the proper block disk.
 */
 static int
-__do_block_io_op(struct xen_blkif *blkif)
+__do_block_io_op(struct xen_blkif_ring *ring)
 {
-        union blkif_back_rings *blk_rings = &blkif->blk_rings;
+        union blkif_back_rings *blk_rings = &ring->blk_rings;
        struct blkif_request req;
        struct pending_req *pending_req;
        RING_IDX rc, rp;
@@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif)
        if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
                rc = blk_rings->common.rsp_prod_pvt;
                pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
-                        rp, rc, rp - rc, blkif->vbd.pdevice);
+                        rp, rc, rp - rc, ring->blkif->vbd.pdevice);
                return -EACCES;
        }
        while (rc != rp) {
@@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif)
                        break;
                }
-                pending_req = alloc_req(blkif);
+                pending_req = alloc_req(ring);
                if (NULL == pending_req) {
-                        blkif->st_oo_req++;
+                        ring->st_oo_req++;
                        more_to_do = 1;
                        break;
                }
-                switch (blkif->blk_protocol) {
+                switch (ring->blkif->blk_protocol) {
                case BLKIF_PROTOCOL_NATIVE:
                        memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
                        break;
@@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif)
                case BLKIF_OP_WRITE_BARRIER:
                case BLKIF_OP_FLUSH_DISKCACHE:
                case BLKIF_OP_INDIRECT:
-                        if (dispatch_rw_block_io(blkif, &req, pending_req))
+                        if (dispatch_rw_block_io(ring, &req, pending_req))
                                goto done;
                        break;
                case BLKIF_OP_DISCARD:
-                        free_req(blkif, pending_req);
+                        free_req(ring, pending_req);
-                        if (dispatch_discard_io(blkif, &req))
+                        if (dispatch_discard_io(ring, &req))
                                goto done;
                        break;
                default:
-                        if (dispatch_other_io(blkif, &req, pending_req))
+                        if (dispatch_other_io(ring, &req, pending_req))
                                goto done;
                        break;
                }
@@ -1178,13 +1199,13 @@ done:
 }
 static int
-do_block_io_op(struct xen_blkif *blkif)
+do_block_io_op(struct xen_blkif_ring *ring)
 {
-        union blkif_back_rings *blk_rings = &blkif->blk_rings;
+        union blkif_back_rings *blk_rings = &ring->blk_rings;
        int more_to_do;
        do {
-                more_to_do = __do_block_io_op(blkif);
+                more_to_do = __do_block_io_op(ring);
                if (more_to_do)
                        break;
@@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif)
 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
 * and call the 'submit_bio' to pass it to the underlying storage.
 */
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
                                struct blkif_request *req,
                                struct pending_req *pending_req)
 {
@@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
        switch (req_operation) {
        case BLKIF_OP_READ:
-                blkif->st_rd_req++;
+                ring->st_rd_req++;
                operation = READ;
                break;
        case BLKIF_OP_WRITE:
-                blkif->st_wr_req++;
+                ring->st_wr_req++;
                operation = WRITE_ODIRECT;
                break;
        case BLKIF_OP_WRITE_BARRIER:
                drain = true;
        case BLKIF_OP_FLUSH_DISKCACHE:
-                blkif->st_f_req++;
+                ring->st_f_req++;
                operation = WRITE_FLUSH;
                break;
        default:
@@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
        preq.nr_sects      = 0;
-        pending_req->blkif     = blkif;
+        pending_req->ring      = ring;
        pending_req->id        = req->u.rw.id;
        pending_req->operation = req_operation;
        pending_req->status    = BLKIF_RSP_OKAY;
@@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                        goto fail_response;
        }
-        if (xen_vbd_translate(&preq, blkif, operation) != 0) {
+        if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
                pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
                         operation == READ ? "read" : "write",
                         preq.sector_number,
                         preq.sector_number + preq.nr_sects,
-                         blkif->vbd.pdevice);
+                         ring->blkif->vbd.pdevice);
                goto fail_response;
        }
@@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                if (((int)preq.sector_number|(int)seg[i].nsec) &
                    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
                        pr_debug("Misaligned I/O request from domain %d\n",
-                                 blkif->domid);
+                                 ring->blkif->domid);
                        goto fail_response;
                }
        }
@@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
         * issue the WRITE_FLUSH.
         */
        if (drain)
-                xen_blk_drain_io(pending_req->blkif);
+                xen_blk_drain_io(pending_req->ring);
        /*
         * If we have failed at this point, we need to undo the M2P override,
@@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
         * This corresponding xen_blkif_put is done in __end_block_io_op, or
         * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
         */
-        xen_blkif_get(blkif);
+        xen_blkif_get(ring->blkif);
-        atomic_inc(&blkif->inflight);
+        atomic_inc(&ring->inflight);
        for (i = 0; i < nseg; i++) {
                while ((bio == NULL) ||
@@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
        blk_finish_plug(&plug);
        if (operation == READ)
-                blkif->st_rd_sect += preq.nr_sects;
+                ring->st_rd_sect += preq.nr_sects;
        else if (operation & WRITE)
-                blkif->st_wr_sect += preq.nr_sects;
+                ring->st_wr_sect += preq.nr_sects;
        return 0;
 fail_flush:
-        xen_blkbk_unmap(blkif, pending_req->segments,
+        xen_blkbk_unmap(ring, pending_req->segments,
                        pending_req->nr_segs);
 fail_response:
        /* Haven't submitted any bio's yet. */
-        make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+        make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
-        free_req(blkif, pending_req);
+        free_req(ring, pending_req);
        msleep(1); /* back off a bit */
        return -EIO;
@@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 /*
 * Put a response on the ring on how the operation fared.
 */
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
                          unsigned short op, int st)
 {
        struct blkif_response  resp;
        unsigned long     flags;
-        union blkif_back_rings *blk_rings = &blkif->blk_rings;
+        union blkif_back_rings *blk_rings;
        int notify;
        resp.id        = id;
        resp.operation = op;
        resp.status    = st;
-        spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+        spin_lock_irqsave(&ring->blk_ring_lock, flags);
+        blk_rings = &ring->blk_rings;
        /* Place on the response ring for the relevant domain. */
-        switch (blkif->blk_protocol) {
+        switch (ring->blkif->blk_protocol) {
        case BLKIF_PROTOCOL_NATIVE:
                memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
                       &resp, sizeof(resp));
@@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
        }
        blk_rings->common.rsp_prod_pvt++;
        RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
-        spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+        spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
        if (notify)
-                notify_remote_via_irq(blkif->irq);
+                notify_remote_via_irq(ring->irq);
 }
 static int __init xen_blkif_init(void)
@@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void)
                xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
        }
+        if (xenblk_max_queues == 0)
+                xenblk_max_queues = num_online_cpus();
        rc = xen_blkif_interface_init();
        if (rc)
                goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index c929ae22764c..dea61f6ab8cb 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -46,6 +46,7 @@
 #include <xen/interface/io/protocols.h>
 extern unsigned int xen_blkif_max_ring_order;
+extern unsigned int xenblk_max_queues;
 /*
 * This is the maximum number of segments that would be allowed in indirect
 * requests. This value will also be passed to the frontend.
@@ -269,68 +270,79 @@ struct persistent_gnt {
        struct list_head remove_node;
 };
-struct xen_blkif {
+/* Per-ring information. */
-        /* Unique identifier for this interface. */
+struct xen_blkif_ring {
-        domid_t                 domid;
-        unsigned int            handle;
        /* Physical parameters of the comms window. */
        unsigned int            irq;
-        /* Comms information. */
-        enum blkif_protocol     blk_protocol;
        union blkif_back_rings  blk_rings;
        void                    *blk_ring;
-        /* The VBD attached to this interface. */
-        struct xen_vbd          vbd;
-        /* Back pointer to the backend_info. */
-        struct backend_info     *be;
        /* Private fields. */
        spinlock_t              blk_ring_lock;
-        atomic_t                refcnt;
        wait_queue_head_t       wq;
-        /* for barrier (drain) requests */
-        struct completion       drain_complete;
-        atomic_t                drain;
        atomic_t                inflight;
-        /* One thread per one blkif. */
+        /* One thread per blkif ring. */
        struct task_struct      *xenblkd;
        unsigned int            waiting_reqs;
-        /* tree to store persistent grants */
+        /* List of all 'pending_req' available */
+        struct list_head        pending_free;
+        /* And its spinlock. */
+        spinlock_t              pending_free_lock;
+        wait_queue_head_t       pending_free_wq;
+        /* Tree to store persistent grants. */
+        spinlock_t              pers_gnts_lock;
        struct rb_root          persistent_gnts;
        unsigned int            persistent_gnt_c;
        atomic_t                persistent_gnt_in_use;
        unsigned long           next_lru;
-        /* used by the kworker that offload work from the persistent purge */
+        /* Statistics. */
+        unsigned long           st_print;
+        unsigned long long      st_rd_req;
+        unsigned long long      st_wr_req;
+        unsigned long long      st_oo_req;
+        unsigned long long      st_f_req;
+        unsigned long long      st_ds_req;
+        unsigned long long      st_rd_sect;
+        unsigned long long      st_wr_sect;
+        /* Used by the kworker that offload work from the persistent purge. */
        struct list_head        persistent_purge_list;
        struct work_struct      persistent_purge_work;
-        /* buffer of free pages to map grant refs */
+        /* Buffer of free pages to map grant refs. */
        spinlock_t              free_pages_lock;
        int                     free_pages_num;
        struct list_head        free_pages;
-        /* List of all 'pending_req' available */
-        struct list_head        pending_free;
-        /* And its spinlock. */
-        spinlock_t              pending_free_lock;
-        wait_queue_head_t       pending_free_wq;
-        /* statistics */
-        unsigned long           st_print;
-        unsigned long long                      st_rd_req;
-        unsigned long long                      st_wr_req;
-        unsigned long long                      st_oo_req;
-        unsigned long long                      st_f_req;
-        unsigned long long                      st_ds_req;
-        unsigned long long                      st_rd_sect;
-        unsigned long long                      st_wr_sect;
        struct work_struct      free_work;
        /* Thread shutdown wait queue. */
        wait_queue_head_t       shutdown_wq;
-        unsigned int nr_ring_pages;
+        struct xen_blkif        *blkif;
+};
+struct xen_blkif {
+        /* Unique identifier for this interface. */
+        domid_t                 domid;
+        unsigned int            handle;
+        /* Comms information. */
+        enum blkif_protocol     blk_protocol;
+        /* The VBD attached to this interface. */
+        struct xen_vbd          vbd;
+        /* Back pointer to the backend_info. */
+        struct backend_info     *be;
+        atomic_t                refcnt;
+        /* for barrier (drain) requests */
+        struct completion       drain_complete;
+        atomic_t                drain;
+        struct work_struct      free_work;
+        unsigned int            nr_ring_pages;
+        /* All rings for this device. */
+        struct xen_blkif_ring   *rings;
+        unsigned int            nr_rings;
 };
 struct seg_buf {
@@ -352,7 +364,7 @@ struct grant_page {
 * response queued for it, with the saved 'id' passed back.
 */
 struct pending_req {
-        struct xen_blkif        *blkif;
+        struct xen_blkif_ring   *ring;
        u64                     id;
        int                     nr_segs;
        atomic_t                pendcnt;
@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
 int xen_blkif_schedule(void *arg);
 int xen_blkif_purge_persistent(void *arg);
-void xen_blkbk_free_caches(struct xen_blkif *blkif);
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
                              struct backend_info *be, int state);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f53cff42f8da..876763f7f13e 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 {
        int err;
        char name[BLKBACK_NAME_LEN];
+        struct xen_blkif_ring *ring;
+        int i;
        /* Not ready to connect? */
-        if (!blkif->irq || !blkif->vbd.bdev)
+        if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
                return;
        /* Already connected? */
@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
        }
        invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
-        blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
+        for (i = 0; i < blkif->nr_rings; i++) {
-        if (IS_ERR(blkif->xenblkd)) {
+                ring = &blkif->rings[i];
-                err = PTR_ERR(blkif->xenblkd);
+                ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
-                blkif->xenblkd = NULL;
+                if (IS_ERR(ring->xenblkd)) {
-                xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+                        err = PTR_ERR(ring->xenblkd);
-                return;
+                        ring->xenblkd = NULL;
+                        xenbus_dev_fatal(blkif->be->dev, err,
+                                        "start %s-%d xenblkd", name, i);
+                        goto out;
+                }
+        }
+        return;
+out:
+        while (--i >= 0) {
+                ring = &blkif->rings[i];
+                kthread_stop(ring->xenblkd);
+        }
+        return;
+}
+static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
+{
+        unsigned int r;
+        blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
+        if (!blkif->rings)
+                return -ENOMEM;
+        for (r = 0; r < blkif->nr_rings; r++) {
+                struct xen_blkif_ring *ring = &blkif->rings[r];
+                spin_lock_init(&ring->blk_ring_lock);
+                init_waitqueue_head(&ring->wq);
+                INIT_LIST_HEAD(&ring->pending_free);
+                INIT_LIST_HEAD(&ring->persistent_purge_list);
+                INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+                spin_lock_init(&ring->free_pages_lock);
+                INIT_LIST_HEAD(&ring->free_pages);
+                spin_lock_init(&ring->pending_free_lock);
+                init_waitqueue_head(&ring->pending_free_wq);
+                init_waitqueue_head(&ring->shutdown_wq);
+                ring->blkif = blkif;
+                ring->st_print = jiffies;
+                xen_blkif_get(blkif);
        }
+        return 0;
 }
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
                return ERR_PTR(-ENOMEM);
        blkif->domid = domid;
-        spin_lock_init(&blkif->blk_ring_lock);
        atomic_set(&blkif->refcnt, 1);
-        init_waitqueue_head(&blkif->wq);
        init_completion(&blkif->drain_complete);
-        atomic_set(&blkif->drain, 0);
-        blkif->st_print = jiffies;
-        blkif->persistent_gnts.rb_node = NULL;
-        spin_lock_init(&blkif->free_pages_lock);
-        INIT_LIST_HEAD(&blkif->free_pages);
-        INIT_LIST_HEAD(&blkif->persistent_purge_list);
-        blkif->free_pages_num = 0;
-        atomic_set(&blkif->persistent_gnt_in_use, 0);
-        atomic_set(&blkif->inflight, 0);
-        INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
-        INIT_LIST_HEAD(&blkif->pending_free);
        INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-        spin_lock_init(&blkif->pending_free_lock);
-        init_waitqueue_head(&blkif->pending_free_wq);
-        init_waitqueue_head(&blkif->shutdown_wq);
        return blkif;
 }
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
                         unsigned int nr_grefs, unsigned int evtchn)
 {
        int err;
+        struct xen_blkif *blkif = ring->blkif;
        /* Already connected through? */
-        if (blkif->irq)
+        if (ring->irq)
                return 0;
        err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
-                                     &blkif->blk_ring);
+                                     &ring->blk_ring);
        if (err < 0)
                return err;
@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
        case BLKIF_PROTOCOL_NATIVE:
        {
                struct blkif_sring *sring;
-                sring = (struct blkif_sring *)blkif->blk_ring;
+                sring = (struct blkif_sring *)ring->blk_ring;
-                BACK_RING_INIT(&blkif->blk_rings.native, sring,
+                BACK_RING_INIT(&ring->blk_rings.native, sring,
                               XEN_PAGE_SIZE * nr_grefs);
                break;
        }
        case BLKIF_PROTOCOL_X86_32:
        {
                struct blkif_x86_32_sring *sring_x86_32;
-                sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
+                sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
-                BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+                BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
                               XEN_PAGE_SIZE * nr_grefs);
                break;
        }
        case BLKIF_PROTOCOL_X86_64:
        {
                struct blkif_x86_64_sring *sring_x86_64;
-                sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
+                sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
-                BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+                BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
                               XEN_PAGE_SIZE * nr_grefs);
                break;
        }
@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
        err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
                                                    xen_blkif_be_int, 0,
-                                                    "blkif-backend", blkif);
+                                                    "blkif-backend", ring);
        if (err < 0) {
-                xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
+                xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
-                blkif->blk_rings.common.sring = NULL;
+                ring->blk_rings.common.sring = NULL;
                return err;
        }
-        blkif->irq = err;
+        ring->irq = err;
        return 0;
 }
@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
        struct pending_req *req, *n;
-        int i = 0, j;
+        unsigned int j, r;
-        if (blkif->xenblkd) {
+        for (r = 0; r < blkif->nr_rings; r++) {
-                kthread_stop(blkif->xenblkd);
+                struct xen_blkif_ring *ring = &blkif->rings[r];
-                wake_up(&blkif->shutdown_wq);
+                unsigned int i = 0;
-                blkif->xenblkd = NULL;
-        }
-        /* The above kthread_stop() guarantees that at this point we
+                if (ring->xenblkd) {
-         * don't have any discard_io or other_io requests. So, checking
+                        kthread_stop(ring->xenblkd);
-         * for inflight IO is enough.
+                        wake_up(&ring->shutdown_wq);
-         */
+                        ring->xenblkd = NULL;
-        if (atomic_read(&blkif->inflight) > 0)
+                }
-                return -EBUSY;
-        if (blkif->irq) {
+                /* The above kthread_stop() guarantees that at this point we
-                unbind_from_irqhandler(blkif->irq, blkif);
+                 * don't have any discard_io or other_io requests. So, checking
-                blkif->irq = 0;
+                 * for inflight IO is enough.
-        }
+                 */
+                if (atomic_read(&ring->inflight) > 0)
+                        return -EBUSY;
-        if (blkif->blk_rings.common.sring) {
+                if (ring->irq) {
-                xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
+                        unbind_from_irqhandler(ring->irq, ring);
-                blkif->blk_rings.common.sring = NULL;
+                        ring->irq = 0;
-        }
+                }
-        /* Remove all persistent grants and the cache of ballooned pages. */
+                if (ring->blk_rings.common.sring) {
-        xen_blkbk_free_caches(blkif);
+                        xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+                        ring->blk_rings.common.sring = NULL;
+                }
-        /* Check that there is no request in use */
+                /* Remove all persistent grants and the cache of ballooned pages. */
-        list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+                xen_blkbk_free_caches(ring);
-                list_del(&req->free_list);
-                for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+                /* Check that there is no request in use */
-                        kfree(req->segments[j]);
+                list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
+                        list_del(&req->free_list);
-                for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+                        for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
-                        kfree(req->indirect_pages[j]);
+                                kfree(req->segments[j]);
-                kfree(req);
+                        for (j = 0; j < MAX_INDIRECT_PAGES; j++)
-                i++;
+                                kfree(req->indirect_pages[j]);
-        }
+                        kfree(req);
+                        i++;
+                }
-        WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+                BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
+                BUG_ON(!list_empty(&ring->persistent_purge_list));
+                BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+                BUG_ON(!list_empty(&ring->free_pages));
+                BUG_ON(ring->free_pages_num != 0);
+                BUG_ON(ring->persistent_gnt_c != 0);
+                WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+                xen_blkif_put(blkif);
+        }
        blkif->nr_ring_pages = 0;
+        /*
+         * blkif->rings was allocated in connect_ring, so we should free it in
+         * here.
+         */
+        kfree(blkif->rings);
+        blkif->rings = NULL;
+        blkif->nr_rings = 0;
        return 0;
 }
@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
        xen_vbd_free(&blkif->vbd);
        /* Make sure everything is drained before shutting down */
-        BUG_ON(blkif->persistent_gnt_c != 0);
-        BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
-        BUG_ON(blkif->free_pages_num != 0);
-        BUG_ON(!list_empty(&blkif->persistent_purge_list));
-        BUG_ON(!list_empty(&blkif->free_pages));
-        BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
        kmem_cache_free(xen_blkif_cachep, blkif);
 }
@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
 *  sysfs interface for VBD I/O requests
 */
-#define VBD_SHOW(name, format, args...)                                 \
+#define VBD_SHOW_ALLRING(name, format)                                  \
        static ssize_t show_##name(struct device *_dev,                 \
                                   struct device_attribute *attr,       \
                                   char *buf)                           \
        {                                                               \
                struct xenbus_device *dev = to_xenbus_device(_dev);     \
                struct backend_info *be = dev_get_drvdata(&dev->dev);   \
+                struct xen_blkif *blkif = be->blkif;                    \
+                unsigned int i;                                         \
+                unsigned long long result = 0;                          \
                                                                        \
-                return sprintf(buf, format, ##args);                    \
+                if (!blkif->rings)                              \
+                        goto out;                                       \
+                                                                        \
+                for (i = 0; i < blkif->nr_rings; i++) {         \
+                        struct xen_blkif_ring *ring = &blkif->rings[i]; \
+                                                                        \
+                        result += ring->st_##name;                      \
+                }                                                       \
+                                                                        \
+out:                                                                    \
+                return sprintf(buf, format, result);                    \
        }                                                               \
        static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
-VBD_SHOW(oo_req,  "%llu\n", be->blkif->st_oo_req);
+VBD_SHOW_ALLRING(oo_req,  "%llu\n");
-VBD_SHOW(rd_req,  "%llu\n", be->blkif->st_rd_req);
+VBD_SHOW_ALLRING(rd_req,  "%llu\n");
-VBD_SHOW(wr_req,  "%llu\n", be->blkif->st_wr_req);
+VBD_SHOW_ALLRING(wr_req,  "%llu\n");
-VBD_SHOW(f_req,  "%llu\n", be->blkif->st_f_req);
+VBD_SHOW_ALLRING(f_req,  "%llu\n");
-VBD_SHOW(ds_req,  "%llu\n", be->blkif->st_ds_req);
+VBD_SHOW_ALLRING(ds_req,  "%llu\n");
-VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect);
+VBD_SHOW_ALLRING(rd_sect, "%llu\n");
-VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect);
+VBD_SHOW_ALLRING(wr_sect, "%llu\n");
 static struct attribute *xen_vbdstat_attrs[] = {
        &dev_attr_oo_req.attr,
@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
        .attrs = xen_vbdstat_attrs,
 };
+#define VBD_SHOW(name, format, args...)                                 \
+        static ssize_t show_##name(struct device *_dev,                 \
+                                   struct device_attribute *attr,       \
+                                   char *buf)                           \
+        {                                                               \
+                struct xenbus_device *dev = to_xenbus_device(_dev);     \
+                struct backend_info *be = dev_get_drvdata(&dev->dev);   \
+                                                                        \
+                return sprintf(buf, format, ##args);                    \
+        }                                                               \
+        static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
 VBD_SHOW(mode, "%s\n", be->mode);
@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
        dev_set_drvdata(&dev->dev, NULL);
-        if (be->blkif) {
+        if (be->blkif)
                xen_blkif_disconnect(be->blkif);
-                xen_blkif_put(be->blkif);
-        }
+        /* Put the reference we set in xen_blkif_alloc(). */
+        xen_blkif_put(be->blkif);
        kfree(be->mode);
        kfree(be);
        return 0;
@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
                goto fail;
        }
+        /* Multi-queue: advertise how many queues are supported by us.*/
+        err = xenbus_printf(XBT_NIL, dev->nodename,
+                            "multi-queue-max-queues", "%u", xenblk_max_queues);
+        if (err)
+                pr_warn("Error writing multi-queue-max-queues\n");
        /* setup back pointer */
        be->blkif->be = be;
@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
                }
                err = connect_ring(be);
-                if (err)
+                if (err) {
+                        /*
+                         * Clean up so that memory resources can be used by
+                         * other devices. connect_ring reported already error.
+                         */
+                        xen_blkif_disconnect(be->blkif);
                        break;
+                }
                xen_update_blkif_status(be->blkif);
                break;
@@ -825,50 +902,43 @@ again:
        xenbus_transaction_end(xbt, 1);
 }
+/*
-static int connect_ring(struct backend_info *be)
+ * Each ring may have multi pages, depends on "ring-page-order".
+ */
+static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
 {
-        struct xenbus_device *dev = be->dev;
        unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
-        unsigned int evtchn, nr_grefs, ring_page_order;
-        unsigned int pers_grants;
-        char protocol[64] = "";
        struct pending_req *req, *n;
        int err, i, j;
+        struct xen_blkif *blkif = ring->blkif;
+        struct xenbus_device *dev = blkif->be->dev;
+        unsigned int ring_page_order, nr_grefs, evtchn;
-        pr_debug("%s %s\n", __func__, dev->otherend);
+        err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
-        err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
                          &evtchn);
        if (err != 1) {
                err = -EINVAL;
-                xenbus_dev_fatal(dev, err, "reading %s/event-channel",
+                xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
-                                 dev->otherend);
                return err;
        }
-        pr_info("event-channel %u\n", evtchn);
        err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
                          &ring_page_order);
        if (err != 1) {
-                err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
+                err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
-                                  "%u", &ring_ref[0]);
                if (err != 1) {
                        err = -EINVAL;
-                        xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+                        xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
-                                         dev->otherend);
                        return err;
                }
                nr_grefs = 1;
-                pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
-                        ring_ref[0]);
        } else {
                unsigned int i;
                if (ring_page_order > xen_blkif_max_ring_order) {
                        err = -EINVAL;
                        xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
-                                         dev->otherend, ring_page_order,
+                                         dir, ring_page_order,
                                         xen_blkif_max_ring_order);
                        return err;
                }
@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
                        char ring_ref_name[RINGREF_NAME_LEN];
                        snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-                        err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
+                        err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
                                           "%u", &ring_ref[i]);
                        if (err != 1) {
                                err = -EINVAL;
                                xenbus_dev_fatal(dev, err, "reading %s/%s",
-                                                 dev->otherend, ring_ref_name);
+                                                 dir, ring_ref_name);
                                return err;
                        }
-                        pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
                }
        }
+        blkif->nr_ring_pages = nr_grefs;
-        be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
-        err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
-                            "%63s", protocol, NULL);
-        if (err)
-                strcpy(protocol, "unspecified, assuming default");
-        else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-                be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
-        else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-                be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
-        else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-                be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
-        else {
-                xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
-                return -1;
-        }
-        err = xenbus_gather(XBT_NIL, dev->otherend,
-                            "feature-persistent", "%u",
-                            &pers_grants, NULL);
-        if (err)
-                pers_grants = 0;
-        be->blkif->vbd.feature_gnt_persistent = pers_grants;
-        be->blkif->vbd.overflow_max_grants = 0;
-        be->blkif->nr_ring_pages = nr_grefs;
-        pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
-                nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
-                pers_grants ? "persistent grants" : "");
        for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
                req = kzalloc(sizeof(*req), GFP_KERNEL);
                if (!req)
                        goto fail;
-                list_add_tail(&req->free_list, &be->blkif->pending_free);
+                list_add_tail(&req->free_list, &ring->pending_free);
                for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
                        req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
                        if (!req->segments[j])
@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
        }
        /* Map the shared frame, irq etc. */
-        err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
+        err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
        if (err) {
                xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
                return err;
@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
        return 0;
 fail:
-        list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) {
+        list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
                list_del(&req->free_list);
                for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
                        if (!req->segments[j])
@@ -962,6 +1003,93 @@ fail:
                kfree(req);
        }
        return -ENOMEM;
+}
+static int connect_ring(struct backend_info *be)
+{
+        struct xenbus_device *dev = be->dev;
+        unsigned int pers_grants;
+        char protocol[64] = "";
+        int err, i;
+        char *xspath;
+        size_t xspathsize;
+        const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
+        unsigned int requested_num_queues = 0;
+        pr_debug("%s %s\n", __func__, dev->otherend);
+        be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+        err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+                            "%63s", protocol, NULL);
+        if (err)
+                strcpy(protocol, "unspecified, assuming default");
+        else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+                be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+        else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+                be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+        else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+                be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+        else {
+                xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+                return -ENOSYS;
+        }
+        err = xenbus_gather(XBT_NIL, dev->otherend,
+                            "feature-persistent", "%u",
+                            &pers_grants, NULL);
+        if (err)
+                pers_grants = 0;
+        be->blkif->vbd.feature_gnt_persistent = pers_grants;
+        be->blkif->vbd.overflow_max_grants = 0;
+        /*
+         * Read the number of hardware queues from frontend.
+         */
+        err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
+                           "%u", &requested_num_queues);
+        if (err < 0) {
+                requested_num_queues = 1;
+        } else {
+                if (requested_num_queues > xenblk_max_queues
+                    || requested_num_queues == 0) {
+                        /* Buggy or malicious guest. */
+                        xenbus_dev_fatal(dev, err,
+                                        "guest requested %u queues, exceeding the maximum of %u.",
+                                        requested_num_queues, xenblk_max_queues);
+                        return -ENOSYS;
+                }
+        }
+        be->blkif->nr_rings = requested_num_queues;
+        if (xen_blkif_alloc_rings(be->blkif))
+                return -ENOMEM;
+        pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
+                 be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
+                 pers_grants ? "persistent grants" : "");
+        if (be->blkif->nr_rings == 1)
+                return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
+        else {
+                xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
+                xspath = kmalloc(xspathsize, GFP_KERNEL);
+                if (!xspath) {
+                        xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
+                        return -ENOMEM;
+                }
+                for (i = 0; i < be->blkif->nr_rings; i++) {
+                        memset(xspath, 0, xspathsize);
+                        snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
+                        err = read_per_ring_refs(&be->blkif->rings[i], xspath);
+                        if (err) {
+                                kfree(xspath);
+                                return err;
+                        }
+                }
+                kfree(xspath);
+        }
+        return 0;
 }
 static const struct xenbus_device_id xen_blkbk_ids[] = {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2fee2eef988d..8a8dc91c39f7 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
 #include <asm/xen/hypervisor.h>
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
 enum blkif_state {
        BLKIF_STATE_DISCONNECTED,
        BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
        struct list_head node;
 };
+enum blk_req_status {
+        REQ_WAITING,
+        REQ_DONE,
+        REQ_ERROR,
+        REQ_EOPNOTSUPP,
+};
 struct blk_shadow {
        struct blkif_request req;
        struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
        struct grant **indirect_grants;
        struct scatterlist *sg;
        unsigned int num_sg;
+        enum blk_req_status status;
+        #define NO_ASSOCIATED_ID ~0UL
+        /*
+         * Id of the sibling if we ever need 2 requests when handling a
+         * block I/O request
+         */
+        unsigned long associated_id;
 };
 struct split_bio {
@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32;
 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
+static unsigned int xen_blkif_max_queues = 4;
+module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
 /*
 * Maximum order of pages to be used for the shared ring between front and
 * backend, 4KB page granularity is used.
@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
        __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
 /*
- * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
- * characters are enough. Define to 20 to keep consist with backend.
+ * characters are enough. Define to 20 to keep consistent with backend.
 */
 #define RINGREF_NAME_LEN (20)
+/*
+ * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
+ */
+#define QUEUE_NAME_LEN (17)
+/*
+ *  Per-ring info.
+ *  Every blkfront device can associate with one or more blkfront_ring_info,
+ *  depending on how many hardware queues/rings to be used.
+ */
+struct blkfront_ring_info {
+        /* Lock to protect data in every ring buffer. */
+        spinlock_t ring_lock;
+        struct blkif_front_ring ring;
+        unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
+        unsigned int evtchn, irq;
+        struct work_struct work;
+        struct gnttab_free_callback callback;
+        struct blk_shadow shadow[BLK_MAX_RING_SIZE];
+        struct list_head indirect_pages;
+        struct list_head grants;
+        unsigned int persistent_gnts_c;
+        unsigned long shadow_free;
+        struct blkfront_info *dev_info;
+};
 /*
 * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
 */
 struct blkfront_info
 {
-        spinlock_t io_lock;
        struct mutex mutex;
        struct xenbus_device *xbdev;
        struct gendisk *gd;
        int vdevice;
        blkif_vdev_t handle;
        enum blkif_state connected;
-        int ring_ref[XENBUS_MAX_RING_GRANTS];
+        /* Number of pages per ring buffer. */
        unsigned int nr_ring_pages;
-        struct blkif_front_ring ring;
-        unsigned int evtchn, irq;
        struct request_queue *rq;
-        struct work_struct work;
-        struct gnttab_free_callback callback;
-        struct blk_shadow shadow[BLK_MAX_RING_SIZE];
-        struct list_head grants;
-        struct list_head indirect_pages;
-        unsigned int persistent_gnts_c;
-        unsigned long shadow_free;
        unsigned int feature_flush;
        unsigned int feature_discard:1;
        unsigned int feature_secdiscard:1;
@@ -155,6 +203,8 @@ struct blkfront_info
        unsigned int max_indirect_segments;
        int is_ready;
        struct blk_mq_tag_set tag_set;
+        struct blkfront_ring_info *rinfo;
+        unsigned int nr_rings;
 };
 static unsigned int nr_minors;
@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock);
 #define GREFS(_psegs)   ((_psegs) * GRANTS_PER_PSEG)
-static int blkfront_setup_indirect(struct blkfront_info *info);
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
-static int blkfront_gather_backend_features(struct blkfront_info *info);
+static void blkfront_gather_backend_features(struct blkfront_info *info);
-static int get_id_from_freelist(struct blkfront_info *info)
+static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
 {
-        unsigned long free = info->shadow_free;
+        unsigned long free = rinfo->shadow_free;
-        BUG_ON(free >= BLK_RING_SIZE(info));
-        info->shadow_free = info->shadow[free].req.u.rw.id;
+        BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
-        info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
+        rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
+        rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
        return free;
 }
-static int add_id_to_freelist(struct blkfront_info *info,
+static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
-                               unsigned long id)
+                              unsigned long id)
 {
-        if (info->shadow[id].req.u.rw.id != id)
+        if (rinfo->shadow[id].req.u.rw.id != id)
                return -EINVAL;
-        if (info->shadow[id].request == NULL)
+        if (rinfo->shadow[id].request == NULL)
                return -EINVAL;
-        info->shadow[id].req.u.rw.id  = info->shadow_free;
+        rinfo->shadow[id].req.u.rw.id  = rinfo->shadow_free;
-        info->shadow[id].request = NULL;
+        rinfo->shadow[id].request = NULL;
-        info->shadow_free = id;
+        rinfo->shadow_free = id;
        return 0;
 }
-static int fill_grant_buffer(struct blkfront_info *info, int num)
+static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 {
+        struct blkfront_info *info = rinfo->dev_info;
        struct page *granted_page;
        struct grant *gnt_list_entry, *n;
        int i = 0;
-        while(i < num) {
+        while (i < num) {
                gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
                if (!gnt_list_entry)
                        goto out_of_memory;
@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
                }
                gnt_list_entry->gref = GRANT_INVALID_REF;
-                list_add(&gnt_list_entry->node, &info->grants);
+                list_add(&gnt_list_entry->node, &rinfo->grants);
                i++;
        }
@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
 out_of_memory:
        list_for_each_entry_safe(gnt_list_entry, n,
-                                 &info->grants, node) {
+                                 &rinfo->grants, node) {
                list_del(&gnt_list_entry->node);
                if (info->feature_persistent)
                        __free_page(gnt_list_entry->page);
@@ -263,17 +315,17 @@ out_of_memory:
        return -ENOMEM;
 }
-static struct grant *get_free_grant(struct blkfront_info *info)
+static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
 {
        struct grant *gnt_list_entry;
-        BUG_ON(list_empty(&info->grants));
+        BUG_ON(list_empty(&rinfo->grants));
-        gnt_list_entry = list_first_entry(&info->grants, struct grant,
+        gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
                                          node);
        list_del(&gnt_list_entry->node);
        if (gnt_list_entry->gref != GRANT_INVALID_REF)
-                info->persistent_gnts_c--;
+                rinfo->persistent_gnts_c--;
        return gnt_list_entry;
 }
@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
 static struct grant *get_grant(grant_ref_t *gref_head,
                               unsigned long gfn,
-                               struct blkfront_info *info)
+                               struct blkfront_ring_info *rinfo)
 {
-        struct grant *gnt_list_entry = get_free_grant(info);
+        struct grant *gnt_list_entry = get_free_grant(rinfo);
+        struct blkfront_info *info = rinfo->dev_info;
        if (gnt_list_entry->gref != GRANT_INVALID_REF)
                return gnt_list_entry;
@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
 }
 static struct grant *get_indirect_grant(grant_ref_t *gref_head,
-                                        struct blkfront_info *info)
+                                        struct blkfront_ring_info *rinfo)
 {
-        struct grant *gnt_list_entry = get_free_grant(info);
+        struct grant *gnt_list_entry = get_free_grant(rinfo);
+        struct blkfront_info *info = rinfo->dev_info;
        if (gnt_list_entry->gref != GRANT_INVALID_REF)
                return gnt_list_entry;
@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
                struct page *indirect_page;
                /* Fetch a pre-allocated page to use for indirect grefs */
-                BUG_ON(list_empty(&info->indirect_pages));
+                BUG_ON(list_empty(&rinfo->indirect_pages));
-                indirect_page = list_first_entry(&info->indirect_pages,
+                indirect_page = list_first_entry(&rinfo->indirect_pages,
                                                 struct page, lru);
                list_del(&indirect_page->lru);
                gnt_list_entry->page = indirect_page;
@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 static void blkif_restart_queue_callback(void *arg)
 {
-        struct blkfront_info *info = (struct blkfront_info *)arg;
+        struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
-        schedule_work(&info->work);
+        schedule_work(&rinfo->work);
 }
 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
        return 0;
 }
-static int blkif_queue_discard_req(struct request *req)
+static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
+                                            struct request *req,
+                                            struct blkif_request **ring_req)
 {
-        struct blkfront_info *info = req->rq_disk->private_data;
+        unsigned long id;
+        *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+        rinfo->ring.req_prod_pvt++;
+        id = get_id_from_freelist(rinfo);
+        rinfo->shadow[id].request = req;
+        rinfo->shadow[id].status = REQ_WAITING;
+        rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
+        (*ring_req)->u.rw.id = id;
+        return id;
+}
+static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+        struct blkfront_info *info = rinfo->dev_info;
        struct blkif_request *ring_req;
        unsigned long id;
        /* Fill out a communications ring structure. */
-        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+        id = blkif_ring_get_request(rinfo, req, &ring_req);
-        id = get_id_from_freelist(info);
-        info->shadow[id].request = req;
        ring_req->operation = BLKIF_OP_DISCARD;
        ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req)
        else
                ring_req->u.discard.flag = 0;
-        info->ring.req_prod_pvt++;
        /* Keep a private copy so we can reissue requests when recovering. */
-        info->shadow[id].req = *ring_req;
+        rinfo->shadow[id].req = *ring_req;
        return 0;
 }
@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req)
 struct setup_rw_req {
        unsigned int grant_idx;
        struct blkif_request_segment *segments;
-        struct blkfront_info *info;
+        struct blkfront_ring_info *rinfo;
        struct blkif_request *ring_req;
        grant_ref_t gref_head;
        unsigned int id;
@@ -495,6 +564,9 @@ struct setup_rw_req {
        bool need_copy;
        unsigned int bvec_off;
        char *bvec_data;
+        bool require_extra_req;
+        struct blkif_request *extra_ring_req;
 };
 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
        /* Convenient aliases */
        unsigned int grant_idx = setup->grant_idx;
        struct blkif_request *ring_req = setup->ring_req;
-        struct blkfront_info *info = setup->info;
+        struct blkfront_ring_info *rinfo = setup->rinfo;
-        struct blk_shadow *shadow = &info->shadow[setup->id];
+        /*
+         * We always use the shadow of the first request to store the list
+         * of grant associated to the block I/O request. This made the
+         * completion more easy to handle even if the block I/O request is
+         * split.
+         */
+        struct blk_shadow *shadow = &rinfo->shadow[setup->id];
+        if (unlikely(setup->require_extra_req &&
+                     grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+                /*
+                 * We are using the second request, setup grant_idx
+                 * to be the index of the segment array.
+                 */
+                grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+                ring_req = setup->extra_ring_req;
+        }
        if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
            (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
                        kunmap_atomic(setup->segments);
                n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
-                gnt_list_entry = get_indirect_grant(&setup->gref_head, info);
+                gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
                shadow->indirect_grants[n] = gnt_list_entry;
                setup->segments = kmap_atomic(gnt_list_entry->page);
                ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
        }
-        gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
+        gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
        ref = gnt_list_entry->gref;
-        shadow->grants_used[grant_idx] = gnt_list_entry;
+        /*
+         * All the grants are stored in the shadow of the first
+         * request. Therefore we have to use the global index.
+         */
+        shadow->grants_used[setup->grant_idx] = gnt_list_entry;
        if (setup->need_copy) {
                void *shared_data;
@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
        (setup->grant_idx)++;
 }
-static int blkif_queue_rw_req(struct request *req)
+static void blkif_setup_extra_req(struct blkif_request *first,
+                                  struct blkif_request *second)
 {
-        struct blkfront_info *info = req->rq_disk->private_data;
+        uint16_t nr_segments = first->u.rw.nr_segments;
-        struct blkif_request *ring_req;
-        unsigned long id;
+        /*
+         * The second request is only present when the first request uses
+         * all its segments. It's always the continuity of the first one.
+         */
+        first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+        second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+        second->u.rw.sector_number = first->u.rw.sector_number +
+                (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+        second->u.rw.handle = first->u.rw.handle;
+        second->operation = first->operation;
+}
+static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+        struct blkfront_info *info = rinfo->dev_info;
+        struct blkif_request *ring_req, *extra_ring_req = NULL;
+        unsigned long id, extra_id = NO_ASSOCIATED_ID;
+        bool require_extra_req = false;
        int i;
        struct setup_rw_req setup = {
                .grant_idx = 0,
                .segments = NULL,
-                .info = info,
+                .rinfo = rinfo,
                .need_copy = rq_data_dir(req) && info->feature_persistent,
        };
@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req)
         * existing persistent grants, or if we have to get new grants,
         * as there are not sufficiently many free.
         */
-        bool new_persistent_gnts;
        struct scatterlist *sg;
        int num_sg, max_grefs, num_grant;
@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req)
                 */
                max_grefs += INDIRECT_GREFS(max_grefs);
-        /* Check if we have enough grants to allocate a requests */
+        /*
-        if (info->persistent_gnts_c < max_grefs) {
+         * We have to reserve 'max_grefs' grants because persistent
-                new_persistent_gnts = 1;
+         * grants are shared by all rings.
-                if (gnttab_alloc_grant_references(
+         */
-                    max_grefs - info->persistent_gnts_c,
+        if (max_grefs > 0)
-                    &setup.gref_head) < 0) {
+                if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
                        gnttab_request_free_callback(
-                                &info->callback,
+                                &rinfo->callback,
                                blkif_restart_queue_callback,
-                                info,
+                                rinfo,
                                max_grefs);
                        return 1;
                }
-        } else
-                new_persistent_gnts = 0;
        /* Fill out a communications ring structure. */
-        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+        id = blkif_ring_get_request(rinfo, req, &ring_req);
-        id = get_id_from_freelist(info);
-        info->shadow[id].request = req;
-        BUG_ON(info->max_indirect_segments == 0 &&
-               GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-        BUG_ON(info->max_indirect_segments &&
-               GREFS(req->nr_phys_segments) > info->max_indirect_segments);
-        num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+        num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
        num_grant = 0;
        /* Calculate the number of grant used */
-        for_each_sg(info->shadow[id].sg, sg, num_sg, i)
+        for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
               num_grant += gnttab_count_grant(sg->offset, sg->length);
-        ring_req->u.rw.id = id;
+        require_extra_req = info->max_indirect_segments == 0 &&
-        info->shadow[id].num_sg = num_sg;
+                num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
-        if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+        BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
+        rinfo->shadow[id].num_sg = num_sg;
+        if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+            likely(!require_extra_req)) {
                /*
                 * The indirect operation can only be a BLKIF_OP_READ or
                 * BLKIF_OP_WRITE
@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req)
                        }
                }
                ring_req->u.rw.nr_segments = num_grant;
+                if (unlikely(require_extra_req)) {
+                        extra_id = blkif_ring_get_request(rinfo, req,
+                                                          &extra_ring_req);
+                        /*
+                         * Only the first request contains the scatter-gather
+                         * list.
+                         */
+                        rinfo->shadow[extra_id].num_sg = 0;
+                        blkif_setup_extra_req(ring_req, extra_ring_req);
+                        /* Link the 2 requests together */
+                        rinfo->shadow[extra_id].associated_id = id;
+                        rinfo->shadow[id].associated_id = extra_id;
+                }
        }
        setup.ring_req = ring_req;
        setup.id = id;
-        for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
+        setup.require_extra_req = require_extra_req;
+        if (unlikely(require_extra_req))
+                setup.extra_ring_req = extra_ring_req;
+        for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
                BUG_ON(sg->offset + sg->length > PAGE_SIZE);
                if (setup.need_copy) {
@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req)
        if (setup.segments)
                kunmap_atomic(setup.segments);
-        info->ring.req_prod_pvt++;
        /* Keep a private copy so we can reissue requests when recovering. */
-        info->shadow[id].req = *ring_req;
+        rinfo->shadow[id].req = *ring_req;
+        if (unlikely(require_extra_req))
+                rinfo->shadow[extra_id].req = *extra_ring_req;
-        if (new_persistent_gnts)
+        if (max_grefs > 0)
                gnttab_free_grant_references(setup.gref_head);
        return 0;
@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req)
 *
 * @req: a request struct
 */
-static int blkif_queue_request(struct request *req)
+static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
 {
-        struct blkfront_info *info = req->rq_disk->private_data;
+        if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
-        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
                return 1;
        if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
-                return blkif_queue_discard_req(req);
+                return blkif_queue_discard_req(req, rinfo);
        else
-                return blkif_queue_rw_req(req);
+                return blkif_queue_rw_req(req, rinfo);
 }
-static inline void flush_requests(struct blkfront_info *info)
+static inline void flush_requests(struct blkfront_ring_info *rinfo)
 {
        int notify;
-        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
        if (notify)
-                notify_remote_via_irq(info->irq);
+                notify_remote_via_irq(rinfo->irq);
 }
 static inline bool blkif_request_flush_invalid(struct request *req,
@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 }
 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
-                           const struct blk_mq_queue_data *qd)
+                          const struct blk_mq_queue_data *qd)
 {
-        struct blkfront_info *info = qd->rq->rq_disk->private_data;
+        unsigned long flags;
+        struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
        blk_mq_start_request(qd->rq);
-        spin_lock_irq(&info->io_lock);
+        spin_lock_irqsave(&rinfo->ring_lock, flags);
-        if (RING_FULL(&info->ring))
+        if (RING_FULL(&rinfo->ring))
                goto out_busy;
-        if (blkif_request_flush_invalid(qd->rq, info))
+        if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
                goto out_err;
-        if (blkif_queue_request(qd->rq))
+        if (blkif_queue_request(qd->rq, rinfo))
                goto out_busy;
-        flush_requests(info);
+        flush_requests(rinfo);
-        spin_unlock_irq(&info->io_lock);
+        spin_unlock_irqrestore(&rinfo->ring_lock, flags);
        return BLK_MQ_RQ_QUEUE_OK;
 out_err:
-        spin_unlock_irq(&info->io_lock);
+        spin_unlock_irqrestore(&rinfo->ring_lock, flags);
        return BLK_MQ_RQ_QUEUE_ERROR;
 out_busy:
-        spin_unlock_irq(&info->io_lock);
+        spin_unlock_irqrestore(&rinfo->ring_lock, flags);
        blk_mq_stop_hw_queue(hctx);
        return BLK_MQ_RQ_QUEUE_BUSY;
 }
+static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+                            unsigned int index)
+{
+        struct blkfront_info *info = (struct blkfront_info *)data;
+        BUG_ON(info->nr_rings <= index);
+        hctx->driver_data = &info->rinfo[index];
+        return 0;
+}
 static struct blk_mq_ops blkfront_mq_ops = {
        .queue_rq = blkif_queue_rq,
        .map_queue = blk_mq_map_queue,
+        .init_hctx = blk_mq_init_hctx,
 };
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
        memset(&info->tag_set, 0, sizeof(info->tag_set));
        info->tag_set.ops = &blkfront_mq_ops;
-        info->tag_set.nr_hw_queues = 1;
+        info->tag_set.nr_hw_queues = info->nr_rings;
-        info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+        if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+                /*
+                 * When indirect descriptior is not supported, the I/O request
+                 * will be split between multiple request in the ring.
+                 * To avoid problems when sending the request, divide by
+                 * 2 the depth of the queue.
+                 */
+                info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+        } else
+                info->tag_set.queue_depth = BLK_RING_SIZE(info);
        info->tag_set.numa_node = NUMA_NO_NODE;
        info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
        info->tag_set.cmd_size = 0;
        info->tag_set.driver_data = info;
        if (blk_mq_alloc_tag_set(&info->tag_set))
-                return -1;
+                return -EINVAL;
        rq = blk_mq_init_queue(&info->tag_set);
        if (IS_ERR(rq)) {
                blk_mq_free_tag_set(&info->tag_set);
-                return -1;
+                return PTR_ERR(rq);
        }
        queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
-        unsigned int minor, nr_minors;
+        unsigned int minor, nr_minors, i;
        if (info->rq == NULL)
                return;
@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
        /* No more blkif_request(). */
        blk_mq_stop_hw_queues(info->rq);
-        /* No more gnttab callback work. */
+        for (i = 0; i < info->nr_rings; i++) {
-        gnttab_cancel_free_callback(&info->callback);
+                struct blkfront_ring_info *rinfo = &info->rinfo[i];
-        /* Flush gnttab callback work. Must be done with no locks held. */
+                /* No more gnttab callback work. */
-        flush_work(&info->work);
+                gnttab_cancel_free_callback(&rinfo->callback);
+                /* Flush gnttab callback work. Must be done with no locks held. */
+                flush_work(&rinfo->work);
+        }
        del_gendisk(info->gd);
@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
        info->gd = NULL;
 }
-/* Must be called with io_lock holded */
+/* Already hold rinfo->ring_lock. */
-static void kick_pending_request_queues(struct blkfront_info *info)
+static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
 {
-        if (!RING_FULL(&info->ring))
+        if (!RING_FULL(&rinfo->ring))
-                blk_mq_start_stopped_hw_queues(info->rq, true);
+                blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
 }
-static void blkif_restart_queue(struct work_struct *work)
+static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
 {
-        struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+        unsigned long flags;
-        spin_lock_irq(&info->io_lock);
+        spin_lock_irqsave(&rinfo->ring_lock, flags);
-        if (info->connected == BLKIF_STATE_CONNECTED)
+        kick_pending_request_queues_locked(rinfo);
-                kick_pending_request_queues(info);
+        spin_unlock_irqrestore(&rinfo->ring_lock, flags);
-        spin_unlock_irq(&info->io_lock);
 }
-static void blkif_free(struct blkfront_info *info, int suspend)
+static void blkif_restart_queue(struct work_struct *work)
 {
-        struct grant *persistent_gnt;
+        struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
-        struct grant *n;
-        int i, j, segs;
-        /* Prevent new requests being issued until we fix things up. */
+        if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
-        spin_lock_irq(&info->io_lock);
+                kick_pending_request_queues(rinfo);
-        info->connected = suspend ?
+}
-                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
-        /* No more blkif_request(). */
-        if (info->rq)
-                blk_mq_stop_hw_queues(info->rq);
-        /* Remove all persistent grants */
+static void blkif_free_ring(struct blkfront_ring_info *rinfo)
-        if (!list_empty(&info->grants)) {
+{
-                list_for_each_entry_safe(persistent_gnt, n,
+        struct grant *persistent_gnt, *n;
-                                         &info->grants, node) {
+        struct blkfront_info *info = rinfo->dev_info;
-                        list_del(&persistent_gnt->node);
+        int i, j, segs;
-                        if (persistent_gnt->gref != GRANT_INVALID_REF) {
-                                gnttab_end_foreign_access(persistent_gnt->gref,
-                                                          0, 0UL);
-                                info->persistent_gnts_c--;
-                        }
-                        if (info->feature_persistent)
-                                __free_page(persistent_gnt->page);
-                        kfree(persistent_gnt);
-                }
-        }
-        BUG_ON(info->persistent_gnts_c != 0);
        /*
         * Remove indirect pages, this only happens when using indirect
         * descriptors but not persistent grants
         */
-        if (!list_empty(&info->indirect_pages)) {
+        if (!list_empty(&rinfo->indirect_pages)) {
                struct page *indirect_page, *n;
                BUG_ON(info->feature_persistent);
-                list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+                list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
                        list_del(&indirect_page->lru);
                        __free_page(indirect_page);
                }
        }
+        /* Remove all persistent grants. */
+        if (!list_empty(&rinfo->grants)) {
+                list_for_each_entry_safe(persistent_gnt, n,
+                                         &rinfo->grants, node) {
+                        list_del(&persistent_gnt->node);
+                        if (persistent_gnt->gref != GRANT_INVALID_REF) {
+                                gnttab_end_foreign_access(persistent_gnt->gref,
+                                                          0, 0UL);
+                                rinfo->persistent_gnts_c--;
+                        }
+                        if (info->feature_persistent)
+                                __free_page(persistent_gnt->page);
+                        kfree(persistent_gnt);
+                }
+        }
+        BUG_ON(rinfo->persistent_gnts_c != 0);
        for (i = 0; i < BLK_RING_SIZE(info); i++) {
                /*
                 * Clear persistent grants present in requests already
                 * on the shared ring
                 */
-                if (!info->shadow[i].request)
+                if (!rinfo->shadow[i].request)
                        goto free_shadow;
-                segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+                segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
-                       info->shadow[i].req.u.indirect.nr_segments :
+                       rinfo->shadow[i].req.u.indirect.nr_segments :
-                       info->shadow[i].req.u.rw.nr_segments;
+                       rinfo->shadow[i].req.u.rw.nr_segments;
                for (j = 0; j < segs; j++) {
-                        persistent_gnt = info->shadow[i].grants_used[j];
+                        persistent_gnt = rinfo->shadow[i].grants_used[j];
                        gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
                        if (info->feature_persistent)
                                __free_page(persistent_gnt->page);
                        kfree(persistent_gnt);
                }
-                if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+                if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
                        /*
                         * If this is not an indirect operation don't try to
                         * free indirect segments
@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend)
                        goto free_shadow;
                for (j = 0; j < INDIRECT_GREFS(segs); j++) {
-                        persistent_gnt = info->shadow[i].indirect_grants[j];
+                        persistent_gnt = rinfo->shadow[i].indirect_grants[j];
                        gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
                        __free_page(persistent_gnt->page);
                        kfree(persistent_gnt);
                }
 free_shadow:
-                kfree(info->shadow[i].grants_used);
+                kfree(rinfo->shadow[i].grants_used);
-                info->shadow[i].grants_used = NULL;
+                rinfo->shadow[i].grants_used = NULL;
-                kfree(info->shadow[i].indirect_grants);
+                kfree(rinfo->shadow[i].indirect_grants);
-                info->shadow[i].indirect_grants = NULL;
+                rinfo->shadow[i].indirect_grants = NULL;
-                kfree(info->shadow[i].sg);
+                kfree(rinfo->shadow[i].sg);
-                info->shadow[i].sg = NULL;
+                rinfo->shadow[i].sg = NULL;
        }
        /* No more gnttab callback work. */
-        gnttab_cancel_free_callback(&info->callback);
+        gnttab_cancel_free_callback(&rinfo->callback);
-        spin_unlock_irq(&info->io_lock);
        /* Flush gnttab callback work. Must be done with no locks held. */
-        flush_work(&info->work);
+        flush_work(&rinfo->work);
        /* Free resources associated with old device channel. */
        for (i = 0; i < info->nr_ring_pages; i++) {
-                if (info->ring_ref[i] != GRANT_INVALID_REF) {
+                if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
-                        gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+                        gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
-                        info->ring_ref[i] = GRANT_INVALID_REF;
+                        rinfo->ring_ref[i] = GRANT_INVALID_REF;
                }
        }
-        free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+        free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
-        info->ring.sring = NULL;
+        rinfo->ring.sring = NULL;
-        if (info->irq)
+        if (rinfo->irq)
-                unbind_from_irqhandler(info->irq, info);
+                unbind_from_irqhandler(rinfo->irq, rinfo);
-        info->evtchn = info->irq = 0;
+        rinfo->evtchn = rinfo->irq = 0;
+}
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+        unsigned int i;
+        /* Prevent new requests being issued until we fix things up. */
+        info->connected = suspend ?
+                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+        /* No more blkif_request(). */
+        if (info->rq)
+                blk_mq_stop_hw_queues(info->rq);
+        for (i = 0; i < info->nr_rings; i++)
+                blkif_free_ring(&info->rinfo[i]);
+        kfree(info->rinfo);
+        info->rinfo = NULL;
+        info->nr_rings = 0;
 }
 struct copy_from_grant {
@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
        kunmap_atomic(shared_data);
 }
-static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+        switch (rsp)
+        {
+        case BLKIF_RSP_OKAY:
+                return REQ_DONE;
+        case BLKIF_RSP_EOPNOTSUPP:
+                return REQ_EOPNOTSUPP;
+        case BLKIF_RSP_ERROR:
+                /* Fallthrough. */
+        default:
+                return REQ_ERROR;
+        }
+}
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+                                  enum blk_req_status s2)
+{
+        BUG_ON(s1 == REQ_WAITING);
+        BUG_ON(s2 == REQ_WAITING);
+        if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+                return BLKIF_RSP_ERROR;
+        else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+                return BLKIF_RSP_EOPNOTSUPP;
+        return BLKIF_RSP_OKAY;
+}
+static bool blkif_completion(unsigned long *id,
+                             struct blkfront_ring_info *rinfo,
                             struct blkif_response *bret)
 {
        int i = 0;
        struct scatterlist *sg;
        int num_sg, num_grant;
+        struct blkfront_info *info = rinfo->dev_info;
+        struct blk_shadow *s = &rinfo->shadow[*id];
        struct copy_from_grant data = {
-                .s = s,
                .grant_idx = 0,
        };
        num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
                s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+        /* The I/O request may be split in two. */
+        if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+                struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+                /* Keep the status of the current response in shadow. */
+                s->status = blkif_rsp_to_req_status(bret->status);
+                /* Wait the second response if not yet here. */
+                if (s2->status == REQ_WAITING)
+                        return 0;
+                bret->status = blkif_get_final_status(s->status,
+                                                      s2->status);
+                /*
+                 * All the grants is stored in the first shadow in order
+                 * to make the completion code simpler.
+                 */
+                num_grant += s2->req.u.rw.nr_segments;
+                /*
+                 * The two responses may not come in order. Only the
+                 * first request will store the scatter-gather list.
+                 */
+                if (s2->num_sg != 0) {
+                        /* Update "id" with the ID of the first response. */
+                        *id = s->associated_id;
+                        s = s2;
+                }
+                /*
+                 * We don't need anymore the second request, so recycling
+                 * it now.
+                 */
+                if (add_id_to_freelist(rinfo, s->associated_id))
+                        WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+                             info->gd->disk_name, s->associated_id);
+        }
+        data.s = s;
        num_sg = s->num_sg;
        if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                        if (!info->feature_persistent)
                                pr_alert_ratelimited("backed has not unmapped grant: %u\n",
                                                     s->grants_used[i]->gref);
-                        list_add(&s->grants_used[i]->node, &info->grants);
+                        list_add(&s->grants_used[i]->node, &rinfo->grants);
-                        info->persistent_gnts_c++;
+                        rinfo->persistent_gnts_c++;
                } else {
                        /*
                         * If the grant is not mapped by the backend we end the
@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                         */
                        gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
                        s->grants_used[i]->gref = GRANT_INVALID_REF;
-                        list_add_tail(&s->grants_used[i]->node, &info->grants);
+                        list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
                }
        }
        if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                                if (!info->feature_persistent)
                                        pr_alert_ratelimited("backed has not unmapped grant: %u\n",
                                                             s->indirect_grants[i]->gref);
-                                list_add(&s->indirect_grants[i]->node, &info->grants);
+                                list_add(&s->indirect_grants[i]->node, &rinfo->grants);
-                                info->persistent_gnts_c++;
+                                rinfo->persistent_gnts_c++;
                        } else {
                                struct page *indirect_page;
@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                                 */
                                if (!info->feature_persistent) {
                                        indirect_page = s->indirect_grants[i]->page;
-                                        list_add(&indirect_page->lru, &info->indirect_pages);
+                                        list_add(&indirect_page->lru, &rinfo->indirect_pages);
                                }
                                s->indirect_grants[i]->gref = GRANT_INVALID_REF;
-                                list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+                                list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
                        }
                }
        }
+        return 1;
 }
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
        struct blkif_response *bret;
        RING_IDX i, rp;
        unsigned long flags;
-        struct blkfront_info *info = (struct blkfront_info *)dev_id;
+        struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
+        struct blkfront_info *info = rinfo->dev_info;
        int error;
-        spin_lock_irqsave(&info->io_lock, flags);
+        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
-        if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-                spin_unlock_irqrestore(&info->io_lock, flags);
                return IRQ_HANDLED;
-        }
+        spin_lock_irqsave(&rinfo->ring_lock, flags);
 again:
-        rp = info->ring.sring->rsp_prod;
+        rp = rinfo->ring.sring->rsp_prod;
        rmb(); /* Ensure we see queued responses up to 'rp'. */
-        for (i = info->ring.rsp_cons; i != rp; i++) {
+        for (i = rinfo->ring.rsp_cons; i != rp; i++) {
                unsigned long id;
-                bret = RING_GET_RESPONSE(&info->ring, i);
+                bret = RING_GET_RESPONSE(&rinfo->ring, i);
                id   = bret->id;
                /*
                 * The backend has messed up and given us an id that we would
@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                         * the id is busted. */
                        continue;
                }
-                req  = info->shadow[id].request;
+                req  = rinfo->shadow[id].request;
-                if (bret->operation != BLKIF_OP_DISCARD)
+                if (bret->operation != BLKIF_OP_DISCARD) {
-                        blkif_completion(&info->shadow[id], info, bret);
+                        /*
+                         * We may need to wait for an extra response if the
+                         * I/O request is split in 2
+                         */
+                        if (!blkif_completion(&id, rinfo, bret))
+                                continue;
+                }
-                if (add_id_to_freelist(info, id)) {
+                if (add_id_to_freelist(rinfo, id)) {
                        WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
                             info->gd->disk_name, op_name(bret->operation), id);
                        continue;
@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                                error = -EOPNOTSUPP;
                        }
                        if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-                                     info->shadow[id].req.u.rw.nr_segments == 0)) {
+                                     rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
                                printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
                                error = -EOPNOTSUPP;
@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                }
        }
-        info->ring.rsp_cons = i;
+        rinfo->ring.rsp_cons = i;
-        if (i != info->ring.req_prod_pvt) {
+        if (i != rinfo->ring.req_prod_pvt) {
                int more_to_do;
-                RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+                RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
                if (more_to_do)
                        goto again;
        } else
-                info->ring.sring->rsp_event = i + 1;
+                rinfo->ring.sring->rsp_event = i + 1;
-        kick_pending_request_queues(info);
+        kick_pending_request_queues_locked(rinfo);
-        spin_unlock_irqrestore(&info->io_lock, flags);
+        spin_unlock_irqrestore(&rinfo->ring_lock, flags);
        return IRQ_HANDLED;
 }
 static int setup_blkring(struct xenbus_device *dev,
-                         struct blkfront_info *info)
+                         struct blkfront_ring_info *rinfo)
 {
        struct blkif_sring *sring;
        int err, i;
+        struct blkfront_info *info = rinfo->dev_info;
        unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
        grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
        for (i = 0; i < info->nr_ring_pages; i++)
-                info->ring_ref[i] = GRANT_INVALID_REF;
+                rinfo->ring_ref[i] = GRANT_INVALID_REF;
        sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
                                                       get_order(ring_size));
@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev,
                return -ENOMEM;
        }
        SHARED_RING_INIT(sring);
-        FRONT_RING_INIT(&info->ring, sring, ring_size);
+        FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
-        err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
+        err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
        if (err < 0) {
                free_pages((unsigned long)sring, get_order(ring_size));
-                info->ring.sring = NULL;
+                rinfo->ring.sring = NULL;
                goto fail;
        }
        for (i = 0; i < info->nr_ring_pages; i++)
-                info->ring_ref[i] = gref[i];
+                rinfo->ring_ref[i] = gref[i];
-        err = xenbus_alloc_evtchn(dev, &info->evtchn);
+        err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
        if (err)
                goto fail;
-        err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
+        err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
-                                        "blkif", info);
+                                        "blkif", rinfo);
        if (err <= 0) {
                xenbus_dev_fatal(dev, err,
                                 "bind_evtchn_to_irqhandler failed");
                goto fail;
        }
-        info->irq = err;
+        rinfo->irq = err;
        return 0;
 fail:
@@ -1455,6 +1701,53 @@ fail:
        return err;
 }
+/*
+ * Write out per-ring/queue nodes including ring-ref and event-channel, and each
+ * ring buffer may have multi pages depending on ->nr_ring_pages.
+ */
+static int write_per_ring_nodes(struct xenbus_transaction xbt,
+                                struct blkfront_ring_info *rinfo, const char *dir)
+{
+        int err;
+        unsigned int i;
+        const char *message = NULL;
+        struct blkfront_info *info = rinfo->dev_info;
+        if (info->nr_ring_pages == 1) {
+                err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
+                if (err) {
+                        message = "writing ring-ref";
+                        goto abort_transaction;
+                }
+        } else {
+                for (i = 0; i < info->nr_ring_pages; i++) {
+                        char ring_ref_name[RINGREF_NAME_LEN];
+                        snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+                        err = xenbus_printf(xbt, dir, ring_ref_name,
+                                            "%u", rinfo->ring_ref[i]);
+                        if (err) {
+                                message = "writing ring-ref";
+                                goto abort_transaction;
+                        }
+                }
+        }
+        err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
+        if (err) {
+                message = "writing event-channel";
+                goto abort_transaction;
+        }
+        return 0;
+abort_transaction:
+        xenbus_transaction_end(xbt, 1);
+        if (message)
+                xenbus_dev_fatal(info->xbdev, err, "%s", message);
+        return err;
+}
 /* Common code used when first setting up, and when resuming. */
 static int talk_to_blkback(struct xenbus_device *dev,
@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
        const char *message = NULL;
        struct xenbus_transaction xbt;
-        int err, i;
+        int err;
-        unsigned int max_page_order = 0;
+        unsigned int i, max_page_order = 0;
        unsigned int ring_page_order = 0;
        err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
                info->nr_ring_pages = 1 << ring_page_order;
        }
-        /* Create shared ring, alloc event channel. */
+        for (i = 0; i < info->nr_rings; i++) {
-        err = setup_blkring(dev, info);
+                struct blkfront_ring_info *rinfo = &info->rinfo[i];
-        if (err)
-                goto out;
+                /* Create shared ring, alloc event channel. */
+                err = setup_blkring(dev, rinfo);
+                if (err)
+                        goto destroy_blkring;
+        }
 again:
        err = xenbus_transaction_start(&xbt);
@@ -1487,38 +1784,49 @@ again:
                goto destroy_blkring;
        }
-        if (info->nr_ring_pages == 1) {
+        if (info->nr_ring_pages > 1) {
-                err = xenbus_printf(xbt, dev->nodename,
+                err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
-                                    "ring-ref", "%u", info->ring_ref[0]);
+                                    ring_page_order);
                if (err) {
-                        message = "writing ring-ref";
+                        message = "writing ring-page-order";
                        goto abort_transaction;
                }
+        }
+        /* We already got the number of queues/rings in _probe */
+        if (info->nr_rings == 1) {
+                err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
+                if (err)
+                        goto destroy_blkring;
        } else {
-                err = xenbus_printf(xbt, dev->nodename,
+                char *path;
-                                    "ring-page-order", "%u", ring_page_order);
+                size_t pathsize;
+                err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
+                                    info->nr_rings);
                if (err) {
-                        message = "writing ring-page-order";
+                        message = "writing multi-queue-num-queues";
                        goto abort_transaction;
                }
-                for (i = 0; i < info->nr_ring_pages; i++) {
+                pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
-                        char ring_ref_name[RINGREF_NAME_LEN];
+                path = kmalloc(pathsize, GFP_KERNEL);
+                if (!path) {
+                        err = -ENOMEM;
+                        message = "ENOMEM while writing ring references";
+                        goto abort_transaction;
+                }
-                        snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+                for (i = 0; i < info->nr_rings; i++) {
-                        err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
+                        memset(path, 0, pathsize);
-                                            "%u", info->ring_ref[i]);
+                        snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
+                        err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
                        if (err) {
-                                message = "writing ring-ref";
+                                kfree(path);
-                                goto abort_transaction;
+                                goto destroy_blkring;
                        }
                }
-        }
+                kfree(path);
-        err = xenbus_printf(xbt, dev->nodename,
-                            "event-channel", "%u", info->evtchn);
-        if (err) {
-                message = "writing event-channel";
-                goto abort_transaction;
        }
        err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
                            XEN_IO_PROTO_ABI_NATIVE);
@@ -1540,9 +1848,14 @@ again:
                goto destroy_blkring;
        }
-        for (i = 0; i < BLK_RING_SIZE(info); i++)
+        for (i = 0; i < info->nr_rings; i++) {
-                info->shadow[i].req.u.rw.id = i+1;
+                unsigned int j;
-        info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+                struct blkfront_ring_info *rinfo = &info->rinfo[i];
+                for (j = 0; j < BLK_RING_SIZE(info); j++)
+                        rinfo->shadow[j].req.u.rw.id = j + 1;
+                rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+        }
        xenbus_switch_state(dev, XenbusStateInitialised);
        return 0;
@@ -1553,7 +1866,10 @@ again:
                xenbus_dev_fatal(dev, err, "%s", message);
 destroy_blkring:
        blkif_free(info, 0);
- out:
+        kfree(info);
+        dev_set_drvdata(&dev->dev, NULL);
        return err;
 }
@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev,
                          const struct xenbus_device_id *id)
 {
        int err, vdevice;
+        unsigned int r_index;
        struct blkfront_info *info;
+        unsigned int backend_max_queues = 0;
        /* FIXME: Use dynamic device id if this is not set. */
        err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev,
                return -ENOMEM;
        }
-        mutex_init(&info->mutex);
-        spin_lock_init(&info->io_lock);
        info->xbdev = dev;
+        /* Check if backend supports multiple queues. */
+        err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                           "multi-queue-max-queues", "%u", &backend_max_queues);
+        if (err < 0)
+                backend_max_queues = 1;
+        info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+        /* We need at least one ring. */
+        if (!info->nr_rings)
+                info->nr_rings = 1;
+        info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
+        if (!info->rinfo) {
+                xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
+                kfree(info);
+                return -ENOMEM;
+        }
+        for (r_index = 0; r_index < info->nr_rings; r_index++) {
+                struct blkfront_ring_info *rinfo;
+                rinfo = &info->rinfo[r_index];
+                INIT_LIST_HEAD(&rinfo->indirect_pages);
+                INIT_LIST_HEAD(&rinfo->grants);
+                rinfo->dev_info = info;
+                INIT_WORK(&rinfo->work, blkif_restart_queue);
+                spin_lock_init(&rinfo->ring_lock);
+        }
+        mutex_init(&info->mutex);
        info->vdevice = vdevice;
-        INIT_LIST_HEAD(&info->grants);
-        INIT_LIST_HEAD(&info->indirect_pages);
-        info->persistent_gnts_c = 0;
        info->connected = BLKIF_STATE_DISCONNECTED;
-        INIT_WORK(&info->work, blkif_restart_queue);
        /* Front end dir is a number, which is used as the id. */
        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio)
 static int blkif_recover(struct blkfront_info *info)
 {
-        int i;
+        unsigned int i, r_index;
        struct request *req, *n;
        struct blk_shadow *copy;
        int rc;
@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info)
        struct split_bio *split_bio;
        struct list_head requests;
-        /* Stage 1: Make a safe copy of the shadow state. */
+        blkfront_gather_backend_features(info);
-        copy = kmemdup(info->shadow, sizeof(info->shadow),
-                       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
-        if (!copy)
-                return -ENOMEM;
-        /* Stage 2: Set up free list. */
-        memset(&info->shadow, 0, sizeof(info->shadow));
-        for (i = 0; i < BLK_RING_SIZE(info); i++)
-                info->shadow[i].req.u.rw.id = i+1;
-        info->shadow_free = info->ring.req_prod_pvt;
-        info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
-        rc = blkfront_gather_backend_features(info);
-        if (rc) {
-                kfree(copy);
-                return rc;
-        }
        segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
        blk_queue_max_segments(info->rq, segs);
        bio_list_init(&bio_list);
        INIT_LIST_HEAD(&requests);
-        for (i = 0; i < BLK_RING_SIZE(info); i++) {
-                /* Not in use? */
-                if (!copy[i].request)
-                        continue;
-                /*
+        for (r_index = 0; r_index < info->nr_rings; r_index++) {
-                 * Get the bios in the request so we can re-queue them.
+                struct blkfront_ring_info *rinfo;
-                 */
-                if (copy[i].request->cmd_flags &
+                rinfo = &info->rinfo[r_index];
-                    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+                /* Stage 1: Make a safe copy of the shadow state. */
+                copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
+                               GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+                if (!copy)
+                        return -ENOMEM;
+                /* Stage 2: Set up free list. */
+                memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
+                for (i = 0; i < BLK_RING_SIZE(info); i++)
+                        rinfo->shadow[i].req.u.rw.id = i+1;
+                rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+                rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+                rc = blkfront_setup_indirect(rinfo);
+                if (rc) {
+                        kfree(copy);
+                        return rc;
+                }
+                for (i = 0; i < BLK_RING_SIZE(info); i++) {
+                        /* Not in use? */
+                        if (!copy[i].request)
+                                continue;
                        /*
-                         * Flush operations don't contain bios, so
+                         * Get the bios in the request so we can re-queue them.
-                         * we need to requeue the whole request
                         */
-                        list_add(&copy[i].request->queuelist, &requests);
+                        if (copy[i].request->cmd_flags &
-                        continue;
+                            (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+                                /*
+                                 * Flush operations don't contain bios, so
+                                 * we need to requeue the whole request
+                                 */
+                                list_add(&copy[i].request->queuelist, &requests);
+                                continue;
+                        }
+                        merge_bio.head = copy[i].request->bio;
+                        merge_bio.tail = copy[i].request->biotail;
+                        bio_list_merge(&bio_list, &merge_bio);
+                        copy[i].request->bio = NULL;
+                        blk_end_request_all(copy[i].request, 0);
                }
-                merge_bio.head = copy[i].request->bio;
-                merge_bio.tail = copy[i].request->biotail;
-                bio_list_merge(&bio_list, &merge_bio);
-                copy[i].request->bio = NULL;
-                blk_end_request_all(copy[i].request, 0);
-        }
-        kfree(copy);
+                kfree(copy);
+        }
        xenbus_switch_state(info->xbdev, XenbusStateConnected);
-        spin_lock_irq(&info->io_lock);
        /* Now safe for us to use the shared ring */
        info->connected = BLKIF_STATE_CONNECTED;
-        /* Kick any other new requests queued since we resumed */
+        for (r_index = 0; r_index < info->nr_rings; r_index++) {
-        kick_pending_request_queues(info);
+                struct blkfront_ring_info *rinfo;
+                rinfo = &info->rinfo[r_index];
+                /* Kick any other new requests queued since we resumed */
+                kick_pending_request_queues(rinfo);
+        }
        list_for_each_entry_safe(req, n, &requests, queuelist) {
                /* Requeue pending requests (flush or discard) */
@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info)
                BUG_ON(req->nr_phys_segments > segs);
                blk_mq_requeue_request(req);
        }
-        spin_unlock_irq(&info->io_lock);
        blk_mq_kick_requeue_list(info->rq);
        while ((bio = bio_list_pop(&bio_list)) != NULL) {
@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev)
        return err;
 }
-static void
+static void blkfront_closing(struct blkfront_info *info)
-blkfront_closing(struct blkfront_info *info)
 {
        struct xenbus_device *xbdev = info->xbdev;
        struct block_device *bdev = NULL;
@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info)
                info->feature_secdiscard = !!discard_secure;
 }
-static int blkfront_setup_indirect(struct blkfront_info *info)
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
 {
        unsigned int psegs, grants;
        int err, i;
+        struct blkfront_info *info = rinfo->dev_info;
-        if (info->max_indirect_segments == 0)
+        if (info->max_indirect_segments == 0) {
-                grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+                if (!HAS_EXTRA_REQ)
+                        grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+                else {
+                        /*
+                         * When an extra req is required, the maximum
+                         * grants supported is related to the size of the
+                         * Linux block segment.
+                         */
+                        grants = GRANTS_PER_PSEG;
+                }
+        }
        else
                grants = info->max_indirect_segments;
        psegs = grants / GRANTS_PER_PSEG;
-        err = fill_grant_buffer(info,
+        err = fill_grant_buffer(rinfo,
                                (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
        if (err)
                goto out_of_memory;
@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
                 */
                int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
-                BUG_ON(!list_empty(&info->indirect_pages));
+                BUG_ON(!list_empty(&rinfo->indirect_pages));
                for (i = 0; i < num; i++) {
                        struct page *indirect_page = alloc_page(GFP_NOIO);
                        if (!indirect_page)
                                goto out_of_memory;
-                        list_add(&indirect_page->lru, &info->indirect_pages);
+                        list_add(&indirect_page->lru, &rinfo->indirect_pages);
                }
        }
        for (i = 0; i < BLK_RING_SIZE(info); i++) {
-                info->shadow[i].grants_used = kzalloc(
+                rinfo->shadow[i].grants_used = kzalloc(
-                        sizeof(info->shadow[i].grants_used[0]) * grants,
+                        sizeof(rinfo->shadow[i].grants_used[0]) * grants,
                        GFP_NOIO);
-                info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO);
+                rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
                if (info->max_indirect_segments)
-                        info->shadow[i].indirect_grants = kzalloc(
+                        rinfo->shadow[i].indirect_grants = kzalloc(
-                                sizeof(info->shadow[i].indirect_grants[0]) *
+                                sizeof(rinfo->shadow[i].indirect_grants[0]) *
                                INDIRECT_GREFS(grants),
                                GFP_NOIO);
-                if ((info->shadow[i].grants_used == NULL) ||
+                if ((rinfo->shadow[i].grants_used == NULL) ||
-                        (info->shadow[i].sg == NULL) ||
+                        (rinfo->shadow[i].sg == NULL) ||
                     (info->max_indirect_segments &&
-                     (info->shadow[i].indirect_grants == NULL)))
+                     (rinfo->shadow[i].indirect_grants == NULL)))
                        goto out_of_memory;
-                sg_init_table(info->shadow[i].sg, psegs);
+                sg_init_table(rinfo->shadow[i].sg, psegs);
        }
@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 out_of_memory:
        for (i = 0; i < BLK_RING_SIZE(info); i++) {
-                kfree(info->shadow[i].grants_used);
+                kfree(rinfo->shadow[i].grants_used);
-                info->shadow[i].grants_used = NULL;
+                rinfo->shadow[i].grants_used = NULL;
-                kfree(info->shadow[i].sg);
+                kfree(rinfo->shadow[i].sg);
-                info->shadow[i].sg = NULL;
+                rinfo->shadow[i].sg = NULL;
-                kfree(info->shadow[i].indirect_grants);
+                kfree(rinfo->shadow[i].indirect_grants);
-                info->shadow[i].indirect_grants = NULL;
+                rinfo->shadow[i].indirect_grants = NULL;
-        }
+        }
-        if (!list_empty(&info->indirect_pages)) {
+        if (!list_empty(&rinfo->indirect_pages)) {
                struct page *indirect_page, *n;
-                list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+                list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
                        list_del(&indirect_page->lru);
                        __free_page(indirect_page);
                }
@@ -1927,7 +2287,7 @@ out_of_memory:
 /*
 * Gather all backend feature-*
 */
-static int blkfront_gather_backend_features(struct blkfront_info *info)
+static void blkfront_gather_backend_features(struct blkfront_info *info)
 {
        int err;
        int barrier, flush, discard, persistent;
@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
        else
                info->max_indirect_segments = min(indirect_segments,
                                                  xen_blkif_max_segments);
-        return blkfront_setup_indirect(info);
 }
 /*
@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info)
        unsigned long sector_size;
        unsigned int physical_sector_size;
        unsigned int binfo;
-        int err;
+        int err, i;
        switch (info->connected) {
        case BLKIF_STATE_CONNECTED:
@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info)
        if (err != 1)
                physical_sector_size = sector_size;
-        err = blkfront_gather_backend_features(info);
+        blkfront_gather_backend_features(info);
-        if (err) {
+        for (i = 0; i < info->nr_rings; i++) {
-                xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+                err = blkfront_setup_indirect(&info->rinfo[i]);
-                                 info->xbdev->otherend);
+                if (err) {
-                return;
+                        xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+                                         info->xbdev->otherend);
+                        blkif_free(info, 0);
+                        break;
+                }
        }
        err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info)
        xenbus_switch_state(info->xbdev, XenbusStateConnected);
        /* Kick pending requests. */
-        spin_lock_irq(&info->io_lock);
        info->connected = BLKIF_STATE_CONNECTED;
-        kick_pending_request_queues(info);
+        for (i = 0; i < info->nr_rings; i++)
-        spin_unlock_irq(&info->io_lock);
+                kick_pending_request_queues(&info->rinfo[i]);
        add_disk(info->gd);
@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
        case XenbusStateInitWait:
                if (dev->state != XenbusStateInitialising)
                        break;
-                if (talk_to_blkback(dev, info)) {
+                if (talk_to_blkback(dev, info))
-                        kfree(info);
-                        dev_set_drvdata(&dev->dev, NULL);
                        break;
-                }
        case XenbusStateInitialising:
        case XenbusStateInitialised:
        case XenbusStateReconfiguring:
@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
                break;
        case XenbusStateConnected:
+                if (dev->state != XenbusStateInitialised) {
+                        if (talk_to_blkback(dev, info))
+                                break;
+                }
                blkfront_connect(info);
                break;
@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = {
 static int __init xlblk_init(void)
 {
        int ret;
+        int nr_cpus = num_online_cpus();
        if (!xen_domain())
                return -ENODEV;
@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void)
        if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
                pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
                        xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
-                xen_blkif_max_ring_order = 0;
+                xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
+        }
+        if (xen_blkif_max_queues > nr_cpus) {
+                pr_info("Invalid max_queues (%d), will use default max: %d.\n",
+                        xen_blkif_max_queues, nr_cpus);
+                xen_blkif_max_queues = nr_cpus;
        }
        if (!xen_has_pv_disk_devices())
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 83392f856dfd..22b9e34ceb75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
        do {
                ret = btree_root(gc_root, c, &op, &writes, &stats);
                closure_sync(&writes);
+                cond_resched();
                if (ret && ret != -EAGAIN)
                        pr_warn("gc failed!");
@@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
                rw_lock(true, b, b->level);
                if (b->key.ptr[0] != btree_ptr ||
-                    b->seq != seq + 1)
+                   b->seq != seq + 1) {
+                       op->lock = b->level;
                        goto out;
+               }
        }
        SET_KEY_PTRS(check_key, 1);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 679a093a3bf6..8d0ead98eb6e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
        WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
             sysfs_create_link(&c->kobj, &d->kobj, d->name),
             "Couldn't create device <-> cache set symlinks");
+        clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
 }
 static void bcache_device_detach(struct bcache_device *d)
@@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
        buf[SB_LABEL_SIZE] = '\0';
        env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
-        if (atomic_xchg(&dc->running, 1))
+        if (atomic_xchg(&dc->running, 1)) {
+                kfree(env[1]);
+                kfree(env[2]);
                return;
+        }
        if (!d->c &&
            BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
@@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
                        else
                                err = "device busy";
                        mutex_unlock(&bch_register_lock);
+                        if (attr == &ksysfs_register_quiet)
+                                goto out;
                }
                goto err;
        }
@@ -1971,8 +1978,7 @@ out:
 err_close:
        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 err:
-        if (attr != &ksysfs_register_quiet)
+        pr_info("error opening %s: %s", path, err);
-                pr_info("error opening %s: %s", path, err);
        ret = -EINVAL;
        goto out;
 }
@@ -2066,8 +2072,10 @@ static int __init bcache_init(void)
        closure_debug_init();
        bcache_major = register_blkdev(0, "bcache");
-        if (bcache_major < 0)
+        if (bcache_major < 0) {
+                unregister_reboot_notifier(&reboot);
                return bcache_major;
+        }
        if (!(bcache_wq = create_workqueue("bcache")) ||
            !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b23f88d9f18c..b9346cd9cda1 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
+        struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+        BUG_ON(KEY_INODE(k) != dc->disk.id);
        return KEY_DIRTY(k);
 }
@@ -372,11 +376,24 @@ next:
        }
 }
+/*
+ * Returns true if we scanned the entire disk
+ */
 static bool refill_dirty(struct cached_dev *dc)
 {
        struct keybuf *buf = &dc->writeback_keys;
+        struct bkey start = KEY(dc->disk.id, 0, 0);
        struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
-        bool searched_from_start = false;
+        struct bkey start_pos;
+        /*
+         * make sure keybuf pos is inside the range for this disk - at bringup
+         * we might not be attached yet so this disk's inode nr isn't
+         * initialized then
+         */
+        if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
+            bkey_cmp(&buf->last_scanned, &end) > 0)
+                buf->last_scanned = start;
        if (dc->partial_stripes_expensive) {
                refill_full_stripes(dc);
@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
                        return false;
        }
-        if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+        start_pos = buf->last_scanned;
-                buf->last_scanned = KEY(dc->disk.id, 0, 0);
-                searched_from_start = true;
-        }
        bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
-        return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+        if (bkey_cmp(&buf->last_scanned, &end) < 0)
+                return false;
+        /*
+         * If we get to the end start scanning again from the beginning, and
+         * only scan up to where we initially started scanning from:
+         */
+        buf->last_scanned = start;
+        bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
+        return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
 }
 static int bch_writeback_thread(void *arg)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 0a9dab187b79..073a042aed24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 static inline void bch_writeback_queue(struct cached_dev *dc)
 {
-        wake_up_process(dc->writeback_thread);
+        if (!IS_ERR_OR_NULL(dc->writeback_thread))
+                wake_up_process(dc->writeback_thread);
 }
 static inline void bch_writeback_add(struct cached_dev *dc)
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 8723f2a99e15..d6b3c9943a2c 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -25,7 +25,6 @@
 */
 #ifndef DRBD_H
 #define DRBD_H
-#include <linux/connector.h>
 #include <asm/types.h>
 #ifdef __KERNEL__
@@ -52,7 +51,7 @@
 #endif
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.5"
+#define REL_VERSION "8.4.6"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
@@ -339,6 +338,8 @@ enum drbd_state_rv {
 #define MDF_AL_CLEAN            (1 << 7)
 #define MDF_AL_DISABLED         (1 << 8)
+#define MAX_PEERS 32
 enum drbd_uuid_index {
        UI_CURRENT,
        UI_BITMAP,
@@ -349,14 +350,35 @@ enum drbd_uuid_index {
        UI_EXTENDED_SIZE   /* Everything. */
 };
+#define HISTORY_UUIDS MAX_PEERS
 enum drbd_timeout_flag {
        UT_DEFAULT      = 0,
        UT_DEGRADED     = 1,
        UT_PEER_OUTDATED = 2,
 };
+enum drbd_notification_type {
+        NOTIFY_EXISTS,
+        NOTIFY_CREATE,
+        NOTIFY_CHANGE,
+        NOTIFY_DESTROY,
+        NOTIFY_CALL,
+        NOTIFY_RESPONSE,
+        NOTIFY_CONTINUES = 0x8000,
+        NOTIFY_FLAGS = NOTIFY_CONTINUES,
+};
 #define UUID_JUST_CREATED ((__u64)4)
+enum write_ordering_e {
+        WO_NONE,
+        WO_DRAIN_IO,
+        WO_BDEV_FLUSH,
+        WO_BIO_BARRIER
+};
 /* magic numbers used in meta data and network packets */
 #define DRBD_MAGIC 0x83740267
 #define DRBD_MAGIC_BIG 0x835a
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 7b131ed8f9c6..2d0e5ad5de9d 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
        __flg_field(1, DRBD_GENLA_F_MANDATORY,  force_detach)
 )
+GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
+        __u32_field(1, 0, res_role)
+        __flg_field(2, 0, res_susp)
+        __flg_field(3, 0, res_susp_nod)
+        __flg_field(4, 0, res_susp_fen)
+        /* __flg_field(5, 0, res_weak) */
+)
+GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info,
+        __u32_field(1, 0, dev_disk_state)
+)
+GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info,
+        __u32_field(1, 0, conn_connection_state)
+        __u32_field(2, 0, conn_role)
+)
+GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info,
+        __u32_field(1, 0, peer_repl_state)
+        __u32_field(2, 0, peer_disk_state)
+        __u32_field(3, 0, peer_resync_susp_user)
+        __u32_field(4, 0, peer_resync_susp_peer)
+        __u32_field(5, 0, peer_resync_susp_dependency)
+)
+GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics,
+        __u32_field(1, 0, res_stat_write_ordering)
+)
+GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics,
+        __u64_field(1, 0, dev_size)  /* (sectors) */
+        __u64_field(2, 0, dev_read)  /* (sectors) */
+        __u64_field(3, 0, dev_write)  /* (sectors) */
+        __u64_field(4, 0, dev_al_writes)  /* activity log writes (count) */
+        __u64_field(5, 0, dev_bm_writes)  /*  bitmap writes  (count) */
+        __u32_field(6, 0, dev_upper_pending)  /* application requests in progress */
+        __u32_field(7, 0, dev_lower_pending)  /* backing device requests in progress */
+        __flg_field(8, 0, dev_upper_blocked)
+        __flg_field(9, 0, dev_lower_blocked)
+        __flg_field(10, 0, dev_al_suspended)  /* activity log suspended */
+        __u64_field(11, 0, dev_exposed_data_uuid)
+        __u64_field(12, 0, dev_current_uuid)
+        __u32_field(13, 0, dev_disk_flags)
+        __bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64))
+)
+GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics,
+        __flg_field(1, 0, conn_congested)
+)
+GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics,
+        __u64_field(1, 0, peer_dev_received)  /* sectors */
+        __u64_field(2, 0, peer_dev_sent)  /* sectors */
+        __u32_field(3, 0, peer_dev_pending)  /* number of requests */
+        __u32_field(4, 0, peer_dev_unacked)  /* number of requests */
+        __u64_field(5, 0, peer_dev_out_of_sync)  /* sectors */
+        __u64_field(6, 0, peer_dev_resync_failed)  /* sectors */
+        __u64_field(7, 0, peer_dev_bitmap_uuid)
+        __u32_field(9, 0, peer_dev_flags)
+)
+GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header,
+        __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type)
+)
+GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info,
+        __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32)
+        __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status)
+)
 /*
 * Notifications and commands (genlmsghdr->cmd)
 */
@@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
 GENL_op(DRBD_ADM_DOWN,          27, GENL_doit(drbd_adm_down),
        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_GET_RESOURCES, 30,
+         GENL_op_init(
+                 .dumpit = drbd_adm_dump_resources,
+         ),
+         GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+GENL_op(DRBD_ADM_GET_DEVICES, 31,
+         GENL_op_init(
+                 .dumpit = drbd_adm_dump_devices,
+                 .done = drbd_adm_dump_devices_done,
+         ),
+         GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+GENL_op(DRBD_ADM_GET_CONNECTIONS, 32,
+         GENL_op_init(
+                 .dumpit = drbd_adm_dump_connections,
+                 .done = drbd_adm_dump_connections_done,
+         ),
+         GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY))
+GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33,
+         GENL_op_init(
+                 .dumpit = drbd_adm_dump_peer_devices,
+                 .done = drbd_adm_dump_peer_devices_done,
+         ),
+         GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+         GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+GENL_notification(
+        DRBD_RESOURCE_STATE, 34, events,
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED))
+GENL_notification(
+        DRBD_DEVICE_STATE, 35, events,
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED))
+GENL_notification(
+        DRBD_CONNECTION_STATE, 36, events,
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED))
+GENL_notification(
+        DRBD_PEER_DEVICE_STATE, 37, events,
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED))
+GENL_op(
+        DRBD_ADM_GET_INITIAL_STATE, 38,
+        GENL_op_init(
+                .dumpit = drbd_adm_get_initial_state,
+        ),
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY))
+GENL_notification(
+        DRBD_HELPER, 40, events,
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+        GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED))
+GENL_notification(
+        DRBD_INITIAL_STATE_DONE, 41, events,
+        GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED))
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 013fd9bc4cb6..083d61e92706 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id)
 #define idr_for_each_entry(idp, entry, id)                      \
        for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
+/**
+ * idr_for_each_entry - continue iteration over an idr's elements of a given type
+ * @idp:     idr handle
+ * @entry:   the type * to use as cursor
+ * @id:      id entry's key
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define idr_for_each_entry_continue(idp, entry, id)                     \
+        for ((entry) = idr_get_next((idp), &(id));                      \
+             entry;                                                     \
+             ++id, (entry) = idr_get_next((idp), &(id)))
 /*
 * IDA - IDR based id allocator, use when translation from id to
 * pointer isn't necessary.
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 46262284de47..04fc6e6c7ff0 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
 extern void lc_committed(struct lru_cache *lc);
 struct seq_file;
-extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
+extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
 extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
                                void (*detail) (struct seq_file *, struct lc_element *));
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c33e1c489eb2..8b8cfadf7833 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -28,6 +28,54 @@ typedef uint16_t blkif_vdev_t;
 typedef uint64_t blkif_sector_t;
 /*
+ * Multiple hardware queues/rings:
+ * If supported, the backend will write the key "multi-queue-max-queues" to
+ * the directory for that vbd, and set its value to the maximum supported
+ * number of queues.
+ * Frontends that are aware of this feature and wish to use it can write the
+ * key "multi-queue-num-queues" with the number they wish to use, which must be
+ * greater than zero, and no more than the value reported by the backend in
+ * "multi-queue-max-queues".
+ *
+ * For frontends requesting just one queue, the usual event-channel and
+ * ring-ref keys are written as before, simplifying the backend processing
+ * to avoid distinguishing between a frontend that doesn't understand the
+ * multi-queue feature, and one that does, but requested only one queue.
+ *
+ * Frontends requesting two or more queues must not write the toplevel
+ * event-channel and ring-ref keys, instead writing those keys under sub-keys
+ * having the name "queue-N" where N is the integer ID of the queue/ring for
+ * which those keys belong. Queues are indexed from zero.
+ * For example, a frontend with two queues must write the following set of
+ * queue-related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ * It is also possible to use multiple queues/rings together with
+ * feature multi-page ring buffer.
+ * For example, a frontend requests two queues/rings and the size of each ring
+ * buffer is two pages must write the following set of related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/ring-page-order = "1"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ */
+/*
 * REQUEST CODES.
 */
 #define BLKIF_OP_READ              0
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 028f5d996eef..28ba40b99337 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc)
 * @seq: the seq_file to print into
 * @lc: the lru cache to print statistics of
 */
-size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
+void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
 {
        /* NOTE:
         * total calls to lc_get are
@@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
        seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
                   lc->name, lc->used, lc->nr_elements,
                   lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
-        return 0;
 }
 static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-21 21:19:38 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-21 21:19:38 -0500
commit	641203549a21ba6a701aecd05c3dfc969ec670cc (patch)
tree	5e3d177c380ed811b5bf37e0bf9b8098416a9bc6
parent	404a47410c26a115123885977053e9a1a4460929 (diff)
parent	e93d12ae3be91d18b2a46deebb90a3f516db3d3c (diff)