Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe: "This is the main pull request for block IO related changes for the 4.16 kernel. Nothing major in this pull request, but a good amount of improvements and fixes all over the map. This contains: - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and Paolo. - Support for SMR zones for deadline and mq-deadline from Damien and Christoph. - Set of fixes for bcache by way of Michael Lyle, including fixes from himself, Kent, Rui, Tang, and Coly. - Series from Matias for lightnvm with fixes from Hans Holmberg, Javier, and Matias. Mostly centered around pblk, and the removing rrpc 1.2 in preparation for supporting 2.0. - A couple of NVMe pull requests from Christoph. Nothing major in here, just fixes and cleanups, and support for command tracing from Johannes. - Support for blk-throttle for tracking reads and writes separately. From Joseph Qi. A few cleanups/fixes also for blk-throttle from Weiping. - Series from Mike Snitzer that enables dm to register its queue more logically, something that's alwways been problematic on dm since it's a stacked device. - Series from Ming cleaning up some of the bio accessor use, in preparation for supporting multipage bvecs. - Various fixes from Ming closing up holes around queue mapping and quiescing. - BSD partition fix from Richard Narron, fixing a problem where we can't mount newer (10/11) FreeBSD partitions. - Series from Tejun reworking blk-mq timeout handling. The previous scheme relied on atomic bits, but it had races where we would think a request had timed out if it to reused at the wrong time. - null_blk now supports faking timeouts, to enable us to better exercise and test that functionality separately. From me. - Kill the separate atomic poll bit in the request struct. After this, we don't use the atomic bits on blk-mq anymore at all. From me. - sgl_alloc/free helpers from Bart. - Heavily contended tag case scalability improvement from me. - Various little fixes and cleanups from Arnd, Bart, Corentin, Douglas, Eryu, Goldwyn, and myself" * 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits) block: remove smart1,2.h nvme: add tracepoint for nvme_complete_rq nvme: add tracepoint for nvme_setup_cmd nvme-pci: introduce RECONNECTING state to mark initializing procedure nvme-rdma: remove redundant boolean for inline_data nvme: don't free uuid pointer before printing it nvme-pci: Suspend queues after deleting them bsg: use pr_debug instead of hand crafted macros blk-mq-debugfs: don't allow write on attributes with seq_operations set nvme-pci: Fix queue double allocations block: Set BIO_TRACE_COMPLETION on new bio during split blk-throttle: use queue_is_rq_based block: Remove kblockd_schedule_delayed_work{,_on}() blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly() lib/scatterlist: Fix chaining support in sgl_alloc_order() blk-throttle: track read and write request individually block: add bdev_read_only() checks to common helpers block: fail op_is_write() requests to read-only partitions blk-throttle: export io_serviced_recursive, io_service_bytes_recursive ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-01-29 14:51:49 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-01-29 14:51:49 -0500
commit: 0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch)
tree: cefccd67dc1f27bb45830f6b8065dd4a1c05e83b
parent: 9697e9da84299d0d715d515dd2cc48f1eceb277d (diff)
parent: 796baeeef85a40b3495a907fb7425086e7010102 (diff)
124 files changed, 3884 insertions, 4729 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index da1525ec4c87..d819dc77fe65 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
        unsigned long flags;
        int i;
+        spin_lock_irqsave(&bfqd->lock, flags);
        if (!entity) /* root group */
-                return;
+                goto put_async_queues;
-        spin_lock_irqsave(&bfqd->lock, flags);
        /*
         * Empty all service_trees belonging to this group before
         * deactivating the group itself.
@@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
        }
        __bfq_deactivate_entity(entity, false);
+put_async_queues:
        bfq_put_async_queues(bfqd, bfqg);
        spin_unlock_irqrestore(&bfqd->lock, flags);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index bcb6d21baf12..47e6ec7427c4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;
 /* Default timeout values, in jiffies, approximating CFQ defaults. */
 const int bfq_timeout = HZ / 8;
+/*
+ * Time limit for merging (see comments in bfq_setup_cooperator). Set
+ * to the slowest value that, in our tests, proved to be effective in
+ * removing false positives, while not causing true positives to miss
+ * queue merging.
+ *
+ * As can be deduced from the low time limit below, queue merging, if
+ * successful, happens at the very beggining of the I/O of the involved
+ * cooperating processes, as a consequence of the arrival of the very
+ * first requests from each cooperator.  After that, there is very
+ * little chance to find cooperators.
+ */
+static const unsigned long bfq_merge_time_limit = HZ/10;
 static struct kmem_cache *bfq_pool;
 /* Below this threshold (in ns), we consider thinktime immediate. */
@@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool;
 #define BFQQ_SEEK_THR           (sector_t)(8 * 100)
 #define BFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
 #define BFQQ_CLOSE_THR          (sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq)        (hweight32(bfqq->seek_history) > 32/8)
+#define BFQQ_SEEKY(bfqq)        (hweight32(bfqq->seek_history) > 19)
 /* Min number of samples required to perform peak-rate update */
 #define BFQ_RATE_MIN_SAMPLES    32
@@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool;
 * interactive applications automatically, using the following formula:
 * duration = (R / r) * T, where r is the peak rate of the device, and
 * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
+ * In particular, R is the peak rate of the reference device (see
- * and T is a reference time: given the systems that are likely to be
+ * below), and T is a reference time: given the systems that are
- * installed on the reference device according to its speed class, T is
+ * likely to be installed on the reference device according to its
- * about the maximum time needed, under BFQ and while reading two files in
+ * speed class, T is about the maximum time needed, under BFQ and
- * parallel, to load typical large applications on these systems.
+ * while reading two files in parallel, to load typical large
- * In practice, the slower/faster the device at hand is, the more/less it
+ * applications on these systems (see the comments on
- * takes to load applications with respect to the reference device.
+ * max_service_from_wr below, for more details on how T is obtained).
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * In practice, the slower/faster the device at hand is, the more/less
- * applications.
+ * it takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to
+ * interactive applications.
 *
 * BFQ uses four different reference pairs (R, T), depending on:
 * . whether the device is rotational or non-rotational;
@@ -240,6 +256,60 @@ static int T_slow[2];
 static int T_fast[2];
 static int device_speed_thresh[2];
+/*
+ * BFQ uses the above-detailed, time-based weight-raising mechanism to
+ * privilege interactive tasks. This mechanism is vulnerable to the
+ * following false positives: I/O-bound applications that will go on
+ * doing I/O for much longer than the duration of weight
+ * raising. These applications have basically no benefit from being
+ * weight-raised at the beginning of their I/O. On the opposite end,
+ * while being weight-raised, these applications
+ * a) unjustly steal throughput to applications that may actually need
+ * low latency;
+ * b) make BFQ uselessly perform device idling; device idling results
+ * in loss of device throughput with most flash-based storage, and may
+ * increase latencies when used purposelessly.
+ *
+ * BFQ tries to reduce these problems, by adopting the following
+ * countermeasure. To introduce this countermeasure, we need first to
+ * finish explaining how the duration of weight-raising for
+ * interactive tasks is computed.
+ *
+ * For a bfq_queue deemed as interactive, the duration of weight
+ * raising is dynamically adjusted, as a function of the estimated
+ * peak rate of the device, so as to be equal to the time needed to
+ * execute the 'largest' interactive task we benchmarked so far. By
+ * largest task, we mean the task for which each involved process has
+ * to do more I/O than for any of the other tasks we benchmarked. This
+ * reference interactive task is the start-up of LibreOffice Writer,
+ * and in this task each process/bfq_queue needs to have at most ~110K
+ * sectors transferred.
+ *
+ * This last piece of information enables BFQ to reduce the actual
+ * duration of weight-raising for at least one class of I/O-bound
+ * applications: those doing sequential or quasi-sequential I/O. An
+ * example is file copy. In fact, once started, the main I/O-bound
+ * processes of these applications usually consume the above 110K
+ * sectors in much less time than the processes of an application that
+ * is starting, because these I/O-bound processes will greedily devote
+ * almost all their CPU cycles only to their target,
+ * throughput-friendly I/O operations. This is even more true if BFQ
+ * happens to be underestimating the device peak rate, and thus
+ * overestimating the duration of weight raising. But, according to
+ * our measurements, once transferred 110K sectors, these processes
+ * have no right to be weight-raised any longer.
+ *
+ * Basing on the last consideration, BFQ ends weight-raising for a
+ * bfq_queue if the latter happens to have received an amount of
+ * service at least equal to the following constant. The constant is
+ * set to slightly more than 110K, to have a minimum safety margin.
+ *
+ * This early ending of weight-raising reduces the amount of time
+ * during which interactive false positives cause the two problems
+ * described at the beginning of these comments.
+ */
+static const unsigned long max_service_from_wr = 120000;
 #define RQ_BIC(rq)              icq_to_bic((rq)->elv.priv[0])
 #define RQ_BFQQ(rq)             ((rq)->elv.priv[1])
@@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
        }
 }
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function.
+ */
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
+{
+        bfqd->sb_shift = bt->sb.shift;
+        /*
+         * In-word depths if no bfq_queue is being weight-raised:
+         * leaving 25% of tags only for sync reads.
+         *
+         * In next formulas, right-shift the value
+         * (1U<<bfqd->sb_shift), instead of computing directly
+         * (1U<<(bfqd->sb_shift - something)), to be robust against
+         * any possible value of bfqd->sb_shift, without having to
+         * limit 'something'.
+         */
+        /* no more than 50% of tags for async I/O */
+        bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
+        /*
+         * no more than 75% of tags for sync writes (25% extra tags
+         * w.r.t. async I/O, to prevent async I/O from starving sync
+         * writes)
+         */
+        bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
+        /*
+         * In-word depths in case some bfq_queue is being weight-
+         * raised: leaving ~63% of tags for sync reads. This is the
+         * highest percentage for which, in our tests, application
+         * start-up times didn't suffer from any regression due to tag
+         * shortage.
+         */
+        /* no more than ~18% of tags for async I/O */
+        bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
+        /* no more than ~37% of tags for sync writes (~20% extra tags) */
+        bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
+}
+/*
+ * Async I/O can easily starve sync I/O (both sync reads and sync
+ * writes), by consuming all tags. Similarly, storms of sync writes,
+ * such as those that sync(2) may trigger, can starve sync reads.
+ * Limit depths of async I/O and sync writes so as to counter both
+ * problems.
+ */
+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+        struct bfq_data *bfqd = data->q->elevator->elevator_data;
+        struct sbitmap_queue *bt;
+        if (op_is_sync(op) && !op_is_write(op))
+                return;
+        if (data->flags & BLK_MQ_REQ_RESERVED) {
+                if (unlikely(!tags->nr_reserved_tags)) {
+                        WARN_ON_ONCE(1);
+                        return;
+                }
+                bt = &tags->breserved_tags;
+        } else
+                bt = &tags->bitmap_tags;
+        if (unlikely(bfqd->sb_shift != bt->sb.shift))
+                bfq_update_depths(bfqd, bt);
+        data->shallow_depth =
+                bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+        bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
+                        __func__, bfqd->wr_busy_queues, op_is_sync(op),
+                        data->shallow_depth);
+}
 static struct bfq_queue *
 bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
                     sector_t sector, struct rb_node **ret_parent,
@@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
        return bfqq;
 }
+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
+{
+        return bfqq->service_from_backlogged > 0 &&
+                time_is_before_jiffies(bfqq->first_IO_time +
+                                       bfq_merge_time_limit);
+}
 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
        struct rb_node **p, *parent;
@@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
                bfqq->pos_root = NULL;
        }
+        /*
+         * bfqq cannot be merged any longer (see comments in
+         * bfq_setup_cooperator): no point in adding bfqq into the
+         * position tree.
+         */
+        if (bfq_too_late_for_merging(bfqq))
+                return;
        if (bfq_class_idle(bfqq))
                return;
        if (!bfqq->next_rq)
@@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
        if (old_wr_coeff == 1 && wr_or_deserves_wr) {
                /* start a weight-raising period */
                if (interactive) {
+                        bfqq->service_from_wr = 0;
                        bfqq->wr_coeff = bfqd->bfq_wr_coeff;
                        bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
                } else {
@@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q,
                        rb_erase(&bfqq->pos_node, bfqq->pos_root);
                        bfqq->pos_root = NULL;
                }
+        } else {
+                bfq_pos_tree_add_move(bfqd, bfqq);
        }
        if (rq->cmd_flags & REQ_META)
@@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
                                        struct bfq_queue *new_bfqq)
 {
+        if (bfq_too_late_for_merging(new_bfqq))
+                return false;
        if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
            (bfqq->ioprio_class != new_bfqq->ioprio_class))
                return false;
@@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
 }
 /*
- * If this function returns true, then bfqq cannot be merged. The idea
- * is that true cooperation happens very early after processes start
- * to do I/O. Usually, late cooperations are just accidental false
- * positives. In case bfqq is weight-raised, such false positives
- * would evidently degrade latency guarantees for bfqq.
- */
-static bool wr_from_too_long(struct bfq_queue *bfqq)
-{
-        return bfqq->wr_coeff > 1 &&
-                time_is_before_jiffies(bfqq->last_wr_start_finish +
-                                       msecs_to_jiffies(100));
-}
-/*
 * Attempt to schedule a merge of bfqq with the currently in-service
 * queue or with a close queue among the scheduled queues.  Return
 * NULL if no merge was scheduled, a pointer to the shared bfq_queue
@@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)
 * to maintain. Besides, in such a critical condition as an out of memory,
 * the benefits of queue merging may be little relevant, or even negligible.
 *
- * Weight-raised queues can be merged only if their weight-raising
- * period has just started. In fact cooperating processes are usually
- * started together. Thus, with this filter we avoid false positives
- * that would jeopardize low-latency guarantees.
- *
 * WARNING: queue merging may impair fairness among non-weight raised
 * queues, for at least two reasons: 1) the original weight of a
 * merged queue may change during the merged state, 2) even being the
@@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 {
        struct bfq_queue *in_service_bfqq, *new_bfqq;
+        /*
+         * Prevent bfqq from being merged if it has been created too
+         * long ago. The idea is that true cooperating processes, and
+         * thus their associated bfq_queues, are supposed to be
+         * created shortly after each other. This is the case, e.g.,
+         * for KVM/QEMU and dump I/O threads. Basing on this
+         * assumption, the following filtering greatly reduces the
+         * probability that two non-cooperating processes, which just
+         * happen to do close I/O for some short time interval, have
+         * their queues merged by mistake.
+         */
+        if (bfq_too_late_for_merging(bfqq))
+                return NULL;
        if (bfqq->new_bfqq)
                return bfqq->new_bfqq;
-        if (!io_struct ||
+        if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
-            wr_from_too_long(bfqq) ||
-            unlikely(bfqq == &bfqd->oom_bfqq))
                return NULL;
        /* If there is only one backlogged queue, don't search. */
@@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
        in_service_bfqq = bfqd->in_service_queue;
-        if (!in_service_bfqq || in_service_bfqq == bfqq
+        if (in_service_bfqq && in_service_bfqq != bfqq &&
-            || wr_from_too_long(in_service_bfqq) ||
+            likely(in_service_bfqq != &bfqd->oom_bfqq) &&
-            unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+            bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
-                goto check_scheduled;
-        if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
            bfqq->entity.parent == in_service_bfqq->entity.parent &&
            bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
                new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
         * queues. The only thing we need is that the bio/request is not
         * NULL, as we need it to establish whether a cooperator exists.
         */
-check_scheduled:
        new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
                        bfq_io_struct_pos(io_struct, request));
-        if (new_bfqq && !wr_from_too_long(new_bfqq) &&
+        if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
-            likely(new_bfqq != &bfqd->oom_bfqq) &&
            bfq_may_be_close_cooperator(bfqq, new_bfqq))
                return bfq_setup_merge(bfqq, new_bfqq);
@@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
        bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
        bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
        if (unlikely(bfq_bfqq_just_created(bfqq) &&
-                     !bfq_bfqq_in_large_burst(bfqq))) {
+                     !bfq_bfqq_in_large_burst(bfqq) &&
+                     bfqq->bfqd->low_latency)) {
                /*
                 * bfqq being merged right after being created: bfqq
                 * would have deserved interactive weight raising, but
@@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 * whereas soft_rt_next_start is set to infinity for applications that do
 * not.
 *
- * Unfortunately, even a greedy application may happen to behave in an
+ * Unfortunately, even a greedy (i.e., I/O-bound) application may
- * isochronous way if the CPU load is high. In fact, the application may
+ * happen to meet, occasionally or systematically, both the above
- * stop issuing requests while the CPUs are busy serving other processes,
+ * bandwidth and isochrony requirements. This may happen at least in
- * then restart, then stop again for a while, and so on. In addition, if
+ * the following circumstances. First, if the CPU load is high. The
- * the disk achieves a low enough throughput with the request pattern
+ * application may stop issuing requests while the CPUs are busy
- * issued by the application (e.g., because the request pattern is random
+ * serving other processes, then restart, then stop again for a while,
- * and/or the device is slow), then the application may meet the above
+ * and so on. The other circumstances are related to the storage
- * bandwidth requirement too. To prevent such a greedy application to be
+ * device: the storage device is highly loaded or reaches a low-enough
- * deemed as soft real-time, a further rule is used in the computation of
+ * throughput with the I/O of the application (e.g., because the I/O
- * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * is random and/or the device is slow). In all these cases, the
- * time plus the maximum time for which the arrival of a request is waited
+ * I/O of the application may be simply slowed down enough to meet
- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * the bandwidth and isochrony requirements. To reduce the probability
- * This filters out greedy applications, as the latter issue instead their
+ * that greedy applications are deemed as soft real-time in these
- * next request as soon as possible after the last one has been completed
+ * corner cases, a further rule is used in the computation of
- * (in contrast, when a batch of requests is completed, a soft real-time
+ * soft_rt_next_start: the return value of this function is forced to
- * application spends some time processing data).
+ * be higher than the maximum between the following two quantities.
+ *
+ * (a) Current time plus: (1) the maximum time for which the arrival
+ *     of a request is waited for when a sync queue becomes idle,
+ *     namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We
+ *     postpone for a moment the reason for adding a few extra
+ *     jiffies; we get back to it after next item (b).  Lower-bounding
+ *     the return value of this function with the current time plus
+ *     bfqd->bfq_slice_idle tends to filter out greedy applications,
+ *     because the latter issue their next request as soon as possible
+ *     after the last one has been completed. In contrast, a soft
+ *     real-time application spends some time processing data, after a
+ *     batch of its requests has been completed.
 *
- * Unfortunately, the last filter may easily generate false positives if
+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out
- * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ *     above, greedy applications may happen to meet both the
- * or both the following cases occur:
+ *     bandwidth and isochrony requirements under heavy CPU or
- * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ *     storage-device load. In more detail, in these scenarios, these
- *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ *     applications happen, only for limited time periods, to do I/O
- *    HZ=100.
+ *     slowly enough to meet all the requirements described so far,
+ *     including the filtering in above item (a). These slow-speed
+ *     time intervals are usually interspersed between other time
+ *     intervals during which these applications do I/O at a very high
+ *     speed. Fortunately, exactly because of the high speed of the
+ *     I/O in the high-speed intervals, the values returned by this
+ *     function happen to be so high, near the end of any such
+ *     high-speed interval, to be likely to fall *after* the end of
+ *     the low-speed time interval that follows. These high values are
+ *     stored in bfqq->soft_rt_next_start after each invocation of
+ *     this function. As a consequence, if the last value of
+ *     bfqq->soft_rt_next_start is constantly used to lower-bound the
+ *     next value that this function may return, then, from the very
+ *     beginning of a low-speed interval, bfqq->soft_rt_next_start is
+ *     likely to be constantly kept so high that any I/O request
+ *     issued during the low-speed interval is considered as arriving
+ *     to soon for the application to be deemed as soft
+ *     real-time. Then, in the high-speed interval that follows, the
+ *     application will not be deemed as soft real-time, just because
+ *     it will do I/O at a high speed. And so on.
+ *
+ * Getting back to the filtering in item (a), in the following two
+ * cases this filtering might be easily passed by a greedy
+ * application, if the reference quantity was just
+ * bfqd->bfq_slice_idle:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or
+ *    higher than bfqd->bfq_slice_idle. This happens, e.g., on slow
+ *    devices with HZ=100. The time granularity may be so coarse
+ *    that the approximation, in jiffies, of bfqd->bfq_slice_idle
+ *    is rather lower than the exact value.
 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
 *    for a while, then suddenly 'jump' by several units to recover the lost
 *    increments. This seems to happen, e.g., inside virtual machines.
- * To address this issue, we do not use as a reference time interval just
+ * To address this issue, in the filtering in (a) we do not use as a
- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * reference time interval just bfqd->bfq_slice_idle, but
- * particular we add the minimum number of jiffies for which the filter
+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the
- * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * minimum number of jiffies for which the filter seems to be quite
- * machines.
+ * precise also in embedded systems and KVM/QEMU virtual machines.
 */
 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
                                                struct bfq_queue *bfqq)
 {
-        return max(bfqq->last_idle_bklogged +
+        return max3(bfqq->soft_rt_next_start,
-                   HZ * bfqq->service_from_backlogged /
+                    bfqq->last_idle_bklogged +
-                   bfqd->bfq_wr_max_softrt_rate,
+                    HZ * bfqq->service_from_backlogged /
-                   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+                    bfqd->bfq_wr_max_softrt_rate,
+                    jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
 }
 /**
@@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
        slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
        /*
-         * Increase service_from_backlogged before next statement,
-         * because the possible next invocation of
-         * bfq_bfqq_charge_time would likely inflate
-         * entity->service. In contrast, service_from_backlogged must
-         * contain real service, to enable the soft real-time
-         * heuristic to correctly compute the bandwidth consumed by
-         * bfqq.
-         */
-        bfqq->service_from_backlogged += entity->service;
-        /*
         * As above explained, charge slow (typically seeky) and
         * timed-out queues with the time and not the service
         * received, to favor sequential workloads.
@@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
                                bfqq->entity.prio_changed = 1;
                        }
                }
+                if (bfqq->wr_coeff > 1 &&
+                    bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
+                    bfqq->service_from_wr > max_service_from_wr) {
+                        /* see comments on max_service_from_wr */
+                        bfq_bfqq_end_wr(bfqq);
+                }
        }
        /*
         * To improve latency (for this or other queues), immediately
@@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
                }
                /*
-                 * We exploit the put_rq_private hook to decrement
+                 * We exploit the bfq_finish_request hook to decrement
-                 * rq_in_driver, but put_rq_private will not be
+                 * rq_in_driver, but bfq_finish_request will not be
                 * invoked on this request. So, to avoid unbalance,
                 * just start this request, without incrementing
                 * rq_in_driver. As a negative consequence,
@@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
                 * bfq_schedule_dispatch to be invoked uselessly.
                 *
                 * As for implementing an exact solution, the
-                 * put_request hook, if defined, is probably invoked
+                 * bfq_finish_request hook, if defined, is probably
-                 * also on this request. So, by exploiting this hook,
+                 * invoked also on this request. So, by exploiting
-                 * we could 1) increment rq_in_driver here, and 2)
+                 * this hook, we could 1) increment rq_in_driver here,
-                 * decrement it in put_request. Such a solution would
+                 * and 2) decrement it in bfq_finish_request. Such a
-                 * let the value of the counter be always accurate,
+                 * solution would let the value of the counter be
-                 * but it would entail using an extra interface
+                 * always accurate, but it would entail using an extra
-                 * function. This cost seems higher than the benefit,
+                 * interface function. This cost seems higher than the
-                 * being the frequency of non-elevator-private
+                 * benefit, being the frequency of non-elevator-private
                 * requests very low.
                 */
                goto start_rq;
@@ -3689,35 +3882,16 @@ exit:
        return rq;
 }
-static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
-{
-        struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
-        struct request *rq;
 #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-        struct bfq_queue *in_serv_queue, *bfqq;
+static void bfq_update_dispatch_stats(struct request_queue *q,
-        bool waiting_rq, idle_timer_disabled;
+                                      struct request *rq,
-#endif
+                                      struct bfq_queue *in_serv_queue,
+                                      bool idle_timer_disabled)
-        spin_lock_irq(&bfqd->lock);
+{
+        struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL;
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-        in_serv_queue = bfqd->in_service_queue;
-        waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
-        rq = __bfq_dispatch_request(hctx);
-        idle_timer_disabled =
-                waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
-#else
-        rq = __bfq_dispatch_request(hctx);
-#endif
-        spin_unlock_irq(&bfqd->lock);
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-        bfqq = rq ? RQ_BFQQ(rq) : NULL;
        if (!idle_timer_disabled && !bfqq)
-                return rq;
+                return;
        /*
         * rq and bfqq are guaranteed to exist until this function
@@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
         * In addition, the following queue lock guarantees that
         * bfqq_group(bfqq) exists as well.
         */
-        spin_lock_irq(hctx->queue->queue_lock);
+        spin_lock_irq(q->queue_lock);
        if (idle_timer_disabled)
                /*
                 * Since the idle timer has been disabled,
@@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
                bfqg_stats_set_start_empty_time(bfqg);
                bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
        }
-        spin_unlock_irq(hctx->queue->queue_lock);
+        spin_unlock_irq(q->queue_lock);
+}
+#else
+static inline void bfq_update_dispatch_stats(struct request_queue *q,
+                                             struct request *rq,
+                                             struct bfq_queue *in_serv_queue,
+                                             bool idle_timer_disabled) {}
 #endif
+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+        struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+        struct request *rq;
+        struct bfq_queue *in_serv_queue;
+        bool waiting_rq, idle_timer_disabled;
+        spin_lock_irq(&bfqd->lock);
+        in_serv_queue = bfqd->in_service_queue;
+        waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
+        rq = __bfq_dispatch_request(hctx);
+        idle_timer_disabled =
+                waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+        spin_unlock_irq(&bfqd->lock);
+        bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
+                                  idle_timer_disabled);
        return rq;
 }
@@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
        bfqq->split_time = bfq_smallest_from_now();
        /*
-         * Set to the value for which bfqq will not be deemed as
+         * To not forget the possibly high bandwidth consumed by a
-         * soft rt when it becomes backlogged.
+         * process/queue in the recent past,
+         * bfq_bfqq_softrt_next_start() returns a value at least equal
+         * to the current value of bfqq->soft_rt_next_start (see
+         * comments on bfq_bfqq_softrt_next_start).  Set
+         * soft_rt_next_start to now, to mean that bfqq has consumed
+         * no bandwidth so far.
         */
-        bfqq->soft_rt_next_start = bfq_greatest_from_now();
+        bfqq->soft_rt_next_start = jiffies;
        /* first request is almost certainly seeky */
        bfqq->seek_history = 1;
@@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
        return idle_timer_disabled;
 }
+#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+static void bfq_update_insert_stats(struct request_queue *q,
+                                    struct bfq_queue *bfqq,
+                                    bool idle_timer_disabled,
+                                    unsigned int cmd_flags)
+{
+        if (!bfqq)
+                return;
+        /*
+         * bfqq still exists, because it can disappear only after
+         * either it is merged with another queue, or the process it
+         * is associated with exits. But both actions must be taken by
+         * the same process currently executing this flow of
+         * instructions.
+         *
+         * In addition, the following queue lock guarantees that
+         * bfqq_group(bfqq) exists as well.
+         */
+        spin_lock_irq(q->queue_lock);
+        bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
+        if (idle_timer_disabled)
+                bfqg_stats_update_idle_time(bfqq_group(bfqq));
+        spin_unlock_irq(q->queue_lock);
+}
+#else
+static inline void bfq_update_insert_stats(struct request_queue *q,
+                                           struct bfq_queue *bfqq,
+                                           bool idle_timer_disabled,
+                                           unsigned int cmd_flags) {}
+#endif
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                               bool at_head)
 {
        struct request_queue *q = hctx->queue;
        struct bfq_data *bfqd = q->elevator->elevator_data;
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
        struct bfq_queue *bfqq = RQ_BFQQ(rq);
        bool idle_timer_disabled = false;
        unsigned int cmd_flags;
-#endif
        spin_lock_irq(&bfqd->lock);
        if (blk_mq_sched_try_insert_merge(q, rq)) {
@@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                else
                        list_add_tail(&rq->queuelist, &bfqd->dispatch);
        } else {
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
                idle_timer_disabled = __bfq_insert_request(bfqd, rq);
                /*
                 * Update bfqq, because, if a queue merge has occurred
@@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                 * redirected into a new queue.
                 */
                bfqq = RQ_BFQQ(rq);
-#else
-                __bfq_insert_request(bfqd, rq);
-#endif
                if (rq_mergeable(rq)) {
                        elv_rqhash_add(q, rq);
@@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                }
        }
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
        /*
         * Cache cmd_flags before releasing scheduler lock, because rq
         * may disappear afterwards (for example, because of a request
         * merge).
         */
        cmd_flags = rq->cmd_flags;
-#endif
        spin_unlock_irq(&bfqd->lock);
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+        bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
-        if (!bfqq)
+                                cmd_flags);
-                return;
-        /*
-         * bfqq still exists, because it can disappear only after
-         * either it is merged with another queue, or the process it
-         * is associated with exits. But both actions must be taken by
-         * the same process currently executing this flow of
-         * instruction.
-         *
-         * In addition, the following queue lock guarantees that
-         * bfqq_group(bfqq) exists as well.
-         */
-        spin_lock_irq(q->queue_lock);
-        bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
-        if (idle_timer_disabled)
-                bfqg_stats_update_idle_time(bfqq_group(bfqq));
-        spin_unlock_irq(q->queue_lock);
-#endif
 }
 static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
@@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
                bfq_schedule_dispatch(bfqd);
 }
-static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
+static void bfq_finish_request_body(struct bfq_queue *bfqq)
 {
        bfqq->allocated--;
@@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq)
                spin_lock_irqsave(&bfqd->lock, flags);
                bfq_completed_request(bfqq, bfqd);
-                bfq_put_rq_priv_body(bfqq);
+                bfq_finish_request_body(bfqq);
                spin_unlock_irqrestore(&bfqd->lock, flags);
        } else {
@@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq)
                        bfqg_stats_update_io_remove(bfqq_group(bfqq),
                                                    rq->cmd_flags);
                }
-                bfq_put_rq_priv_body(bfqq);
+                bfq_finish_request_body(bfqq);
        }
        rq->elv.priv[0] = NULL;
@@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e)
        hrtimer_cancel(&bfqd->idle_slice_timer);
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
+        /* release oom-queue reference to root group */
+        bfqg_and_blkg_put(bfqd->root_group);
        blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
 #else
        spin_lock_irq(&bfqd->lock);
@@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {
 static struct elevator_type iosched_bfq_mq = {
        .ops.mq = {
+                .limit_depth            = bfq_limit_depth,
                .prepare_request        = bfq_prepare_request,
                .finish_request         = bfq_finish_request,
                .exit_icq               = bfq_exit_icq,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 91c4390903a1..350c39ae2896 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -337,6 +337,11 @@ struct bfq_queue {
         * last transition from idle to backlogged.
         */
        unsigned long service_from_backlogged;
+        /*
+         * Cumulative service received from the @bfq_queue since its
+         * last transition to weight-raised state.
+         */
+        unsigned long service_from_wr;
        /*
         * Value of wr start time when switching to soft rt
@@ -344,6 +349,8 @@ struct bfq_queue {
        unsigned long wr_start_at_switch_to_srt;
        unsigned long split_time; /* time of last split */
+        unsigned long first_IO_time; /* time of first I/O for this queue */
 };
 /**
@@ -627,6 +634,18 @@ struct bfq_data {
        struct bfq_io_cq *bio_bic;
        /* bfqq associated with the task issuing current bio for merging */
        struct bfq_queue *bio_bfqq;
+        /*
+         * Cached sbitmap shift, used to compute depth limits in
+         * bfq_update_depths.
+         */
+        unsigned int sb_shift;
+        /*
+         * Depth limits used in bfq_limit_depth (see comments on the
+         * function)
+         */
+        unsigned int word_depths[2][2];
 };
 enum bfqq_state_flags {
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index e495d3f9b4b0..4498c43245e2 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
        struct bfq_entity *entity = &bfqq->entity;
        struct bfq_service_tree *st;
+        if (!bfqq->service_from_backlogged)
+                bfqq->first_IO_time = jiffies;
+        if (bfqq->wr_coeff > 1)
+                bfqq->service_from_wr += served;
+        bfqq->service_from_backlogged += served;
        for_each_entity(entity) {
                st = bfq_entity_service_tree(entity);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 23b42e8aa03e..9cfdd6c83b5b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 /**
 * __bio_integrity_endio - Integrity I/O completion function
 * @bio:        Protected bio
- * @error:      Pointer to errno
 *
 * Description: Completion for integrity I/O
 *
diff --git a/block/bio.c b/block/bio.c
index 9ef6cf3addb3..e1708db48258 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -971,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
 EXPORT_SYMBOL(bio_advance);
 /**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
-int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
-{
-        int i;
-        struct bio_vec *bv;
-        bio_for_each_segment_all(bv, bio, i) {
-                bv->bv_page = alloc_page(gfp_mask);
-                if (!bv->bv_page) {
-                        while (--bv >= bio->bi_io_vec)
-                                __free_page(bv->bv_page);
-                        return -ENOMEM;
-                }
-        }
-        return 0;
-}
-EXPORT_SYMBOL(bio_alloc_pages);
-/**
 * bio_copy_data - copy contents of data buffers from one chain of bios to
 * another
 * @src: source bio list
@@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
        bio_advance(bio, split->bi_iter.bi_size);
        if (bio_flagged(bio, BIO_TRACE_COMPLETION))
-                bio_set_flag(bio, BIO_TRACE_COMPLETION);
+                bio_set_flag(split, BIO_TRACE_COMPLETION);
        return split;
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index 3ba4326a63b5..a2005a485335 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->start_time = jiffies;
        set_start_time_ns(rq);
        rq->part = NULL;
+        seqcount_init(&rq->gstate_seq);
+        u64_stats_init(&rq->aborted_gstate_sync);
 }
 EXPORT_SYMBOL(blk_rq_init);
@@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q)
        queue_flag_set(QUEUE_FLAG_DEAD, q);
        spin_unlock_irq(lock);
+        /*
+         * make sure all in-progress dispatch are completed because
+         * blk_freeze_queue() can only complete all requests, and
+         * dispatch may still be in-progress since we dispatch requests
+         * from more than one contexts
+         */
+        if (q->mq_ops)
+                blk_mq_quiesce_queue(q);
        /* for synchronous bio-based driver finish in-flight integrity i/o */
        blk_flush_integrity();
@@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
        lockdep_assert_held(q->queue_lock);
+        blk_req_zone_write_unlock(req);
        blk_pm_put_request(req);
        elv_completed_request(q, req);
@@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part,
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
+static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+{
+        if (part->policy && op_is_write(bio_op(bio))) {
+                char b[BDEVNAME_SIZE];
+                printk(KERN_ERR
+                       "generic_make_request: Trying to write "
+                        "to read-only block-device %s (partno %d)\n",
+                        bio_devname(bio, b), part->partno);
+                return true;
+        }
+        return false;
+}
 /*
 * Remap block n of partition p to block n+start(p) of the disk.
 */
@@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio)
        struct hd_struct *p;
        int ret = 0;
+        rcu_read_lock();
+        p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+        if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+                     bio_check_ro(bio, p))) {
+                ret = -EIO;
+                goto out;
+        }
        /*
         * Zone reset does not include bi_size so bio_sectors() is always 0.
         * Include a test for the reset op code and perform the remap if needed.
         */
-        if (!bio->bi_partno ||
+        if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
-            (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
+                goto out;
-                return 0;
-        rcu_read_lock();
+        bio->bi_iter.bi_sector += p->start_sect;
-        p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+        bio->bi_partno = 0;
-        if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
+        trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-                bio->bi_iter.bi_sector += p->start_sect;
+                              bio->bi_iter.bi_sector - p->start_sect);
-                bio->bi_partno = 0;
-                trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-                                bio->bi_iter.bi_sector - p->start_sect);
-        } else {
-                printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
-                ret = -EIO;
-        }
-        rcu_read_unlock();
+out:
+        rcu_read_unlock();
        return ret;
 }
@@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio)
         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
         * if queue is not a request based queue.
         */
        if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
                goto not_supported;
        if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
                goto end_io;
-        if (blk_partition_remap(bio))
+        if (!bio->bi_partno) {
-                goto end_io;
+                if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+                        goto end_io;
+        } else {
+                if (blk_partition_remap(bio))
+                        goto end_io;
+        }
        if (bio_check_eod(bio, nr_sectors))
                goto end_io;
@@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
                 * bypass a potential scheduler on the bottom device for
                 * insert.
                 */
-                blk_mq_request_bypass_insert(rq, true);
+                return blk_mq_request_issue_directly(rq);
-                return BLK_STS_OK;
        }
        spin_lock_irqsave(q->queue_lock, flags);
@@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req)
                wbt_issue(req->q->rq_wb, &req->issue_stat);
        }
-        BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+        BUG_ON(blk_rq_is_complete(req));
        blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
@@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
-int kblockd_schedule_delayed_work(struct delayed_work *dwork,
-                                  unsigned long delay)
-{
-        return queue_delayed_work(kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-                                     unsigned long delay)
-{
-        return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
 /**
 * blk_start_plug - initialize blk_plug and track it inside the task_struct
 * @plug:       The &struct blk_plug that needs to be initialized
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 5c0f3dc446dc..f7b292f12449 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
         * be reused after dying flag is set
         */
        if (q->mq_ops) {
-                blk_mq_sched_insert_request(rq, at_head, true, false, false);
+                blk_mq_sched_insert_request(rq, at_head, true, false);
                return;
        }
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2bc544ce3d2e..a676084d4740 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        if (!q)
                return -ENXIO;
+        if (bdev_read_only(bdev))
+                return -EPERM;
        if (flags & BLKDEV_DISCARD_SECURE) {
                if (!blk_queue_secure_erase(q))
                        return -EOPNOTSUPP;
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
        if (!q)
                return -ENXIO;
+        if (bdev_read_only(bdev))
+                return -EPERM;
        bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
        if ((sector | nr_sects) & bs_mask)
                return -EINVAL;
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
        if (!q)
                return -ENXIO;
+        if (bdev_read_only(bdev))
+                return -EPERM;
        /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
        max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
        if (!q)
                return -ENXIO;
+        if (bdev_read_only(bdev))
+                return -EPERM;
        while (nr_sects != 0) {
                bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
                               gfp_mask);
diff --git a/block/blk-map.c b/block/blk-map.c
index d3a94719f03f..db9373bd31ac 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
        unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
        struct bio *bio = NULL;
        struct iov_iter i;
-        int ret;
+        int ret = -EINVAL;
        if (!iter_is_iovec(iter))
                goto fail;
@@ -148,7 +148,7 @@ unmap_rq:
        __blk_rq_unmap_user(bio);
 fail:
        rq->bio = NULL;
-        return -EINVAL;
+        return ret;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f5dedd57dff6..8452fc7164cc 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
                                nsegs++;
                                sectors = max_sectors;
                        }
-                        if (sectors)
+                        goto split;
-                                goto split;
-                        /* Make this single bvec as the 1st segment */
                }
                if (bvprvp && blk_queue_cluster(q)) {
@@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
                        bvprvp = &bvprv;
                        sectors += bv.bv_len >> 9;
-                        if (nsegs == 1 && seg_size > front_seg_size)
-                                front_seg_size = seg_size;
                        continue;
                }
 new_segment:
                if (nsegs == queue_max_segments(q))
                        goto split;
+                if (nsegs == 1 && seg_size > front_seg_size)
+                        front_seg_size = seg_size;
                nsegs++;
                bvprv = bv;
                bvprvp = &bvprv;
                seg_size = bv.bv_len;
                sectors += bv.bv_len >> 9;
-                if (nsegs == 1 && seg_size > front_seg_size)
-                        front_seg_size = seg_size;
        }
        do_split = false;
@@ -174,6 +171,8 @@ split:
                        bio = new;
        }
+        if (nsegs == 1 && seg_size > front_seg_size)
+                front_seg_size = seg_size;
        bio->bi_seg_front_size = front_seg_size;
        if (seg_size > bio->bi_seg_back_size)
                bio->bi_seg_back_size = seg_size;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b56a4f35720d..21cbc1f071c6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -289,17 +289,12 @@ static const char *const rqf_name[] = {
        RQF_NAME(HASHED),
        RQF_NAME(STATS),
        RQF_NAME(SPECIAL_PAYLOAD),
+        RQF_NAME(ZONE_WRITE_LOCKED),
+        RQF_NAME(MQ_TIMEOUT_EXPIRED),
+        RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
-#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
-static const char *const rqaf_name[] = {
-        RQAF_NAME(COMPLETE),
-        RQAF_NAME(STARTED),
-        RQAF_NAME(POLL_SLEPT),
-};
-#undef RQAF_NAME
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
        const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
        seq_puts(m, ", .rq_flags=");
        blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
                       ARRAY_SIZE(rqf_name));
-        seq_puts(m, ", .atomic_flags=");
+        seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
-        blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
        seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
                   rq->internal_tag);
        if (mq_ops->show_rq)
@@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
        const struct show_busy_params *params = data;
        if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
-            test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+            blk_mq_rq_state(rq) != MQ_RQ_IDLE)
                __blk_mq_debugfs_rq_show(params->m,
                                         list_entry_rq(&rq->queuelist));
 }
@@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
        const struct blk_mq_debugfs_attr *attr = m->private;
        void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
-        if (!attr->write)
+        /*
+         * Attributes that only implement .seq_ops are read-only and 'attr' is
+         * the same with 'data' in this case.
+         */
+        if (attr == data || !attr->write)
                return -EPERM;
        return attr->write(data, buf, count, ppos);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c117bd8fd1f6..55c0a745b427 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
        WRITE_ONCE(hctx->dispatch_from, ctx);
 }
-/* return true if hw queue need to be run again */
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
        struct request_queue *q = hctx->queue;
@@ -428,7 +427,7 @@ done:
 }
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
-                                 bool run_queue, bool async, bool can_block)
+                                 bool run_queue, bool async)
 {
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index ba1d1418a96d..1e9c9018ace1 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
-                                 bool run_queue, bool async, bool can_block);
+                                 bool run_queue, bool async);
 void blk_mq_sched_insert_requests(struct request_queue *q,
                                  struct blk_mq_ctx *ctx,
                                  struct list_head *list, bool run_queue_async);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 79969c3c234f..a54b4b070f1c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
        return ret;
 }
-static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
        struct blk_mq_hw_ctx *hctx;
        int i;
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
        q->mq_sysfs_init_done = false;
 }
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
-        mutex_lock(&q->sysfs_lock);
-        __blk_mq_unregister_dev(dev, q);
-        mutex_unlock(&q->sysfs_lock);
-}
 void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 {
        kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c81b40ecd3f1..336dde07b230 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
        ws = bt_wait_ptr(bt, data->hctx);
        drop_ctx = data->ctx == NULL;
        do {
-                prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
-                tag = __blk_mq_get_tag(data, bt);
-                if (tag != -1)
-                        break;
                /*
                 * We're out of tags on this hardware queue, kick any
                 * pending IO submits before going to sleep waiting for
@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                if (tag != -1)
                        break;
+                prepare_to_wait_exclusive(&ws->wait, &wait,
+                                                TASK_UNINTERRUPTIBLE);
+                tag = __blk_mq_get_tag(data, bt);
+                if (tag != -1)
+                        break;
                if (data->ctx)
                        blk_mq_put_ctx(data->ctx);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d3797327491..01f271d40825 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 {
        struct mq_inflight *mi = priv;
-        if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
+        if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
-            !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
                /*
                 * index[0] counts the specific partition that was asked
                 * for. index[1] counts the ones that are active on the
@@ -222,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
        queue_for_each_hw_ctx(q, hctx, i) {
                if (hctx->flags & BLK_MQ_F_BLOCKING)
-                        synchronize_srcu(hctx->queue_rq_srcu);
+                        synchronize_srcu(hctx->srcu);
                else
                        rcu = true;
        }
@@ -272,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 {
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];
+        req_flags_t rq_flags = 0;
-        rq->rq_flags = 0;
        if (data->flags & BLK_MQ_REQ_INTERNAL) {
                rq->tag = -1;
                rq->internal_tag = tag;
        } else {
                if (blk_mq_tag_busy(data->hctx)) {
-                        rq->rq_flags = RQF_MQ_INFLIGHT;
+                        rq_flags = RQF_MQ_INFLIGHT;
                        atomic_inc(&data->hctx->nr_active);
                }
                rq->tag = tag;
@@ -288,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                data->hctx->tags->rqs[rq->tag] = rq;
        }
-        INIT_LIST_HEAD(&rq->queuelist);
        /* csd/requeue_work/fifo_time is initialized before use */
        rq->q = data->q;
        rq->mq_ctx = data->ctx;
+        rq->rq_flags = rq_flags;
+        rq->cpu = -1;
        rq->cmd_flags = op;
        if (data->flags & BLK_MQ_REQ_PREEMPT)
                rq->rq_flags |= RQF_PREEMPT;
        if (blk_queue_io_stat(data->q))
                rq->rq_flags |= RQF_IO_STAT;
-        /* do not touch atomic flags, it needs atomic ops against the timer */
+        INIT_LIST_HEAD(&rq->queuelist);
-        rq->cpu = -1;
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->rq_disk = NULL;
        rq->part = NULL;
        rq->start_time = jiffies;
-#ifdef CONFIG_BLK_CGROUP
-        rq->rl = NULL;
-        set_start_time_ns(rq);
-        rq->io_start_time_ns = 0;
-#endif
        rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
@@ -316,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
        rq->special = NULL;
        /* tag was already set */
        rq->extra_len = 0;
+        rq->__deadline = 0;
        INIT_LIST_HEAD(&rq->timeout_list);
        rq->timeout = 0;
@@ -324,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
        rq->end_io_data = NULL;
        rq->next_rq = NULL;
+#ifdef CONFIG_BLK_CGROUP
+        rq->rl = NULL;
+        set_start_time_ns(rq);
+        rq->io_start_time_ns = 0;
+#endif
        data->ctx->rq_dispatched[op_is_sync(op)]++;
        return rq;
 }
@@ -443,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                blk_queue_exit(q);
                return ERR_PTR(-EXDEV);
        }
-        cpu = cpumask_first(alloc_data.hctx->cpumask);
+        cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
        alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
        rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@ -485,8 +485,7 @@ void blk_mq_free_request(struct request *rq)
        if (blk_rq_rl(rq))
                blk_put_rl(blk_rq_rl(rq));
-        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+        blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
-        clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
        if (rq->tag != -1)
                blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
        if (sched_tag != -1)
@@ -532,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq)
        bool shared = false;
        int cpu;
+        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+        blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
        if (rq->internal_tag != -1)
                blk_mq_sched_completed_request(rq);
        if (rq->rq_flags & RQF_STATS) {
@@ -559,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq)
        put_cpu();
 }
+static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+        __releases(hctx->srcu)
+{
+        if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                rcu_read_unlock();
+        else
+                srcu_read_unlock(hctx->srcu, srcu_idx);
+}
+static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+        __acquires(hctx->srcu)
+{
+        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+                /* shut up gcc false positive */
+                *srcu_idx = 0;
+                rcu_read_lock();
+        } else
+                *srcu_idx = srcu_read_lock(hctx->srcu);
+}
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+        unsigned long flags;
+        /*
+         * blk_mq_rq_aborted_gstate() is used from the completion path and
+         * can thus be called from irq context.  u64_stats_fetch in the
+         * middle of update on the same CPU leads to lockup.  Disable irq
+         * while updating.
+         */
+        local_irq_save(flags);
+        u64_stats_update_begin(&rq->aborted_gstate_sync);
+        rq->aborted_gstate = gstate;
+        u64_stats_update_end(&rq->aborted_gstate_sync);
+        local_irq_restore(flags);
+}
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+        unsigned int start;
+        u64 aborted_gstate;
+        do {
+                start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+                aborted_gstate = rq->aborted_gstate;
+        } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+        return aborted_gstate;
+}
 /**
 * blk_mq_complete_request - end I/O on a request
 * @rq:         the request being processed
@@ -570,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq)
 void blk_mq_complete_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
+        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+        int srcu_idx;
        if (unlikely(blk_should_fake_timeout(q)))
                return;
-        if (!blk_mark_rq_complete(rq))
+        /*
+         * If @rq->aborted_gstate equals the current instance, timeout is
+         * claiming @rq and we lost.  This is synchronized through
+         * hctx_lock().  See blk_mq_timeout_work() for details.
+         *
+         * Completion path never blocks and we can directly use RCU here
+         * instead of hctx_lock() which can be either RCU or SRCU.
+         * However, that would complicate paths which want to synchronize
+         * against us.  Let stay in sync with the issue path so that
+         * hctx_lock() covers both issue and completion paths.
+         */
+        hctx_lock(hctx, &srcu_idx);
+        if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
                __blk_mq_complete_request(rq);
+        hctx_unlock(hctx, srcu_idx);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 int blk_mq_request_started(struct request *rq)
 {
-        return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+        return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
 }
 EXPORT_SYMBOL_GPL(blk_mq_request_started);
@@ -598,34 +666,27 @@ void blk_mq_start_request(struct request *rq)
                wbt_issue(q->rq_wb, &rq->issue_stat);
        }
-        blk_add_timer(rq);
+        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
-        WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
        /*
-         * Mark us as started and clear complete. Complete might have been
+         * Mark @rq in-flight which also advances the generation number,
-         * set if requeue raced with timeout, which then marked it as
+         * and register for timeout.  Protect with a seqcount to allow the
-         * complete. So be sure to clear complete again when we start
+         * timeout path to read both @rq->gstate and @rq->deadline
-         * the request, otherwise we'll ignore the completion event.
+         * coherently.
         *
-         * Ensure that ->deadline is visible before we set STARTED, such that
+         * This is the only place where a request is marked in-flight.  If
-         * blk_mq_check_expired() is guaranteed to observe our ->deadline when
+         * the timeout path reads an in-flight @rq->gstate, the
-         * it observes STARTED.
+         * @rq->deadline it reads together under @rq->gstate_seq is
+         * guaranteed to be the matching one.
         */
-        smp_wmb();
+        preempt_disable();
-        set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+        write_seqcount_begin(&rq->gstate_seq);
-        if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
-                /*
+        blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
-                 * Coherence order guarantees these consecutive stores to a
+        blk_add_timer(rq);
-                 * single variable propagate in the specified order. Thus the
-                 * clear_bit() is ordered _after_ the set bit. See
+        write_seqcount_end(&rq->gstate_seq);
-                 * blk_mq_check_expired().
+        preempt_enable();
-                 *
-                 * (the bits must be part of the same byte for this to be
-                 * true).
-                 */
-                clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-        }
        if (q->dma_drain_size && blk_rq_bytes(rq)) {
                /*
@@ -639,13 +700,9 @@ void blk_mq_start_request(struct request *rq)
 EXPORT_SYMBOL(blk_mq_start_request);
 /*
- * When we reach here because queue is busy, REQ_ATOM_COMPLETE
+ * When we reach here because queue is busy, it's safe to change the state
- * flag isn't set yet, so there may be race with timeout handler,
+ * to IDLE without checking @rq->aborted_gstate because we should still be
- * but given rq->deadline is just set in .queue_rq() under
+ * holding the RCU read lock and thus protected against timeout.
- * this situation, the race won't be possible in reality because
- * rq->timeout should be set as big enough to cover the window
- * between blk_mq_start_request() called from .queue_rq() and
- * clearing REQ_ATOM_STARTED here.
 */
 static void __blk_mq_requeue_request(struct request *rq)
 {
@@ -657,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq)
        wbt_requeue(q->rq_wb, &rq->issue_stat);
        blk_mq_sched_requeue_request(rq);
-        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+        if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
+                blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
                if (q->dma_drain_size && blk_rq_bytes(rq))
                        rq->nr_phys_segments--;
        }
@@ -689,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
                rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
-                blk_mq_sched_insert_request(rq, true, false, false, true);
+                blk_mq_sched_insert_request(rq, true, false, false);
        }
        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
-                blk_mq_sched_insert_request(rq, false, false, false, true);
+                blk_mq_sched_insert_request(rq, false, false, false);
        }
        blk_mq_run_hw_queues(q, false);
@@ -729,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-        kblockd_schedule_delayed_work(&q->requeue_work, 0);
+        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
@@ -755,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
 struct blk_mq_timeout_data {
        unsigned long next;
        unsigned int next_set;
+        unsigned int nr_expired;
 };
-void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
        const struct blk_mq_ops *ops = req->q->mq_ops;
        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
-        /*
+        req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
-         * We know that complete is set at this point. If STARTED isn't set
-         * anymore, then the request isn't active and the "timeout" should
-         * just be ignored. This can happen due to the bitflag ordering.
-         * Timeout first checks if STARTED is set, and if it is, assumes
-         * the request is active. But if we race with completion, then
-         * both flags will get cleared. So check here again, and ignore
-         * a timeout event with a request that isn't active.
-         */
-        if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
-                return;
        if (ops->timeout)
                ret = ops->timeout(req, reserved);
@@ -782,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
                __blk_mq_complete_request(req);
                break;
        case BLK_EH_RESET_TIMER:
+                /*
+                 * As nothing prevents from completion happening while
+                 * ->aborted_gstate is set, this may lead to ignored
+                 * completions and further spurious timeouts.
+                 */
+                blk_mq_rq_update_aborted_gstate(req, 0);
                blk_add_timer(req);
-                blk_clear_rq_complete(req);
                break;
        case BLK_EH_NOT_HANDLED:
                break;
@@ -797,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                struct request *rq, void *priv, bool reserved)
 {
        struct blk_mq_timeout_data *data = priv;
-        unsigned long deadline;
+        unsigned long gstate, deadline;
+        int start;
-        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+        might_sleep();
-                return;
-        /*
+        if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
-         * Ensures that if we see STARTED we must also see our
+                return;
-         * up-to-date deadline, see blk_mq_start_request().
-         */
-        smp_rmb();
-        deadline = READ_ONCE(rq->deadline);
+        /* read coherent snapshots of @rq->state_gen and @rq->deadline */
+        while (true) {
+                start = read_seqcount_begin(&rq->gstate_seq);
+                gstate = READ_ONCE(rq->gstate);
+                deadline = blk_rq_deadline(rq);
+                if (!read_seqcount_retry(&rq->gstate_seq, start))
+                        break;
+                cond_resched();
+        }
-        /*
+        /* if in-flight && overdue, mark for abortion */
-         * The rq being checked may have been freed and reallocated
+        if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
-         * out already here, we avoid this race by checking rq->deadline
+            time_after_eq(jiffies, deadline)) {
-         * and REQ_ATOM_COMPLETE flag together:
+                blk_mq_rq_update_aborted_gstate(rq, gstate);
-         *
+                data->nr_expired++;
-         * - if rq->deadline is observed as new value because of
+                hctx->nr_expired++;
-         *   reusing, the rq won't be timed out because of timing.
-         * - if rq->deadline is observed as previous value,
-         *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-         *   because we put a barrier between setting rq->deadline
-         *   and clearing the flag in blk_mq_start_request(), so
-         *   this rq won't be timed out too.
-         */
-        if (time_after_eq(jiffies, deadline)) {
-                if (!blk_mark_rq_complete(rq)) {
-                        /*
-                         * Again coherence order ensures that consecutive reads
-                         * from the same variable must be in that order. This
-                         * ensures that if we see COMPLETE clear, we must then
-                         * see STARTED set and we'll ignore this timeout.
-                         *
-                         * (There's also the MB implied by the test_and_clear())
-                         */
-                        blk_mq_rq_timed_out(rq, reserved);
-                }
        } else if (!data->next_set || time_after(data->next, deadline)) {
                data->next = deadline;
                data->next_set = 1;
        }
 }
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+                struct request *rq, void *priv, bool reserved)
+{
+        /*
+         * We marked @rq->aborted_gstate and waited for RCU.  If there were
+         * completions that we lost to, they would have finished and
+         * updated @rq->gstate by now; otherwise, the completion path is
+         * now guaranteed to see @rq->aborted_gstate and yield.  If
+         * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+         */
+        if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+            READ_ONCE(rq->gstate) == rq->aborted_gstate)
+                blk_mq_rq_timed_out(rq, reserved);
+}
 static void blk_mq_timeout_work(struct work_struct *work)
 {
        struct request_queue *q =
@@ -848,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
        struct blk_mq_timeout_data data = {
                .next           = 0,
                .next_set       = 0,
+                .nr_expired     = 0,
        };
+        struct blk_mq_hw_ctx *hctx;
        int i;
        /* A deadlock might occur if a request is stuck requiring a
@@ -867,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work)
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;
+        /* scan for the expired ones and set their ->aborted_gstate */
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+        if (data.nr_expired) {
+                bool has_rcu = false;
+                /*
+                 * Wait till everyone sees ->aborted_gstate.  The
+                 * sequential waits for SRCUs aren't ideal.  If this ever
+                 * becomes a problem, we can add per-hw_ctx rcu_head and
+                 * wait in parallel.
+                 */
+                queue_for_each_hw_ctx(q, hctx, i) {
+                        if (!hctx->nr_expired)
+                                continue;
+                        if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                                has_rcu = true;
+                        else
+                                synchronize_srcu(hctx->srcu);
+                        hctx->nr_expired = 0;
+                }
+                if (has_rcu)
+                        synchronize_rcu();
+                /* terminate the ones we won */
+                blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+        }
        if (data.next_set) {
                data.next = blk_rq_timeout(round_jiffies_up(data.next));
                mod_timer(&q->timeout, data.next);
        } else {
-                struct blk_mq_hw_ctx *hctx;
+                /*
+                 * Request timeouts are handled as a forward rolling timer. If
+                 * we end up here it means that no requests are pending and
+                 * also that no request has been pending for a while. Mark
+                 * each hctx as idle.
+                 */
                queue_for_each_hw_ctx(q, hctx, i) {
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx))
@@ -1010,66 +1099,67 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 /*
 * Mark us waiting for a tag. For shared tags, this involves hooking us into
- * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
+ * the tag wakeups. For non-shared tags, we can simply mark us needing a
- * restart. For both caes, take care to check the condition again after
+ * restart. For both cases, take care to check the condition again after
 * marking us as waiting.
 */
 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
                                 struct request *rq)
 {
        struct blk_mq_hw_ctx *this_hctx = *hctx;
-        bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
        struct sbq_wait_state *ws;
        wait_queue_entry_t *wait;
        bool ret;
-        if (!shared_tags) {
+        if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
                if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
                        set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
-        } else {
-                wait = &this_hctx->dispatch_wait;
-                if (!list_empty_careful(&wait->entry))
-                        return false;
-                spin_lock(&this_hctx->lock);
+                /*
-                if (!list_empty(&wait->entry)) {
+                 * It's possible that a tag was freed in the window between the
-                        spin_unlock(&this_hctx->lock);
+                 * allocation failure and adding the hardware queue to the wait
-                        return false;
+                 * queue.
-                }
+                 *
+                 * Don't clear RESTART here, someone else could have set it.
+                 * At most this will cost an extra queue run.
+                 */
+                return blk_mq_get_driver_tag(rq, hctx, false);
+        }
-                ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+        wait = &this_hctx->dispatch_wait;
-                add_wait_queue(&ws->wait, wait);
+        if (!list_empty_careful(&wait->entry))
+                return false;
+        spin_lock(&this_hctx->lock);
+        if (!list_empty(&wait->entry)) {
+                spin_unlock(&this_hctx->lock);
+                return false;
        }
+        ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+        add_wait_queue(&ws->wait, wait);
        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
         * queue.
         */
        ret = blk_mq_get_driver_tag(rq, hctx, false);
+        if (!ret) {
-        if (!shared_tags) {
-                /*
-                 * Don't clear RESTART here, someone else could have set it.
-                 * At most this will cost an extra queue run.
-                 */
-                return ret;
-        } else {
-                if (!ret) {
-                        spin_unlock(&this_hctx->lock);
-                        return false;
-                }
-                /*
-                 * We got a tag, remove ourselves from the wait queue to ensure
-                 * someone else gets the wakeup.
-                 */
-                spin_lock_irq(&ws->wait.lock);
-                list_del_init(&wait->entry);
-                spin_unlock_irq(&ws->wait.lock);
                spin_unlock(&this_hctx->lock);
-                return true;
+                return false;
        }
+        /*
+         * We got a tag, remove ourselves from the wait queue to ensure
+         * someone else gets the wakeup.
+         */
+        spin_lock_irq(&ws->wait.lock);
+        list_del_init(&wait->entry);
+        spin_unlock_irq(&ws->wait.lock);
+        spin_unlock(&this_hctx->lock);
+        return true;
 }
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
@@ -1206,9 +1296,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        /*
         * We should be running this queue from one of the CPUs that
         * are mapped to it.
+         *
+         * There are at least two related races now between setting
+         * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+         * __blk_mq_run_hw_queue():
+         *
+         * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+         *   but later it becomes online, then this warning is harmless
+         *   at all
+         *
+         * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+         *   but later it becomes offline, then the warning can't be
+         *   triggered, and we depend on blk-mq timeout handler to
+         *   handle dispatched requests to this hctx
         */
-        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+        if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-                cpu_online(hctx->next_cpu));
+                cpu_online(hctx->next_cpu)) {
+                printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+                        raw_smp_processor_id(),
+                        cpumask_empty(hctx->cpumask) ? "inactive": "active");
+                dump_stack();
+        }
        /*
         * We can't run the queue inline with ints disabled. Ensure that
@@ -1216,17 +1324,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         */
        WARN_ON_ONCE(in_interrupt());
-        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-                rcu_read_lock();
-                blk_mq_sched_dispatch_requests(hctx);
-                rcu_read_unlock();
-        } else {
-                might_sleep();
-                srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
+        hctx_lock(hctx, &srcu_idx);
-                blk_mq_sched_dispatch_requests(hctx);
+        blk_mq_sched_dispatch_requests(hctx);
-                srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
+        hctx_unlock(hctx, srcu_idx);
-        }
 }
 /*
@@ -1237,20 +1339,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
+        bool tried = false;
        if (hctx->queue->nr_hw_queues == 1)
                return WORK_CPU_UNBOUND;
        if (--hctx->next_cpu_batch <= 0) {
                int next_cpu;
+select_cpu:
-                next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+                next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+                                cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
-                        next_cpu = cpumask_first(hctx->cpumask);
+                        next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
-                hctx->next_cpu = next_cpu;
+                /*
+                 * No online CPU is found, so have to make sure hctx->next_cpu
+                 * is set correctly for not breaking workqueue.
+                 */
+                if (next_cpu >= nr_cpu_ids)
+                        hctx->next_cpu = cpumask_first(hctx->cpumask);
+                else
+                        hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
+        /*
+         * Do unbound schedule if we can't find a online CPU for this hctx,
+         * and it should only happen in the path of handling CPU DEAD.
+         */
+        if (!cpu_online(hctx->next_cpu)) {
+                if (!tried) {
+                        tried = true;
+                        goto select_cpu;
+                }
+                /*
+                 * Make sure to re-select CPU next time once after CPUs
+                 * in hctx->cpumask become online again.
+                 */
+                hctx->next_cpu_batch = 1;
+                return WORK_CPU_UNBOUND;
+        }
        return hctx->next_cpu;
 }
@@ -1274,9 +1403,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                put_cpu();
        }
-        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+        kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
-                                         &hctx->run_work,
+                                    msecs_to_jiffies(msecs));
-                                         msecs_to_jiffies(msecs));
 }
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1287,7 +1415,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-        if (blk_mq_hctx_has_pending(hctx)) {
+        int srcu_idx;
+        bool need_run;
+        /*
+         * When queue is quiesced, we may be switching io scheduler, or
+         * updating nr_hw_queues, or other things, and we can't run queue
+         * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+         *
+         * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+         * quiesced.
+         */
+        hctx_lock(hctx, &srcu_idx);
+        need_run = !blk_queue_quiesced(hctx->queue) &&
+                blk_mq_hctx_has_pending(hctx);
+        hctx_unlock(hctx, srcu_idx);
+        if (need_run) {
                __blk_mq_delay_run_hw_queue(hctx, async, 0);
                return true;
        }
@@ -1595,9 +1739,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
        return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
-static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
-                                        struct request *rq,
+                                            struct request *rq,
-                                        blk_qc_t *cookie, bool may_sleep)
+                                            blk_qc_t *cookie)
 {
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
@@ -1606,15 +1750,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
        };
        blk_qc_t new_cookie;
        blk_status_t ret;
+        new_cookie = request_to_qc_t(hctx, rq);
+        /*
+         * For OK queue, we are done. For error, caller may kill it.
+         * Any other error (busy), just add it to our list as we
+         * previously would have done.
+         */
+        ret = q->mq_ops->queue_rq(hctx, &bd);
+        switch (ret) {
+        case BLK_STS_OK:
+                *cookie = new_cookie;
+                break;
+        case BLK_STS_RESOURCE:
+                __blk_mq_requeue_request(rq);
+                break;
+        default:
+                *cookie = BLK_QC_T_NONE;
+                break;
+        }
+        return ret;
+}
+static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+                                                struct request *rq,
+                                                blk_qc_t *cookie,
+                                                bool bypass_insert)
+{
+        struct request_queue *q = rq->q;
        bool run_queue = true;
-        /* RCU or SRCU read lock is needed before checking quiesced flag */
+        /*
+         * RCU or SRCU read lock is needed before checking quiesced flag.
+         *
+         * When queue is stopped or quiesced, ignore 'bypass_insert' from
+         * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
+         * and avoid driver to try to dispatch again.
+         */
        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
                run_queue = false;
+                bypass_insert = false;
                goto insert;
        }
-        if (q->elevator)
+        if (q->elevator && !bypass_insert)
                goto insert;
        if (!blk_mq_get_driver_tag(rq, NULL, false))
@@ -1625,47 +1806,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                goto insert;
        }
-        new_cookie = request_to_qc_t(hctx, rq);
+        return __blk_mq_issue_directly(hctx, rq, cookie);
-        /*
-         * For OK queue, we are done. For error, kill it. Any other
-         * error (busy), just add it to our list as we previously
-         * would have done
-         */
-        ret = q->mq_ops->queue_rq(hctx, &bd);
-        switch (ret) {
-        case BLK_STS_OK:
-                *cookie = new_cookie;
-                return;
-        case BLK_STS_RESOURCE:
-                __blk_mq_requeue_request(rq);
-                goto insert;
-        default:
-                *cookie = BLK_QC_T_NONE;
-                blk_mq_end_request(rq, ret);
-                return;
-        }
 insert:
-        blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+        if (bypass_insert)
+                return BLK_STS_RESOURCE;
+        blk_mq_sched_insert_request(rq, false, run_queue, false);
+        return BLK_STS_OK;
 }
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                struct request *rq, blk_qc_t *cookie)
 {
-        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+        blk_status_t ret;
-                rcu_read_lock();
+        int srcu_idx;
-                __blk_mq_try_issue_directly(hctx, rq, cookie, false);
-                rcu_read_unlock();
-        } else {
-                unsigned int srcu_idx;
-                might_sleep();
+        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-                srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
+        hctx_lock(hctx, &srcu_idx);
-                __blk_mq_try_issue_directly(hctx, rq, cookie, true);
-                srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
+        ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
-        }
+        if (ret == BLK_STS_RESOURCE)
+                blk_mq_sched_insert_request(rq, false, true, false);
+        else if (ret != BLK_STS_OK)
+                blk_mq_end_request(rq, ret);
+        hctx_unlock(hctx, srcu_idx);
+}
+blk_status_t blk_mq_request_issue_directly(struct request *rq)
+{
+        blk_status_t ret;
+        int srcu_idx;
+        blk_qc_t unused_cookie;
+        struct blk_mq_ctx *ctx = rq->mq_ctx;
+        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+        hctx_lock(hctx, &srcu_idx);
+        ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+        hctx_unlock(hctx, srcu_idx);
+        return ret;
 }
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -1776,7 +1957,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
        } else if (q->elevator) {
                blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
-                blk_mq_sched_insert_request(rq, false, true, true, true);
+                blk_mq_sched_insert_request(rq, false, true, true);
        } else {
                blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
@@ -1869,6 +2050,22 @@ static size_t order_to_size(unsigned int order)
        return (size_t)PAGE_SIZE << order;
 }
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+                               unsigned int hctx_idx, int node)
+{
+        int ret;
+        if (set->ops->init_request) {
+                ret = set->ops->init_request(set, rq, hctx_idx, node);
+                if (ret)
+                        return ret;
+        }
+        seqcount_init(&rq->gstate_seq);
+        u64_stats_init(&rq->aborted_gstate_sync);
+        return 0;
+}
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx, unsigned int depth)
 {
@@ -1930,12 +2127,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                        struct request *rq = p;
                        tags->static_rqs[i] = rq;
-                        if (set->ops->init_request) {
+                        if (blk_mq_init_request(set, rq, hctx_idx, node)) {
-                                if (set->ops->init_request(set, rq, hctx_idx,
+                                tags->static_rqs[i] = NULL;
-                                                node)) {
+                                goto fail;
-                                        tags->static_rqs[i] = NULL;
-                                        goto fail;
-                                }
                        }
                        p += rq_size;
@@ -1994,7 +2188,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 {
        blk_mq_debugfs_unregister_hctx(hctx);
-        blk_mq_tag_idle(hctx);
+        if (blk_mq_hw_queue_mapped(hctx))
+                blk_mq_tag_idle(hctx);
        if (set->ops->exit_request)
                set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
@@ -2005,7 +2200,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
                set->ops->exit_hctx(hctx, hctx_idx);
        if (hctx->flags & BLK_MQ_F_BLOCKING)
-                cleanup_srcu_struct(hctx->queue_rq_srcu);
+                cleanup_srcu_struct(hctx->srcu);
        blk_mq_remove_cpuhp(hctx);
        blk_free_flush_queue(hctx->fq);
@@ -2074,13 +2269,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
        if (!hctx->fq)
                goto sched_exit_hctx;
-        if (set->ops->init_request &&
+        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
-            set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-                                   node))
                goto free_fq;
        if (hctx->flags & BLK_MQ_F_BLOCKING)
-                init_srcu_struct(hctx->queue_rq_srcu);
+                init_srcu_struct(hctx->srcu);
        blk_mq_debugfs_register_hctx(q, hctx);
@@ -2116,16 +2309,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                INIT_LIST_HEAD(&__ctx->rq_list);
                __ctx->queue = q;
-                /* If the cpu isn't present, the cpu is mapped to first hctx */
-                if (!cpu_present(i))
-                        continue;
-                hctx = blk_mq_map_queue(q, i);
                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
                 */
+                hctx = blk_mq_map_queue(q, i);
                if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                        hctx->numa_node = local_memory_node(cpu_to_node(i));
        }
@@ -2182,7 +2370,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
         *
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
-        for_each_present_cpu(i) {
+        for_each_possible_cpu(i) {
                hctx_idx = q->mq_map[i];
                /* unmapped hw queue can be remapped after CPU topo changed */
                if (!set->tags[hctx_idx] &&
@@ -2236,7 +2424,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                /*
                 * Initialize batch roundrobin counts
                 */
-                hctx->next_cpu = cpumask_first(hctx->cpumask);
+                hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+                                cpu_online_mask);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
 }
@@ -2369,7 +2558,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
 {
        int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-        BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+        BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
                           __alignof__(struct blk_mq_hw_ctx)) !=
                     sizeof(struct blk_mq_hw_ctx));
@@ -2386,6 +2575,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
        struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
        blk_mq_sysfs_unregister(q);
+        /* protect against switching io scheduler  */
+        mutex_lock(&q->sysfs_lock);
        for (i = 0; i < set->nr_hw_queues; i++) {
                int node;
@@ -2430,6 +2622,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                }
        }
        q->nr_hw_queues = i;
+        mutex_unlock(&q->sysfs_lock);
        blk_mq_sysfs_register(q);
 }
@@ -2601,9 +2794,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
 {
-        if (set->ops->map_queues)
+        if (set->ops->map_queues) {
+                int cpu;
+                /*
+                 * transport .map_queues is usually done in the following
+                 * way:
+                 *
+                 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
+                 *      mask = get_cpu_mask(queue)
+                 *      for_each_cpu(cpu, mask)
+                 *              set->mq_map[cpu] = queue;
+                 * }
+                 *
+                 * When we need to remap, the table has to be cleared for
+                 * killing stale mapping since one CPU may not be mapped
+                 * to any hw queue.
+                 */
+                for_each_possible_cpu(cpu)
+                        set->mq_map[cpu] = 0;
                return set->ops->map_queues(set);
-        else
+        } else
                return blk_mq_map_queues(set);
 }
@@ -2712,6 +2923,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
                return -EINVAL;
        blk_mq_freeze_queue(q);
+        blk_mq_quiesce_queue(q);
        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
@@ -2735,6 +2947,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        if (!ret)
                q->nr_requests = nr;
+        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);
        return ret;
@@ -2850,7 +3063,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
        unsigned int nsecs;
        ktime_t kt;
-        if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+        if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
                return false;
        /*
@@ -2870,7 +3083,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
        if (!nsecs)
                return false;
-        set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+        rq->rq_flags |= RQF_MQ_POLL_SLEPT;
        /*
         * This will be replaced with the stats tracking code, using
@@ -2884,7 +3097,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
        hrtimer_init_sleeper(&hs, current);
        do {
-                if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+                if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
                        break;
                set_current_state(TASK_UNINTERRUPTIBLE);
                hrtimer_start_expires(&hs.timer, mode);
@@ -2970,12 +3183,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 static int __init blk_mq_init(void)
 {
-        /*
-         * See comment in block/blk.h rq_atomic_flags enum
-         */
-        BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-                        (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c7c3ff5bf62..88c558f71819 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,20 @@ struct blk_mq_ctx {
        struct kobject          kobj;
 } ____cacheline_aligned_in_smp;
+/*
+ * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
+ * and the upper bits the generation number.
+ */
+enum mq_rq_state {
+        MQ_RQ_IDLE              = 0,
+        MQ_RQ_IN_FLIGHT         = 1,
+        MQ_RQ_COMPLETE          = 2,
+        MQ_RQ_STATE_BITS        = 2,
+        MQ_RQ_STATE_MASK        = (1 << MQ_RQ_STATE_BITS) - 1,
+        MQ_RQ_GEN_INC           = 1 << MQ_RQ_STATE_BITS,
+};
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                                struct list_head *list);
+/* Used by blk_insert_cloned_request() to issue request directly */
+blk_status_t blk_mq_request_issue_directly(struct request *rq);
 /*
 * CPU -> queue mappings
 */
@@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
-extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 void blk_mq_release(struct request_queue *q);
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline int blk_mq_rq_state(struct request *rq)
+{
+        return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+}
+/**
+ * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
+ * @rq: target request.
+ * @state: new state to set.
+ *
+ * Set @rq's state to @state.  The caller is responsible for ensuring that
+ * there are no other updaters.  A request can transition into IN_FLIGHT
+ * only from IDLE and doing so increments the generation number.
+ */
+static inline void blk_mq_rq_update_state(struct request *rq,
+                                          enum mq_rq_state state)
+{
+        u64 old_val = READ_ONCE(rq->gstate);
+        u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
+        if (state == MQ_RQ_IN_FLIGHT) {
+                WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
+                new_val += MQ_RQ_GEN_INC;
+        }
+        /* avoid exposing interim values */
+        WRITE_ONCE(rq->gstate, new_val);
+}
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
                                           unsigned int cpu)
 {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 870484eaed1f..cbea895a5547 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
        .release        = blk_release_queue,
 };
+/**
+ * blk_register_queue - register a block layer queue with sysfs
+ * @disk: Disk of which the request queue should be registered with sysfs.
+ */
 int blk_register_queue(struct gendisk *disk)
 {
        int ret;
@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
        if (q->request_fn || (q->mq_ops && q->elevator)) {
                ret = elv_register_queue(q);
                if (ret) {
+                        mutex_unlock(&q->sysfs_lock);
                        kobject_uevent(&q->kobj, KOBJ_REMOVE);
                        kobject_del(&q->kobj);
                        blk_trace_remove_sysfs(dev);
                        kobject_put(&dev->kobj);
-                        goto unlock;
+                        return ret;
                }
        }
        ret = 0;
@@ -921,7 +926,15 @@ unlock:
        mutex_unlock(&q->sysfs_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(blk_register_queue);
+/**
+ * blk_unregister_queue - counterpart of blk_register_queue()
+ * @disk: Disk of which the request queue should be unregistered from sysfs.
+ *
+ * Note: the caller is responsible for guaranteeing that this function is called
+ * after blk_register_queue() has finished.
+ */
 void blk_unregister_queue(struct gendisk *disk)
 {
        struct request_queue *q = disk->queue;
@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
        if (WARN_ON(!q))
                return;
-        mutex_lock(&q->sysfs_lock);
+        /* Return early if disk->queue was never registered. */
-        queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+        if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
-        mutex_unlock(&q->sysfs_lock);
+                return;
-        wbt_exit(q);
+        /*
+         * Since sysfs_remove_dir() prevents adding new directory entries
+         * before removal of existing entries starts, protect against
+         * concurrent elv_iosched_store() calls.
+         */
+        mutex_lock(&q->sysfs_lock);
+        spin_lock_irq(q->queue_lock);
+        queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
+        spin_unlock_irq(q->queue_lock);
+        /*
+         * Remove the sysfs attributes before unregistering the queue data
+         * structures that can be modified through sysfs.
+         */
        if (q->mq_ops)
                blk_mq_unregister_dev(disk_to_dev(disk), q);
+        mutex_unlock(&q->sysfs_lock);
-        if (q->request_fn || (q->mq_ops && q->elevator))
-                elv_unregister_queue(q);
        kobject_uevent(&q->kobj, KOBJ_REMOVE);
        kobject_del(&q->kobj);
        blk_trace_remove_sysfs(disk_to_dev(disk));
+        wbt_exit(q);
+        mutex_lock(&q->sysfs_lock);
+        if (q->request_fn || (q->mq_ops && q->elevator))
+                elv_unregister_queue(q);
+        mutex_unlock(&q->sysfs_lock);
        kobject_put(&disk_to_dev(disk)->kobj);
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d19f416d6101..c5a131673733 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -216,9 +216,9 @@ struct throtl_data
        unsigned int scale;
-        struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+        struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
-        struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+        struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
-        struct latency_bucket __percpu *latency_buckets;
+        struct latency_bucket __percpu *latency_buckets[2];
        unsigned long last_calculate_time;
        unsigned long filtered_latency;
@@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = {
                .seq_show = blkg_print_stat_bytes,
        },
        {
+                .name = "throttle.io_service_bytes_recursive",
+                .private = (unsigned long)&blkcg_policy_throtl,
+                .seq_show = blkg_print_stat_bytes_recursive,
+        },
+        {
                .name = "throttle.io_serviced",
                .private = (unsigned long)&blkcg_policy_throtl,
                .seq_show = blkg_print_stat_ios,
        },
+        {
+                .name = "throttle.io_serviced_recursive",
+                .private = (unsigned long)&blkcg_policy_throtl,
+                .seq_show = blkg_print_stat_ios_recursive,
+        },
        { }     /* terminate */
 };
@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static void throtl_update_latency_buckets(struct throtl_data *td)
 {
-        struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+        struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
-        int i, cpu;
+        int i, cpu, rw;
-        unsigned long last_latency = 0;
+        unsigned long last_latency[2] = { 0 };
-        unsigned long latency;
+        unsigned long latency[2];
        if (!blk_queue_nonrot(td->queue))
                return;
@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
        td->last_calculate_time = jiffies;
        memset(avg_latency, 0, sizeof(avg_latency));
-        for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+        for (rw = READ; rw <= WRITE; rw++) {
-                struct latency_bucket *tmp = &td->tmp_buckets[i];
+                for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                        struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
-                for_each_possible_cpu(cpu) {
-                        struct latency_bucket *bucket;
+                        for_each_possible_cpu(cpu) {
+                                struct latency_bucket *bucket;
-                        /* this isn't race free, but ok in practice */
-                        bucket = per_cpu_ptr(td->latency_buckets, cpu);
+                                /* this isn't race free, but ok in practice */
-                        tmp->total_latency += bucket[i].total_latency;
+                                bucket = per_cpu_ptr(td->latency_buckets[rw],
-                        tmp->samples += bucket[i].samples;
+                                        cpu);
-                        bucket[i].total_latency = 0;
+                                tmp->total_latency += bucket[i].total_latency;
-                        bucket[i].samples = 0;
+                                tmp->samples += bucket[i].samples;
-                }
+                                bucket[i].total_latency = 0;
+                                bucket[i].samples = 0;
+                        }
-                if (tmp->samples >= 32) {
+                        if (tmp->samples >= 32) {
-                        int samples = tmp->samples;
+                                int samples = tmp->samples;
-                        latency = tmp->total_latency;
+                                latency[rw] = tmp->total_latency;
-                        tmp->total_latency = 0;
+                                tmp->total_latency = 0;
-                        tmp->samples = 0;
+                                tmp->samples = 0;
-                        latency /= samples;
+                                latency[rw] /= samples;
-                        if (latency == 0)
+                                if (latency[rw] == 0)
-                                continue;
+                                        continue;
-                        avg_latency[i].latency = latency;
+                                avg_latency[rw][i].latency = latency[rw];
+                        }
                }
        }
-        for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+        for (rw = READ; rw <= WRITE; rw++) {
-                if (!avg_latency[i].latency) {
+                for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-                        if (td->avg_buckets[i].latency < last_latency)
+                        if (!avg_latency[rw][i].latency) {
-                                td->avg_buckets[i].latency = last_latency;
+                                if (td->avg_buckets[rw][i].latency < last_latency[rw])
-                        continue;
+                                        td->avg_buckets[rw][i].latency =
-                }
+                                                last_latency[rw];
+                                continue;
+                        }
-                if (!td->avg_buckets[i].valid)
+                        if (!td->avg_buckets[rw][i].valid)
-                        latency = avg_latency[i].latency;
+                                latency[rw] = avg_latency[rw][i].latency;
-                else
+                        else
-                        latency = (td->avg_buckets[i].latency * 7 +
+                                latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
-                                avg_latency[i].latency) >> 3;
+                                        avg_latency[rw][i].latency) >> 3;
-                td->avg_buckets[i].latency = max(latency, last_latency);
+                        td->avg_buckets[rw][i].latency = max(latency[rw],
-                td->avg_buckets[i].valid = true;
+                                last_latency[rw]);
-                last_latency = td->avg_buckets[i].latency;
+                        td->avg_buckets[rw][i].valid = true;
+                        last_latency[rw] = td->avg_buckets[rw][i].latency;
+                }
        }
        for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
                throtl_log(&td->service_queue,
-                        "Latency bucket %d: latency=%ld, valid=%d", i,
+                        "Latency bucket %d: read latency=%ld, read valid=%d, "
-                        td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+                        "write latency=%ld, write valid=%d", i,
+                        td->avg_buckets[READ][i].latency,
+                        td->avg_buckets[READ][i].valid,
+                        td->avg_buckets[WRITE][i].latency,
+                        td->avg_buckets[WRITE][i].valid);
 }
 #else
 static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
        struct latency_bucket *latency;
        int index;
-        if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+        if (!td || td->limit_index != LIMIT_LOW ||
+            !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
            !blk_queue_nonrot(td->queue))
                return;
        index = request_bucket_index(size);
-        latency = get_cpu_ptr(td->latency_buckets);
+        latency = get_cpu_ptr(td->latency_buckets[op]);
        latency[index].total_latency += time;
        latency[index].samples++;
-        put_cpu_ptr(td->latency_buckets);
+        put_cpu_ptr(td->latency_buckets[op]);
 }
 void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
        unsigned long finish_time;
        unsigned long start_time;
        unsigned long lat;
+        int rw = bio_data_dir(bio);
        tg = bio->bi_cg_private;
        if (!tg)
@@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
                bucket = request_bucket_index(
                        blk_stat_size(&bio->bi_issue_stat));
-                threshold = tg->td->avg_buckets[bucket].latency +
+                threshold = tg->td->avg_buckets[rw][bucket].latency +
                        tg->latency_target;
                if (lat > threshold)
                        tg->bad_bio_cnt++;
@@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
        if (!td)
                return -ENOMEM;
-        td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+        td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
                LATENCY_BUCKET_SIZE, __alignof__(u64));
-        if (!td->latency_buckets) {
+        if (!td->latency_buckets[READ]) {
+                kfree(td);
+                return -ENOMEM;
+        }
+        td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
+                LATENCY_BUCKET_SIZE, __alignof__(u64));
+        if (!td->latency_buckets[WRITE]) {
+                free_percpu(td->latency_buckets[READ]);
                kfree(td);
                return -ENOMEM;
        }
@@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
        /* activate policy */
        ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
        if (ret) {
-                free_percpu(td->latency_buckets);
+                free_percpu(td->latency_buckets[READ]);
+                free_percpu(td->latency_buckets[WRITE]);
                kfree(td);
        }
        return ret;
@@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
        BUG_ON(!q->td);
        throtl_shutdown_wq(q);
        blkcg_deactivate_policy(q, &blkcg_policy_throtl);
-        free_percpu(q->td->latency_buckets);
+        free_percpu(q->td->latency_buckets[READ]);
+        free_percpu(q->td->latency_buckets[WRITE]);
        kfree(q->td);
 }
@@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
        } else {
                td->throtl_slice = DFL_THROTL_SLICE_HD;
                td->filtered_latency = LATENCY_FILTERED_HD;
-                for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
+                for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-                        td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+                        td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+                        td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+                }
        }
 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
        /* if no low limit, use previous default */
        td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
-        td->track_bio_latency = !q->mq_ops && !q->request_fn;
+        td->track_bio_latency = !queue_is_rq_based(q);
        if (!td->track_bio_latency)
                blk_stat_enable_accounting(q);
 }
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 764ecf9aeb30..a05e3676d24a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
 static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
                          unsigned int *next_set)
 {
-        if (time_after_eq(jiffies, rq->deadline)) {
+        const unsigned long deadline = blk_rq_deadline(rq);
+        if (time_after_eq(jiffies, deadline)) {
                list_del_init(&rq->timeout_list);
                /*
@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
                 */
                if (!blk_mark_rq_complete(rq))
                        blk_rq_timed_out(rq);
-        } else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
+        } else if (!*next_set || time_after(*next_timeout, deadline)) {
-                *next_timeout = rq->deadline;
+                *next_timeout = deadline;
                *next_set = 1;
        }
 }
@@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)
 */
 void blk_abort_request(struct request *req)
 {
-        if (blk_mark_rq_complete(req))
-                return;
        if (req->q->mq_ops) {
-                blk_mq_rq_timed_out(req, false);
+                /*
+                 * All we need to ensure is that timeout scan takes place
+                 * immediately and that scan sees the new timeout value.
+                 * No need for fancy synchronizations.
+                 */
+                blk_rq_set_deadline(req, jiffies);
+                mod_timer(&req->q->timeout, 0);
        } else {
+                if (blk_mark_rq_complete(req))
+                        return;
                blk_delete_timer(req);
                blk_rq_timed_out(req);
        }
@@ -208,7 +215,8 @@ void blk_add_timer(struct request *req)
        if (!req->timeout)
                req->timeout = q->rq_timeout;
-        WRITE_ONCE(req->deadline, jiffies + req->timeout);
+        blk_rq_set_deadline(req, jiffies + req->timeout);
+        req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
        /*
         * Only the non-mq case needs to add the request to a protected list.
@@ -222,7 +230,7 @@ void blk_add_timer(struct request *req)
         * than an existing one, modify the timer. Round up to next nearest
         * second.
         */
-        expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
+        expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
        if (!timer_pending(&q->timeout) ||
            time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..acb7252c7e81 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
 }
 /*
+ * Return true if a request is a write requests that needs zone write locking.
+ */
+bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+        if (!rq->q->seq_zones_wlock)
+                return false;
+        if (blk_rq_is_passthrough(rq))
+                return false;
+        switch (req_op(rq)) {
+        case REQ_OP_WRITE_ZEROES:
+        case REQ_OP_WRITE_SAME:
+        case REQ_OP_WRITE:
+                return blk_rq_zone_is_seq(rq);
+        default:
+                return false;
+        }
+}
+EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+void __blk_req_zone_write_lock(struct request *rq)
+{
+        if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
+                                          rq->q->seq_zones_wlock)))
+                return;
+        WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+        rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+void __blk_req_zone_write_unlock(struct request *rq)
+{
+        rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
+        if (rq->q->seq_zones_wlock)
+                WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
+                                                 rq->q->seq_zones_wlock));
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+/*
 * Check that a zone report belongs to the partition.
 * If yes, fix its start sector and write pointer, copy it in the
 * zone information array and return true. Return false otherwise.
diff --git a/block/blk.h b/block/blk.h
index 442098aa9463..46db5dc83dcb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -120,33 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes);
 void blk_account_io_done(struct request *req);
 /*
- * Internal atomic flags for request handling
- */
-enum rq_atomic_flags {
-        /*
-         * Keep these two bits first - not because we depend on the
-         * value of them, but we do depend on them being in the same
-         * byte of storage to ensure ordering on writes. Keeping them
-         * first will achieve that nicely.
-         */
-        REQ_ATOM_COMPLETE = 0,
-        REQ_ATOM_STARTED,
-        REQ_ATOM_POLL_SLEPT,
-};
-/*
 * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them succeeds
+ * sure that only one of them succeeds. Steal the bottom bit of the
+ * __deadline field for this.
 */
 static inline int blk_mark_rq_complete(struct request *rq)
 {
-        return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+        return test_and_set_bit(0, &rq->__deadline);
 }
 static inline void blk_clear_rq_complete(struct request *rq)
 {
-        clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+        clear_bit(0, &rq->__deadline);
+}
+static inline bool blk_rq_is_complete(struct request *rq)
+{
+        return test_bit(0, &rq->__deadline);
 }
 /*
@@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
                e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
+int elv_register_queue(struct request_queue *q);
+void elv_unregister_queue(struct request_queue *q);
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 #ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -246,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 }
 /*
+ * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+ * setting the deadline clears the bottom bit, potentially clearing the
+ * completed bit. The user has to be OK with this (current ones are fine).
+ */
+static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+{
+        rq->__deadline = time & ~0x1UL;
+}
+static inline unsigned long blk_rq_deadline(struct request *rq)
+{
+        return rq->__deadline & ~0x1UL;
+}
+/*
 * Internal io_context interface
 */
 void get_io_context(struct io_context *ioc);
diff --git a/block/bounce.c b/block/bounce.c
index 1d05c422c932..6a3e68292273 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)
 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 {
        unsigned char *vfrom;
-        struct bio_vec tovec, *fromvec = from->bi_io_vec;
+        struct bio_vec tovec, fromvec;
        struct bvec_iter iter;
+        /*
+         * The bio of @from is created by bounce, so we can iterate
+         * its bvec from start to end, but the @from->bi_iter can't be
+         * trusted because it might be changed by splitting.
+         */
+        struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
        bio_for_each_segment(tovec, to, iter) {
-                if (tovec.bv_page != fromvec->bv_page) {
+                fromvec = bio_iter_iovec(from, from_iter);
+                if (tovec.bv_page != fromvec.bv_page) {
                        /*
                         * fromvec->bv_offset and fromvec->bv_len might have
                         * been modified by the block layer, so use the original
                         * copy, bounce_copy_vec already uses tovec->bv_len
                         */
-                        vfrom = page_address(fromvec->bv_page) +
+                        vfrom = page_address(fromvec.bv_page) +
                                tovec.bv_offset;
                        bounce_copy_vec(&tovec, vfrom);
                        flush_dcache_page(tovec.bv_page);
                }
+                bio_advance_iter(from, &from_iter, tovec.bv_len);
-                fromvec++;
        }
 }
 static void bounce_end_io(struct bio *bio, mempool_t *pool)
 {
        struct bio *bio_orig = bio->bi_private;
-        struct bio_vec *bvec, *org_vec;
+        struct bio_vec *bvec, orig_vec;
        int i;
-        int start = bio_orig->bi_iter.bi_idx;
+        struct bvec_iter orig_iter = bio_orig->bi_iter;
        /*
         * free up bounce indirect pages used
         */
        bio_for_each_segment_all(bvec, bio, i) {
-                org_vec = bio_orig->bi_io_vec + i + start;
+                orig_vec = bio_iter_iovec(bio_orig, orig_iter);
+                if (bvec->bv_page != orig_vec.bv_page) {
-                if (bvec->bv_page == org_vec->bv_page)
+                        dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-                        continue;
+                        mempool_free(bvec->bv_page, pool);
+                }
-                dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+                bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
-                mempool_free(bvec->bv_page, pool);
        }
        bio_orig->bi_status = bio->bi_status;
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 15d25ccd51a5..1474153f73e3 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -30,7 +30,7 @@
 /**
 * bsg_teardown_job - routine to teardown a bsg job
- * @job: bsg_job that is to be torn down
+ * @kref: kref inside bsg_job that is to be torn down
 */
 static void bsg_teardown_job(struct kref *kref)
 {
@@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
 * @name: device to give bsg device
 * @job_fn: bsg job handler
 * @dd_job_size: size of LLD data needed for each job
+ * @release: @dev release function
 */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
                bsg_job_fn *job_fn, int dd_job_size,
diff --git a/block/bsg.c b/block/bsg.c
index 452f94f1c5d4..a1bcbb6ba50b 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -32,6 +32,9 @@
 #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
 #define BSG_VERSION     "0.4"
+#define bsg_dbg(bd, fmt, ...) \
+        pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
 struct bsg_device {
        struct request_queue *queue;
        spinlock_t lock;
@@ -55,14 +58,6 @@ enum {
 #define BSG_DEFAULT_CMDS        64
 #define BSG_MAX_DEVS            32768
-#undef BSG_DEBUG
-#ifdef BSG_DEBUG
-#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
-#else
-#define dprintk(fmt, args...)
-#endif
 static DEFINE_MUTEX(bsg_mutex);
 static DEFINE_IDR(bsg_minor_idr);
@@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
        bc->bd = bd;
        INIT_LIST_HEAD(&bc->list);
-        dprintk("%s: returning free cmd %p\n", bd->name, bc);
+        bsg_dbg(bd, "returning free cmd %p\n", bc);
        return bc;
 out:
        spin_unlock_irq(&bd->lock);
@@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
        if (!bcd->class_dev)
                return ERR_PTR(-ENXIO);
-        dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
+        bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
+                (unsigned long long) hdr->dout_xferp,
                hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
                hdr->din_xfer_len);
@@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)
        struct bsg_device *bd = bc->bd;
        unsigned long flags;
-        dprintk("%s: finished rq %p bc %p, bio %p\n",
+        bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
-                bd->name, rq, bc, bc->bio);
+                rq, bc, bc->bio);
        bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
@@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
        list_add_tail(&bc->list, &bd->busy_list);
        spin_unlock_irq(&bd->lock);
-        dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
+        bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
        rq->end_io_data = bc;
        blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
@@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
                }
        } while (1);
-        dprintk("%s: returning done %p\n", bd->name, bc);
+        bsg_dbg(bd, "returning done %p\n", bc);
        return bc;
 }
@@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
        struct scsi_request *req = scsi_req(rq);
        int ret = 0;
-        dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
+        pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
        /*
         * fill in all the output members
         */
@@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
        struct bsg_command *bc;
        int ret, tret;
-        dprintk("%s: entered\n", bd->name);
+        bsg_dbg(bd, "entered\n");
        /*
         * wait for all commands to complete
@@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        int ret;
        ssize_t bytes_read;
-        dprintk("%s: read %zd bytes\n", bd->name, count);
+        bsg_dbg(bd, "read %zd bytes\n", count);
        bsg_set_block(bd, file);
@@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
        ssize_t bytes_written;
        int ret;
-        dprintk("%s: write %zd bytes\n", bd->name, count);
+        bsg_dbg(bd, "write %zd bytes\n", count);
        if (unlikely(uaccess_kernel()))
                return -EINVAL;
@@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
        if (!bytes_written || err_block_err(ret))
                bytes_written = ret;
-        dprintk("%s: returning %zd\n", bd->name, bytes_written);
+        bsg_dbg(bd, "returning %zd\n", bytes_written);
        return bytes_written;
 }
@@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)
        hlist_del(&bd->dev_list);
        mutex_unlock(&bsg_mutex);
-        dprintk("%s: tearing down\n", bd->name);
+        bsg_dbg(bd, "tearing down\n");
        /*
         * close can always block
@@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
                                         struct file *file)
 {
        struct bsg_device *bd;
-#ifdef BSG_DEBUG
        unsigned char buf[32];
-#endif
        if (!blk_queue_scsi_passthrough(rq)) {
                WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
@@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
        hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
        strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
-        dprintk("bound to <%s>, max queue %d\n",
+        bsg_dbg(bd, "bound to <%s>, max queue %d\n",
                format_dev_t(buf, inode->i_rdev), bd->max_queue);
        mutex_unlock(&bsg_mutex);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b83f77460d28..9de9f156e203 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -50,8 +50,6 @@ struct deadline_data {
        int front_merges;
 };
-static void deadline_move_request(struct deadline_data *, struct request *);
 static inline struct rb_root *
 deadline_rb_root(struct deadline_data *dd, struct request *rq)
 {
@@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)
        struct deadline_data *dd = q->elevator->elevator_data;
        const int data_dir = rq_data_dir(rq);
+        /*
+         * This may be a requeue of a write request that has locked its
+         * target zone. If it is the case, this releases the zone lock.
+         */
+        blk_req_zone_write_unlock(rq);
        deadline_add_rq_rb(dd, rq);
        /*
@@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
 {
        struct request_queue *q = rq->q;
+        /*
+         * For a zoned block device, write requests must write lock their
+         * target zone.
+         */
+        blk_req_zone_write_lock(rq);
        deadline_remove_request(q, rq);
        elv_dispatch_add_tail(q, rq);
 }
@@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
 }
 /*
+ * For the specified data direction, return the next request to dispatch using
+ * arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+        struct request *rq;
+        if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+                return NULL;
+        if (list_empty(&dd->fifo_list[data_dir]))
+                return NULL;
+        rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+        if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+                return rq;
+        /*
+         * Look for a write request that can be dispatched, that is one with
+         * an unlocked target zone.
+         */
+        list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
+                if (blk_req_can_dispatch_to_zone(rq))
+                        return rq;
+        }
+        return NULL;
+}
+/*
+ * For the specified data direction, return the next request to dispatch using
+ * sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+        struct request *rq;
+        if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+                return NULL;
+        rq = dd->next_rq[data_dir];
+        if (!rq)
+                return NULL;
+        if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+                return rq;
+        /*
+         * Look for a write request that can be dispatched, that is one with
+         * an unlocked target zone.
+         */
+        while (rq) {
+                if (blk_req_can_dispatch_to_zone(rq))
+                        return rq;
+                rq = deadline_latter_request(rq);
+        }
+        return NULL;
+}
+/*
 * deadline_dispatch_requests selects the best request according to
 * read/write expire, fifo_batch, etc
 */
@@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
        struct deadline_data *dd = q->elevator->elevator_data;
        const int reads = !list_empty(&dd->fifo_list[READ]);
        const int writes = !list_empty(&dd->fifo_list[WRITE]);
-        struct request *rq;
+        struct request *rq, *next_rq;
        int data_dir;
        /*
         * batches are currently reads XOR writes
         */
-        if (dd->next_rq[WRITE])
+        rq = deadline_next_request(dd, WRITE);
-                rq = dd->next_rq[WRITE];
+        if (!rq)
-        else
+                rq = deadline_next_request(dd, READ);
-                rq = dd->next_rq[READ];
        if (rq && dd->batching < dd->fifo_batch)
                /* we have a next request are still entitled to batch */
@@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
        if (reads) {
                BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
-                if (writes && (dd->starved++ >= dd->writes_starved))
+                if (deadline_fifo_request(dd, WRITE) &&
+                    (dd->starved++ >= dd->writes_starved))
                        goto dispatch_writes;
                data_dir = READ;
@@ -291,21 +364,29 @@ dispatch_find_request:
        /*
         * we are not running a batch, find best request for selected data_dir
         */
-        if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+        next_rq = deadline_next_request(dd, data_dir);
+        if (deadline_check_fifo(dd, data_dir) || !next_rq) {
                /*
                 * A deadline has expired, the last request was in the other
                 * direction, or we have run out of higher-sectored requests.
                 * Start again from the request with the earliest expiry time.
                 */
-                rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+                rq = deadline_fifo_request(dd, data_dir);
        } else {
                /*
                 * The last req was the same dir and we have a next request in
                 * sort order. No expired requests so continue on from here.
                 */
-                rq = dd->next_rq[data_dir];
+                rq = next_rq;
        }
+        /*
+         * For a zoned block device, if we only have writes queued and none of
+         * them can be dispatched, rq will be NULL.
+         */
+        if (!rq)
+                return 0;
        dd->batching = 0;
 dispatch_request:
@@ -318,6 +399,16 @@ dispatch_request:
        return 1;
 }
+/*
+ * For zoned block devices, write unlock the target zone of completed
+ * write requests.
+ */
+static void
+deadline_completed_request(struct request_queue *q, struct request *rq)
+{
+        blk_req_zone_write_unlock(rq);
+}
 static void deadline_exit_queue(struct elevator_queue *e)
 {
        struct deadline_data *dd = e->elevator_data;
@@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {
                .elevator_merged_fn =           deadline_merged_request,
                .elevator_merge_req_fn =        deadline_merged_requests,
                .elevator_dispatch_fn =         deadline_dispatch_requests,
+                .elevator_completed_req_fn =    deadline_completed_request,
                .elevator_add_req_fn =          deadline_add_request,
                .elevator_former_req_fn =       elv_rb_former_request,
                .elevator_latter_req_fn =       elv_rb_latter_request,
diff --git a/block/elevator.c b/block/elevator.c
index 7bda083d5968..e87e9b43aba0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
        struct elevator_queue *e = q->elevator;
        int error;
+        lockdep_assert_held(&q->sysfs_lock);
        error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
        if (!error) {
                struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
        }
        return error;
 }
-EXPORT_SYMBOL(elv_register_queue);
 void elv_unregister_queue(struct request_queue *q)
 {
+        lockdep_assert_held(&q->sysfs_lock);
        if (q) {
                struct elevator_queue *e = q->elevator;
@@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
                wbt_enable_default(q);
        }
 }
-EXPORT_SYMBOL(elv_unregister_queue);
 int elv_register(struct elevator_type *e)
 {
@@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,
 {
        int ret;
+        lockdep_assert_held(&q->sysfs_lock);
        blk_mq_freeze_queue(q);
+        blk_mq_quiesce_queue(q);
        if (q->elevator) {
                if (q->elevator->registered)
@@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,
                blk_add_trace_msg(q, "elv switch: none");
 out:
+        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);
        return ret;
 }
@@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
        bool old_registered = false;
        int err;
+        lockdep_assert_held(&q->sysfs_lock);
        if (q->mq_ops)
                return elevator_switch_mq(q, new_e);
diff --git a/block/genhd.c b/block/genhd.c
index 96a66f671720..88a53c188cb7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -629,16 +629,18 @@ exit:
 }
 /**
- * device_add_disk - add partitioning information to kernel list
+ * __device_add_disk - add disk information to kernel list
 * @parent: parent device for the disk
 * @disk: per-device partitioning information
+ * @register_queue: register the queue if set to true
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
 *
 * FIXME: error handling
 */
-void device_add_disk(struct device *parent, struct gendisk *disk)
+static void __device_add_disk(struct device *parent, struct gendisk *disk,
+                              bool register_queue)
 {
        dev_t devt;
        int retval;
@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
                                    exact_match, exact_lock, disk);
        }
        register_disk(parent, disk);
-        blk_register_queue(disk);
+        if (register_queue)
+                blk_register_queue(disk);
        /*
         * Take an extra ref on queue which will be put on disk_release()
@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
        disk_add_events(disk);
        blk_integrity_add(disk);
 }
+void device_add_disk(struct device *parent, struct gendisk *disk)
+{
+        __device_add_disk(parent, disk, true);
+}
 EXPORT_SYMBOL(device_add_disk);
+void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
+{
+        __device_add_disk(parent, disk, false);
+}
+EXPORT_SYMBOL(device_add_disk_no_queue_reg);
 void del_gendisk(struct gendisk *disk)
 {
        struct disk_part_iter piter;
@@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
                 * Unregister bdi before releasing device numbers (as they can
                 * get reused and we'd get clashes in sysfs).
                 */
-                bdi_unregister(disk->queue->backing_dev_info);
+                if (!(disk->flags & GENHD_FL_HIDDEN))
+                        bdi_unregister(disk->queue->backing_dev_info);
                blk_unregister_queue(disk);
        } else {
                WARN_ON(1);
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 0179e484ec98..c56f211c8440 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -59,6 +59,7 @@ struct deadline_data {
        int front_merges;
        spinlock_t lock;
+        spinlock_t zone_lock;
        struct list_head dispatch;
 };
@@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
 }
 /*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+        struct request *rq;
+        unsigned long flags;
+        if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+                return NULL;
+        if (list_empty(&dd->fifo_list[data_dir]))
+                return NULL;
+        rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+        if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+                return rq;
+        /*
+         * Look for a write request that can be dispatched, that is one with
+         * an unlocked target zone.
+         */
+        spin_lock_irqsave(&dd->zone_lock, flags);
+        list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
+                if (blk_req_can_dispatch_to_zone(rq))
+                        goto out;
+        }
+        rq = NULL;
+out:
+        spin_unlock_irqrestore(&dd->zone_lock, flags);
+        return rq;
+}
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+        struct request *rq;
+        unsigned long flags;
+        if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+                return NULL;
+        rq = dd->next_rq[data_dir];
+        if (!rq)
+                return NULL;
+        if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+                return rq;
+        /*
+         * Look for a write request that can be dispatched, that is one with
+         * an unlocked target zone.
+         */
+        spin_lock_irqsave(&dd->zone_lock, flags);
+        while (rq) {
+                if (blk_req_can_dispatch_to_zone(rq))
+                        break;
+                rq = deadline_latter_request(rq);
+        }
+        spin_unlock_irqrestore(&dd->zone_lock, flags);
+        return rq;
+}
+/*
 * deadline_dispatch_requests selects the best request according to
 * read/write expire, fifo_batch, etc
 */
-static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+static struct request *__dd_dispatch_request(struct deadline_data *dd)
 {
-        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+        struct request *rq, *next_rq;
-        struct request *rq;
        bool reads, writes;
        int data_dir;
@@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
        /*
         * batches are currently reads XOR writes
         */
-        if (dd->next_rq[WRITE])
+        rq = deadline_next_request(dd, WRITE);
-                rq = dd->next_rq[WRITE];
+        if (!rq)
-        else
+                rq = deadline_next_request(dd, READ);
-                rq = dd->next_rq[READ];
        if (rq && dd->batching < dd->fifo_batch)
                /* we have a next request are still entitled to batch */
@@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
        if (reads) {
                BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
-                if (writes && (dd->starved++ >= dd->writes_starved))
+                if (deadline_fifo_request(dd, WRITE) &&
+                    (dd->starved++ >= dd->writes_starved))
                        goto dispatch_writes;
                data_dir = READ;
@@ -260,21 +331,29 @@ dispatch_find_request:
        /*
         * we are not running a batch, find best request for selected data_dir
         */
-        if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+        next_rq = deadline_next_request(dd, data_dir);
+        if (deadline_check_fifo(dd, data_dir) || !next_rq) {
                /*
                 * A deadline has expired, the last request was in the other
                 * direction, or we have run out of higher-sectored requests.
                 * Start again from the request with the earliest expiry time.
                 */
-                rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+                rq = deadline_fifo_request(dd, data_dir);
        } else {
                /*
                 * The last req was the same dir and we have a next request in
                 * sort order. No expired requests so continue on from here.
                 */
-                rq = dd->next_rq[data_dir];
+                rq = next_rq;
        }
+        /*
+         * For a zoned block device, if we only have writes queued and none of
+         * them can be dispatched, rq will be NULL.
+         */
+        if (!rq)
+                return NULL;
        dd->batching = 0;
 dispatch_request:
@@ -284,17 +363,27 @@ dispatch_request:
        dd->batching++;
        deadline_move_request(dd, rq);
 done:
+        /*
+         * If the request needs its target zone locked, do it.
+         */
+        blk_req_zone_write_lock(rq);
        rq->rq_flags |= RQF_STARTED;
        return rq;
 }
+/*
+ * One confusing aspect here is that we get called for a specific
+ * hardware queue, but we return a request that may not be for a
+ * different hardware queue. This is because mq-deadline has shared
+ * state for all hardware queues, in terms of sorting, FIFOs, etc.
+ */
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
        struct request *rq;
        spin_lock(&dd->lock);
-        rq = __dd_dispatch_request(hctx);
+        rq = __dd_dispatch_request(dd);
        spin_unlock(&dd->lock);
        return rq;
@@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
        dd->front_merges = 1;
        dd->fifo_batch = fifo_batch;
        spin_lock_init(&dd->lock);
+        spin_lock_init(&dd->zone_lock);
        INIT_LIST_HEAD(&dd->dispatch);
        q->elevator = eq;
@@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
        struct deadline_data *dd = q->elevator->elevator_data;
        const int data_dir = rq_data_dir(rq);
+        /*
+         * This may be a requeue of a write request that has locked its
+         * target zone. If it is the case, this releases the zone lock.
+         */
+        blk_req_zone_write_unlock(rq);
        if (blk_mq_sched_try_insert_merge(q, rq))
                return;
@@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
        spin_unlock(&dd->lock);
 }
+/*
+ * For zoned block devices, write unlock the target zone of
+ * completed write requests. Do this while holding the zone lock
+ * spinlock so that the zone is never unlocked while deadline_fifo_request()
+ * while deadline_next_request() are executing.
+ */
+static void dd_completed_request(struct request *rq)
+{
+        struct request_queue *q = rq->q;
+        if (blk_queue_is_zoned(q)) {
+                struct deadline_data *dd = q->elevator->elevator_data;
+                unsigned long flags;
+                spin_lock_irqsave(&dd->zone_lock, flags);
+                blk_req_zone_write_unlock(rq);
+                spin_unlock_irqrestore(&dd->zone_lock, flags);
+        }
+}
 static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
@@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = {
        .ops.mq = {
                .insert_requests        = dd_insert_requests,
                .dispatch_request       = dd_dispatch_request,
+                .completed_request      = dd_completed_request,
                .next_request           = elv_rb_latter_request,
                .former_request         = elv_rb_former_request,
                .bio_merge              = dd_bio_merge,
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 0af3a3db6fb0..82c44f7df911 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,
                        continue;
                bsd_start = le32_to_cpu(p->p_offset);
                bsd_size = le32_to_cpu(p->p_size);
-                if (memcmp(flavour, "bsd\0", 4) == 0)
+                /* FreeBSD has relative offset if C partition offset is zero */
+                if (memcmp(flavour, "bsd\0", 4) == 0 &&
+                    le32_to_cpu(l->d_partitions[2].p_offset) == 0)
                        bsd_start += offset;
                if (offset == bsd_start && size == bsd_size)
                        /* full parent partition, we have it already */
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index edcfff974527..60b471f8621b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -384,9 +384,10 @@ out_put_request:
 /**
 * sg_scsi_ioctl  --  handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
- * @file:       file this ioctl operates on (optional)
 * @q:          request queue to send scsi commands down
 * @disk:       gendisk to operate on (option)
+ * @mode:       mode used to open the file through which the ioctl has been
+ *              submitted
 * @sic:        userspace structure describing the command to perform
 *
 * Send down the scsi command described by @sic to the device below
@@ -415,10 +416,10 @@ out_put_request:
 *      Positive numbers returned are the compacted SCSI error codes (4
 *      bytes in one int) where the lowest byte is the SCSI status.
 */
-#define OMAX_SB_LEN 16          /* For backward compatibility */
 int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
                struct scsi_ioctl_command __user *sic)
 {
+        enum { OMAX_SB_LEN = 16 };      /* For backward compatibility */
        struct request *rq;
        struct scsi_request *req;
        int err;
@@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
        if (bd && bd == bd->bd_contains)
                return 0;
-        /* Actually none of these is particularly useful on a partition,
-         * but they are safe.
-         */
-        switch (cmd) {
-        case SCSI_IOCTL_GET_IDLUN:
-        case SCSI_IOCTL_GET_BUS_NUMBER:
-        case SCSI_IOCTL_GET_PCI:
-        case SCSI_IOCTL_PROBE_HOST:
-        case SG_GET_VERSION_NUM:
-        case SG_SET_TIMEOUT:
-        case SG_GET_TIMEOUT:
-        case SG_GET_RESERVED_SIZE:
-        case SG_SET_RESERVED_SIZE:
-        case SG_EMULATED_HOST:
-                return 0;
-        case CDROM_GET_CAPABILITY:
-                /* Keep this until we remove the printk below.  udev sends it
-                 * and we do not want to spam dmesg about it.   CD-ROMs do
-                 * not have partitions, so we get here only for disks.
-                 */
-                return -ENOIOCTLCMD;
-        default:
-                break;
-        }
        if (capable(CAP_SYS_RAWIO))
                return 0;
-        /* In particular, rule out all resets and host-specific ioctls.  */
-        printk_ratelimited(KERN_WARNING
-                           "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
        return -ENOIOCTLCMD;
 }
 EXPORT_SYMBOL(scsi_verify_blk_ioctl);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index f7911963bb79..20360e040425 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -106,6 +106,7 @@ config CRYPTO_KPP
 config CRYPTO_ACOMP2
        tristate
        select CRYPTO_ALGAPI2
+        select SGL_ALLOC
 config CRYPTO_ACOMP
        tristate
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 2075e2c4e7df..968bbcf65c94 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -140,53 +140,6 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm)
        return ret;
 }
-static void crypto_scomp_sg_free(struct scatterlist *sgl)
-{
-        int i, n;
-        struct page *page;
-        if (!sgl)
-                return;
-        n = sg_nents(sgl);
-        for_each_sg(sgl, sgl, n, i) {
-                page = sg_page(sgl);
-                if (page)
-                        __free_page(page);
-        }
-        kfree(sgl);
-}
-static struct scatterlist *crypto_scomp_sg_alloc(size_t size, gfp_t gfp)
-{
-        struct scatterlist *sgl;
-        struct page *page;
-        int i, n;
-        n = ((size - 1) >> PAGE_SHIFT) + 1;
-        sgl = kmalloc_array(n, sizeof(struct scatterlist), gfp);
-        if (!sgl)
-                return NULL;
-        sg_init_table(sgl, n);
-        for (i = 0; i < n; i++) {
-                page = alloc_page(gfp);
-                if (!page)
-                        goto err;
-                sg_set_page(sgl + i, page, PAGE_SIZE, 0);
-        }
-        return sgl;
-err:
-        sg_mark_end(sgl + i);
-        crypto_scomp_sg_free(sgl);
-        return NULL;
-}
 static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
 {
        struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
@@ -220,7 +173,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
                                              scratch_dst, &req->dlen, *ctx);
        if (!ret) {
                if (!req->dst) {
-                        req->dst = crypto_scomp_sg_alloc(req->dlen, GFP_ATOMIC);
+                        req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL);
                        if (!req->dst)
                                goto out;
                }
@@ -274,7 +227,7 @@ int crypto_init_scomp_ops_async(struct crypto_tfm *tfm)
        crt->compress = scomp_acomp_compress;
        crt->decompress = scomp_acomp_decompress;
-        crt->dst_free = crypto_scomp_sg_free;
+        crt->dst_free = sgl_free;
        crt->reqsize = sizeof(void *);
        return 0;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 442e777bdfb2..728075214959 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6619,43 +6619,27 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller)
 #ifdef DAC960_GAM_MINOR
-/*
+static long DAC960_gam_get_controller_info(DAC960_ControllerInfo_T __user *UserSpaceControllerInfo)
- * DAC960_gam_ioctl is the ioctl function for performing RAID operations.
-*/
-static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
-                                                unsigned long Argument)
 {
-  long ErrorCode = 0;
-  if (!capable(CAP_SYS_ADMIN)) return -EACCES;
-  mutex_lock(&DAC960_mutex);
-  switch (Request)
-    {
-    case DAC960_IOCTL_GET_CONTROLLER_COUNT:
-      ErrorCode = DAC960_ControllerCount;
-      break;
-    case DAC960_IOCTL_GET_CONTROLLER_INFO:
-      {
-        DAC960_ControllerInfo_T __user *UserSpaceControllerInfo =
-          (DAC960_ControllerInfo_T __user *) Argument;
        DAC960_ControllerInfo_T ControllerInfo;
        DAC960_Controller_T *Controller;
        int ControllerNumber;
+        long ErrorCode;
        if (UserSpaceControllerInfo == NULL)
                ErrorCode = -EINVAL;
        else ErrorCode = get_user(ControllerNumber,
                             &UserSpaceControllerInfo->ControllerNumber);
        if (ErrorCode != 0)
-                break;
+                goto out;
        ErrorCode = -ENXIO;
        if (ControllerNumber < 0 ||
            ControllerNumber > DAC960_ControllerCount - 1) {
-          break;
+                goto out;
        }
        Controller = DAC960_Controllers[ControllerNumber];
        if (Controller == NULL)
-                break;
+                goto out;
        memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T));
        ControllerInfo.ControllerNumber = ControllerNumber;
        ControllerInfo.FirmwareType = Controller->FirmwareType;
@@ -6670,12 +6654,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
        strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion);
        ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo,
                             sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0);
-        break;
+out:
-      }
+        return ErrorCode;
-    case DAC960_IOCTL_V1_EXECUTE_COMMAND:
+}
-      {
-        DAC960_V1_UserCommand_T __user *UserSpaceUserCommand =
+static long DAC960_gam_v1_execute_command(DAC960_V1_UserCommand_T __user *UserSpaceUserCommand)
-          (DAC960_V1_UserCommand_T __user *) Argument;
+{
        DAC960_V1_UserCommand_T UserCommand;
        DAC960_Controller_T *Controller;
        DAC960_Command_T *Command = NULL;
@@ -6688,39 +6672,41 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
        int ControllerNumber, DataTransferLength;
        unsigned char *DataTransferBuffer = NULL;
        dma_addr_t DataTransferBufferDMA;
+        long ErrorCode;
        if (UserSpaceUserCommand == NULL) {
                ErrorCode = -EINVAL;
-                break;
+                goto out;
        }
        if (copy_from_user(&UserCommand, UserSpaceUserCommand,
                                   sizeof(DAC960_V1_UserCommand_T))) {
                ErrorCode = -EFAULT;
-                break;
+                goto out;
        }
        ControllerNumber = UserCommand.ControllerNumber;
        ErrorCode = -ENXIO;
        if (ControllerNumber < 0 ||
            ControllerNumber > DAC960_ControllerCount - 1)
-                break;
+                goto out;
        Controller = DAC960_Controllers[ControllerNumber];
        if (Controller == NULL)
-                break;
+                goto out;
        ErrorCode = -EINVAL;
        if (Controller->FirmwareType != DAC960_V1_Controller)
-                break;
+                goto out;
        CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode;
        DataTransferLength = UserCommand.DataTransferLength;
        if (CommandOpcode & 0x80)
-                break;
+                goto out;
        if (CommandOpcode == DAC960_V1_DCDB)
          {
            if (copy_from_user(&DCDB, UserCommand.DCDB,
                               sizeof(DAC960_V1_DCDB_T))) {
                ErrorCode = -EFAULT;
-                break;
+                goto out;
            }
            if (DCDB.Channel >= DAC960_V1_MaxChannels)
-                        break;
+                goto out;
            if (!((DataTransferLength == 0 &&
                   DCDB.Direction
                   == DAC960_V1_DCDB_NoDataTransfer) ||
@@ -6730,15 +6716,15 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
                  (DataTransferLength < 0 &&
                   DCDB.Direction
                   == DAC960_V1_DCDB_DataTransferSystemToDevice)))
-                        break;
+                        goto out;
            if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength)
                != abs(DataTransferLength))
-                        break;
+                        goto out;
            DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice,
                        sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA);
            if (DCDB_IOBUF == NULL) {
                        ErrorCode = -ENOMEM;
-                        break;
+                        goto out;
                }
          }
        ErrorCode = -ENOMEM;
@@ -6748,19 +6734,19 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
                                                       DataTransferLength,
                                                       &DataTransferBufferDMA);
            if (DataTransferBuffer == NULL)
-                break;
+                goto out;
          }
        else if (DataTransferLength < 0)
          {
            DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
                                -DataTransferLength, &DataTransferBufferDMA);
            if (DataTransferBuffer == NULL)
-                break;
+                goto out;
            if (copy_from_user(DataTransferBuffer,
                               UserCommand.DataTransferBuffer,
                               -DataTransferLength)) {
                ErrorCode = -EFAULT;
-                break;
+                goto out;
            }
          }
        if (CommandOpcode == DAC960_V1_DCDB)
@@ -6837,12 +6823,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
        if (DCDB_IOBUF != NULL)
          pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T),
                        DCDB_IOBUF, DCDB_IOBUFDMA);
-        break;
+        out:
-      }
+        return ErrorCode;
-    case DAC960_IOCTL_V2_EXECUTE_COMMAND:
+}
-      {
-        DAC960_V2_UserCommand_T __user *UserSpaceUserCommand =
+static long DAC960_gam_v2_execute_command(DAC960_V2_UserCommand_T __user *UserSpaceUserCommand)
-          (DAC960_V2_UserCommand_T __user *) Argument;
+{
        DAC960_V2_UserCommand_T UserCommand;
        DAC960_Controller_T *Controller;
        DAC960_Command_T *Command = NULL;
@@ -6855,26 +6841,26 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
        dma_addr_t DataTransferBufferDMA;
        unsigned char *RequestSenseBuffer = NULL;
        dma_addr_t RequestSenseBufferDMA;
+        long ErrorCode = -EINVAL;
-        ErrorCode = -EINVAL;
        if (UserSpaceUserCommand == NULL)
-                break;
+                goto out;
        if (copy_from_user(&UserCommand, UserSpaceUserCommand,
                           sizeof(DAC960_V2_UserCommand_T))) {
                ErrorCode = -EFAULT;
-                break;
+                goto out;
        }
        ErrorCode = -ENXIO;
        ControllerNumber = UserCommand.ControllerNumber;
        if (ControllerNumber < 0 ||
            ControllerNumber > DAC960_ControllerCount - 1)
-                break;
+                goto out;
        Controller = DAC960_Controllers[ControllerNumber];
        if (Controller == NULL)
-                break;
+                goto out;
        if (Controller->FirmwareType != DAC960_V2_Controller){
                ErrorCode = -EINVAL;
-                break;
+                goto out;
        }
        DataTransferLength = UserCommand.DataTransferLength;
        ErrorCode = -ENOMEM;
@@ -6884,14 +6870,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
                                                       DataTransferLength,
                                                       &DataTransferBufferDMA);
            if (DataTransferBuffer == NULL)
-                break;
+                goto out;
          }
        else if (DataTransferLength < 0)
          {
            DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
                                -DataTransferLength, &DataTransferBufferDMA);
            if (DataTransferBuffer == NULL)
-                break;
+                goto out;
            if (copy_from_user(DataTransferBuffer,
                               UserCommand.DataTransferBuffer,
                               -DataTransferLength)) {
@@ -7001,42 +6987,44 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
        if (RequestSenseBuffer != NULL)
          pci_free_consistent(Controller->PCIDevice, RequestSenseLength,
                RequestSenseBuffer, RequestSenseBufferDMA);
-        break;
+out:
-      }
+        return ErrorCode;
-    case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
+}
-      {
-        DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus =
+static long DAC960_gam_v2_get_health_status(DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus)
-          (DAC960_V2_GetHealthStatus_T __user *) Argument;
+{
        DAC960_V2_GetHealthStatus_T GetHealthStatus;
        DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer;
        DAC960_Controller_T *Controller;
        int ControllerNumber;
+        long ErrorCode;
        if (UserSpaceGetHealthStatus == NULL) {
                ErrorCode = -EINVAL;
-                break;
+                goto out;
        }
        if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus,
                           sizeof(DAC960_V2_GetHealthStatus_T))) {
                ErrorCode = -EFAULT;
-                break;
+                goto out;
        }
        ErrorCode = -ENXIO;
        ControllerNumber = GetHealthStatus.ControllerNumber;
        if (ControllerNumber < 0 ||
            ControllerNumber > DAC960_ControllerCount - 1)
-                    break;
+                goto out;
        Controller = DAC960_Controllers[ControllerNumber];
        if (Controller == NULL)
-                break;
+                goto out;
        if (Controller->FirmwareType != DAC960_V2_Controller) {
                ErrorCode = -EINVAL;
-                break;
+                goto out;
        }
        if (copy_from_user(&HealthStatusBuffer,
                           GetHealthStatus.HealthStatusBuffer,
                           sizeof(DAC960_V2_HealthStatusBuffer_T))) {
                ErrorCode = -EFAULT;
-                break;
+                goto out;
        }
        ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue,
                        !(Controller->V2.HealthStatusBuffer->StatusChangeCounter
@@ -7046,7 +7034,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
                        DAC960_MonitoringTimerInterval);
        if (ErrorCode == -ERESTARTSYS) {
                ErrorCode = -EINTR;
-                break;
+                goto out;
        }
        if (copy_to_user(GetHealthStatus.HealthStatusBuffer,
                         Controller->V2.HealthStatusBuffer,
@@ -7054,7 +7042,39 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
                ErrorCode = -EFAULT;
        else
                ErrorCode =  0;
-      }
+out:
+        return ErrorCode;
+}
+/*
+ * DAC960_gam_ioctl is the ioctl function for performing RAID operations.
+*/
+static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
+                                                unsigned long Argument)
+{
+  long ErrorCode = 0;
+  void __user *argp = (void __user *)Argument;
+  if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+  mutex_lock(&DAC960_mutex);
+  switch (Request)
+    {
+    case DAC960_IOCTL_GET_CONTROLLER_COUNT:
+      ErrorCode = DAC960_ControllerCount;
+      break;
+    case DAC960_IOCTL_GET_CONTROLLER_INFO:
+      ErrorCode = DAC960_gam_get_controller_info(argp);
+      break;
+    case DAC960_IOCTL_V1_EXECUTE_COMMAND:
+      ErrorCode = DAC960_gam_v1_execute_command(argp);
+      break;
+    case DAC960_IOCTL_V2_EXECUTE_COMMAND:
+      ErrorCode = DAC960_gam_v2_execute_command(argp);
+      break;
+    case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
+      ErrorCode = DAC960_gam_v2_get_health_status(argp);
      break;
      default:
        ErrorCode = -ENOTTY;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 40579d0cb3d1..ad9b687a236a 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK
        tristate "Null test block driver"
        select CONFIGFS_FS
+config BLK_DEV_NULL_BLK_FAULT_INJECTION
+        bool "Support fault injection for Null test block driver"
+        depends on BLK_DEV_NULL_BLK && FAULT_INJECTION
 config BLK_DEV_FD
        tristate "Normal floppy disk support"
        depends on ARCH_MAY_HAVE_PC_FDC
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 9220f8e833d0..c0ebda1283cc 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -112,8 +112,7 @@ enum frame_flags {
 struct frame {
        struct list_head head;
        u32 tag;
-        struct timeval sent;    /* high-res time packet was sent */
+        ktime_t sent;                   /* high-res time packet was sent */
-        u32 sent_jiffs;         /* low-res jiffies-based sent time */
        ulong waited;
        ulong waited_total;
        struct aoetgt *t;               /* parent target I belong to */
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 812fed069708..540bb60cd071 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
        skb = skb_clone(f->skb, GFP_ATOMIC);
        if (skb) {
-                do_gettimeofday(&f->sent);
+                f->sent = ktime_get();
-                f->sent_jiffs = (u32) jiffies;
                __skb_queue_head_init(&queue);
                __skb_queue_tail(&queue, skb);
                aoenet_xmit(&queue);
@@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
        skb = skb_clone(skb, GFP_ATOMIC);
        if (skb == NULL)
                return;
-        do_gettimeofday(&f->sent);
+        f->sent = ktime_get();
-        f->sent_jiffs = (u32) jiffies;
        __skb_queue_head_init(&queue);
        __skb_queue_tail(&queue, skb);
        aoenet_xmit(&queue);
@@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
 static int
 tsince_hr(struct frame *f)
 {
-        struct timeval now;
+        u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
-        int n;
-        do_gettimeofday(&now);
+        /* delta is normally under 4.2 seconds, avoid 64-bit division */
-        n = now.tv_usec - f->sent.tv_usec;
+        if (likely(delta <= UINT_MAX))
-        n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+                return (u32)delta / NSEC_PER_USEC;
-        if (n < 0)
+        /* avoid overflow after 71 minutes */
-                n = -n;
+        if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
+                return INT_MAX;
-        /* For relatively long periods, use jiffies to avoid
+        return div_u64(delta, NSEC_PER_USEC);
-         * discrepancies caused by updates to the system time.
-         *
-         * On system with HZ of 1000, 32-bits is over 49 days
-         * worth of jiffies, or over 71 minutes worth of usecs.
-         *
-         * Jiffies overflow is handled by subtraction of unsigned ints:
-         * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
-         * $3 = 4
-         * (gdb)
-         */
-        if (n > USEC_PER_SEC / 4) {
-                n = ((u32) jiffies) - f->sent_jiffs;
-                n *= USEC_PER_SEC / HZ;
-        }
-        return n;
 }
 static int
@@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
        nf->waited = 0;
        nf->waited_total = f->waited_total;
        nf->sent = f->sent;
-        nf->sent_jiffs = f->sent_jiffs;
        f->skb = skb;
        return nf;
@@ -633,8 +614,7 @@ probe(struct aoetgt *t)
        skb = skb_clone(f->skb, GFP_ATOMIC);
        if (skb) {
-                do_gettimeofday(&f->sent);
+                f->sent = ktime_get();
-                f->sent_jiffs = (u32) jiffies;
                __skb_queue_head_init(&queue);
                __skb_queue_tail(&queue, skb);
                aoenet_xmit(&queue);
@@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
        d->timer.function = rexmit_timer;
        skb = skb_clone(skb, GFP_ATOMIC);
-        if (skb) {
+        if (skb)
-                do_gettimeofday(&f->sent);
+                f->sent = ktime_get();
-                f->sent_jiffs = (u32) jiffies;
-        }
        return skb;
 }
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index bd97908c766f..9f4e6f502b84 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio)
        struct drbd_bm_aio_ctx *ctx = bio->bi_private;
        struct drbd_device *device = ctx->device;
        struct drbd_bitmap *b = device->bitmap;
-        unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
+        unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
        if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
            !bm_test_page_unchanged(b->bm_pages[idx]))
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index ad0477ae820f..6655893a3a7a 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -12,9 +12,9 @@
 #include <linux/slab.h>
 #include <linux/blk-mq.h>
 #include <linux/hrtimer.h>
-#include <linux/lightnvm.h>
 #include <linux/configfs.h>
 #include <linux/badblocks.h>
+#include <linux/fault-inject.h>
 #define SECTOR_SHIFT            9
 #define PAGE_SECTORS_SHIFT      (PAGE_SHIFT - SECTOR_SHIFT)
@@ -27,6 +27,10 @@
 #define TICKS_PER_SEC           50ULL
 #define TIMER_INTERVAL          (NSEC_PER_SEC / TICKS_PER_SEC)
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static DECLARE_FAULT_ATTR(null_timeout_attr);
+#endif
 static inline u64 mb_per_tick(int mbps)
 {
        return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
@@ -107,7 +111,6 @@ struct nullb_device {
        unsigned int hw_queue_depth; /* queue depth */
        unsigned int index; /* index of the disk, only valid with a disk */
        unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
-        bool use_lightnvm; /* register as a LightNVM device */
        bool blocking; /* blocking blk-mq device */
        bool use_per_node_hctx; /* use per-node allocation for hardware context */
        bool power; /* power on/off the device */
@@ -121,7 +124,6 @@ struct nullb {
        unsigned int index;
        struct request_queue *q;
        struct gendisk *disk;
-        struct nvm_dev *ndev;
        struct blk_mq_tag_set *tag_set;
        struct blk_mq_tag_set __tag_set;
        unsigned int queue_depth;
@@ -139,7 +141,6 @@ static LIST_HEAD(nullb_list);
 static struct mutex lock;
 static int null_major;
 static DEFINE_IDA(nullb_indexes);
-static struct kmem_cache *ppa_cache;
 static struct blk_mq_tag_set tag_set;
 enum {
@@ -166,6 +167,11 @@ static int g_home_node = NUMA_NO_NODE;
 module_param_named(home_node, g_home_node, int, S_IRUGO);
 MODULE_PARM_DESC(home_node, "Home node for the device");
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static char g_timeout_str[80];
+module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+#endif
 static int g_queue_mode = NULL_Q_MQ;
 static int null_param_store_val(const char *str, int *val, int min, int max)
@@ -208,10 +214,6 @@ static int nr_devices = 1;
 module_param(nr_devices, int, S_IRUGO);
 MODULE_PARM_DESC(nr_devices, "Number of devices to register");
-static bool g_use_lightnvm;
-module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
-MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
 static bool g_blocking;
 module_param_named(blocking, g_blocking, bool, S_IRUGO);
 MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
@@ -345,7 +347,6 @@ NULLB_DEVICE_ATTR(blocksize, uint);
 NULLB_DEVICE_ATTR(irqmode, uint);
 NULLB_DEVICE_ATTR(hw_queue_depth, uint);
 NULLB_DEVICE_ATTR(index, uint);
-NULLB_DEVICE_ATTR(use_lightnvm, bool);
 NULLB_DEVICE_ATTR(blocking, bool);
 NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
 NULLB_DEVICE_ATTR(memory_backed, bool);
@@ -455,7 +456,6 @@ static struct configfs_attribute *nullb_device_attrs[] = {
        &nullb_device_attr_irqmode,
        &nullb_device_attr_hw_queue_depth,
        &nullb_device_attr_index,
-        &nullb_device_attr_use_lightnvm,
        &nullb_device_attr_blocking,
        &nullb_device_attr_use_per_node_hctx,
        &nullb_device_attr_power,
@@ -573,7 +573,6 @@ static struct nullb_device *null_alloc_dev(void)
        dev->blocksize = g_bs;
        dev->irqmode = g_irqmode;
        dev->hw_queue_depth = g_hw_queue_depth;
-        dev->use_lightnvm = g_use_lightnvm;
        dev->blocking = g_blocking;
        dev->use_per_node_hctx = g_use_per_node_hctx;
        return dev;
@@ -1352,6 +1351,12 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
        return BLK_QC_T_NONE;
 }
+static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
+{
+        pr_info("null: rq %p timed out\n", rq);
+        return BLK_EH_HANDLED;
+}
 static int null_rq_prep_fn(struct request_queue *q, struct request *req)
 {
        struct nullb *nullb = q->queuedata;
@@ -1369,6 +1374,16 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req)
        return BLKPREP_DEFER;
 }
+static bool should_timeout_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+        if (g_timeout_str[0])
+                return should_fail(&null_timeout_attr, 1);
+#endif
+        return false;
+}
 static void null_request_fn(struct request_queue *q)
 {
        struct request *rq;
@@ -1376,12 +1391,20 @@ static void null_request_fn(struct request_queue *q)
        while ((rq = blk_fetch_request(q)) != NULL) {
                struct nullb_cmd *cmd = rq->special;
-                spin_unlock_irq(q->queue_lock);
+                if (!should_timeout_request(rq)) {
-                null_handle_cmd(cmd);
+                        spin_unlock_irq(q->queue_lock);
-                spin_lock_irq(q->queue_lock);
+                        null_handle_cmd(cmd);
+                        spin_lock_irq(q->queue_lock);
+                }
        }
 }
+static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
+{
+        pr_info("null: rq %p timed out\n", rq);
+        return BLK_EH_HANDLED;
+}
 static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                         const struct blk_mq_queue_data *bd)
 {
@@ -1399,12 +1422,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
        blk_mq_start_request(bd->rq);
-        return null_handle_cmd(cmd);
+        if (!should_timeout_request(bd->rq))
+                return null_handle_cmd(cmd);
+        return BLK_STS_OK;
 }
 static const struct blk_mq_ops null_mq_ops = {
        .queue_rq       = null_queue_rq,
        .complete       = null_softirq_done_fn,
+        .timeout        = null_timeout_rq,
 };
 static void cleanup_queue(struct nullb_queue *nq)
@@ -1423,170 +1450,6 @@ static void cleanup_queues(struct nullb *nullb)
        kfree(nullb->queues);
 }
-#ifdef CONFIG_NVM
-static void null_lnvm_end_io(struct request *rq, blk_status_t status)
-{
-        struct nvm_rq *rqd = rq->end_io_data;
-        /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
-        rqd->error = status ? -EIO : 0;
-        nvm_end_io(rqd);
-        blk_put_request(rq);
-}
-static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-        struct request_queue *q = dev->q;
-        struct request *rq;
-        struct bio *bio = rqd->bio;
-        rq = blk_mq_alloc_request(q,
-                op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
-        if (IS_ERR(rq))
-                return -ENOMEM;
-        blk_init_request_from_bio(rq, bio);
-        rq->end_io_data = rqd;
-        blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
-        return 0;
-}
-static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
-{
-        struct nullb *nullb = dev->q->queuedata;
-        sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
-        sector_t blksize;
-        struct nvm_id_group *grp;
-        id->ver_id = 0x1;
-        id->vmnt = 0;
-        id->cap = 0x2;
-        id->dom = 0x1;
-        id->ppaf.blk_offset = 0;
-        id->ppaf.blk_len = 16;
-        id->ppaf.pg_offset = 16;
-        id->ppaf.pg_len = 16;
-        id->ppaf.sect_offset = 32;
-        id->ppaf.sect_len = 8;
-        id->ppaf.pln_offset = 40;
-        id->ppaf.pln_len = 8;
-        id->ppaf.lun_offset = 48;
-        id->ppaf.lun_len = 8;
-        id->ppaf.ch_offset = 56;
-        id->ppaf.ch_len = 8;
-        sector_div(size, nullb->dev->blocksize); /* convert size to pages */
-        size >>= 8; /* concert size to pgs pr blk */
-        grp = &id->grp;
-        grp->mtype = 0;
-        grp->fmtype = 0;
-        grp->num_ch = 1;
-        grp->num_pg = 256;
-        blksize = size;
-        size >>= 16;
-        grp->num_lun = size + 1;
-        sector_div(blksize, grp->num_lun);
-        grp->num_blk = blksize;
-        grp->num_pln = 1;
-        grp->fpg_sz = nullb->dev->blocksize;
-        grp->csecs = nullb->dev->blocksize;
-        grp->trdt = 25000;
-        grp->trdm = 25000;
-        grp->tprt = 500000;
-        grp->tprm = 500000;
-        grp->tbet = 1500000;
-        grp->tbem = 1500000;
-        grp->mpos = 0x010101; /* single plane rwe */
-        grp->cpar = nullb->dev->hw_queue_depth;
-        return 0;
-}
-static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
-{
-        mempool_t *virtmem_pool;
-        virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
-        if (!virtmem_pool) {
-                pr_err("null_blk: Unable to create virtual memory pool\n");
-                return NULL;
-        }
-        return virtmem_pool;
-}
-static void null_lnvm_destroy_dma_pool(void *pool)
-{
-        mempool_destroy(pool);
-}
-static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
-                                gfp_t mem_flags, dma_addr_t *dma_handler)
-{
-        return mempool_alloc(pool, mem_flags);
-}
-static void null_lnvm_dev_dma_free(void *pool, void *entry,
-                                                        dma_addr_t dma_handler)
-{
-        mempool_free(entry, pool);
-}
-static struct nvm_dev_ops null_lnvm_dev_ops = {
-        .identity               = null_lnvm_id,
-        .submit_io              = null_lnvm_submit_io,
-        .create_dma_pool        = null_lnvm_create_dma_pool,
-        .destroy_dma_pool       = null_lnvm_destroy_dma_pool,
-        .dev_dma_alloc          = null_lnvm_dev_dma_alloc,
-        .dev_dma_free           = null_lnvm_dev_dma_free,
-        /* Simulate nvme protocol restriction */
-        .max_phys_sect          = 64,
-};
-static int null_nvm_register(struct nullb *nullb)
-{
-        struct nvm_dev *dev;
-        int rv;
-        dev = nvm_alloc_dev(0);
-        if (!dev)
-                return -ENOMEM;
-        dev->q = nullb->q;
-        memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
-        dev->ops = &null_lnvm_dev_ops;
-        rv = nvm_register(dev);
-        if (rv) {
-                kfree(dev);
-                return rv;
-        }
-        nullb->ndev = dev;
-        return 0;
-}
-static void null_nvm_unregister(struct nullb *nullb)
-{
-        nvm_unregister(nullb->ndev);
-}
-#else
-static int null_nvm_register(struct nullb *nullb)
-{
-        pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
-        return -EINVAL;
-}
-static void null_nvm_unregister(struct nullb *nullb) {}
-#endif /* CONFIG_NVM */
 static void null_del_dev(struct nullb *nullb)
 {
        struct nullb_device *dev = nullb->dev;
@@ -1595,10 +1458,7 @@ static void null_del_dev(struct nullb *nullb)
        list_del_init(&nullb->list);
-        if (dev->use_lightnvm)
+        del_gendisk(nullb->disk);
-                null_nvm_unregister(nullb);
-        else
-                del_gendisk(nullb->disk);
        if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
                hrtimer_cancel(&nullb->bw_timer);
@@ -1610,8 +1470,7 @@ static void null_del_dev(struct nullb *nullb)
        if (dev->queue_mode == NULL_Q_MQ &&
            nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
-        if (!dev->use_lightnvm)
+        put_disk(nullb->disk);
-                put_disk(nullb->disk);
        cleanup_queues(nullb);
        if (null_cache_active(nullb))
                null_free_device_storage(nullb->dev, true);
@@ -1775,11 +1634,6 @@ static void null_validate_conf(struct nullb_device *dev)
 {
        dev->blocksize = round_down(dev->blocksize, 512);
        dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
-        if (dev->use_lightnvm && dev->blocksize != 4096)
-                dev->blocksize = 4096;
-        if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
-                dev->queue_mode = NULL_Q_MQ;
        if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
                if (dev->submit_queues != nr_online_nodes)
@@ -1805,6 +1659,20 @@ static void null_validate_conf(struct nullb_device *dev)
                dev->mbps = 0;
 }
+static bool null_setup_fault(void)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+        if (!g_timeout_str[0])
+                return true;
+        if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
+                return false;
+        null_timeout_attr.verbose = 0;
+#endif
+        return true;
+}
 static int null_add_dev(struct nullb_device *dev)
 {
        struct nullb *nullb;
@@ -1838,6 +1706,10 @@ static int null_add_dev(struct nullb_device *dev)
                if (rv)
                        goto out_cleanup_queues;
+                if (!null_setup_fault())
+                        goto out_cleanup_queues;
+                nullb->tag_set->timeout = 5 * HZ;
                nullb->q = blk_mq_init_queue(nullb->tag_set);
                if (IS_ERR(nullb->q)) {
                        rv = -ENOMEM;
@@ -1861,8 +1733,14 @@ static int null_add_dev(struct nullb_device *dev)
                        rv = -ENOMEM;
                        goto out_cleanup_queues;
                }
+                if (!null_setup_fault())
+                        goto out_cleanup_blk_queue;
                blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
                blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+                blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
+                nullb->q->rq_timeout = 5 * HZ;
                rv = init_driver_queues(nullb);
                if (rv)
                        goto out_cleanup_blk_queue;
@@ -1895,11 +1773,7 @@ static int null_add_dev(struct nullb_device *dev)
        sprintf(nullb->disk_name, "nullb%d", nullb->index);
-        if (dev->use_lightnvm)
+        rv = null_gendisk_register(nullb);
-                rv = null_nvm_register(nullb);
-        else
-                rv = null_gendisk_register(nullb);
        if (rv)
                goto out_cleanup_blk_queue;
@@ -1938,18 +1812,6 @@ static int __init null_init(void)
                g_bs = PAGE_SIZE;
        }
-        if (g_use_lightnvm && g_bs != 4096) {
-                pr_warn("null_blk: LightNVM only supports 4k block size\n");
-                pr_warn("null_blk: defaults block size to 4k\n");
-                g_bs = 4096;
-        }
-        if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
-                pr_warn("null_blk: LightNVM only supported for blk-mq\n");
-                pr_warn("null_blk: defaults queue mode to blk-mq\n");
-                g_queue_mode = NULL_Q_MQ;
-        }
        if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
                if (g_submit_queues != nr_online_nodes) {
                        pr_warn("null_blk: submit_queues param is set to %u.\n",
@@ -1982,16 +1844,6 @@ static int __init null_init(void)
                goto err_conf;
        }
-        if (g_use_lightnvm) {
-                ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
-                                                                0, 0, NULL);
-                if (!ppa_cache) {
-                        pr_err("null_blk: unable to create ppa cache\n");
-                        ret = -ENOMEM;
-                        goto err_ppa;
-                }
-        }
        for (i = 0; i < nr_devices; i++) {
                dev = null_alloc_dev();
                if (!dev) {
@@ -2015,8 +1867,6 @@ err_dev:
                null_del_dev(nullb);
                null_free_dev(dev);
        }
-        kmem_cache_destroy(ppa_cache);
-err_ppa:
        unregister_blkdev(null_major, "nullb");
 err_conf:
        configfs_unregister_subsystem(&nullb_subsys);
@@ -2047,8 +1897,6 @@ static void __exit null_exit(void)
        if (g_queue_mode == NULL_Q_MQ && shared_tags)
                blk_mq_free_tag_set(&tag_set);
-        kmem_cache_destroy(ppa_cache);
 }
 module_init(null_init);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 67974796c350..531a0915066b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
        bdev = bdget(dev);
        if (!bdev)
                return -ENOMEM;
+        ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
+        if (ret)
+                return ret;
        if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
                WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
-                bdput(bdev);
+                blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
                return -EINVAL;
        }
-        ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
-        if (ret)
-                return ret;
        /* This is safe, since we have a reference from open(). */
        __module_get(THIS_MODULE);
@@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
        pd->pkt_dev = MKDEV(pktdev_major, idx);
        ret = pkt_new_dev(pd, dev);
        if (ret)
-                goto out_new_dev;
+                goto out_mem2;
        /* inherit events of the host device */
        disk->events = pd->bdev->bd_disk->events;
@@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
        mutex_unlock(&ctl_mutex);
        return 0;
-out_new_dev:
-        blk_cleanup_queue(disk->queue);
 out_mem2:
        put_disk(disk);
 out_mem:
diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h
deleted file mode 100644
index e5565fbaeb30..000000000000
--- a/drivers/block/smart1,2.h
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- *    Disk Array driver for Compaq SMART2 Controllers
- *    Copyright 1998 Compaq Computer Corporation
- *
- *    This program is free software; you can redistribute it and/or modify
- *    it under the terms of the GNU General Public License as published by
- *    the Free Software Foundation; either version 2 of the License, or
- *    (at your option) any later version.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *    NON INFRINGEMENT.  See the GNU General Public License for more details.
- *
- *    You should have received a copy of the GNU General Public License
- *    along with this program; if not, write to the Free Software
- *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- *    Questions/Comments/Bugfixes to iss_storagedev@hp.com
- *
- *    If you want to make changes, improve or add functionality to this
- *    driver, you'll probably need the Compaq Array Controller Interface
- *    Specificiation (Document number ECG086/1198)
- */
-/*
- * This file contains the controller communication implementation for
- * Compaq SMART-1 and SMART-2 controllers.  To the best of my knowledge,
- * this should support:
- *
- *  PCI:
- *  SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200
- *  Integerated SMART Array Controller, SMART-4200, SMART-4250ES
- *
- *  EISA:
- *  SMART-2/E, SMART, IAES, IDA-2, IDA
- */
-/*
- * Memory mapped FIFO interface (SMART 42xx cards)
- */
-static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
-        writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET);
-}
-/*  
- *  This card is the opposite of the other cards.  
- *   0 turns interrupts on... 
- *   0x08 turns them off... 
- */
-static void smart4_intr_mask(ctlr_info_t *h, unsigned long val)
-{
-        if (val) 
-        { /* Turn interrupts on */
-                writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
-        } else /* Turn them off */
-        {
-                writel( S42XX_INTR_OFF, 
-                        h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
-        }
-}
-/*
- *  For older cards FIFO Full = 0. 
- *  On this card 0 means there is room, anything else FIFO Full. 
- * 
- */ 
-static unsigned long smart4_fifo_full(ctlr_info_t *h)
-{
-        
-        return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET));
-}
-/* This type of controller returns -1 if the fifo is empty, 
- *    Not 0 like the others.
- *    And we need to let it know we read a value out 
- */ 
-static unsigned long smart4_completed(ctlr_info_t *h)
-{
-        long register_value 
-                = readl(h->vaddr + S42XX_REPLY_PORT_OFFSET);
-        /* Fifo is empty */
-        if( register_value == 0xffffffff)
-                return 0;       
-        /* Need to let it know we got the reply */
-        /* We do this by writing a 0 to the port we just read from */
-        writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET);
-        return ((unsigned long) register_value); 
-}
- /*
- *  This hardware returns interrupt pending at a different place and 
- *  it does not tell us if the fifo is empty, we will have check  
- *  that by getting a 0 back from the command_completed call. 
- */
-static unsigned long smart4_intr_pending(ctlr_info_t *h)
-{
-        unsigned long register_value  = 
-                readl(h->vaddr + S42XX_INTR_STATUS);
-        if( register_value &  S42XX_INTR_PENDING) 
-                return  FIFO_NOT_EMPTY; 
-        return 0 ;
-}
-static struct access_method smart4_access = {
-        smart4_submit_command,
-        smart4_intr_mask,
-        smart4_fifo_full,
-        smart4_intr_pending,
-        smart4_completed,
-};
-/*
- * Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards)
- */
-static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
-        writel(c->busaddr, h->vaddr + COMMAND_FIFO);
-}
-static void smart2_intr_mask(ctlr_info_t *h, unsigned long val)
-{
-        writel(val, h->vaddr + INTR_MASK);
-}
-static unsigned long smart2_fifo_full(ctlr_info_t *h)
-{
-        return readl(h->vaddr + COMMAND_FIFO);
-}
-static unsigned long smart2_completed(ctlr_info_t *h)
-{
-        return readl(h->vaddr + COMMAND_COMPLETE_FIFO);
-}
-static unsigned long smart2_intr_pending(ctlr_info_t *h)
-{
-        return readl(h->vaddr + INTR_PENDING);
-}
-static struct access_method smart2_access = {
-        smart2_submit_command,
-        smart2_intr_mask,
-        smart2_fifo_full,
-        smart2_intr_pending,
-        smart2_completed,
-};
-/*
- *  IO access for SMART-2/E cards
- */
-static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
-        outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO);
-}
-static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val)
-{
-        outl(val, h->io_mem_addr + INTR_MASK);
-}
-static unsigned long smart2e_fifo_full(ctlr_info_t *h)
-{
-        return inl(h->io_mem_addr + COMMAND_FIFO);
-}
-static unsigned long smart2e_completed(ctlr_info_t *h)
-{
-        return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO);
-}
-static unsigned long smart2e_intr_pending(ctlr_info_t *h)
-{
-        return inl(h->io_mem_addr + INTR_PENDING);
-}
-static struct access_method smart2e_access = {
-        smart2e_submit_command,
-        smart2e_intr_mask,
-        smart2e_fifo_full,
-        smart2e_intr_pending,
-        smart2e_completed,
-};
-/*
- *  IO access for older SMART-1 type cards
- */
-#define SMART1_SYSTEM_MASK              0xC8E
-#define SMART1_SYSTEM_DOORBELL          0xC8F
-#define SMART1_LOCAL_MASK               0xC8C
-#define SMART1_LOCAL_DOORBELL           0xC8D
-#define SMART1_INTR_MASK                0xC89
-#define SMART1_LISTADDR                 0xC90
-#define SMART1_LISTLEN                  0xC94
-#define SMART1_TAG                      0xC97
-#define SMART1_COMPLETE_ADDR            0xC98
-#define SMART1_LISTSTATUS               0xC9E
-#define CHANNEL_BUSY                    0x01
-#define CHANNEL_CLEAR                   0x02
-static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
-        /*
-         * This __u16 is actually a bunch of control flags on SMART
-         * and below.  We want them all to be zero.
-         */
-        c->hdr.size = 0;
-        outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
-        outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR);
-        outw(c->size, h->io_mem_addr + SMART1_LISTLEN);
-        outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
-}
-static void smart1_intr_mask(ctlr_info_t *h, unsigned long val)
-{
-        if (val == 1) {
-                outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
-                outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
-                outb(0x01, h->io_mem_addr + SMART1_INTR_MASK);
-                outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK);
-        } else {
-                outb(0, h->io_mem_addr + 0xC8E);
-        }
-}
-static unsigned long smart1_fifo_full(ctlr_info_t *h)
-{
-        unsigned char chan;
-        chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR;
-        return chan;
-}
-static unsigned long smart1_completed(ctlr_info_t *h)
-{
-        unsigned char status;
-        unsigned long cmd;
-        if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) {
-                outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
-                cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR);
-                status = inb(h->io_mem_addr + SMART1_LISTSTATUS);
-                outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
-                /*
-                 * this is x86 (actually compaq x86) only, so it's ok
-                 */
-                if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status;
-        } else {
-                cmd = 0;
-        }
-        return cmd;
-}
-static unsigned long smart1_intr_pending(ctlr_info_t *h)
-{
-        unsigned char chan;
-        chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY;
-        return chan;
-}
-static struct access_method smart1_access = {
-        smart1_submit_command,
-        smart1_intr_mask,
-        smart1_fifo_full,
-        smart1_intr_pending,
-        smart1_completed,
-};
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d70eba30003a..0afa6c8c3857 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
 static void zram_page_end_io(struct bio *bio)
 {
-        struct page *page = bio->bi_io_vec[0].bv_page;
+        struct page *page = bio_first_page_all(bio);
        page_endio(page, op_is_write(bio_op(bio)),
                        blk_status_to_errno(bio->bi_status));
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 2a953efec4e1..10c08982185a 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -27,13 +27,6 @@ config NVM_DEBUG
        It is required to create/remove targets without IOCTLs.
-config NVM_RRPC
-        tristate "Round-robin Hybrid Open-Channel SSD target"
-        ---help---
-        Allows an open-channel SSD to be exposed as a block device to the
-        host. The target is implemented using a linear mapping table and
-        cost-based garbage collection. It is optimized for 4K IO sizes.
 config NVM_PBLK
        tristate "Physical Block Device Open-Channel SSD target"
        ---help---
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 2c3fd9d2c08c..97d9d7c71550 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -4,7 +4,6 @@
 #
 obj-$(CONFIG_NVM)               := core.o
-obj-$(CONFIG_NVM_RRPC)          += rrpc.o
 obj-$(CONFIG_NVM_PBLK)          += pblk.o
 pblk-y                          := pblk-init.o pblk-core.o pblk-rb.o \
                                   pblk-write.o pblk-cache.o pblk-read.o \
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 83249b43dd06..dcc9e621e651 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -45,12 +45,6 @@ struct nvm_dev_map {
        int nr_chnls;
 };
-struct nvm_area {
-        struct list_head list;
-        sector_t begin;
-        sector_t end;   /* end is excluded */
-};
 static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
 {
        struct nvm_target *tgt;
@@ -62,6 +56,30 @@ static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
        return NULL;
 }
+static bool nvm_target_exists(const char *name)
+{
+        struct nvm_dev *dev;
+        struct nvm_target *tgt;
+        bool ret = false;
+        down_write(&nvm_lock);
+        list_for_each_entry(dev, &nvm_devices, devices) {
+                mutex_lock(&dev->mlock);
+                list_for_each_entry(tgt, &dev->targets, list) {
+                        if (!strcmp(name, tgt->disk->disk_name)) {
+                                ret = true;
+                                mutex_unlock(&dev->mlock);
+                                goto out;
+                        }
+                }
+                mutex_unlock(&dev->mlock);
+        }
+out:
+        up_write(&nvm_lock);
+        return ret;
+}
 static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
 {
        int i;
@@ -104,7 +122,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
                if (clear) {
                        for (j = 0; j < ch_map->nr_luns; j++) {
                                int lun = j + lun_offs[j];
-                                int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+                                int lunid = (ch * dev->geo.nr_luns) + lun;
                                WARN_ON(!test_and_clear_bit(lunid,
                                                        dev->lun_map));
@@ -122,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
 }
 static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
-                                              int lun_begin, int lun_end)
+                                              u16 lun_begin, u16 lun_end,
+                                              u16 op)
 {
        struct nvm_tgt_dev *tgt_dev = NULL;
        struct nvm_dev_map *dev_rmap = dev->rmap;
@@ -130,10 +149,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
        struct ppa_addr *luns;
        int nr_luns = lun_end - lun_begin + 1;
        int luns_left = nr_luns;
-        int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
+        int nr_chnls = nr_luns / dev->geo.nr_luns;
-        int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
+        int nr_chnls_mod = nr_luns % dev->geo.nr_luns;
-        int bch = lun_begin / dev->geo.luns_per_chnl;
+        int bch = lun_begin / dev->geo.nr_luns;
-        int blun = lun_begin % dev->geo.luns_per_chnl;
+        int blun = lun_begin % dev->geo.nr_luns;
        int lunid = 0;
        int lun_balanced = 1;
        int prev_nr_luns;
@@ -154,15 +173,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
        if (!luns)
                goto err_luns;
-        prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
+        prev_nr_luns = (luns_left > dev->geo.nr_luns) ?
-                                        dev->geo.luns_per_chnl : luns_left;
+                                        dev->geo.nr_luns : luns_left;
        for (i = 0; i < nr_chnls; i++) {
                struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
                int *lun_roffs = ch_rmap->lun_offs;
                struct nvm_ch_map *ch_map = &dev_map->chnls[i];
                int *lun_offs;
-                int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
+                int luns_in_chnl = (luns_left > dev->geo.nr_luns) ?
-                                        dev->geo.luns_per_chnl : luns_left;
+                                        dev->geo.nr_luns : luns_left;
                if (lun_balanced && prev_nr_luns != luns_in_chnl)
                        lun_balanced = 0;
@@ -199,8 +218,9 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
        memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
        /* Target device only owns a portion of the physical device */
        tgt_dev->geo.nr_chnls = nr_chnls;
-        tgt_dev->geo.nr_luns = nr_luns;
+        tgt_dev->geo.all_luns = nr_luns;
-        tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
+        tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
+        tgt_dev->geo.op = op;
        tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
        tgt_dev->q = dev->q;
        tgt_dev->map = dev_map;
@@ -226,27 +246,79 @@ static const struct block_device_operations nvm_fops = {
        .owner          = THIS_MODULE,
 };
-static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
+static struct nvm_tgt_type *__nvm_find_target_type(const char *name)
 {
-        struct nvm_tgt_type *tmp, *tt = NULL;
+        struct nvm_tgt_type *tt;
-        if (lock)
+        list_for_each_entry(tt, &nvm_tgt_types, list)
-                down_write(&nvm_tgtt_lock);
+                if (!strcmp(name, tt->name))
+                        return tt;
-        list_for_each_entry(tmp, &nvm_tgt_types, list)
+        return NULL;
-                if (!strcmp(name, tmp->name)) {
+}
-                        tt = tmp;
-                        break;
+static struct nvm_tgt_type *nvm_find_target_type(const char *name)
-                }
+{
+        struct nvm_tgt_type *tt;
+        down_write(&nvm_tgtt_lock);
+        tt = __nvm_find_target_type(name);
+        up_write(&nvm_tgtt_lock);
-        if (lock)
-                up_write(&nvm_tgtt_lock);
        return tt;
 }
+static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
+                                 int lun_end)
+{
+        if (lun_begin > lun_end || lun_end >= geo->all_luns) {
+                pr_err("nvm: lun out of bound (%u:%u > %u)\n",
+                        lun_begin, lun_end, geo->all_luns - 1);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int __nvm_config_simple(struct nvm_dev *dev,
+                               struct nvm_ioctl_create_simple *s)
+{
+        struct nvm_geo *geo = &dev->geo;
+        if (s->lun_begin == -1 && s->lun_end == -1) {
+                s->lun_begin = 0;
+                s->lun_end = geo->all_luns - 1;
+        }
+        return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
+}
+static int __nvm_config_extended(struct nvm_dev *dev,
+                                 struct nvm_ioctl_create_extended *e)
+{
+        struct nvm_geo *geo = &dev->geo;
+        if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
+                e->lun_begin = 0;
+                e->lun_end = dev->geo.all_luns - 1;
+        }
+        /* op not set falls into target's default */
+        if (e->op == 0xFFFF)
+                e->op = NVM_TARGET_DEFAULT_OP;
+        if (e->op < NVM_TARGET_MIN_OP ||
+            e->op > NVM_TARGET_MAX_OP) {
+                pr_err("nvm: invalid over provisioning value\n");
+                return -EINVAL;
+        }
+        return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
+}
 static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 {
-        struct nvm_ioctl_create_simple *s = &create->conf.s;
+        struct nvm_ioctl_create_extended e;
        struct request_queue *tqueue;
        struct gendisk *tdisk;
        struct nvm_tgt_type *tt;
@@ -255,22 +327,41 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
        void *targetdata;
        int ret;
-        tt = nvm_find_target_type(create->tgttype, 1);
+        switch (create->conf.type) {
+        case NVM_CONFIG_TYPE_SIMPLE:
+                ret = __nvm_config_simple(dev, &create->conf.s);
+                if (ret)
+                        return ret;
+                e.lun_begin = create->conf.s.lun_begin;
+                e.lun_end = create->conf.s.lun_end;
+                e.op = NVM_TARGET_DEFAULT_OP;
+                break;
+        case NVM_CONFIG_TYPE_EXTENDED:
+                ret = __nvm_config_extended(dev, &create->conf.e);
+                if (ret)
+                        return ret;
+                e = create->conf.e;
+                break;
+        default:
+                pr_err("nvm: config type not valid\n");
+                return -EINVAL;
+        }
+        tt = nvm_find_target_type(create->tgttype);
        if (!tt) {
                pr_err("nvm: target type %s not found\n", create->tgttype);
                return -EINVAL;
        }
-        mutex_lock(&dev->mlock);
+        if (nvm_target_exists(create->tgtname)) {
-        t = nvm_find_target(dev, create->tgtname);
+                pr_err("nvm: target name already exists (%s)\n",
-        if (t) {
+                                                        create->tgtname);
-                pr_err("nvm: target name already exists.\n");
-                mutex_unlock(&dev->mlock);
                return -EINVAL;
        }
-        mutex_unlock(&dev->mlock);
-        ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end);
+        ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
        if (ret)
                return ret;
@@ -280,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
                goto err_reserve;
        }
-        tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
+        tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
        if (!tgt_dev) {
                pr_err("nvm: could not create target device\n");
                ret = -ENOMEM;
@@ -350,7 +441,7 @@ err_dev:
 err_t:
        kfree(t);
 err_reserve:
-        nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
+        nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
        return ret;
 }
@@ -420,7 +511,7 @@ static int nvm_register_map(struct nvm_dev *dev)
        for (i = 0; i < dev->geo.nr_chnls; i++) {
                struct nvm_ch_map *ch_rmap;
                int *lun_roffs;
-                int luns_in_chnl = dev->geo.luns_per_chnl;
+                int luns_in_chnl = dev->geo.nr_luns;
                ch_rmap = &rmap->chnls[i];
@@ -524,41 +615,12 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
        nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
 }
-void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
-                     int len)
-{
-        struct nvm_geo *geo = &dev->geo;
-        struct nvm_dev_map *dev_rmap = dev->rmap;
-        u64 i;
-        for (i = 0; i < len; i++) {
-                struct nvm_ch_map *ch_rmap;
-                int *lun_roffs;
-                struct ppa_addr gaddr;
-                u64 pba = le64_to_cpu(entries[i]);
-                u64 diff;
-                if (!pba)
-                        continue;
-                gaddr = linear_to_generic_addr(geo, pba);
-                ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
-                lun_roffs = ch_rmap->lun_offs;
-                diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
-                                (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
-                entries[i] -= cpu_to_le64(diff);
-        }
-}
-EXPORT_SYMBOL(nvm_part_to_tgt);
 int nvm_register_tgt_type(struct nvm_tgt_type *tt)
 {
        int ret = 0;
        down_write(&nvm_tgtt_lock);
-        if (nvm_find_target_type(tt->name, 0))
+        if (__nvm_find_target_type(tt->name))
                ret = -EEXIST;
        else
                list_add(&tt->list, &nvm_tgt_types);
@@ -726,112 +788,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io_sync);
-int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
-                                                                int nr_ppas)
-{
-        struct nvm_geo *geo = &tgt_dev->geo;
-        struct nvm_rq rqd;
-        int ret;
-        memset(&rqd, 0, sizeof(struct nvm_rq));
-        rqd.opcode = NVM_OP_ERASE;
-        rqd.flags = geo->plane_mode >> 1;
-        ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
-        if (ret)
-                return ret;
-        ret = nvm_submit_io_sync(tgt_dev, &rqd);
-        if (ret) {
-                pr_err("rrpr: erase I/O submission failed: %d\n", ret);
-                goto free_ppa_list;
-        }
-free_ppa_list:
-        nvm_free_rqd_ppalist(tgt_dev, &rqd);
-        return ret;
-}
-EXPORT_SYMBOL(nvm_erase_sync);
-int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
-                    nvm_l2p_update_fn *update_l2p, void *priv)
-{
-        struct nvm_dev *dev = tgt_dev->parent;
-        if (!dev->ops->get_l2p_tbl)
-                return 0;
-        return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv);
-}
-EXPORT_SYMBOL(nvm_get_l2p_tbl);
-int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
-{
-        struct nvm_dev *dev = tgt_dev->parent;
-        struct nvm_geo *geo = &dev->geo;
-        struct nvm_area *area, *prev, *next;
-        sector_t begin = 0;
-        sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
-        if (len > max_sectors)
-                return -EINVAL;
-        area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
-        if (!area)
-                return -ENOMEM;
-        prev = NULL;
-        spin_lock(&dev->lock);
-        list_for_each_entry(next, &dev->area_list, list) {
-                if (begin + len > next->begin) {
-                        begin = next->end;
-                        prev = next;
-                        continue;
-                }
-                break;
-        }
-        if ((begin + len) > max_sectors) {
-                spin_unlock(&dev->lock);
-                kfree(area);
-                return -EINVAL;
-        }
-        area->begin = *lba = begin;
-        area->end = begin + len;
-        if (prev) /* insert into sorted order */
-                list_add(&area->list, &prev->list);
-        else
-                list_add(&area->list, &dev->area_list);
-        spin_unlock(&dev->lock);
-        return 0;
-}
-EXPORT_SYMBOL(nvm_get_area);
-void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
-{
-        struct nvm_dev *dev = tgt_dev->parent;
-        struct nvm_area *area;
-        spin_lock(&dev->lock);
-        list_for_each_entry(area, &dev->area_list, list) {
-                if (area->begin != begin)
-                        continue;
-                list_del(&area->list);
-                spin_unlock(&dev->lock);
-                kfree(area);
-                return;
-        }
-        spin_unlock(&dev->lock);
-}
-EXPORT_SYMBOL(nvm_put_area);
 void nvm_end_io(struct nvm_rq *rqd)
 {
        struct nvm_tgt_dev *tgt_dev = rqd->dev;
@@ -858,10 +814,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
        struct nvm_geo *geo = &dev->geo;
        int blk, offset, pl, blktype;
-        if (nr_blks != geo->blks_per_lun * geo->plane_mode)
+        if (nr_blks != geo->nr_chks * geo->plane_mode)
                return -EINVAL;
-        for (blk = 0; blk < geo->blks_per_lun; blk++) {
+        for (blk = 0; blk < geo->nr_chks; blk++) {
                offset = blk * geo->plane_mode;
                blktype = blks[offset];
@@ -877,7 +833,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
                blks[blk] = blktype;
        }
-        return geo->blks_per_lun;
+        return geo->nr_chks;
 }
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
@@ -892,53 +848,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
 }
 EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
-static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
-{
-        struct nvm_geo *geo = &dev->geo;
-        int i;
-        dev->lps_per_blk = geo->pgs_per_blk;
-        dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
-        if (!dev->lptbl)
-                return -ENOMEM;
-        /* Just a linear array */
-        for (i = 0; i < dev->lps_per_blk; i++)
-                dev->lptbl[i] = i;
-        return 0;
-}
-static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
-{
-        int i, p;
-        struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
-        if (!mlc->num_pairs)
-                return 0;
-        dev->lps_per_blk = mlc->num_pairs;
-        dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
-        if (!dev->lptbl)
-                return -ENOMEM;
-        /* The lower page table encoding consists of a list of bytes, where each
-         * has a lower and an upper half. The first half byte maintains the
-         * increment value and every value after is an offset added to the
-         * previous incrementation value
-         */
-        dev->lptbl[0] = mlc->pairs[0] & 0xF;
-        for (i = 1; i < dev->lps_per_blk; i++) {
-                p = mlc->pairs[i >> 1];
-                if (i & 0x1) /* upper */
-                        dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
-                else /* lower */
-                        dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
-        }
-        return 0;
-}
 static int nvm_core_init(struct nvm_dev *dev)
 {
        struct nvm_id *id = &dev->identity;
@@ -946,66 +855,44 @@ static int nvm_core_init(struct nvm_dev *dev)
        struct nvm_geo *geo = &dev->geo;
        int ret;
+        memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
+        if (grp->mtype != 0) {
+                pr_err("nvm: memory type not supported\n");
+                return -EINVAL;
+        }
        /* Whole device values */
        geo->nr_chnls = grp->num_ch;
-        geo->luns_per_chnl = grp->num_lun;
+        geo->nr_luns = grp->num_lun;
-        /* Generic device values */
+        /* Generic device geometry values */
-        geo->pgs_per_blk = grp->num_pg;
+        geo->ws_min = grp->ws_min;
-        geo->blks_per_lun = grp->num_blk;
+        geo->ws_opt = grp->ws_opt;
-        geo->nr_planes = grp->num_pln;
+        geo->ws_seq = grp->ws_seq;
-        geo->fpg_size = grp->fpg_sz;
+        geo->ws_per_chk = grp->ws_per_chk;
-        geo->pfpg_size = grp->fpg_sz * grp->num_pln;
+        geo->nr_chks = grp->num_chk;
        geo->sec_size = grp->csecs;
        geo->oob_size = grp->sos;
-        geo->sec_per_pg = grp->fpg_sz / grp->csecs;
        geo->mccap = grp->mccap;
-        memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
-        geo->plane_mode = NVM_PLANE_SINGLE;
        geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
-        if (grp->mpos & 0x020202)
+        geo->sec_per_chk = grp->clba;
-                geo->plane_mode = NVM_PLANE_DOUBLE;
+        geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
-        if (grp->mpos & 0x040404)
+        geo->all_luns = geo->nr_luns * geo->nr_chnls;
-                geo->plane_mode = NVM_PLANE_QUAD;
-        if (grp->mtype != 0) {
+        /* 1.2 spec device geometry values */
-                pr_err("nvm: memory type not supported\n");
+        geo->plane_mode = 1 << geo->ws_seq;
-                return -EINVAL;
+        geo->nr_planes = geo->ws_opt / geo->ws_min;
-        }
+        geo->sec_per_pg = geo->ws_min;
-        /* calculated values */
        geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
-        geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk;
-        geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun;
-        geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls;
-        dev->total_secs = geo->nr_luns * geo->sec_per_lun;
+        dev->total_secs = geo->all_luns * geo->sec_per_lun;
-        dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns),
+        dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
                                        sizeof(unsigned long), GFP_KERNEL);
        if (!dev->lun_map)
                return -ENOMEM;
-        switch (grp->fmtype) {
-        case NVM_ID_FMTYPE_SLC:
-                if (nvm_init_slc_tbl(dev, grp)) {
-                        ret = -ENOMEM;
-                        goto err_fmtype;
-                }
-                break;
-        case NVM_ID_FMTYPE_MLC:
-                if (nvm_init_mlc_tbl(dev, grp)) {
-                        ret = -ENOMEM;
-                        goto err_fmtype;
-                }
-                break;
-        default:
-                pr_err("nvm: flash type not supported\n");
-                ret = -EINVAL;
-                goto err_fmtype;
-        }
        INIT_LIST_HEAD(&dev->area_list);
        INIT_LIST_HEAD(&dev->targets);
        mutex_init(&dev->mlock);
@@ -1031,7 +918,6 @@ static void nvm_free(struct nvm_dev *dev)
                dev->ops->destroy_dma_pool(dev->dma_pool);
        nvm_unregister_map(dev);
-        kfree(dev->lptbl);
        kfree(dev->lun_map);
        kfree(dev);
 }
@@ -1062,8 +948,8 @@ static int nvm_init(struct nvm_dev *dev)
        pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
                        dev->name, geo->sec_per_pg, geo->nr_planes,
-                        geo->pgs_per_blk, geo->blks_per_lun,
+                        geo->ws_per_chk, geo->nr_chks,
-                        geo->nr_luns, geo->nr_chnls);
+                        geo->all_luns, geo->nr_chnls);
        return 0;
 err:
        pr_err("nvm: failed to initialize nvm\n");
@@ -1135,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister);
 static int __nvm_configure_create(struct nvm_ioctl_create *create)
 {
        struct nvm_dev *dev;
-        struct nvm_ioctl_create_simple *s;
        down_write(&nvm_lock);
        dev = nvm_find_nvm_dev(create->dev);
@@ -1146,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
                return -EINVAL;
        }
-        if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
-                pr_err("nvm: config type not valid\n");
-                return -EINVAL;
-        }
-        s = &create->conf.s;
-        if (s->lun_begin == -1 && s->lun_end == -1) {
-                s->lun_begin = 0;
-                s->lun_end = dev->geo.nr_luns - 1;
-        }
-        if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) {
-                pr_err("nvm: lun out of bound (%u:%u > %u)\n",
-                        s->lun_begin, s->lun_end, dev->geo.nr_luns - 1);
-                return -EINVAL;
-        }
        return nvm_create_tgt(dev, create);
 }
@@ -1262,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
        if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
                return -EFAULT;
+        if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
+            create.conf.e.rsv != 0) {
+                pr_err("nvm: reserved config field in use\n");
+                return -EINVAL;
+        }
        create.dev[DISK_NAME_LEN - 1] = '\0';
        create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
        create.tgtname[DISK_NAME_LEN - 1] = '\0';
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 0d227ef7d1b9..000fcad38136 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -19,12 +19,16 @@
 int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
 {
+        struct request_queue *q = pblk->dev->q;
        struct pblk_w_ctx w_ctx;
        sector_t lba = pblk_get_lba(bio);
+        unsigned long start_time = jiffies;
        unsigned int bpos, pos;
        int nr_entries = pblk_get_secs(bio);
        int i, ret;
+        generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
        /* Update the write buffer head (mem) with the entries that we can
         * write. The write in itself cannot fail, so there is no need to
         * rollback from here on.
@@ -67,6 +71,7 @@ retry:
        pblk_rl_inserted(&pblk->rl, nr_entries);
 out:
+        generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
        pblk_write_should_kick(pblk);
        return ret;
 }
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 76516ee84e9a..0487b9340c1d 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work)
                struct pblk_line *line;
                int pos;
-                line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+                line = &pblk->lines[pblk_ppa_to_line(*ppa)];
-                pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+                pos = pblk_ppa_to_pos(&dev->geo, *ppa);
                pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
                                line->id, pos);
@@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
 {
        struct nvm_tgt_dev *dev = pblk->dev;
        struct nvm_geo *geo = &dev->geo;
-        int pos = pblk_dev_ppa_to_pos(geo, *ppa);
+        int pos = pblk_ppa_to_pos(geo, *ppa);
        pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
        atomic_long_inc(&pblk->erase_failed);
@@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
 {
        struct pblk_line *line;
-        line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
+        line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
        atomic_dec(&line->left_seblks);
        if (rqd->error) {
@@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
        BUG_ON(pblk_ppa_empty(ppa));
 #endif
-        line_id = pblk_tgt_ppa_to_line(ppa);
+        line_id = pblk_ppa_to_line(ppa);
        line = &pblk->lines[line_id];
        paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
@@ -650,7 +650,7 @@ next_rq:
        } else {
                for (i = 0; i < rqd.nr_ppas; ) {
                        struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
-                        int pos = pblk_dev_ppa_to_pos(geo, ppa);
+                        int pos = pblk_ppa_to_pos(geo, ppa);
                        int read_type = PBLK_READ_RANDOM;
                        if (pblk_io_aligned(pblk, rq_ppas))
@@ -668,7 +668,7 @@ next_rq:
                                }
                                ppa = addr_to_gen_ppa(pblk, paddr, id);
-                                pos = pblk_dev_ppa_to_pos(geo, ppa);
+                                pos = pblk_ppa_to_pos(geo, ppa);
                        }
                        if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
@@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
                cmd_op = NVM_OP_PWRITE;
                flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
                lba_list = emeta_to_lbas(pblk, line->emeta->buf);
-        } else if (dir == PBLK_READ) {
+        } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
                bio_op = REQ_OP_READ;
                cmd_op = NVM_OP_PREAD;
                flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
        if (rqd.error) {
                if (dir == PBLK_WRITE)
                        pblk_log_write_err(pblk, &rqd);
-                else
+                else if (dir == PBLK_READ)
                        pblk_log_read_err(pblk, &rqd);
        }
@@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
 {
        u64 bpaddr = pblk_line_smeta_start(pblk, line);
-        return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
+        return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
 }
 int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
@@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
                struct nvm_geo *geo = &dev->geo;
                pr_err("pblk: could not sync erase line:%d,blk:%d\n",
-                                        pblk_dev_ppa_to_line(ppa),
+                                        pblk_ppa_to_line(ppa),
-                                        pblk_dev_ppa_to_pos(geo, ppa));
+                                        pblk_ppa_to_pos(geo, ppa));
                rqd.error = ret;
                goto out;
@@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
        /* Start metadata */
        smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
-        smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns);
+        smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
        /* Fill metadata among lines */
        if (cur) {
@@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
                                                        lm->sec_per_line);
                bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
                                                        lm->sec_per_line);
-                line->sec_in_line -= geo->sec_per_blk;
+                line->sec_in_line -= geo->sec_per_chk;
                if (bit >= lm->emeta_bb)
                        nr_bb++;
        }
@@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
        }
        spin_unlock(&l_mg->free_lock);
-        pblk_rl_free_lines_dec(&pblk->rl, line);
+        pblk_rl_free_lines_dec(&pblk->rl, line, true);
        if (!pblk_line_init_bb(pblk, line, 0)) {
                list_add(&line->list, &l_mg->free_list);
@@ -1233,7 +1233,7 @@ retry:
        l_mg->data_line = retry_line;
        spin_unlock(&l_mg->free_lock);
-        pblk_rl_free_lines_dec(&pblk->rl, retry_line);
+        pblk_rl_free_lines_dec(&pblk->rl, line, false);
        if (pblk_line_erase(pblk, retry_line))
                goto retry;
@@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
 {
        struct pblk_line_mgmt *l_mg = &pblk->l_mg;
        struct pblk_line *line;
-        int is_next = 0;
        spin_lock(&l_mg->free_lock);
        line = pblk_line_get(pblk);
@@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
        } else {
                l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
                l_mg->data_next->type = PBLK_LINETYPE_DATA;
-                is_next = 1;
        }
        spin_unlock(&l_mg->free_lock);
@@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
                        return NULL;
        }
-        pblk_rl_free_lines_dec(&pblk->rl, line);
-        if (is_next)
-                pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
 retry_setup:
        if (!pblk_line_init_metadata(pblk, line, NULL)) {
                line = pblk_line_retry(pblk, line);
@@ -1311,6 +1305,8 @@ retry_setup:
                goto retry_setup;
        }
+        pblk_rl_free_lines_dec(&pblk->rl, line, true);
        return line;
 }
@@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
        struct pblk_line_mgmt *l_mg = &pblk->l_mg;
        struct pblk_line *cur, *new = NULL;
        unsigned int left_seblks;
-        int is_next = 0;
        cur = l_mg->data_line;
        new = l_mg->data_next;
@@ -1444,6 +1439,8 @@ retry_setup:
                goto retry_setup;
        }
+        pblk_rl_free_lines_dec(&pblk->rl, new, true);
        /* Allocate next line for preparation */
        spin_lock(&l_mg->free_lock);
        l_mg->data_next = pblk_line_get(pblk);
@@ -1457,13 +1454,9 @@ retry_setup:
        } else {
                l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
                l_mg->data_next->type = PBLK_LINETYPE_DATA;
-                is_next = 1;
        }
        spin_unlock(&l_mg->free_lock);
-        if (is_next)
-                pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
 out:
        return new;
 }
@@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
                struct nvm_geo *geo = &dev->geo;
                pr_err("pblk: could not async erase line:%d,blk:%d\n",
-                                        pblk_dev_ppa_to_line(ppa),
+                                        pblk_ppa_to_line(ppa),
-                                        pblk_dev_ppa_to_pos(geo, ppa));
+                                        pblk_ppa_to_pos(geo, ppa));
        }
        return err;
@@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
        struct nvm_tgt_dev *dev = pblk->dev;
        struct nvm_geo *geo = &dev->geo;
        struct pblk_lun *rlun;
-        int nr_luns = geo->nr_luns;
+        int nr_luns = geo->all_luns;
        int bit = -1;
        while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
@@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
                /* If the L2P entry maps to a line, the reference is valid */
                if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
-                        int line_id = pblk_dev_ppa_to_line(ppa);
+                        int line_id = pblk_ppa_to_line(ppa);
                        struct pblk_line *line = &pblk->lines[line_id];
                        kref_get(&line->ref);
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 9c8e114c8a54..3d899383666e 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -169,7 +169,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
         * the line untouched. TODO: Implement a recovery routine that scans and
         * moves all sectors on the line.
         */
-        lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
+        ret = pblk_recov_check_emeta(pblk, emeta_buf);
+        if (ret) {
+                pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
+                goto fail_free_emeta;
+        }
+        lba_list = emeta_to_lbas(pblk, emeta_buf);
        if (!lba_list) {
                pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
                goto fail_free_emeta;
@@ -519,22 +526,12 @@ void pblk_gc_should_start(struct pblk *pblk)
        }
 }
-/*
- * If flush_wq == 1 then no lock should be held by the caller since
- * flush_workqueue can sleep
- */
-static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
-{
-        pblk->gc.gc_active = 0;
-        pr_debug("pblk: gc stop\n");
-}
 void pblk_gc_should_stop(struct pblk *pblk)
 {
        struct pblk_gc *gc = &pblk->gc;
        if (gc->gc_active && !gc->gc_forced)
-                pblk_gc_stop(pblk, 0);
+                gc->gc_active = 0;
 }
 void pblk_gc_should_kick(struct pblk *pblk)
@@ -660,7 +657,7 @@ void pblk_gc_exit(struct pblk *pblk)
        gc->gc_enabled = 0;
        del_timer_sync(&gc->gc_timer);
-        pblk_gc_stop(pblk, 1);
+        gc->gc_active = 0;
        if (gc->gc_ts)
                kthread_stop(gc->gc_ts);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 695826a06b5d..93d671ca518e 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk)
        }
        ppaf.ch_len = power_len;
-        power_len = get_count_order(geo->luns_per_chnl);
+        power_len = get_count_order(geo->nr_luns);
-        if (1 << power_len != geo->luns_per_chnl) {
+        if (1 << power_len != geo->nr_luns) {
                pr_err("pblk: supports only power-of-two LUN config.\n");
                return -EINVAL;
        }
@@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk)
        struct nvm_geo *geo = &dev->geo;
        pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
-                                                geo->nr_planes * geo->nr_luns;
+                                                geo->nr_planes * geo->all_luns;
        if (pblk_init_global_caches(pblk))
                return -ENOMEM;
@@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk)
        if (!pblk->gen_ws_pool)
                goto free_page_bio_pool;
-        pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+        pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
+                                                        pblk_rec_cache);
        if (!pblk->rec_pool)
                goto free_gen_ws_pool;
-        pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+        pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
                                                        pblk_g_rq_cache);
        if (!pblk->r_rq_pool)
                goto free_rec_pool;
-        pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+        pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
                                                        pblk_g_rq_cache);
        if (!pblk->e_rq_pool)
                goto free_r_rq_pool;
-        pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+        pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
                                                        pblk_w_rq_cache);
        if (!pblk->w_rq_pool)
                goto free_e_rq_pool;
@@ -354,6 +355,8 @@ static void pblk_core_free(struct pblk *pblk)
        mempool_destroy(pblk->e_rq_pool);
        mempool_destroy(pblk->w_rq_pool);
+        pblk_rwb_free(pblk);
        pblk_free_global_caches(pblk);
 }
@@ -409,7 +412,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
        u8 *blks;
        int nr_blks, ret;
-        nr_blks = geo->blks_per_lun * geo->plane_mode;
+        nr_blks = geo->nr_chks * geo->plane_mode;
        blks = kmalloc(nr_blks, GFP_KERNEL);
        if (!blks)
                return -ENOMEM;
@@ -482,20 +485,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
        int i, ret;
        /* TODO: Implement unbalanced LUN support */
-        if (geo->luns_per_chnl < 0) {
+        if (geo->nr_luns < 0) {
                pr_err("pblk: unbalanced LUN config.\n");
                return -EINVAL;
        }
-        pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+        pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
+                                                                GFP_KERNEL);
        if (!pblk->luns)
                return -ENOMEM;
-        for (i = 0; i < geo->nr_luns; i++) {
+        for (i = 0; i < geo->all_luns; i++) {
                /* Stripe across channels */
                int ch = i % geo->nr_chnls;
                int lun_raw = i / geo->nr_chnls;
-                int lunid = lun_raw + ch * geo->luns_per_chnl;
+                int lunid = lun_raw + ch * geo->nr_luns;
                rlun = &pblk->luns[i];
                rlun->bppa = luns[lunid];
@@ -577,22 +581,37 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
 static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 {
        struct nvm_tgt_dev *dev = pblk->dev;
+        struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+        struct pblk_line_meta *lm = &pblk->lm;
        struct nvm_geo *geo = &dev->geo;
        sector_t provisioned;
+        int sec_meta, blk_meta;
-        pblk->over_pct = 20;
+        if (geo->op == NVM_TARGET_DEFAULT_OP)
+                pblk->op = PBLK_DEFAULT_OP;
+        else
+                pblk->op = geo->op;
        provisioned = nr_free_blks;
-        provisioned *= (100 - pblk->over_pct);
+        provisioned *= (100 - pblk->op);
        sector_div(provisioned, 100);
+        pblk->op_blks = nr_free_blks - provisioned;
        /* Internally pblk manages all free blocks, but all calculations based
         * on user capacity consider only provisioned blocks
         */
        pblk->rl.total_blocks = nr_free_blks;
-        pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
+        pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk;
-        pblk->capacity = provisioned * geo->sec_per_blk;
+        /* Consider sectors used for metadata */
+        sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
+        blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
+        pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk;
        atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+        atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
 }
 static int pblk_lines_alloc_metadata(struct pblk *pblk)
@@ -683,7 +702,7 @@ static int pblk_lines_init(struct pblk *pblk)
        int i, ret;
        pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
-        max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+        max_write_ppas = pblk->min_write_pgs * geo->all_luns;
        pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
                                max_write_ppas : nvm_max_phys_sects(dev);
        pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
@@ -693,26 +712,26 @@ static int pblk_lines_init(struct pblk *pblk)
                return -EINVAL;
        }
-        div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+        div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod);
        if (mod) {
                pr_err("pblk: bad configuration of sectors/pages\n");
                return -EINVAL;
        }
-        l_mg->nr_lines = geo->blks_per_lun;
+        l_mg->nr_lines = geo->nr_chks;
        l_mg->log_line = l_mg->data_line = NULL;
        l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
        l_mg->nr_free_lines = 0;
        bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
-        lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
+        lm->sec_per_line = geo->sec_per_chk * geo->all_luns;
-        lm->blk_per_line = geo->nr_luns;
+        lm->blk_per_line = geo->all_luns;
-        lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+        lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
        lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
-        lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+        lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
        lm->mid_thrs = lm->sec_per_line / 2;
        lm->high_thrs = lm->sec_per_line / 4;
-        lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs;
+        lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
        /* Calculate necessary pages for smeta. See comment over struct
         * line_smeta definition
@@ -742,12 +761,12 @@ add_emeta_page:
                goto add_emeta_page;
        }
-        lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
+        lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
        lm->min_blk_line = 1;
-        if (geo->nr_luns > 1)
+        if (geo->all_luns > 1)
                lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
-                                        lm->emeta_sec[0], geo->sec_per_blk);
+                                        lm->emeta_sec[0], geo->sec_per_chk);
        if (lm->min_blk_line > lm->blk_per_line) {
                pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
@@ -772,7 +791,7 @@ add_emeta_page:
                goto fail_free_bb_template;
        }
-        bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+        bb_distance = (geo->all_luns) * geo->sec_per_pl;
        for (i = 0; i < lm->sec_per_line; i += bb_distance)
                bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
@@ -844,7 +863,7 @@ add_emeta_page:
        pblk_set_provision(pblk, nr_free_blks);
        /* Cleanup per-LUN bad block lists - managed within lines on run-time */
-        for (i = 0; i < geo->nr_luns; i++)
+        for (i = 0; i < geo->all_luns; i++)
                kfree(pblk->luns[i].bb_list);
        return 0;
@@ -858,7 +877,7 @@ fail_free_bb_template:
 fail_free_meta:
        pblk_line_meta_free(pblk);
 fail:
-        for (i = 0; i < geo->nr_luns; i++)
+        for (i = 0; i < geo->all_luns; i++)
                kfree(pblk->luns[i].bb_list);
        return ret;
@@ -866,15 +885,19 @@ fail:
 static int pblk_writer_init(struct pblk *pblk)
 {
-        timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
-        mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
        pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
        if (IS_ERR(pblk->writer_ts)) {
-                pr_err("pblk: could not allocate writer kthread\n");
+                int err = PTR_ERR(pblk->writer_ts);
-                return PTR_ERR(pblk->writer_ts);
+                if (err != -EINTR)
+                        pr_err("pblk: could not allocate writer kthread (%d)\n",
+                                        err);
+                return err;
        }
+        timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
+        mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
        return 0;
 }
@@ -910,7 +933,6 @@ static void pblk_tear_down(struct pblk *pblk)
        pblk_pipeline_stop(pblk);
        pblk_writer_stop(pblk);
        pblk_rb_sync_l2p(&pblk->rwb);
-        pblk_rwb_free(pblk);
        pblk_rl_free(&pblk->rl);
        pr_debug("pblk: consistent tear down\n");
@@ -1025,7 +1047,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
        ret = pblk_writer_init(pblk);
        if (ret) {
-                pr_err("pblk: could not initialize write thread\n");
+                if (ret != -EINTR)
+                        pr_err("pblk: could not initialize write thread\n");
                goto fail_free_lines;
        }
@@ -1041,13 +1064,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
        blk_queue_write_cache(tqueue, true, false);
-        tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+        tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size;
        tqueue->limits.discard_alignment = 0;
        blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
-        pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+        pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
-                        geo->nr_luns, pblk->l_mg.nr_lines,
+                        tdisk->disk_name,
+                        geo->all_luns, pblk->l_mg.nr_lines,
                        (unsigned long long)pblk->rl.nr_secs,
                        pblk->rwb.nr_entries);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 6f3ecde2140f..7445e6430c52 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
                return;
        /* Erase blocks that are bad in this line but might not be in next */
-        if (unlikely(ppa_empty(*erase_ppa)) &&
+        if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
                        bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
                int bit = -1;
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index b8f78e401482..ec8fc314646b 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -54,7 +54,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
        rb->seg_size = (1 << power_seg_sz);
        rb->nr_entries = (1 << power_size);
        rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
-        rb->sync_point = EMPTY_ENTRY;
+        rb->flush_point = EMPTY_ENTRY;
        spin_lock_init(&rb->w_lock);
        spin_lock_init(&rb->s_lock);
@@ -112,7 +112,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
        up_write(&pblk_rb_lock);
 #ifdef CONFIG_NVM_DEBUG
-        atomic_set(&rb->inflight_sync_point, 0);
+        atomic_set(&rb->inflight_flush_point, 0);
 #endif
        /*
@@ -226,7 +226,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
                pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
                                                        entry->cacheline);
-                line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
+                line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
                kref_put(&line->ref, pblk_line_put);
                clean_wctx(w_ctx);
                rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
@@ -349,35 +349,35 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
        smp_store_release(&entry->w_ctx.flags, flags);
 }
-static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
+static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
                                  unsigned int pos)
 {
        struct pblk_rb_entry *entry;
-        unsigned int subm, sync_point;
+        unsigned int sync, flush_point;
-        subm = READ_ONCE(rb->subm);
+        sync = READ_ONCE(rb->sync);
+        if (pos == sync)
+                return 0;
 #ifdef CONFIG_NVM_DEBUG
-        atomic_inc(&rb->inflight_sync_point);
+        atomic_inc(&rb->inflight_flush_point);
 #endif
-        if (pos == subm)
+        flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
-                return 0;
+        entry = &rb->entries[flush_point];
-        sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+        pblk_rb_sync_init(rb, NULL);
-        entry = &rb->entries[sync_point];
-        /* Protect syncs */
+        /* Protect flush points */
-        smp_store_release(&rb->sync_point, sync_point);
+        smp_store_release(&rb->flush_point, flush_point);
-        if (!bio)
+        if (bio)
-                return 0;
+                bio_list_add(&entry->w_ctx.bios, bio);
-        spin_lock_irq(&rb->s_lock);
+        pblk_rb_sync_end(rb, NULL);
-        bio_list_add(&entry->w_ctx.bios, bio);
-        spin_unlock_irq(&rb->s_lock);
-        return 1;
+        return bio ? 1 : 0;
 }
 static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
@@ -416,7 +416,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
        struct pblk *pblk = container_of(rb, struct pblk, rwb);
        unsigned int mem = READ_ONCE(rb->mem);
-        if (pblk_rb_sync_point_set(rb, NULL, mem))
+        if (pblk_rb_flush_point_set(rb, NULL, mem))
                return;
        pblk_write_should_kick(pblk);
@@ -440,7 +440,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
 #ifdef CONFIG_NVM_DEBUG
                atomic_long_inc(&pblk->nr_flush);
 #endif
-                if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
+                if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
                        *io_ret = NVM_IO_OK;
        }
@@ -606,21 +606,6 @@ try:
                        return NVM_IO_ERR;
                }
-                if (flags & PBLK_FLUSH_ENTRY) {
-                        unsigned int sync_point;
-                        sync_point = READ_ONCE(rb->sync_point);
-                        if (sync_point == pos) {
-                                /* Protect syncs */
-                                smp_store_release(&rb->sync_point, EMPTY_ENTRY);
-                        }
-                        flags &= ~PBLK_FLUSH_ENTRY;
-#ifdef CONFIG_NVM_DEBUG
-                        atomic_dec(&rb->inflight_sync_point);
-#endif
-                }
                flags &= ~PBLK_WRITTEN_DATA;
                flags |= PBLK_SUBMITTED_ENTRY;
@@ -730,15 +715,24 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
 unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
 {
-        unsigned int sync;
+        unsigned int sync, flush_point;
-        unsigned int i;
        lockdep_assert_held(&rb->s_lock);
        sync = READ_ONCE(rb->sync);
+        flush_point = READ_ONCE(rb->flush_point);
-        for (i = 0; i < nr_entries; i++)
+        if (flush_point != EMPTY_ENTRY) {
-                sync = (sync + 1) & (rb->nr_entries - 1);
+                unsigned int secs_to_flush;
+                secs_to_flush = pblk_rb_ring_count(flush_point, sync,
+                                        rb->nr_entries);
+                if (secs_to_flush < nr_entries) {
+                        /* Protect flush points */
+                        smp_store_release(&rb->flush_point, EMPTY_ENTRY);
+                }
+        }
+        sync = (sync + nr_entries) & (rb->nr_entries - 1);
        /* Protect from counts */
        smp_store_release(&rb->sync, sync);
@@ -746,22 +740,27 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
        return sync;
 }
-unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
+/* Calculate how many sectors to submit up to the current flush point. */
+unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
 {
-        unsigned int subm, sync_point;
+        unsigned int subm, sync, flush_point;
-        unsigned int count;
+        unsigned int submitted, to_flush;
-        /* Protect syncs */
+        /* Protect flush points */
-        sync_point = smp_load_acquire(&rb->sync_point);
+        flush_point = smp_load_acquire(&rb->flush_point);
-        if (sync_point == EMPTY_ENTRY)
+        if (flush_point == EMPTY_ENTRY)
                return 0;
+        /* Protect syncs */
+        sync = smp_load_acquire(&rb->sync);
        subm = READ_ONCE(rb->subm);
+        submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
        /* The sync point itself counts as a sector to sync */
-        count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+        to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
-        return count;
+        return (submitted < to_flush) ? (to_flush - submitted) : 0;
 }
 /*
@@ -801,7 +800,7 @@ int pblk_rb_tear_down_check(struct pblk_rb *rb)
        if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
                                (rb->sync == rb->l2p_update) &&
-                                (rb->sync_point == EMPTY_ENTRY)) {
+                                (rb->flush_point == EMPTY_ENTRY)) {
                goto out;
        }
@@ -848,7 +847,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
                queued_entries++;
        spin_unlock_irq(&rb->s_lock);
-        if (rb->sync_point != EMPTY_ENTRY)
+        if (rb->flush_point != EMPTY_ENTRY)
                offset = scnprintf(buf, PAGE_SIZE,
                        "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
                        rb->nr_entries,
@@ -857,14 +856,14 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
                        rb->sync,
                        rb->l2p_update,
 #ifdef CONFIG_NVM_DEBUG
-                        atomic_read(&rb->inflight_sync_point),
+                        atomic_read(&rb->inflight_flush_point),
 #else
                        0,
 #endif
-                        rb->sync_point,
+                        rb->flush_point,
                        pblk_rb_read_count(rb),
                        pblk_rb_space(rb),
-                        pblk_rb_sync_point_count(rb),
+                        pblk_rb_flush_point_count(rb),
                        queued_entries);
        else
                offset = scnprintf(buf, PAGE_SIZE,
@@ -875,13 +874,13 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
                        rb->sync,
                        rb->l2p_update,
 #ifdef CONFIG_NVM_DEBUG
-                        atomic_read(&rb->inflight_sync_point),
+                        atomic_read(&rb->inflight_flush_point),
 #else
                        0,
 #endif
                        pblk_rb_read_count(rb),
                        pblk_rb_space(rb),
-                        pblk_rb_sync_point_count(rb),
+                        pblk_rb_flush_point_count(rb),
                        queued_entries);
        return offset;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index ca79d8fb3e60..2f761283f43e 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -141,7 +141,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
                struct ppa_addr ppa = ppa_list[i];
                struct pblk_line *line;
-                line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+                line = &pblk->lines[pblk_ppa_to_line(ppa)];
                kref_put(&line->ref, pblk_line_put_wq);
        }
 }
@@ -158,8 +158,12 @@ static void pblk_end_user_read(struct bio *bio)
 static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
                               bool put_line)
 {
+        struct nvm_tgt_dev *dev = pblk->dev;
        struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
        struct bio *bio = rqd->bio;
+        unsigned long start_time = r_ctx->start_time;
+        generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
        if (rqd->error)
                pblk_log_read_err(pblk, rqd);
@@ -193,9 +197,9 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
        __pblk_end_io_read(pblk, rqd, true);
 }
-static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
-                                      unsigned int bio_init_idx,
+                                 unsigned int bio_init_idx,
-                                      unsigned long *read_bitmap)
+                                 unsigned long *read_bitmap)
 {
        struct bio *new_bio, *bio = rqd->bio;
        struct pblk_sec_meta *meta_list = rqd->meta_list;
@@ -270,7 +274,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
        i = 0;
        hole = find_first_zero_bit(read_bitmap, nr_secs);
        do {
-                int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]);
+                int line_id = pblk_ppa_to_line(rqd->ppa_list[i]);
                struct pblk_line *line = &pblk->lines[line_id];
                kref_put(&line->ref, pblk_line_put);
@@ -306,6 +310,8 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
        return NVM_IO_OK;
 err:
+        pr_err("pblk: failed to perform partial read\n");
        /* Free allocated pages in new bio */
        pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
        __pblk_end_io_read(pblk, rqd, false);
@@ -357,6 +363,7 @@ retry:
 int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 {
        struct nvm_tgt_dev *dev = pblk->dev;
+        struct request_queue *q = dev->q;
        sector_t blba = pblk_get_lba(bio);
        unsigned int nr_secs = pblk_get_secs(bio);
        struct pblk_g_ctx *r_ctx;
@@ -372,6 +379,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
                return NVM_IO_ERR;
        }
+        generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
        bitmap_zero(&read_bitmap, nr_secs);
        rqd = pblk_alloc_rqd(pblk, PBLK_READ);
@@ -383,6 +392,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
        rqd->end_io = pblk_end_io_read;
        r_ctx = nvm_rq_to_pdu(rqd);
+        r_ctx->start_time = jiffies;
        r_ctx->lba = blba;
        /* Save the index for this bio's start. This is needed in case
@@ -422,7 +432,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
                int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
                if (!int_bio) {
                        pr_err("pblk: could not clone read bio\n");
-                        return NVM_IO_ERR;
+                        goto fail_end_io;
                }
                rqd->bio = int_bio;
@@ -433,7 +443,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
                        pr_err("pblk: read IO submission failed\n");
                        if (int_bio)
                                bio_put(int_bio);
-                        return ret;
+                        goto fail_end_io;
                }
                return NVM_IO_OK;
@@ -442,17 +452,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
        /* The read bio request could be partially filled by the write buffer,
         * but there are some holes that need to be read from the drive.
         */
-        ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+        return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
-        if (ret) {
-                pr_err("pblk: failed to perform partial read\n");
-                return ret;
-        }
-        return NVM_IO_OK;
 fail_rqd_free:
        pblk_free_rqd(pblk, rqd, PBLK_READ);
        return ret;
+fail_end_io:
+        __pblk_end_io_read(pblk, rqd, false);
+        return ret;
 }
 static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index eadb3eb5d4dc..1d5e961bf5e0 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -111,18 +111,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
        return 0;
 }
-__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf)
+int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
 {
        u32 crc;
        crc = pblk_calc_emeta_crc(pblk, emeta_buf);
        if (le32_to_cpu(emeta_buf->crc) != crc)
-                return NULL;
+                return 1;
        if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
-                return NULL;
+                return 1;
-        return emeta_to_lbas(pblk, emeta_buf);
+        return 0;
 }
 static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
@@ -137,7 +137,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
        u64 nr_valid_lbas, nr_lbas = 0;
        u64 i;
-        lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
+        lba_list = emeta_to_lbas(pblk, emeta_buf);
        if (!lba_list)
                return 1;
@@ -149,7 +149,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
                struct ppa_addr ppa;
                int pos;
-                ppa = addr_to_pblk_ppa(pblk, i, line->id);
+                ppa = addr_to_gen_ppa(pblk, i, line->id);
                pos = pblk_ppa_to_pos(geo, ppa);
                /* Do not update bad blocks */
@@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
        int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
        return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
-                                nr_bb * geo->sec_per_blk;
+                                nr_bb * geo->sec_per_chk;
 }
 struct pblk_recov_alloc {
@@ -263,12 +263,12 @@ next_read_rq:
                int pos;
                ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
-                pos = pblk_dev_ppa_to_pos(geo, ppa);
+                pos = pblk_ppa_to_pos(geo, ppa);
                while (test_bit(pos, line->blk_bitmap)) {
                        r_ptr_int += pblk->min_write_pgs;
                        ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
-                        pos = pblk_dev_ppa_to_pos(geo, ppa);
+                        pos = pblk_ppa_to_pos(geo, ppa);
                }
                for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
@@ -288,7 +288,7 @@ next_read_rq:
        /* At this point, the read should not fail. If it does, it is a problem
         * we cannot recover from here. Need FTL log.
         */
-        if (rqd->error) {
+        if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
                pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
                return -EINTR;
        }
@@ -411,12 +411,12 @@ next_pad_rq:
                int pos;
                w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
-                ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+                ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
                pos = pblk_ppa_to_pos(geo, ppa);
                while (test_bit(pos, line->blk_bitmap)) {
                        w_ptr += pblk->min_write_pgs;
-                        ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+                        ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
                        pos = pblk_ppa_to_pos(geo, ppa);
                }
@@ -541,12 +541,12 @@ next_rq:
                w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
                ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
-                pos = pblk_dev_ppa_to_pos(geo, ppa);
+                pos = pblk_ppa_to_pos(geo, ppa);
                while (test_bit(pos, line->blk_bitmap)) {
                        w_ptr += pblk->min_write_pgs;
                        ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
-                        pos = pblk_dev_ppa_to_pos(geo, ppa);
+                        pos = pblk_ppa_to_pos(geo, ppa);
                }
                for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
@@ -672,12 +672,12 @@ next_rq:
                paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
                ppa = addr_to_gen_ppa(pblk, paddr, line->id);
-                pos = pblk_dev_ppa_to_pos(geo, ppa);
+                pos = pblk_ppa_to_pos(geo, ppa);
                while (test_bit(pos, line->blk_bitmap)) {
                        paddr += pblk->min_write_pgs;
                        ppa = addr_to_gen_ppa(pblk, paddr, line->id);
-                        pos = pblk_dev_ppa_to_pos(geo, ppa);
+                        pos = pblk_ppa_to_pos(geo, ppa);
                }
                for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
@@ -817,7 +817,7 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
        while (emeta_secs) {
                emeta_start--;
-                ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id);
+                ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
                pos = pblk_ppa_to_pos(geo, ppa);
                if (!test_bit(pos, line->blk_bitmap))
                        emeta_secs--;
@@ -938,6 +938,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
                        goto next;
                }
+                if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
+                        pblk_recov_l2p_from_oob(pblk, line);
+                        goto next;
+                }
                if (pblk_recov_l2p_from_emeta(pblk, line))
                        pblk_recov_l2p_from_oob(pblk, line);
@@ -984,10 +989,8 @@ next:
        }
        spin_unlock(&l_mg->free_lock);
-        if (is_next) {
+        if (is_next)
                pblk_line_erase(pblk, l_mg->data_next);
-                pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
-        }
 out:
        if (found_lines != recovered_lines)
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index dacc71922260..0d457b162f23 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -89,17 +89,15 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
        return atomic_read(&rl->free_blocks);
 }
-/*
+unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
- * We check for (i) the number of free blocks in the current LUN and (ii) the
+{
- * total number of free blocks in the pblk instance. This is to even out the
+        return atomic_read(&rl->free_user_blocks);
- * number of free blocks on each LUN when GC kicks in.
+}
- *
- * Only the total number of free blocks is used to configure the rate limiter.
+static void __pblk_rl_update_rates(struct pblk_rl *rl,
- */
+                                   unsigned long free_blocks)
-void pblk_rl_update_rates(struct pblk_rl *rl)
 {
        struct pblk *pblk = container_of(rl, struct pblk, rl);
-        unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
        int max = rl->rb_budget;
        if (free_blocks >= rl->high) {
@@ -132,20 +130,37 @@ void pblk_rl_update_rates(struct pblk_rl *rl)
                pblk_gc_should_stop(pblk);
 }
+void pblk_rl_update_rates(struct pblk_rl *rl)
+{
+        __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
+}
 void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
 {
        int blk_in_line = atomic_read(&line->blk_in_line);
+        int free_blocks;
        atomic_add(blk_in_line, &rl->free_blocks);
-        pblk_rl_update_rates(rl);
+        free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
+        __pblk_rl_update_rates(rl, free_blocks);
 }
-void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
+                            bool used)
 {
        int blk_in_line = atomic_read(&line->blk_in_line);
+        int free_blocks;
        atomic_sub(blk_in_line, &rl->free_blocks);
-        pblk_rl_update_rates(rl);
+        if (used)
+                free_blocks = atomic_sub_return(blk_in_line,
+                                                        &rl->free_user_blocks);
+        else
+                free_blocks = atomic_read(&rl->free_user_blocks);
+        __pblk_rl_update_rates(rl, free_blocks);
 }
 int pblk_rl_high_thrs(struct pblk_rl *rl)
@@ -174,16 +189,21 @@ void pblk_rl_free(struct pblk_rl *rl)
 void pblk_rl_init(struct pblk_rl *rl, int budget)
 {
        struct pblk *pblk = container_of(rl, struct pblk, rl);
+        struct nvm_tgt_dev *dev = pblk->dev;
+        struct nvm_geo *geo = &dev->geo;
+        struct pblk_line_mgmt *l_mg = &pblk->l_mg;
        struct pblk_line_meta *lm = &pblk->lm;
        int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
+        int sec_meta, blk_meta;
        unsigned int rb_windows;
-        rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
+        /* Consider sectors used for metadata */
-        rl->high_pw = get_count_order(rl->high);
+        sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
+        blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
-        rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
+        rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
-        if (rl->low < min_blocks)
+        rl->high_pw = get_count_order(rl->high);
-                rl->low = min_blocks;
        rl->rsv_blocks = min_blocks;
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index cd49e8875d4e..620bab853579 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
        ssize_t sz = 0;
        int i;
-        for (i = 0; i < geo->nr_luns; i++) {
+        for (i = 0; i < geo->all_luns; i++) {
                int active = 1;
                rlun = &pblk->luns[i];
@@ -49,11 +49,12 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
 static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
 {
-        int free_blocks, total_blocks;
+        int free_blocks, free_user_blocks, total_blocks;
        int rb_user_max, rb_user_cnt;
        int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
-        free_blocks = atomic_read(&pblk->rl.free_blocks);
+        free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
+        free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
        rb_user_max = pblk->rl.rb_user_max;
        rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
        rb_gc_max = pblk->rl.rb_gc_max;
@@ -64,16 +65,16 @@ static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
        total_blocks = pblk->rl.total_blocks;
        return snprintf(page, PAGE_SIZE,
-                "u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
+                "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
                                rb_user_cnt,
                                rb_user_max,
                                rb_gc_cnt,
                                rb_gc_max,
                                rb_state,
                                rb_budget,
-                                pblk->rl.low,
                                pblk->rl.high,
                                free_blocks,
+                                free_user_blocks,
                                total_blocks,
                                READ_ONCE(pblk->rl.rb_user_active));
 }
@@ -238,7 +239,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
        sz = snprintf(page, PAGE_SIZE - sz,
                "line: nluns:%d, nblks:%d, nsecs:%d\n",
-                geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+                geo->all_luns, lm->blk_per_line, lm->sec_per_line);
        sz += snprintf(page + sz, PAGE_SIZE - sz,
                "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
@@ -287,7 +288,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
                                "blk_line:%d, sec_line:%d, sec_blk:%d\n",
                                        lm->blk_per_line,
                                        lm->sec_per_line,
-                                        geo->sec_per_blk);
+                                        geo->sec_per_chk);
        return sz;
 }
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 6c1cafafef53..aae86ed60b98 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -21,13 +21,28 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
                                    struct pblk_c_ctx *c_ctx)
 {
        struct bio *original_bio;
+        struct pblk_rb *rwb = &pblk->rwb;
        unsigned long ret;
        int i;
        for (i = 0; i < c_ctx->nr_valid; i++) {
                struct pblk_w_ctx *w_ctx;
+                int pos = c_ctx->sentry + i;
+                int flags;
+                w_ctx = pblk_rb_w_ctx(rwb, pos);
+                flags = READ_ONCE(w_ctx->flags);
+                if (flags & PBLK_FLUSH_ENTRY) {
+                        flags &= ~PBLK_FLUSH_ENTRY;
+                        /* Release flags on context. Protect from writes */
+                        smp_store_release(&w_ctx->flags, flags);
+#ifdef CONFIG_NVM_DEBUG
+                        atomic_dec(&rwb->inflight_flush_point);
+#endif
+                }
-                w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
                while ((original_bio = bio_list_pop(&w_ctx->bios)))
                        bio_endio(original_bio);
        }
@@ -439,7 +454,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
        struct pblk_line *meta_line;
        int err;
-        ppa_set_empty(&erase_ppa);
+        pblk_ppa_set_empty(&erase_ppa);
        /* Assign lbas to ppas and populate request structure */
        err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
@@ -457,7 +472,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
                return NVM_IO_ERR;
        }
-        if (!ppa_empty(erase_ppa)) {
+        if (!pblk_ppa_empty(erase_ppa)) {
                /* Submit erase for next data line */
                if (pblk_blk_erase_async(pblk, erase_ppa)) {
                        struct pblk_line *e_line = pblk_line_get_erase(pblk);
@@ -508,7 +523,7 @@ static int pblk_submit_write(struct pblk *pblk)
        if (!secs_avail)
                return 1;
-        secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+        secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
        if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
                return 1;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 59a64d461a5d..8c357fb6538e 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -51,17 +51,16 @@
 #define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
-#define pblk_for_each_lun(pblk, rlun, i) \
-                for ((i) = 0, rlun = &(pblk)->luns[0]; \
-                        (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
 /* Static pool sizes */
 #define PBLK_GEN_WS_POOL_SIZE (2)
+#define PBLK_DEFAULT_OP (11)
 enum {
        PBLK_READ               = READ,
        PBLK_WRITE              = WRITE,/* Write from write buffer */
        PBLK_WRITE_INT,                 /* Internal write - no write buffer */
+        PBLK_READ_RECOV,                /* Recovery read - errors allowed */
        PBLK_ERASE,
 };
@@ -114,6 +113,7 @@ struct pblk_c_ctx {
 /* read context */
 struct pblk_g_ctx {
        void *private;
+        unsigned long start_time;
        u64 lba;
 };
@@ -170,7 +170,7 @@ struct pblk_rb {
                                         * the last submitted entry that has
                                         * been successfully persisted to media
                                         */
-        unsigned int sync_point;        /* Sync point - last entry that must be
+        unsigned int flush_point;       /* Sync point - last entry that must be
                                         * flushed to the media. Used with
                                         * REQ_FLUSH and REQ_FUA
                                         */
@@ -193,7 +193,7 @@ struct pblk_rb {
        spinlock_t s_lock;              /* Sync lock */
 #ifdef CONFIG_NVM_DEBUG
-        atomic_t inflight_sync_point;   /* Not served REQ_FLUSH | REQ_FUA */
+        atomic_t inflight_flush_point;  /* Not served REQ_FLUSH | REQ_FUA */
 #endif
 };
@@ -256,9 +256,6 @@ struct pblk_rl {
        unsigned int high;      /* Upper threshold for rate limiter (free run -
                                 * user I/O rate limiter
                                 */
-        unsigned int low;       /* Lower threshold for rate limiter (user I/O
-                                 * rate limiter - stall)
-                                 */
        unsigned int high_pw;   /* High rounded up as a power of 2 */
 #define PBLK_USER_HIGH_THRS 8   /* Begin write limit at 12% available blks */
@@ -292,7 +289,9 @@ struct pblk_rl {
        unsigned long long nr_secs;
        unsigned long total_blocks;
-        atomic_t free_blocks;
+        atomic_t free_blocks;           /* Total number of free blocks (+ OP) */
+        atomic_t free_user_blocks;      /* Number of user free blocks (no OP) */
 };
 #define PBLK_LINE_EMPTY (~0U)
@@ -583,7 +582,9 @@ struct pblk {
                            */
        sector_t capacity; /* Device capacity when bad blocks are subtracted */
-        int over_pct;      /* Percentage of device used for over-provisioning */
+        int op;      /* Percentage of device used for over-provisioning */
+        int op_blks; /* Number of blocks used for over-provisioning */
        /* pblk provisioning values. Used by rate limiter */
        struct pblk_rl rl;
@@ -691,7 +692,7 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
 struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
                                              struct ppa_addr *ppa);
 void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
-unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
+unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb);
 unsigned int pblk_rb_read_count(struct pblk_rb *rb);
 unsigned int pblk_rb_sync_count(struct pblk_rb *rb);
@@ -812,7 +813,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 void pblk_submit_rec(struct work_struct *work);
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
 int pblk_recov_pad(struct pblk *pblk);
-__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
+int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
 int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
                        struct pblk_rec_ctx *recovery, u64 *comp_bits,
                        unsigned int comp);
@@ -843,6 +844,7 @@ void pblk_rl_free(struct pblk_rl *rl);
 void pblk_rl_update_rates(struct pblk_rl *rl);
 int pblk_rl_high_thrs(struct pblk_rl *rl);
 unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl);
 int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
@@ -851,7 +853,8 @@ void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
 int pblk_rl_max_io(struct pblk_rl *rl);
 void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
-void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
+                            bool used);
 int pblk_rl_is_limit(struct pblk_rl *rl);
 /*
@@ -907,28 +910,47 @@ static inline int pblk_pad_distance(struct pblk *pblk)
        struct nvm_tgt_dev *dev = pblk->dev;
        struct nvm_geo *geo = &dev->geo;
-        return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+        return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl;
 }
-static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
+static inline int pblk_ppa_to_line(struct ppa_addr p)
 {
        return p.g.blk;
 }
-static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
 {
-        return p.g.blk;
+        return p.g.lun * geo->nr_chnls + p.g.ch;
 }
-static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+                                              u64 line_id)
 {
-        return p.g.lun * geo->nr_chnls + p.g.ch;
+        struct ppa_addr ppa;
+        ppa.ppa = 0;
+        ppa.g.blk = line_id;
+        ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
+        ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
+        ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
+        ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
+        ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+        return ppa;
 }
-/* A block within a line corresponds to the lun */
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
-static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+                                                        struct ppa_addr p)
 {
-        return p.g.lun * geo->nr_chnls + p.g.ch;
+        u64 paddr;
+        paddr = (u64)p.g.pg << pblk->ppaf.pg_offset;
+        paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+        paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
+        paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
+        paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+        return paddr;
 }
 static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
@@ -960,24 +982,6 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
        return ppa64;
 }
-static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
-                                                                sector_t lba)
-{
-        struct ppa_addr ppa;
-        if (pblk->ppaf_bitsize < 32) {
-                u32 *map = (u32 *)pblk->trans_map;
-                ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
-        } else {
-                struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
-                ppa = map[lba];
-        }
-        return ppa;
-}
 static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
 {
        u32 ppa32 = 0;
@@ -999,33 +1003,36 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
        return ppa32;
 }
-static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
-                                                struct ppa_addr ppa)
+                                                                sector_t lba)
 {
+        struct ppa_addr ppa;
        if (pblk->ppaf_bitsize < 32) {
                u32 *map = (u32 *)pblk->trans_map;
-                map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+                ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
        } else {
-                u64 *map = (u64 *)pblk->trans_map;
+                struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
-                map[lba] = ppa.ppa;
+                ppa = map[lba];
        }
+        return ppa;
 }
-static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
-                                                        struct ppa_addr p)
+                                                struct ppa_addr ppa)
 {
-        u64 paddr;
+        if (pblk->ppaf_bitsize < 32) {
+                u32 *map = (u32 *)pblk->trans_map;
-        paddr = 0;
+                map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
-        paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
+        } else {
-        paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+                u64 *map = (u64 *)pblk->trans_map;
-        paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
-        paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
-        paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
-        return paddr;
+                map[lba] = ppa.ppa;
+        }
 }
 static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
@@ -1040,10 +1047,7 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
 static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa)
 {
-        if (lppa.ppa == rppa.ppa)
+        return (lppa.ppa == rppa.ppa);
-                return true;
-        return false;
 }
 static inline int pblk_addr_in_cache(struct ppa_addr ppa)
@@ -1066,32 +1070,6 @@ static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
        return p;
 }
-static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
-                                              u64 line_id)
-{
-        struct ppa_addr ppa;
-        ppa.ppa = 0;
-        ppa.g.blk = line_id;
-        ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
-        ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
-        ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
-        ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
-        ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
-        return ppa;
-}
-static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
-                                         u64 line_id)
-{
-        struct ppa_addr ppa;
-        ppa = addr_to_gen_ppa(pblk, paddr, line_id);
-        return ppa;
-}
 static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
                                            struct line_header *header)
 {
@@ -1212,10 +1190,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
                if (!ppa->c.is_cached &&
                                ppa->g.ch < geo->nr_chnls &&
-                                ppa->g.lun < geo->luns_per_chnl &&
+                                ppa->g.lun < geo->nr_luns &&
                                ppa->g.pl < geo->nr_planes &&
-                                ppa->g.blk < geo->blks_per_lun &&
+                                ppa->g.blk < geo->nr_chks &&
-                                ppa->g.pg < geo->pgs_per_blk &&
+                                ppa->g.pg < geo->ws_per_chk &&
                                ppa->g.sec < geo->sec_per_pg)
                        continue;
@@ -1245,7 +1223,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
                for (i = 0; i < rqd->nr_ppas; i++) {
                        ppa = ppa_list[i];
-                        line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+                        line = &pblk->lines[pblk_ppa_to_line(ppa)];
                        spin_lock(&line->lock);
                        if (line->state != PBLK_LINESTATE_OPEN) {
@@ -1288,11 +1266,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
        return  bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
 }
-static inline sector_t pblk_get_sector(sector_t lba)
-{
-        return lba * NR_PHY_IN_LOG;
-}
 static inline void pblk_setup_uuid(struct pblk *pblk)
 {
        uuid_le uuid;
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
deleted file mode 100644
index 0993c14be860..000000000000
--- a/drivers/lightnvm/rrpc.c
+++ /dev/null
@@ -1,1625 +0,0 @@
-/*
- * Copyright (C) 2015 IT University of Copenhagen
- * Initial release: Matias Bjorling <m@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
- */
-#include "rrpc.h"
-static struct kmem_cache *rrpc_gcb_cache, *rrpc_rq_cache;
-static DECLARE_RWSEM(rrpc_lock);
-static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
-                                struct nvm_rq *rqd, unsigned long flags);
-#define rrpc_for_each_lun(rrpc, rlun, i) \
-                for ((i) = 0, rlun = &(rrpc)->luns[0]; \
-                        (i) < (rrpc)->nr_luns; (i)++, rlun = &(rrpc)->luns[(i)])
-static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_block *rblk = a->rblk;
-        unsigned int pg_offset;
-        lockdep_assert_held(&rrpc->rev_lock);
-        if (a->addr == ADDR_EMPTY || !rblk)
-                return;
-        spin_lock(&rblk->lock);
-        div_u64_rem(a->addr, dev->geo.sec_per_blk, &pg_offset);
-        WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages));
-        rblk->nr_invalid_pages++;
-        spin_unlock(&rblk->lock);
-        rrpc->rev_trans_map[a->addr].addr = ADDR_EMPTY;
-}
-static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba,
-                                                        unsigned int len)
-{
-        sector_t i;
-        spin_lock(&rrpc->rev_lock);
-        for (i = slba; i < slba + len; i++) {
-                struct rrpc_addr *gp = &rrpc->trans_map[i];
-                rrpc_page_invalidate(rrpc, gp);
-                gp->rblk = NULL;
-        }
-        spin_unlock(&rrpc->rev_lock);
-}
-static struct nvm_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc,
-                                        sector_t laddr, unsigned int pages)
-{
-        struct nvm_rq *rqd;
-        struct rrpc_inflight_rq *inf;
-        rqd = mempool_alloc(rrpc->rq_pool, GFP_ATOMIC);
-        if (!rqd)
-                return ERR_PTR(-ENOMEM);
-        inf = rrpc_get_inflight_rq(rqd);
-        if (rrpc_lock_laddr(rrpc, laddr, pages, inf)) {
-                mempool_free(rqd, rrpc->rq_pool);
-                return NULL;
-        }
-        return rqd;
-}
-static void rrpc_inflight_laddr_release(struct rrpc *rrpc, struct nvm_rq *rqd)
-{
-        struct rrpc_inflight_rq *inf = rrpc_get_inflight_rq(rqd);
-        rrpc_unlock_laddr(rrpc, inf);
-        mempool_free(rqd, rrpc->rq_pool);
-}
-static void rrpc_discard(struct rrpc *rrpc, struct bio *bio)
-{
-        sector_t slba = bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
-        sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
-        struct nvm_rq *rqd;
-        while (1) {
-                rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len);
-                if (rqd)
-                        break;
-                schedule();
-        }
-        if (IS_ERR(rqd)) {
-                pr_err("rrpc: unable to acquire inflight IO\n");
-                bio_io_error(bio);
-                return;
-        }
-        rrpc_invalidate_range(rrpc, slba, len);
-        rrpc_inflight_laddr_release(rrpc, rqd);
-}
-static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        return (rblk->next_page == dev->geo.sec_per_blk);
-}
-/* Calculate relative addr for the given block, considering instantiated LUNs */
-static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_lun *rlun = rblk->rlun;
-        return rlun->id * dev->geo.sec_per_blk;
-}
-static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev,
-                                         struct rrpc_addr *gp)
-{
-        struct rrpc_block *rblk = gp->rblk;
-        struct rrpc_lun *rlun = rblk->rlun;
-        u64 addr = gp->addr;
-        struct ppa_addr paddr;
-        paddr.ppa = addr;
-        paddr = rrpc_linear_to_generic_addr(&dev->geo, paddr);
-        paddr.g.ch = rlun->bppa.g.ch;
-        paddr.g.lun = rlun->bppa.g.lun;
-        paddr.g.blk = rblk->id;
-        return paddr;
-}
-/* requires lun->lock taken */
-static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk,
-                                                struct rrpc_block **cur_rblk)
-{
-        struct rrpc *rrpc = rlun->rrpc;
-        if (*cur_rblk) {
-                spin_lock(&(*cur_rblk)->lock);
-                WARN_ON(!block_is_full(rrpc, *cur_rblk));
-                spin_unlock(&(*cur_rblk)->lock);
-        }
-        *cur_rblk = new_rblk;
-}
-static struct rrpc_block *__rrpc_get_blk(struct rrpc *rrpc,
-                                                        struct rrpc_lun *rlun)
-{
-        struct rrpc_block *rblk = NULL;
-        if (list_empty(&rlun->free_list))
-                goto out;
-        rblk = list_first_entry(&rlun->free_list, struct rrpc_block, list);
-        list_move_tail(&rblk->list, &rlun->used_list);
-        rblk->state = NVM_BLK_ST_TGT;
-        rlun->nr_free_blocks--;
-out:
-        return rblk;
-}
-static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
-                                                        unsigned long flags)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_block *rblk;
-        int is_gc = flags & NVM_IOTYPE_GC;
-        spin_lock(&rlun->lock);
-        if (!is_gc && rlun->nr_free_blocks < rlun->reserved_blocks) {
-                pr_err("nvm: rrpc: cannot give block to non GC request\n");
-                spin_unlock(&rlun->lock);
-                return NULL;
-        }
-        rblk = __rrpc_get_blk(rrpc, rlun);
-        if (!rblk) {
-                pr_err("nvm: rrpc: cannot get new block\n");
-                spin_unlock(&rlun->lock);
-                return NULL;
-        }
-        spin_unlock(&rlun->lock);
-        bitmap_zero(rblk->invalid_pages, dev->geo.sec_per_blk);
-        rblk->next_page = 0;
-        rblk->nr_invalid_pages = 0;
-        atomic_set(&rblk->data_cmnt_size, 0);
-        return rblk;
-}
-static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        struct rrpc_lun *rlun = rblk->rlun;
-        spin_lock(&rlun->lock);
-        if (rblk->state & NVM_BLK_ST_TGT) {
-                list_move_tail(&rblk->list, &rlun->free_list);
-                rlun->nr_free_blocks++;
-                rblk->state = NVM_BLK_ST_FREE;
-        } else if (rblk->state & NVM_BLK_ST_BAD) {
-                list_move_tail(&rblk->list, &rlun->bb_list);
-                rblk->state = NVM_BLK_ST_BAD;
-        } else {
-                WARN_ON_ONCE(1);
-                pr_err("rrpc: erroneous type (ch:%d,lun:%d,blk%d-> %u)\n",
-                                        rlun->bppa.g.ch, rlun->bppa.g.lun,
-                                        rblk->id, rblk->state);
-                list_move_tail(&rblk->list, &rlun->bb_list);
-        }
-        spin_unlock(&rlun->lock);
-}
-static void rrpc_put_blks(struct rrpc *rrpc)
-{
-        struct rrpc_lun *rlun;
-        int i;
-        for (i = 0; i < rrpc->nr_luns; i++) {
-                rlun = &rrpc->luns[i];
-                if (rlun->cur)
-                        rrpc_put_blk(rrpc, rlun->cur);
-                if (rlun->gc_cur)
-                        rrpc_put_blk(rrpc, rlun->gc_cur);
-        }
-}
-static struct rrpc_lun *get_next_lun(struct rrpc *rrpc)
-{
-        int next = atomic_inc_return(&rrpc->next_lun);
-        return &rrpc->luns[next % rrpc->nr_luns];
-}
-static void rrpc_gc_kick(struct rrpc *rrpc)
-{
-        struct rrpc_lun *rlun;
-        unsigned int i;
-        for (i = 0; i < rrpc->nr_luns; i++) {
-                rlun = &rrpc->luns[i];
-                queue_work(rrpc->krqd_wq, &rlun->ws_gc);
-        }
-}
-/*
- * timed GC every interval.
- */
-static void rrpc_gc_timer(struct timer_list *t)
-{
-        struct rrpc *rrpc = from_timer(rrpc, t, gc_timer);
-        rrpc_gc_kick(rrpc);
-        mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10));
-}
-static void rrpc_end_sync_bio(struct bio *bio)
-{
-        struct completion *waiting = bio->bi_private;
-        if (bio->bi_status)
-                pr_err("nvm: gc request failed (%u).\n", bio->bi_status);
-        complete(waiting);
-}
-/*
- * rrpc_move_valid_pages -- migrate live data off the block
- * @rrpc: the 'rrpc' structure
- * @block: the block from which to migrate live pages
- *
- * Description:
- *   GC algorithms may call this function to migrate remaining live
- *   pages off the block prior to erasing it. This function blocks
- *   further execution until the operation is complete.
- */
-static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct request_queue *q = dev->q;
-        struct rrpc_rev_addr *rev;
-        struct nvm_rq *rqd;
-        struct bio *bio;
-        struct page *page;
-        int slot;
-        int nr_sec_per_blk = dev->geo.sec_per_blk;
-        u64 phys_addr;
-        DECLARE_COMPLETION_ONSTACK(wait);
-        if (bitmap_full(rblk->invalid_pages, nr_sec_per_blk))
-                return 0;
-        bio = bio_alloc(GFP_NOIO, 1);
-        if (!bio) {
-                pr_err("nvm: could not alloc bio to gc\n");
-                return -ENOMEM;
-        }
-        page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
-        while ((slot = find_first_zero_bit(rblk->invalid_pages,
-                                            nr_sec_per_blk)) < nr_sec_per_blk) {
-                /* Lock laddr */
-                phys_addr = rrpc_blk_to_ppa(rrpc, rblk) + slot;
-try:
-                spin_lock(&rrpc->rev_lock);
-                /* Get logical address from physical to logical table */
-                rev = &rrpc->rev_trans_map[phys_addr];
-                /* already updated by previous regular write */
-                if (rev->addr == ADDR_EMPTY) {
-                        spin_unlock(&rrpc->rev_lock);
-                        continue;
-                }
-                rqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1);
-                if (IS_ERR_OR_NULL(rqd)) {
-                        spin_unlock(&rrpc->rev_lock);
-                        schedule();
-                        goto try;
-                }
-                spin_unlock(&rrpc->rev_lock);
-                /* Perform read to do GC */
-                bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr);
-                bio_set_op_attrs(bio,  REQ_OP_READ, 0);
-                bio->bi_private = &wait;
-                bio->bi_end_io = rrpc_end_sync_bio;
-                /* TODO: may fail when EXP_PG_SIZE > PAGE_SIZE */
-                bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
-                if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
-                        pr_err("rrpc: gc read failed.\n");
-                        rrpc_inflight_laddr_release(rrpc, rqd);
-                        goto finished;
-                }
-                wait_for_completion_io(&wait);
-                if (bio->bi_status) {
-                        rrpc_inflight_laddr_release(rrpc, rqd);
-                        goto finished;
-                }
-                bio_reset(bio);
-                reinit_completion(&wait);
-                bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr);
-                bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-                bio->bi_private = &wait;
-                bio->bi_end_io = rrpc_end_sync_bio;
-                bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
-                /* turn the command around and write the data back to a new
-                 * address
-                 */
-                if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
-                        pr_err("rrpc: gc write failed.\n");
-                        rrpc_inflight_laddr_release(rrpc, rqd);
-                        goto finished;
-                }
-                wait_for_completion_io(&wait);
-                rrpc_inflight_laddr_release(rrpc, rqd);
-                if (bio->bi_status)
-                        goto finished;
-                bio_reset(bio);
-        }
-finished:
-        mempool_free(page, rrpc->page_pool);
-        bio_put(bio);
-        if (!bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) {
-                pr_err("nvm: failed to garbage collect block\n");
-                return -EIO;
-        }
-        return 0;
-}
-static void rrpc_block_gc(struct work_struct *work)
-{
-        struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc,
-                                                                        ws_gc);
-        struct rrpc *rrpc = gcb->rrpc;
-        struct rrpc_block *rblk = gcb->rblk;
-        struct rrpc_lun *rlun = rblk->rlun;
-        struct ppa_addr ppa;
-        mempool_free(gcb, rrpc->gcb_pool);
-        pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' being reclaimed\n",
-                        rlun->bppa.g.ch, rlun->bppa.g.lun,
-                        rblk->id);
-        if (rrpc_move_valid_pages(rrpc, rblk))
-                goto put_back;
-        ppa.ppa = 0;
-        ppa.g.ch = rlun->bppa.g.ch;
-        ppa.g.lun = rlun->bppa.g.lun;
-        ppa.g.blk = rblk->id;
-        if (nvm_erase_sync(rrpc->dev, &ppa, 1))
-                goto put_back;
-        rrpc_put_blk(rrpc, rblk);
-        return;
-put_back:
-        spin_lock(&rlun->lock);
-        list_add_tail(&rblk->prio, &rlun->prio_list);
-        spin_unlock(&rlun->lock);
-}
-/* the block with highest number of invalid pages, will be in the beginning
- * of the list
- */
-static struct rrpc_block *rblk_max_invalid(struct rrpc_block *ra,
-                                                        struct rrpc_block *rb)
-{
-        if (ra->nr_invalid_pages == rb->nr_invalid_pages)
-                return ra;
-        return (ra->nr_invalid_pages < rb->nr_invalid_pages) ? rb : ra;
-}
-/* linearly find the block with highest number of invalid pages
- * requires lun->lock
- */
-static struct rrpc_block *block_prio_find_max(struct rrpc_lun *rlun)
-{
-        struct list_head *prio_list = &rlun->prio_list;
-        struct rrpc_block *rblk, *max;
-        BUG_ON(list_empty(prio_list));
-        max = list_first_entry(prio_list, struct rrpc_block, prio);
-        list_for_each_entry(rblk, prio_list, prio)
-                max = rblk_max_invalid(max, rblk);
-        return max;
-}
-static void rrpc_lun_gc(struct work_struct *work)
-{
-        struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_gc);
-        struct rrpc *rrpc = rlun->rrpc;
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_block_gc *gcb;
-        unsigned int nr_blocks_need;
-        nr_blocks_need = dev->geo.blks_per_lun / GC_LIMIT_INVERSE;
-        if (nr_blocks_need < rrpc->nr_luns)
-                nr_blocks_need = rrpc->nr_luns;
-        spin_lock(&rlun->lock);
-        while (nr_blocks_need > rlun->nr_free_blocks &&
-                                        !list_empty(&rlun->prio_list)) {
-                struct rrpc_block *rblk = block_prio_find_max(rlun);
-                if (!rblk->nr_invalid_pages)
-                        break;
-                gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
-                if (!gcb)
-                        break;
-                list_del_init(&rblk->prio);
-                WARN_ON(!block_is_full(rrpc, rblk));
-                pr_debug("rrpc: selected block 'ch:%d,lun:%d,blk:%d' for GC\n",
-                                        rlun->bppa.g.ch, rlun->bppa.g.lun,
-                                        rblk->id);
-                gcb->rrpc = rrpc;
-                gcb->rblk = rblk;
-                INIT_WORK(&gcb->ws_gc, rrpc_block_gc);
-                queue_work(rrpc->kgc_wq, &gcb->ws_gc);
-                nr_blocks_need--;
-        }
-        spin_unlock(&rlun->lock);
-        /* TODO: Hint that request queue can be started again */
-}
-static void rrpc_gc_queue(struct work_struct *work)
-{
-        struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc,
-                                                                        ws_gc);
-        struct rrpc *rrpc = gcb->rrpc;
-        struct rrpc_block *rblk = gcb->rblk;
-        struct rrpc_lun *rlun = rblk->rlun;
-        spin_lock(&rlun->lock);
-        list_add_tail(&rblk->prio, &rlun->prio_list);
-        spin_unlock(&rlun->lock);
-        mempool_free(gcb, rrpc->gcb_pool);
-        pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' full, allow GC (sched)\n",
-                                        rlun->bppa.g.ch, rlun->bppa.g.lun,
-                                        rblk->id);
-}
-static const struct block_device_operations rrpc_fops = {
-        .owner          = THIS_MODULE,
-};
-static struct rrpc_lun *rrpc_get_lun_rr(struct rrpc *rrpc, int is_gc)
-{
-        unsigned int i;
-        struct rrpc_lun *rlun, *max_free;
-        if (!is_gc)
-                return get_next_lun(rrpc);
-        /* during GC, we don't care about RR, instead we want to make
-         * sure that we maintain evenness between the block luns.
-         */
-        max_free = &rrpc->luns[0];
-        /* prevent GC-ing lun from devouring pages of a lun with
-         * little free blocks. We don't take the lock as we only need an
-         * estimate.
-         */
-        rrpc_for_each_lun(rrpc, rlun, i) {
-                if (rlun->nr_free_blocks > max_free->nr_free_blocks)
-                        max_free = rlun;
-        }
-        return max_free;
-}
-static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t laddr,
-                                        struct rrpc_block *rblk, u64 paddr)
-{
-        struct rrpc_addr *gp;
-        struct rrpc_rev_addr *rev;
-        BUG_ON(laddr >= rrpc->nr_sects);
-        gp = &rrpc->trans_map[laddr];
-        spin_lock(&rrpc->rev_lock);
-        if (gp->rblk)
-                rrpc_page_invalidate(rrpc, gp);
-        gp->addr = paddr;
-        gp->rblk = rblk;
-        rev = &rrpc->rev_trans_map[gp->addr];
-        rev->addr = laddr;
-        spin_unlock(&rrpc->rev_lock);
-        return gp;
-}
-static u64 rrpc_alloc_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        u64 addr = ADDR_EMPTY;
-        spin_lock(&rblk->lock);
-        if (block_is_full(rrpc, rblk))
-                goto out;
-        addr = rblk->next_page;
-        rblk->next_page++;
-out:
-        spin_unlock(&rblk->lock);
-        return addr;
-}
-/* Map logical address to a physical page. The mapping implements a round robin
- * approach and allocates a page from the next lun available.
- *
- * Returns rrpc_addr with the physical address and block. Returns NULL if no
- * blocks in the next rlun are available.
- */
-static struct ppa_addr rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
-                                                                int is_gc)
-{
-        struct nvm_tgt_dev *tgt_dev = rrpc->dev;
-        struct rrpc_lun *rlun;
-        struct rrpc_block *rblk, **cur_rblk;
-        struct rrpc_addr *p;
-        struct ppa_addr ppa;
-        u64 paddr;
-        int gc_force = 0;
-        ppa.ppa = ADDR_EMPTY;
-        rlun = rrpc_get_lun_rr(rrpc, is_gc);
-        if (!is_gc && rlun->nr_free_blocks < rrpc->nr_luns * 4)
-                return ppa;
-        /*
-         * page allocation steps:
-         * 1. Try to allocate new page from current rblk
-         * 2a. If succeed, proceed to map it in and return
-         * 2b. If fail, first try to allocate a new block from media manger,
-         *     and then retry step 1. Retry until the normal block pool is
-         *     exhausted.
-         * 3. If exhausted, and garbage collector is requesting the block,
-         *    go to the reserved block and retry step 1.
-         *    In the case that this fails as well, or it is not GC
-         *    requesting, report not able to retrieve a block and let the
-         *    caller handle further processing.
-         */
-        spin_lock(&rlun->lock);
-        cur_rblk = &rlun->cur;
-        rblk = rlun->cur;
-retry:
-        paddr = rrpc_alloc_addr(rrpc, rblk);
-        if (paddr != ADDR_EMPTY)
-                goto done;
-        if (!list_empty(&rlun->wblk_list)) {
-new_blk:
-                rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block,
-                                                                        prio);
-                rrpc_set_lun_cur(rlun, rblk, cur_rblk);
-                list_del(&rblk->prio);
-                goto retry;
-        }
-        spin_unlock(&rlun->lock);
-        rblk = rrpc_get_blk(rrpc, rlun, gc_force);
-        if (rblk) {
-                spin_lock(&rlun->lock);
-                list_add_tail(&rblk->prio, &rlun->wblk_list);
-                /*
-                 * another thread might already have added a new block,
-                 * Therefore, make sure that one is used, instead of the
-                 * one just added.
-                 */
-                goto new_blk;
-        }
-        if (unlikely(is_gc) && !gc_force) {
-                /* retry from emergency gc block */
-                cur_rblk = &rlun->gc_cur;
-                rblk = rlun->gc_cur;
-                gc_force = 1;
-                spin_lock(&rlun->lock);
-                goto retry;
-        }
-        pr_err("rrpc: failed to allocate new block\n");
-        return ppa;
-done:
-        spin_unlock(&rlun->lock);
-        p = rrpc_update_map(rrpc, laddr, rblk, paddr);
-        if (!p)
-                return ppa;
-        /* return global address */
-        return rrpc_ppa_to_gaddr(tgt_dev, p);
-}
-static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        struct rrpc_block_gc *gcb;
-        gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
-        if (!gcb) {
-                pr_err("rrpc: unable to queue block for gc.");
-                return;
-        }
-        gcb->rrpc = rrpc;
-        gcb->rblk = rblk;
-        INIT_WORK(&gcb->ws_gc, rrpc_gc_queue);
-        queue_work(rrpc->kgc_wq, &gcb->ws_gc);
-}
-static struct rrpc_lun *rrpc_ppa_to_lun(struct rrpc *rrpc, struct ppa_addr p)
-{
-        struct rrpc_lun *rlun = NULL;
-        int i;
-        for (i = 0; i < rrpc->nr_luns; i++) {
-                if (rrpc->luns[i].bppa.g.ch == p.g.ch &&
-                                rrpc->luns[i].bppa.g.lun == p.g.lun) {
-                        rlun = &rrpc->luns[i];
-                        break;
-                }
-        }
-        return rlun;
-}
-static void __rrpc_mark_bad_block(struct rrpc *rrpc, struct ppa_addr ppa)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_lun *rlun;
-        struct rrpc_block *rblk;
-        rlun = rrpc_ppa_to_lun(rrpc, ppa);
-        rblk = &rlun->blocks[ppa.g.blk];
-        rblk->state = NVM_BLK_ST_BAD;
-        nvm_set_tgt_bb_tbl(dev, &ppa, 1, NVM_BLK_T_GRWN_BAD);
-}
-static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
-{
-        void *comp_bits = &rqd->ppa_status;
-        struct ppa_addr ppa, prev_ppa;
-        int nr_ppas = rqd->nr_ppas;
-        int bit;
-        if (rqd->nr_ppas == 1)
-                __rrpc_mark_bad_block(rrpc, rqd->ppa_addr);
-        ppa_set_empty(&prev_ppa);
-        bit = -1;
-        while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
-                ppa = rqd->ppa_list[bit];
-                if (ppa_cmp_blk(ppa, prev_ppa))
-                        continue;
-                __rrpc_mark_bad_block(rrpc, ppa);
-        }
-}
-static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
-                                                sector_t laddr, uint8_t npages)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_addr *p;
-        struct rrpc_block *rblk;
-        int cmnt_size, i;
-        for (i = 0; i < npages; i++) {
-                p = &rrpc->trans_map[laddr + i];
-                rblk = p->rblk;
-                cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
-                if (unlikely(cmnt_size == dev->geo.sec_per_blk))
-                        rrpc_run_gc(rrpc, rblk);
-        }
-}
-static void rrpc_end_io(struct nvm_rq *rqd)
-{
-        struct rrpc *rrpc = rqd->private;
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
-        uint8_t npages = rqd->nr_ppas;
-        sector_t laddr = rrpc_get_laddr(rqd->bio) - npages;
-        if (bio_data_dir(rqd->bio) == WRITE) {
-                if (rqd->error == NVM_RSP_ERR_FAILWRITE)
-                        rrpc_mark_bad_block(rrpc, rqd);
-                rrpc_end_io_write(rrpc, rrqd, laddr, npages);
-        }
-        bio_put(rqd->bio);
-        if (rrqd->flags & NVM_IOTYPE_GC)
-                return;
-        rrpc_unlock_rq(rrpc, rqd);
-        if (npages > 1)
-                nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-        mempool_free(rqd, rrpc->rq_pool);
-}
-static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
-                        struct nvm_rq *rqd, unsigned long flags, int npages)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
-        struct rrpc_addr *gp;
-        sector_t laddr = rrpc_get_laddr(bio);
-        int is_gc = flags & NVM_IOTYPE_GC;
-        int i;
-        if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
-                nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-                return NVM_IO_REQUEUE;
-        }
-        for (i = 0; i < npages; i++) {
-                /* We assume that mapping occurs at 4KB granularity */
-                BUG_ON(!(laddr + i < rrpc->nr_sects));
-                gp = &rrpc->trans_map[laddr + i];
-                if (gp->rblk) {
-                        rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp);
-                } else {
-                        BUG_ON(is_gc);
-                        rrpc_unlock_laddr(rrpc, r);
-                        nvm_dev_dma_free(dev->parent, rqd->ppa_list,
-                                                        rqd->dma_ppa_list);
-                        return NVM_IO_DONE;
-                }
-        }
-        rqd->opcode = NVM_OP_HBREAD;
-        return NVM_IO_OK;
-}
-static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
-                                                        unsigned long flags)
-{
-        int is_gc = flags & NVM_IOTYPE_GC;
-        sector_t laddr = rrpc_get_laddr(bio);
-        struct rrpc_addr *gp;
-        if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
-                return NVM_IO_REQUEUE;
-        BUG_ON(!(laddr < rrpc->nr_sects));
-        gp = &rrpc->trans_map[laddr];
-        if (gp->rblk) {
-                rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp);
-        } else {
-                BUG_ON(is_gc);
-                rrpc_unlock_rq(rrpc, rqd);
-                return NVM_IO_DONE;
-        }
-        rqd->opcode = NVM_OP_HBREAD;
-        return NVM_IO_OK;
-}
-static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
-                        struct nvm_rq *rqd, unsigned long flags, int npages)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
-        struct ppa_addr p;
-        sector_t laddr = rrpc_get_laddr(bio);
-        int is_gc = flags & NVM_IOTYPE_GC;
-        int i;
-        if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
-                nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-                return NVM_IO_REQUEUE;
-        }
-        for (i = 0; i < npages; i++) {
-                /* We assume that mapping occurs at 4KB granularity */
-                p = rrpc_map_page(rrpc, laddr + i, is_gc);
-                if (p.ppa == ADDR_EMPTY) {
-                        BUG_ON(is_gc);
-                        rrpc_unlock_laddr(rrpc, r);
-                        nvm_dev_dma_free(dev->parent, rqd->ppa_list,
-                                                        rqd->dma_ppa_list);
-                        rrpc_gc_kick(rrpc);
-                        return NVM_IO_REQUEUE;
-                }
-                rqd->ppa_list[i] = p;
-        }
-        rqd->opcode = NVM_OP_HBWRITE;
-        return NVM_IO_OK;
-}
-static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
-                                struct nvm_rq *rqd, unsigned long flags)
-{
-        struct ppa_addr p;
-        int is_gc = flags & NVM_IOTYPE_GC;
-        sector_t laddr = rrpc_get_laddr(bio);
-        if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
-                return NVM_IO_REQUEUE;
-        p = rrpc_map_page(rrpc, laddr, is_gc);
-        if (p.ppa == ADDR_EMPTY) {
-                BUG_ON(is_gc);
-                rrpc_unlock_rq(rrpc, rqd);
-                rrpc_gc_kick(rrpc);
-                return NVM_IO_REQUEUE;
-        }
-        rqd->ppa_addr = p;
-        rqd->opcode = NVM_OP_HBWRITE;
-        return NVM_IO_OK;
-}
-static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
-                        struct nvm_rq *rqd, unsigned long flags, uint8_t npages)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        if (npages > 1) {
-                rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
-                                                        &rqd->dma_ppa_list);
-                if (!rqd->ppa_list) {
-                        pr_err("rrpc: not able to allocate ppa list\n");
-                        return NVM_IO_ERR;
-                }
-                if (bio_op(bio) == REQ_OP_WRITE)
-                        return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags,
-                                                                        npages);
-                return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages);
-        }
-        if (bio_op(bio) == REQ_OP_WRITE)
-                return rrpc_write_rq(rrpc, bio, rqd, flags);
-        return rrpc_read_rq(rrpc, bio, rqd, flags);
-}
-static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
-                                struct nvm_rq *rqd, unsigned long flags)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_rq *rrq = nvm_rq_to_pdu(rqd);
-        uint8_t nr_pages = rrpc_get_pages(bio);
-        int bio_size = bio_sectors(bio) << 9;
-        int err;
-        if (bio_size < dev->geo.sec_size)
-                return NVM_IO_ERR;
-        else if (bio_size > dev->geo.max_rq_size)
-                return NVM_IO_ERR;
-        err = rrpc_setup_rq(rrpc, bio, rqd, flags, nr_pages);
-        if (err)
-                return err;
-        bio_get(bio);
-        rqd->bio = bio;
-        rqd->private = rrpc;
-        rqd->nr_ppas = nr_pages;
-        rqd->end_io = rrpc_end_io;
-        rrq->flags = flags;
-        err = nvm_submit_io(dev, rqd);
-        if (err) {
-                pr_err("rrpc: I/O submission failed: %d\n", err);
-                bio_put(bio);
-                if (!(flags & NVM_IOTYPE_GC)) {
-                        rrpc_unlock_rq(rrpc, rqd);
-                        if (rqd->nr_ppas > 1)
-                                nvm_dev_dma_free(dev->parent, rqd->ppa_list,
-                                                        rqd->dma_ppa_list);
-                }
-                return NVM_IO_ERR;
-        }
-        return NVM_IO_OK;
-}
-static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
-{
-        struct rrpc *rrpc = q->queuedata;
-        struct nvm_rq *rqd;
-        int err;
-        blk_queue_split(q, &bio);
-        if (bio_op(bio) == REQ_OP_DISCARD) {
-                rrpc_discard(rrpc, bio);
-                return BLK_QC_T_NONE;
-        }
-        rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
-        memset(rqd, 0, sizeof(struct nvm_rq));
-        err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
-        switch (err) {
-        case NVM_IO_OK:
-                return BLK_QC_T_NONE;
-        case NVM_IO_ERR:
-                bio_io_error(bio);
-                break;
-        case NVM_IO_DONE:
-                bio_endio(bio);
-                break;
-        case NVM_IO_REQUEUE:
-                spin_lock(&rrpc->bio_lock);
-                bio_list_add(&rrpc->requeue_bios, bio);
-                spin_unlock(&rrpc->bio_lock);
-                queue_work(rrpc->kgc_wq, &rrpc->ws_requeue);
-                break;
-        }
-        mempool_free(rqd, rrpc->rq_pool);
-        return BLK_QC_T_NONE;
-}
-static void rrpc_requeue(struct work_struct *work)
-{
-        struct rrpc *rrpc = container_of(work, struct rrpc, ws_requeue);
-        struct bio_list bios;
-        struct bio *bio;
-        bio_list_init(&bios);
-        spin_lock(&rrpc->bio_lock);
-        bio_list_merge(&bios, &rrpc->requeue_bios);
-        bio_list_init(&rrpc->requeue_bios);
-        spin_unlock(&rrpc->bio_lock);
-        while ((bio = bio_list_pop(&bios)))
-                rrpc_make_rq(rrpc->disk->queue, bio);
-}
-static void rrpc_gc_free(struct rrpc *rrpc)
-{
-        if (rrpc->krqd_wq)
-                destroy_workqueue(rrpc->krqd_wq);
-        if (rrpc->kgc_wq)
-                destroy_workqueue(rrpc->kgc_wq);
-}
-static int rrpc_gc_init(struct rrpc *rrpc)
-{
-        rrpc->krqd_wq = alloc_workqueue("rrpc-lun", WQ_MEM_RECLAIM|WQ_UNBOUND,
-                                                                rrpc->nr_luns);
-        if (!rrpc->krqd_wq)
-                return -ENOMEM;
-        rrpc->kgc_wq = alloc_workqueue("rrpc-bg", WQ_MEM_RECLAIM, 1);
-        if (!rrpc->kgc_wq)
-                return -ENOMEM;
-        timer_setup(&rrpc->gc_timer, rrpc_gc_timer, 0);
-        return 0;
-}
-static void rrpc_map_free(struct rrpc *rrpc)
-{
-        vfree(rrpc->rev_trans_map);
-        vfree(rrpc->trans_map);
-}
-static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
-{
-        struct rrpc *rrpc = (struct rrpc *)private;
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_addr *addr = rrpc->trans_map + slba;
-        struct rrpc_rev_addr *raddr = rrpc->rev_trans_map;
-        struct rrpc_lun *rlun;
-        struct rrpc_block *rblk;
-        u64 i;
-        for (i = 0; i < nlb; i++) {
-                struct ppa_addr gaddr;
-                u64 pba = le64_to_cpu(entries[i]);
-                unsigned int mod;
-                /* LNVM treats address-spaces as silos, LBA and PBA are
-                 * equally large and zero-indexed.
-                 */
-                if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) {
-                        pr_err("nvm: L2P data entry is out of bounds!\n");
-                        pr_err("nvm: Maybe loaded an old target L2P\n");
-                        return -EINVAL;
-                }
-                /* Address zero is a special one. The first page on a disk is
-                 * protected. As it often holds internal device boot
-                 * information.
-                 */
-                if (!pba)
-                        continue;
-                div_u64_rem(pba, rrpc->nr_sects, &mod);
-                gaddr = rrpc_recov_addr(dev, pba);
-                rlun = rrpc_ppa_to_lun(rrpc, gaddr);
-                if (!rlun) {
-                        pr_err("rrpc: l2p corruption on lba %llu\n",
-                                                        slba + i);
-                        return -EINVAL;
-                }
-                rblk = &rlun->blocks[gaddr.g.blk];
-                if (!rblk->state) {
-                        /* at this point, we don't know anything about the
-                         * block. It's up to the FTL on top to re-etablish the
-                         * block state. The block is assumed to be open.
-                         */
-                        list_move_tail(&rblk->list, &rlun->used_list);
-                        rblk->state = NVM_BLK_ST_TGT;
-                        rlun->nr_free_blocks--;
-                }
-                addr[i].addr = pba;
-                addr[i].rblk = rblk;
-                raddr[mod].addr = slba + i;
-        }
-        return 0;
-}
-static int rrpc_map_init(struct rrpc *rrpc)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        sector_t i;
-        int ret;
-        rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects);
-        if (!rrpc->trans_map)
-                return -ENOMEM;
-        rrpc->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr)
-                                                        * rrpc->nr_sects);
-        if (!rrpc->rev_trans_map)
-                return -ENOMEM;
-        for (i = 0; i < rrpc->nr_sects; i++) {
-                struct rrpc_addr *p = &rrpc->trans_map[i];
-                struct rrpc_rev_addr *r = &rrpc->rev_trans_map[i];
-                p->addr = ADDR_EMPTY;
-                r->addr = ADDR_EMPTY;
-        }
-        /* Bring up the mapping table from device */
-        ret = nvm_get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects,
-                                                        rrpc_l2p_update, rrpc);
-        if (ret) {
-                pr_err("nvm: rrpc: could not read L2P table.\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-/* Minimum pages needed within a lun */
-#define PAGE_POOL_SIZE 16
-#define ADDR_POOL_SIZE 64
-static int rrpc_core_init(struct rrpc *rrpc)
-{
-        down_write(&rrpc_lock);
-        if (!rrpc_gcb_cache) {
-                rrpc_gcb_cache = kmem_cache_create("rrpc_gcb",
-                                sizeof(struct rrpc_block_gc), 0, 0, NULL);
-                if (!rrpc_gcb_cache) {
-                        up_write(&rrpc_lock);
-                        return -ENOMEM;
-                }
-                rrpc_rq_cache = kmem_cache_create("rrpc_rq",
-                                sizeof(struct nvm_rq) + sizeof(struct rrpc_rq),
-                                0, 0, NULL);
-                if (!rrpc_rq_cache) {
-                        kmem_cache_destroy(rrpc_gcb_cache);
-                        up_write(&rrpc_lock);
-                        return -ENOMEM;
-                }
-        }
-        up_write(&rrpc_lock);
-        rrpc->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
-        if (!rrpc->page_pool)
-                return -ENOMEM;
-        rrpc->gcb_pool = mempool_create_slab_pool(rrpc->dev->geo.nr_luns,
-                                                                rrpc_gcb_cache);
-        if (!rrpc->gcb_pool)
-                return -ENOMEM;
-        rrpc->rq_pool = mempool_create_slab_pool(64, rrpc_rq_cache);
-        if (!rrpc->rq_pool)
-                return -ENOMEM;
-        spin_lock_init(&rrpc->inflights.lock);
-        INIT_LIST_HEAD(&rrpc->inflights.reqs);
-        return 0;
-}
-static void rrpc_core_free(struct rrpc *rrpc)
-{
-        mempool_destroy(rrpc->page_pool);
-        mempool_destroy(rrpc->gcb_pool);
-        mempool_destroy(rrpc->rq_pool);
-}
-static void rrpc_luns_free(struct rrpc *rrpc)
-{
-        struct rrpc_lun *rlun;
-        int i;
-        if (!rrpc->luns)
-                return;
-        for (i = 0; i < rrpc->nr_luns; i++) {
-                rlun = &rrpc->luns[i];
-                vfree(rlun->blocks);
-        }
-        kfree(rrpc->luns);
-}
-static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
-{
-        struct nvm_geo *geo = &dev->geo;
-        struct rrpc_block *rblk;
-        struct ppa_addr ppa;
-        u8 *blks;
-        int nr_blks;
-        int i;
-        int ret;
-        if (!dev->parent->ops->get_bb_tbl)
-                return 0;
-        nr_blks = geo->blks_per_lun * geo->plane_mode;
-        blks = kmalloc(nr_blks, GFP_KERNEL);
-        if (!blks)
-                return -ENOMEM;
-        ppa.ppa = 0;
-        ppa.g.ch = rlun->bppa.g.ch;
-        ppa.g.lun = rlun->bppa.g.lun;
-        ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
-        if (ret) {
-                pr_err("rrpc: could not get BB table\n");
-                goto out;
-        }
-        nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
-        if (nr_blks < 0) {
-                ret = nr_blks;
-                goto out;
-        }
-        for (i = 0; i < nr_blks; i++) {
-                if (blks[i] == NVM_BLK_T_FREE)
-                        continue;
-                rblk = &rlun->blocks[i];
-                list_move_tail(&rblk->list, &rlun->bb_list);
-                rblk->state = NVM_BLK_ST_BAD;
-                rlun->nr_free_blocks--;
-        }
-out:
-        kfree(blks);
-        return ret;
-}
-static void rrpc_set_lun_ppa(struct rrpc_lun *rlun, struct ppa_addr ppa)
-{
-        rlun->bppa.ppa = 0;
-        rlun->bppa.g.ch = ppa.g.ch;
-        rlun->bppa.g.lun = ppa.g.lun;
-}
-static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct nvm_geo *geo = &dev->geo;
-        struct rrpc_lun *rlun;
-        int i, j, ret = -EINVAL;
-        if (geo->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
-                pr_err("rrpc: number of pages per block too high.");
-                return -EINVAL;
-        }
-        spin_lock_init(&rrpc->rev_lock);
-        rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun),
-                                                                GFP_KERNEL);
-        if (!rrpc->luns)
-                return -ENOMEM;
-        /* 1:1 mapping */
-        for (i = 0; i < rrpc->nr_luns; i++) {
-                rlun = &rrpc->luns[i];
-                rlun->id = i;
-                rrpc_set_lun_ppa(rlun, luns[i]);
-                rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
-                                                        geo->blks_per_lun);
-                if (!rlun->blocks) {
-                        ret = -ENOMEM;
-                        goto err;
-                }
-                INIT_LIST_HEAD(&rlun->free_list);
-                INIT_LIST_HEAD(&rlun->used_list);
-                INIT_LIST_HEAD(&rlun->bb_list);
-                for (j = 0; j < geo->blks_per_lun; j++) {
-                        struct rrpc_block *rblk = &rlun->blocks[j];
-                        rblk->id = j;
-                        rblk->rlun = rlun;
-                        rblk->state = NVM_BLK_T_FREE;
-                        INIT_LIST_HEAD(&rblk->prio);
-                        INIT_LIST_HEAD(&rblk->list);
-                        spin_lock_init(&rblk->lock);
-                        list_add_tail(&rblk->list, &rlun->free_list);
-                }
-                rlun->rrpc = rrpc;
-                rlun->nr_free_blocks = geo->blks_per_lun;
-                rlun->reserved_blocks = 2; /* for GC only */
-                INIT_LIST_HEAD(&rlun->prio_list);
-                INIT_LIST_HEAD(&rlun->wblk_list);
-                INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
-                spin_lock_init(&rlun->lock);
-                if (rrpc_bb_discovery(dev, rlun))
-                        goto err;
-        }
-        return 0;
-err:
-        return ret;
-}
-/* returns 0 on success and stores the beginning address in *begin */
-static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        sector_t size = rrpc->nr_sects * dev->geo.sec_size;
-        int ret;
-        size >>= 9;
-        ret = nvm_get_area(dev, begin, size);
-        if (!ret)
-                *begin >>= (ilog2(dev->geo.sec_size) - 9);
-        return ret;
-}
-static void rrpc_area_free(struct rrpc *rrpc)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9);
-        nvm_put_area(dev, begin);
-}
-static void rrpc_free(struct rrpc *rrpc)
-{
-        rrpc_gc_free(rrpc);
-        rrpc_map_free(rrpc);
-        rrpc_core_free(rrpc);
-        rrpc_luns_free(rrpc);
-        rrpc_area_free(rrpc);
-        kfree(rrpc);
-}
-static void rrpc_exit(void *private)
-{
-        struct rrpc *rrpc = private;
-        del_timer(&rrpc->gc_timer);
-        flush_workqueue(rrpc->krqd_wq);
-        flush_workqueue(rrpc->kgc_wq);
-        rrpc_free(rrpc);
-}
-static sector_t rrpc_capacity(void *private)
-{
-        struct rrpc *rrpc = private;
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        sector_t reserved, provisioned;
-        /* cur, gc, and two emergency blocks for each lun */
-        reserved = rrpc->nr_luns * dev->geo.sec_per_blk * 4;
-        provisioned = rrpc->nr_sects - reserved;
-        if (reserved > rrpc->nr_sects) {
-                pr_err("rrpc: not enough space available to expose storage.\n");
-                return 0;
-        }
-        sector_div(provisioned, 10);
-        return provisioned * 9 * NR_PHY_IN_LOG;
-}
-/*
- * Looks up the logical address from reverse trans map and check if its valid by
- * comparing the logical to physical address with the physical address.
- * Returns 0 on free, otherwise 1 if in use
- */
-static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        int offset;
-        struct rrpc_addr *laddr;
-        u64 bpaddr, paddr, pladdr;
-        bpaddr = block_to_rel_addr(rrpc, rblk);
-        for (offset = 0; offset < dev->geo.sec_per_blk; offset++) {
-                paddr = bpaddr + offset;
-                pladdr = rrpc->rev_trans_map[paddr].addr;
-                if (pladdr == ADDR_EMPTY)
-                        continue;
-                laddr = &rrpc->trans_map[pladdr];
-                if (paddr == laddr->addr) {
-                        laddr->rblk = rblk;
-                } else {
-                        set_bit(offset, rblk->invalid_pages);
-                        rblk->nr_invalid_pages++;
-                }
-        }
-}
-static int rrpc_blocks_init(struct rrpc *rrpc)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct rrpc_lun *rlun;
-        struct rrpc_block *rblk;
-        int lun_iter, blk_iter;
-        for (lun_iter = 0; lun_iter < rrpc->nr_luns; lun_iter++) {
-                rlun = &rrpc->luns[lun_iter];
-                for (blk_iter = 0; blk_iter < dev->geo.blks_per_lun;
-                                                                blk_iter++) {
-                        rblk = &rlun->blocks[blk_iter];
-                        rrpc_block_map_update(rrpc, rblk);
-                }
-        }
-        return 0;
-}
-static int rrpc_luns_configure(struct rrpc *rrpc)
-{
-        struct rrpc_lun *rlun;
-        struct rrpc_block *rblk;
-        int i;
-        for (i = 0; i < rrpc->nr_luns; i++) {
-                rlun = &rrpc->luns[i];
-                rblk = rrpc_get_blk(rrpc, rlun, 0);
-                if (!rblk)
-                        goto err;
-                rrpc_set_lun_cur(rlun, rblk, &rlun->cur);
-                /* Emergency gc block */
-                rblk = rrpc_get_blk(rrpc, rlun, 1);
-                if (!rblk)
-                        goto err;
-                rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur);
-        }
-        return 0;
-err:
-        rrpc_put_blks(rrpc);
-        return -EINVAL;
-}
-static struct nvm_tgt_type tt_rrpc;
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
-                       int flags)
-{
-        struct request_queue *bqueue = dev->q;
-        struct request_queue *tqueue = tdisk->queue;
-        struct nvm_geo *geo = &dev->geo;
-        struct rrpc *rrpc;
-        sector_t soffset;
-        int ret;
-        if (!(dev->identity.dom & NVM_RSP_L2P)) {
-                pr_err("nvm: rrpc: device does not support l2p (%x)\n",
-                                                        dev->identity.dom);
-                return ERR_PTR(-EINVAL);
-        }
-        rrpc = kzalloc(sizeof(struct rrpc), GFP_KERNEL);
-        if (!rrpc)
-                return ERR_PTR(-ENOMEM);
-        rrpc->dev = dev;
-        rrpc->disk = tdisk;
-        bio_list_init(&rrpc->requeue_bios);
-        spin_lock_init(&rrpc->bio_lock);
-        INIT_WORK(&rrpc->ws_requeue, rrpc_requeue);
-        rrpc->nr_luns = geo->nr_luns;
-        rrpc->nr_sects = (unsigned long long)geo->sec_per_lun * rrpc->nr_luns;
-        /* simple round-robin strategy */
-        atomic_set(&rrpc->next_lun, -1);
-        ret = rrpc_area_init(rrpc, &soffset);
-        if (ret < 0) {
-                pr_err("nvm: rrpc: could not initialize area\n");
-                return ERR_PTR(ret);
-        }
-        rrpc->soffset = soffset;
-        ret = rrpc_luns_init(rrpc, dev->luns);
-        if (ret) {
-                pr_err("nvm: rrpc: could not initialize luns\n");
-                goto err;
-        }
-        ret = rrpc_core_init(rrpc);
-        if (ret) {
-                pr_err("nvm: rrpc: could not initialize core\n");
-                goto err;
-        }
-        ret = rrpc_map_init(rrpc);
-        if (ret) {
-                pr_err("nvm: rrpc: could not initialize maps\n");
-                goto err;
-        }
-        ret = rrpc_blocks_init(rrpc);
-        if (ret) {
-                pr_err("nvm: rrpc: could not initialize state for blocks\n");
-                goto err;
-        }
-        ret = rrpc_luns_configure(rrpc);
-        if (ret) {
-                pr_err("nvm: rrpc: not enough blocks available in LUNs.\n");
-                goto err;
-        }
-        ret = rrpc_gc_init(rrpc);
-        if (ret) {
-                pr_err("nvm: rrpc: could not initialize gc\n");
-                goto err;
-        }
-        /* inherit the size from the underlying device */
-        blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
-        blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
-        pr_info("nvm: rrpc initialized with %u luns and %llu pages.\n",
-                        rrpc->nr_luns, (unsigned long long)rrpc->nr_sects);
-        mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10));
-        return rrpc;
-err:
-        rrpc_free(rrpc);
-        return ERR_PTR(ret);
-}
-/* round robin, page-based FTL, and cost-based GC */
-static struct nvm_tgt_type tt_rrpc = {
-        .name           = "rrpc",
-        .version        = {1, 0, 0},
-        .make_rq        = rrpc_make_rq,
-        .capacity       = rrpc_capacity,
-        .init           = rrpc_init,
-        .exit           = rrpc_exit,
-};
-static int __init rrpc_module_init(void)
-{
-        return nvm_register_tgt_type(&tt_rrpc);
-}
-static void rrpc_module_exit(void)
-{
-        nvm_unregister_tgt_type(&tt_rrpc);
-}
-module_init(rrpc_module_init);
-module_exit(rrpc_module_exit);
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Block-Device Target for Open-Channel SSDs");
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
deleted file mode 100644
index fdb6ff902903..000000000000
--- a/drivers/lightnvm/rrpc.h
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (C) 2015 IT University of Copenhagen
- * Initial release: Matias Bjorling <m@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
- */
-#ifndef RRPC_H_
-#define RRPC_H_
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/vmalloc.h>
-#include <linux/lightnvm.h>
-/* Run only GC if less than 1/X blocks are free */
-#define GC_LIMIT_INVERSE 10
-#define GC_TIME_SECS 100
-#define RRPC_SECTOR (512)
-#define RRPC_EXPOSED_PAGE_SIZE (4096)
-#define NR_PHY_IN_LOG (RRPC_EXPOSED_PAGE_SIZE / RRPC_SECTOR)
-struct rrpc_inflight {
-        struct list_head reqs;
-        spinlock_t lock;
-};
-struct rrpc_inflight_rq {
-        struct list_head list;
-        sector_t l_start;
-        sector_t l_end;
-};
-struct rrpc_rq {
-        struct rrpc_inflight_rq inflight_rq;
-        unsigned long flags;
-};
-struct rrpc_block {
-        int id;                         /* id inside of LUN */
-        struct rrpc_lun *rlun;
-        struct list_head prio;          /* LUN CG list */
-        struct list_head list;          /* LUN free, used, bb list */
-#define MAX_INVALID_PAGES_STORAGE 8
-        /* Bitmap for invalid page intries */
-        unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE];
-        /* points to the next writable page within a block */
-        unsigned int next_page;
-        /* number of pages that are invalid, wrt host page size */
-        unsigned int nr_invalid_pages;
-        int state;
-        spinlock_t lock;
-        atomic_t data_cmnt_size; /* data pages committed to stable storage */
-};
-struct rrpc_lun {
-        struct rrpc *rrpc;
-        int id;
-        struct ppa_addr bppa;
-        struct rrpc_block *cur, *gc_cur;
-        struct rrpc_block *blocks;      /* Reference to block allocation */
-        struct list_head prio_list;     /* Blocks that may be GC'ed */
-        struct list_head wblk_list;     /* Queued blocks to be written to */
-        /* lun block lists */
-        struct list_head used_list;     /* In-use blocks */
-        struct list_head free_list;     /* Not used blocks i.e. released
-                                         * and ready for use
-                                         */
-        struct list_head bb_list;       /* Bad blocks. Mutually exclusive with
-                                         * free_list and used_list
-                                         */
-        unsigned int nr_free_blocks;    /* Number of unused blocks */
-        struct work_struct ws_gc;
-        int reserved_blocks;
-        spinlock_t lock;
-};
-struct rrpc {
-        struct nvm_tgt_dev *dev;
-        struct gendisk *disk;
-        sector_t soffset; /* logical sector offset */
-        int nr_luns;
-        struct rrpc_lun *luns;
-        /* calculated values */
-        unsigned long long nr_sects;
-        /* Write strategy variables. Move these into each for structure for each
-         * strategy
-         */
-        atomic_t next_lun; /* Whenever a page is written, this is updated
-                            * to point to the next write lun
-                            */
-        spinlock_t bio_lock;
-        struct bio_list requeue_bios;
-        struct work_struct ws_requeue;
-        /* Simple translation map of logical addresses to physical addresses.
-         * The logical addresses is known by the host system, while the physical
-         * addresses are used when writing to the disk block device.
-         */
-        struct rrpc_addr *trans_map;
-        /* also store a reverse map for garbage collection */
-        struct rrpc_rev_addr *rev_trans_map;
-        spinlock_t rev_lock;
-        struct rrpc_inflight inflights;
-        mempool_t *addr_pool;
-        mempool_t *page_pool;
-        mempool_t *gcb_pool;
-        mempool_t *rq_pool;
-        struct timer_list gc_timer;
-        struct workqueue_struct *krqd_wq;
-        struct workqueue_struct *kgc_wq;
-};
-struct rrpc_block_gc {
-        struct rrpc *rrpc;
-        struct rrpc_block *rblk;
-        struct work_struct ws_gc;
-};
-/* Logical to physical mapping */
-struct rrpc_addr {
-        u64 addr;
-        struct rrpc_block *rblk;
-};
-/* Physical to logical mapping */
-struct rrpc_rev_addr {
-        u64 addr;
-};
-static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo,
-                                                          struct ppa_addr r)
-{
-        struct ppa_addr l;
-        int secs, pgs;
-        sector_t ppa = r.ppa;
-        l.ppa = 0;
-        div_u64_rem(ppa, geo->sec_per_pg, &secs);
-        l.g.sec = secs;
-        sector_div(ppa, geo->sec_per_pg);
-        div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
-        l.g.pg = pgs;
-        return l;
-}
-static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba)
-{
-        return linear_to_generic_addr(&dev->geo, pba);
-}
-static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
-        struct nvm_tgt_dev *dev = rrpc->dev;
-        struct nvm_geo *geo = &dev->geo;
-        struct rrpc_lun *rlun = rblk->rlun;
-        return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
-}
-static inline sector_t rrpc_get_laddr(struct bio *bio)
-{
-        return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
-}
-static inline unsigned int rrpc_get_pages(struct bio *bio)
-{
-        return  bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
-}
-static inline sector_t rrpc_get_sector(sector_t laddr)
-{
-        return laddr * NR_PHY_IN_LOG;
-}
-static inline int request_intersects(struct rrpc_inflight_rq *r,
-                                sector_t laddr_start, sector_t laddr_end)
-{
-        return (laddr_end >= r->l_start) && (laddr_start <= r->l_end);
-}
-static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
-                             unsigned int pages, struct rrpc_inflight_rq *r)
-{
-        sector_t laddr_end = laddr + pages - 1;
-        struct rrpc_inflight_rq *rtmp;
-        WARN_ON(irqs_disabled());
-        spin_lock_irq(&rrpc->inflights.lock);
-        list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) {
-                if (unlikely(request_intersects(rtmp, laddr, laddr_end))) {
-                        /* existing, overlapping request, come back later */
-                        spin_unlock_irq(&rrpc->inflights.lock);
-                        return 1;
-                }
-        }
-        r->l_start = laddr;
-        r->l_end = laddr_end;
-        list_add_tail(&r->list, &rrpc->inflights.reqs);
-        spin_unlock_irq(&rrpc->inflights.lock);
-        return 0;
-}
-static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
-                                 unsigned int pages,
-                                 struct rrpc_inflight_rq *r)
-{
-        BUG_ON((laddr + pages) > rrpc->nr_sects);
-        return __rrpc_lock_laddr(rrpc, laddr, pages, r);
-}
-static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq *rqd)
-{
-        struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
-        return &rrqd->inflight_rq;
-}
-static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio,
-                                                        struct nvm_rq *rqd)
-{
-        sector_t laddr = rrpc_get_laddr(bio);
-        unsigned int pages = rrpc_get_pages(bio);
-        struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
-        return rrpc_lock_laddr(rrpc, laddr, pages, r);
-}
-static inline void rrpc_unlock_laddr(struct rrpc *rrpc,
-                                                struct rrpc_inflight_rq *r)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&rrpc->inflights.lock, flags);
-        list_del_init(&r->list);
-        spin_unlock_irqrestore(&rrpc->inflights.lock, flags);
-}
-static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd)
-{
-        struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
-        uint8_t pages = rqd->nr_ppas;
-        BUG_ON((r->l_start + pages) > rrpc->nr_sects);
-        rrpc_unlock_laddr(rrpc, r);
-}
-#endif /* RRPC_H_ */
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a0cc1bc6d884..6cc6c0f9c3a9 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -525,15 +525,21 @@ struct open_bucket {
 /*
 * We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
+ * write streams for better cache utilization: first we try to segregate flash
- * the last write to it was sequential with the current write, and failing that
+ * only volume write streams from cached devices, secondly we look for a bucket
- * we look for a bucket that was last used by the same task.
+ * where the last write to it was sequential with the current write, and
+ * failing that we look for a bucket that was last used by the same task.
 *
 * The ideas is if you've got multiple tasks pulling data into the cache at the
 * same time, you'll get better cache utilization if you try to segregate their
 * data and preserve locality.
 *
- * For example, say you've starting Firefox at the same time you're copying a
+ * For example, dirty sectors of flash only volume is not reclaimable, if their
+ * dirty sectors mixed with dirty sectors of cached device, such buckets will
+ * be marked as dirty and won't be reclaimed, though the dirty data of cached
+ * device have been written back to backend device.
+ *
+ * And say you've starting Firefox at the same time you're copying a
 * bunch of files. Firefox will likely end up being fairly hot and stay in the
 * cache awhile, but the data you copied might not be; if you wrote all that
 * data to the same buckets it'd get invalidated at the same time.
@@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
        struct open_bucket *ret, *ret_task = NULL;
        list_for_each_entry_reverse(ret, &c->data_buckets, list)
-                if (!bkey_cmp(&ret->key, search))
+                if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
+                    UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
+                        continue;
+                else if (!bkey_cmp(&ret->key, search))
                        goto found;
                else if (ret->last_write_point == write_point)
                        ret_task = ret;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 843877e017e1..5e2d4e80198e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -320,14 +320,15 @@ struct cached_dev {
         */
        atomic_t                has_dirty;
-        struct bch_ratelimit    writeback_rate;
-        struct delayed_work     writeback_rate_update;
        /*
-         * Internal to the writeback code, so read_dirty() can keep track of
+         * Set to zero by things that touch the backing volume-- except
-         * where it's at.
+         * writeback.  Incremented by writeback.  Used to determine when to
+         * accelerate idle writeback.
         */
-        sector_t                last_read;
+        atomic_t                backing_idle;
+        struct bch_ratelimit    writeback_rate;
+        struct delayed_work     writeback_rate_update;
        /* Limit number of writeback bios in flight */
        struct semaphore        in_flight;
@@ -336,6 +337,14 @@ struct cached_dev {
        struct keybuf           writeback_keys;
+        /*
+         * Order the write-half of writeback operations strongly in dispatch
+         * order.  (Maintain LBA order; don't allow reads completing out of
+         * order to re-order the writes...)
+         */
+        struct closure_waitlist writeback_ordering_wait;
+        atomic_t                writeback_sequence_next;
        /* For tracking sequential IO */
 #define RECENT_IO_BITS  7
 #define RECENT_IO       (1 << RECENT_IO_BITS)
@@ -488,6 +497,7 @@ struct cache_set {
        int                     caches_loaded;
        struct bcache_device    **devices;
+        unsigned                devices_max_used;
        struct list_head        cached_devs;
        uint64_t                cached_dev_sectors;
        struct closure          caching;
@@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c)
 /* Forward declarations */
-void bch_count_io_errors(struct cache *, blk_status_t, const char *);
+void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
 void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
                              blk_status_t, const char *);
 void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81e8dc3dbe5e..bf3a48aa9a9a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
                       bset_sector_offset(&b->keys, i));
-        if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+        if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
                int j;
                struct bio_vec *bv;
                void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b)
                continue_at(cl, btree_node_write_done, NULL);
        } else {
+                /* No problem for multipage bvec since the bio is just allocated */
                b->bio->bi_vcnt = 0;
                bch_bio_map(b->bio, i);
@@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
        /* don't reclaim buckets to which writeback keys point */
        rcu_read_lock();
-        for (i = 0; i < c->nr_uuids; i++) {
+        for (i = 0; i < c->devices_max_used; i++) {
                struct bcache_device *d = c->devices[i];
                struct cached_dev *dc;
                struct keybuf_key *w, *n;
@@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg)
 int bch_gc_thread_start(struct cache_set *c)
 {
        c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
-        if (IS_ERR(c->gc_thread))
+        return PTR_ERR_OR_ZERO(c->gc_thread);
-                return PTR_ERR(c->gc_thread);
-        return 0;
 }
 /* Initial partial gc */
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 1841d0359bac..7f12920c14f7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -8,6 +8,7 @@
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/sched/debug.h>
 #include "closure.h"
@@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
        BUG_ON(flags & CLOSURE_GUARD_MASK);
        BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
-        /* Must deliver precisely one wakeup */
-        if (r == 1 && (flags & CLOSURE_SLEEPING))
-                wake_up_process(cl->task);
        if (!r) {
                if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
                        atomic_set(&cl->remaining,
@@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 }
 EXPORT_SYMBOL(closure_wait);
-/**
+struct closure_syncer {
- * closure_sync - sleep until a closure has nothing left to wait on
+        struct task_struct      *task;
- *
+        int                     done;
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+};
- * the last refcount.
- */
+static void closure_sync_fn(struct closure *cl)
-void closure_sync(struct closure *cl)
 {
-        while (1) {
+        cl->s->done = 1;
-                __closure_start_sleep(cl);
+        wake_up_process(cl->s->task);
-                closure_set_ret_ip(cl);
+}
-                if ((atomic_read(&cl->remaining) &
+void __sched __closure_sync(struct closure *cl)
-                     CLOSURE_REMAINING_MASK) == 1)
+{
-                        break;
+        struct closure_syncer s = { .task = current };
+        cl->s = &s;
+        continue_at(cl, closure_sync_fn, NULL);
+        while (1) {
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                if (s.done)
+                        break;
                schedule();
        }
-        __closure_end_sleep(cl);
+        __set_current_state(TASK_RUNNING);
 }
-EXPORT_SYMBOL(closure_sync);
+EXPORT_SYMBOL(__closure_sync);
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data)
                           cl, (void *) cl->ip, cl->fn, cl->parent,
                           r & CLOSURE_REMAINING_MASK);
-                seq_printf(f, "%s%s%s%s\n",
+                seq_printf(f, "%s%s\n",
                           test_bit(WORK_STRUCT_PENDING_BIT,
                                    work_data_bits(&cl->work)) ? "Q" : "",
-                           r & CLOSURE_RUNNING  ? "R" : "",
+                           r & CLOSURE_RUNNING  ? "R" : "");
-                           r & CLOSURE_STACK    ? "S" : "",
-                           r & CLOSURE_SLEEPING ? "Sl" : "");
                if (r & CLOSURE_WAITING)
                        seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index ccfbea6f9f6b..3b9dfc9962ad 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -103,6 +103,7 @@
 */
 struct closure;
+struct closure_syncer;
 typedef void (closure_fn) (struct closure *);
 struct closure_waitlist {
@@ -115,10 +116,6 @@ enum closure_state {
         * the thread that owns the closure, and cleared by the thread that's
         * waking up the closure.
         *
-         * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
-         * - indicates that cl->task is valid and closure_put() may wake it up.
-         * Only set or cleared by the thread that owns the closure.
-         *
         * The rest are for debugging and don't affect behaviour:
         *
         * CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -128,22 +125,16 @@ enum closure_state {
         * continue_at() and closure_return() clear it for you, if you're doing
         * something unusual you can use closure_set_dead() which also helps
         * annotate where references are being transferred.
-         *
-         * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
-         * closure with this flag set
         */
-        CLOSURE_BITS_START      = (1 << 23),
+        CLOSURE_BITS_START      = (1U << 26),
-        CLOSURE_DESTRUCTOR      = (1 << 23),
+        CLOSURE_DESTRUCTOR      = (1U << 26),
-        CLOSURE_WAITING         = (1 << 25),
+        CLOSURE_WAITING         = (1U << 28),
-        CLOSURE_SLEEPING        = (1 << 27),
+        CLOSURE_RUNNING         = (1U << 30),
-        CLOSURE_RUNNING         = (1 << 29),
-        CLOSURE_STACK           = (1 << 31),
 };
 #define CLOSURE_GUARD_MASK                                      \
-        ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING|  \
+        ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
-          CLOSURE_RUNNING|CLOSURE_STACK) << 1)
 #define CLOSURE_REMAINING_MASK          (CLOSURE_BITS_START - 1)
 #define CLOSURE_REMAINING_INITIALIZER   (1|CLOSURE_RUNNING)
@@ -152,7 +143,7 @@ struct closure {
        union {
                struct {
                        struct workqueue_struct *wq;
-                        struct task_struct      *task;
+                        struct closure_syncer   *s;
                        struct llist_node       list;
                        closure_fn              *fn;
                };
@@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void closure_sync(struct closure *cl);
+void __closure_sync(struct closure *cl);
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+        if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+                __closure_sync(cl);
+}
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
 #endif
 }
-static inline void __closure_end_sleep(struct closure *cl)
-{
-        __set_current_state(TASK_RUNNING);
-        if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
-                atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
-}
-static inline void __closure_start_sleep(struct closure *cl)
-{
-        closure_set_ip(cl);
-        cl->task = current;
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
-                atomic_add(CLOSURE_SLEEPING, &cl->remaining);
-}
 static inline void closure_set_stopped(struct closure *cl)
 {
        atomic_sub(CLOSURE_RUNNING, &cl->remaining);
@@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl)
 static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
                                  struct workqueue_struct *wq)
 {
-        BUG_ON(object_is_on_stack(cl));
        closure_set_ip(cl);
        cl->fn = fn;
        cl->wq = wq;
@@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
 static inline void closure_init_stack(struct closure *cl)
 {
        memset(cl, 0, sizeof(struct closure));
-        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
 }
 /**
@@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list)
 * This is because after calling continue_at() you no longer have a ref on @cl,
 * and whatever @cl owns may be freed out from under you - a running closure fn
 * has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
 */
 #define continue_at(_cl, _fn, _wq)                                      \
 do {                                                                    \
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c7a02c4900da..af89408befe8 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
                return;
        check->bi_opf = REQ_OP_READ;
-        if (bio_alloc_pages(check, GFP_NOIO))
+        if (bch_bio_alloc_pages(check, GFP_NOIO))
                goto out_put;
        submit_bio_wait(check);
@@ -251,8 +251,7 @@ void bch_debug_exit(void)
 int __init bch_debug_init(struct kobject *kobj)
 {
-        int ret = 0;
        debug = debugfs_create_dir("bcache", NULL);
-        return ret;
+        return IS_ERR_OR_NULL(debug);
 }
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fac97ec2d0e2..a783c5a41ff1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 /* IO errors */
-void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
+void bch_count_io_errors(struct cache *ca,
+                         blk_status_t error,
+                         int is_read,
+                         const char *m)
 {
        /*
         * The halflife of an error is:
@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
                errors >>= IO_ERROR_SHIFT;
                if (errors < ca->set->error_limit)
-                        pr_err("%s: IO error on %s, recovering",
+                        pr_err("%s: IO error on %s%s",
-                               bdevname(ca->bdev, buf), m);
+                               bdevname(ca->bdev, buf), m,
+                               is_read ? ", recovering." : ".");
                else
                        bch_cache_set_error(ca->set,
                                            "%s: too many IO errors %s",
@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 {
        struct bbio *b = container_of(bio, struct bbio, bio);
        struct cache *ca = PTR_CACHE(c, &b->key, 0);
+        int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
        unsigned threshold = op_is_write(bio_op(bio))
                ? c->congested_write_threshold_us
@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
                        atomic_inc(&c->congested);
        }
-        bch_count_io_errors(ca, error, m);
+        bch_count_io_errors(ca, error, is_read, m);
 }
 void bch_bbio_endio(struct cache_set *c, struct bio *bio,
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index d50c1c97da68..a24c3a95b2c0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
                bio_set_op_attrs(bio, REQ_OP_READ, 0);
                bio->bi_end_io  = read_moving_endio;
-                if (bio_alloc_pages(bio, GFP_KERNEL))
+                if (bch_bio_alloc_pages(bio, GFP_KERNEL))
                        goto err;
                trace_bcache_gc_copy(&w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 643c3021624f..1a46b41dac70 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl)
 {
        struct search *s = container_of(cl, struct search, iop.cl);
        struct bio *bio = &s->bio.bio;
+        struct cached_dev *dc;
        int ret;
        bch_btree_op_init(&s->op, -1);
@@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl)
                return;
        }
+        /*
+         * We might meet err when searching the btree, If that happens, we will
+         * get negative ret, in this scenario we should not recover data from
+         * backing device (when cache device is dirty) because we don't know
+         * whether bkeys the read request covered are all clean.
+         *
+         * And after that happened, s->iop.status is still its initial value
+         * before we submit s->bio.bio
+         */
+        if (ret < 0) {
+                BUG_ON(ret == -EINTR);
+                if (s->d && s->d->c &&
+                                !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
+                        dc = container_of(s->d, struct cached_dev, disk);
+                        if (dc && atomic_read(&dc->has_dirty))
+                                s->recoverable = false;
+                }
+                if (!s->iop.status)
+                        s->iop.status = BLK_STS_IOERR;
+        }
        closure_return(cl);
 }
@@ -611,8 +633,8 @@ static void request_endio(struct bio *bio)
 static void bio_complete(struct search *s)
 {
        if (s->orig_bio) {
-                struct request_queue *q = s->orig_bio->bi_disk->queue;
+                generic_end_io_acct(s->d->disk->queue,
-                generic_end_io_acct(q, bio_data_dir(s->orig_bio),
+                                    bio_data_dir(s->orig_bio),
                                    &s->d->disk->part0, s->start_time);
                trace_bcache_request_end(s->d, s->orig_bio);
@@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
        cache_bio->bi_private   = &s->cl;
        bch_bio_map(cache_bio, NULL);
-        if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
+        if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
                goto out_put;
        if (reada)
@@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
        struct cached_dev *dc = container_of(d, struct cached_dev, disk);
        int rw = bio_data_dir(bio);
+        atomic_set(&dc->backing_idle, 0);
        generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
        bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b4d28928dec5..133b81225ea9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio)
 static void __write_super(struct cache_sb *sb, struct bio *bio)
 {
-        struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+        struct cache_sb *out = page_address(bio_first_page_all(bio));
        unsigned i;
        bio->bi_iter.bi_sector  = SB_SECTOR;
@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio)
 {
        struct cache *ca = bio->bi_private;
-        bch_count_io_errors(ca, bio->bi_status, "writing superblock");
+        /* is_read = 0 */
+        bch_count_io_errors(ca, bio->bi_status, 0,
+                            "writing superblock");
        closure_put(&ca->set->sb_write);
 }
@@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
        d->c = c;
        c->devices[id] = d;
+        if (id >= c->devices_max_used)
+                c->devices_max_used = id + 1;
        closure_get(&c->caching);
 }
@@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
        mutex_lock(&bch_register_lock);
+        cancel_delayed_work_sync(&dc->writeback_rate_update);
+        if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+                kthread_stop(dc->writeback_thread);
+                dc->writeback_thread = NULL;
+        }
        memset(&dc->sb.set_uuid, 0, 16);
        SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
@@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
        dc->bdev->bd_holder = dc;
        bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
-        dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+        bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
        get_page(sb_page);
        if (cached_dev_init(dc, sb->block_size << 9))
@@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c)
        struct uuid_entry *u;
        for (u = c->uuids;
-             u < c->uuids + c->nr_uuids && !ret;
+             u < c->uuids + c->devices_max_used && !ret;
             u++)
                if (UUID_FLASH_ONLY(u))
                        ret = flash_dev_run(c, u);
@@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl)
        mutex_lock(&bch_register_lock);
-        for (i = 0; i < c->nr_uuids; i++)
+        for (i = 0; i < c->devices_max_used; i++)
                if (c->devices[i]) {
                        if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
                            test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
@@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
        c->bucket_bits          = ilog2(sb->bucket_size);
        c->block_bits           = ilog2(sb->block_size);
        c->nr_uuids             = bucket_bytes(c) / sizeof(struct uuid_entry);
+        c->devices_max_used     = 0;
        c->btree_pages          = bucket_pages(c);
        if (c->btree_pages > BTREE_MAX_PAGES)
                c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj)
                free_fifo(&ca->free[i]);
        if (ca->sb_bio.bi_inline_vecs[0].bv_page)
-                put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+                put_page(bio_first_page_all(&ca->sb_bio));
        if (!IS_ERR_OR_NULL(ca->bdev))
                blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
        ca->bdev->bd_holder = ca;
        bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
-        ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+        bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
        get_page(sb_page);
        if (blk_queue_discard(bdev_get_queue(ca->bdev)))
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index e548b8b51322..a23cd6a14b74 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
                : 0;
 }
+/*
+ * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly,
+ * the preferred way is bio_add_page, but in this case, bch_bio_map()
+ * supposes that the bvec table is empty, so it is safe to access
+ * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is
+ * supported.
+ */
 void bch_bio_map(struct bio *bio, void *base)
 {
        size_t size = bio->bi_iter.bi_size;
@@ -276,6 +283,33 @@ start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
        }
 }
+/**
+ * bch_bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+        int i;
+        struct bio_vec *bv;
+        bio_for_each_segment_all(bv, bio, i) {
+                bv->bv_page = alloc_page(gfp_mask);
+                if (!bv->bv_page) {
+                        while (--bv >= bio->bi_io_vec)
+                                __free_page(bv->bv_page);
+                        return -ENOMEM;
+                }
+        }
+        return 0;
+}
 /*
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
 * use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ed5e8a412eb8..4df4c5c1cab2 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }
 void bch_bio_map(struct bio *bio, void *base);
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 56a37884ca8b..51306a19ab03 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -18,17 +18,39 @@
 #include <trace/events/bcache.h>
 /* Rate limiting */
+static uint64_t __calc_target_rate(struct cached_dev *dc)
-static void __update_writeback_rate(struct cached_dev *dc)
 {
        struct cache_set *c = dc->disk.c;
+        /*
+         * This is the size of the cache, minus the amount used for
+         * flash-only devices
+         */
        uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
                                bcache_flash_devs_sectors_dirty(c);
+        /*
+         * Unfortunately there is no control of global dirty data.  If the
+         * user states that they want 10% dirty data in the cache, and has,
+         * e.g., 5 backing volumes of equal size, we try and ensure each
+         * backing volume uses about 2% of the cache for dirty data.
+         */
+        uint32_t bdev_share =
+                div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
+                                c->cached_dev_sectors);
        uint64_t cache_dirty_target =
                div_u64(cache_sectors * dc->writeback_percent, 100);
-        int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
-                                   c->cached_dev_sectors);
+        /* Ensure each backing dev gets at least one dirty share */
+        if (bdev_share < 1)
+                bdev_share = 1;
+        return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
+}
+static void __update_writeback_rate(struct cached_dev *dc)
+{
        /*
         * PI controller:
         * Figures out the amount that should be written per second.
@@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
         * This acts as a slow, long-term average that is not subject to
         * variations in usage like the p term.
         */
+        int64_t target = __calc_target_rate(dc);
        int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
        int64_t error = dirty - target;
        int64_t proportional_scaled =
@@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 struct dirty_io {
        struct closure          cl;
        struct cached_dev       *dc;
+        uint16_t                sequence;
        struct bio              bio;
 };
@@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl)
 {
        struct dirty_io *io = container_of(cl, struct dirty_io, cl);
        struct keybuf_key *w = io->bio.bi_private;
+        struct cached_dev *dc = io->dc;
+        uint16_t next_sequence;
+        if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
+                /* Not our turn to write; wait for a write to complete */
+                closure_wait(&dc->writeback_ordering_wait, cl);
+                if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
+                        /*
+                         * Edge case-- it happened in indeterminate order
+                         * relative to when we were added to wait list..
+                         */
+                        closure_wake_up(&dc->writeback_ordering_wait);
+                }
+                continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+                return;
+        }
+        next_sequence = io->sequence + 1;
        /*
         * IO errors are signalled using the dirty bit on the key.
@@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl)
                closure_bio_submit(&io->bio, cl);
        }
+        atomic_set(&dc->writeback_sequence_next, next_sequence);
+        closure_wake_up(&dc->writeback_ordering_wait);
        continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
@@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio)
        struct keybuf_key *w = bio->bi_private;
        struct dirty_io *io = w->private;
+        /* is_read = 1 */
        bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
-                            bio->bi_status, "reading dirty data from cache");
+                            bio->bi_status, 1,
+                            "reading dirty data from cache");
        dirty_endio(bio);
 }
@@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl)
 static void read_dirty(struct cached_dev *dc)
 {
        unsigned delay = 0;
-        struct keybuf_key *w;
+        struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
+        size_t size;
+        int nk, i;
        struct dirty_io *io;
        struct closure cl;
+        uint16_t sequence = 0;
+        BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
+        atomic_set(&dc->writeback_sequence_next, sequence);
        closure_init_stack(&cl);
        /*
@@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc)
         * mempools.
         */
-        while (!kthread_should_stop()) {
+        next = bch_keybuf_next(&dc->writeback_keys);
-                w = bch_keybuf_next(&dc->writeback_keys);
+        while (!kthread_should_stop() && next) {
-                if (!w)
+                size = 0;
-                        break;
+                nk = 0;
-                BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
+                do {
+                        BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
-                if (KEY_START(&w->key) != dc->last_read ||
-                    jiffies_to_msecs(delay) > 50)
+                        /*
-                        while (!kthread_should_stop() && delay)
+                         * Don't combine too many operations, even if they
-                                delay = schedule_timeout_interruptible(delay);
+                         * are all small.
+                         */
-                dc->last_read   = KEY_OFFSET(&w->key);
+                        if (nk >= MAX_WRITEBACKS_IN_PASS)
+                                break;
-                io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
-                             * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+                        /*
-                             GFP_KERNEL);
+                         * If the current operation is very large, don't
-                if (!io)
+                         * further combine operations.
-                        goto err;
+                         */
+                        if (size >= MAX_WRITESIZE_IN_PASS)
-                w->private      = io;
+                                break;
-                io->dc          = dc;
+                        /*
-                dirty_init(w);
+                         * Operations are only eligible to be combined
-                bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+                         * if they are contiguous.
-                io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+                         *
-                bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+                         * TODO: add a heuristic willing to fire a
-                io->bio.bi_end_io       = read_dirty_endio;
+                         * certain amount of non-contiguous IO per pass,
+                         * so that we can benefit from backing device
-                if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+                         * command queueing.
-                        goto err_free;
+                         */
+                        if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
+                                                &START_KEY(&next->key)))
+                                break;
+                        size += KEY_SIZE(&next->key);
+                        keys[nk++] = next;
+                } while ((next = bch_keybuf_next(&dc->writeback_keys)));
+                /* Now we have gathered a set of 1..5 keys to write back. */
+                for (i = 0; i < nk; i++) {
+                        w = keys[i];
+                        io = kzalloc(sizeof(struct dirty_io) +
+                                     sizeof(struct bio_vec) *
+                                     DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+                                     GFP_KERNEL);
+                        if (!io)
+                                goto err;
+                        w->private      = io;
+                        io->dc          = dc;
+                        io->sequence    = sequence++;
+                        dirty_init(w);
+                        bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+                        io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+                        bio_set_dev(&io->bio,
+                                    PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+                        io->bio.bi_end_io       = read_dirty_endio;
+                        if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+                                goto err_free;
+                        trace_bcache_writeback(&w->key);
+                        down(&dc->in_flight);
+                        /* We've acquired a semaphore for the maximum
+                         * simultaneous number of writebacks; from here
+                         * everything happens asynchronously.
+                         */
+                        closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+                }
-                trace_bcache_writeback(&w->key);
+                delay = writeback_delay(dc, size);
-                down(&dc->in_flight);
+                /* If the control system would wait for at least half a
-                closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+                 * second, and there's been no reqs hitting the backing disk
+                 * for awhile: use an alternate mode where we have at most
+                 * one contiguous set of writebacks in flight at a time.  If
+                 * someone wants to do IO it will be quick, as it will only
+                 * have to contend with one operation in flight, and we'll
+                 * be round-tripping data to the backing disk as quickly as
+                 * it can accept it.
+                 */
+                if (delay >= HZ / 2) {
+                        /* 3 means at least 1.5 seconds, up to 7.5 if we
+                         * have slowed way down.
+                         */
+                        if (atomic_inc_return(&dc->backing_idle) >= 3) {
+                                /* Wait for current I/Os to finish */
+                                closure_sync(&cl);
+                                /* And immediately launch a new set. */
+                                delay = 0;
+                        }
+                }
-                delay = writeback_delay(dc, KEY_SIZE(&w->key));
+                while (!kthread_should_stop() && delay) {
+                        schedule_timeout_interruptible(delay);
+                        delay = writeback_delay(dc, 0);
+                }
        }
        if (0) {
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index a9e3ffb4b03c..66f1c527fa24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,16 @@
 #define CUTOFF_WRITEBACK        40
 #define CUTOFF_WRITEBACK_SYNC   70
+#define MAX_WRITEBACKS_IN_PASS  5
+#define MAX_WRITESIZE_IN_PASS   5000    /* *512b */
+/*
+ * 14 (16384ths) is chosen here as something that each backing device
+ * should be a reasonable fraction of the share, and not to blow up
+ * until individual backing devices are a petabyte.
+ */
+#define WRITEBACK_SHARE_SHIFT   14
 static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 {
        uint64_t i, ret = 0;
@@ -21,7 +31,7 @@ static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
        mutex_lock(&bch_register_lock);
-        for (i = 0; i < c->nr_uuids; i++) {
+        for (i = 0; i < c->devices_max_used; i++) {
                struct bcache_device *d = c->devices[i];
                if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 554d60394c06..2ad429100d25 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
        bio_for_each_segment_all(bv, clone, i) {
                BUG_ON(!bv->bv_page);
                mempool_free(bv->bv_page, cc->page_pool);
-                bv->bv_page = NULL;
        }
 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f7810cc869ac..ef57c6d1c887 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1475,21 +1475,6 @@ static void activate_path_work(struct work_struct *work)
        activate_or_offline_path(pgpath);
 }
-static int noretry_error(blk_status_t error)
-{
-        switch (error) {
-        case BLK_STS_NOTSUPP:
-        case BLK_STS_NOSPC:
-        case BLK_STS_TARGET:
-        case BLK_STS_NEXUS:
-        case BLK_STS_MEDIUM:
-                return 1;
-        }
-        /* Anything else could be a path failure, so should be retried */
-        return 0;
-}
 static int multipath_end_io(struct dm_target *ti, struct request *clone,
                            blk_status_t error, union map_info *map_context)
 {
@@ -1508,7 +1493,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
         * request into dm core, which will remake a clone request and
         * clone bios for it and resubmit it later.
         */
-        if (error && !noretry_error(error)) {
+        if (error && blk_path_error(error)) {
                struct multipath *m = ti->private;
                r = DM_ENDIO_REQUEUE;
@@ -1544,7 +1529,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
        unsigned long flags;
        int r = DM_ENDIO_DONE;
-        if (!*error || noretry_error(*error))
+        if (!*error || !blk_path_error(*error))
                goto done;
        if (pgpath)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d32f25489c2..b7d175e94a02 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
        dm_complete_request(tio->orig, error);
 }
-static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
        blk_status_t r;
@@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
        clone->start_time = jiffies;
        r = blk_insert_cloned_request(clone->q, clone);
-        if (r)
+        if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
                /* must complete clone in terms of original request */
                dm_complete_request(rq, r);
+        return r;
 }
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
@@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
        struct mapped_device *md = tio->md;
        struct request *rq = tio->orig;
        struct request *clone = NULL;
+        blk_status_t ret;
        r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+check_again:
        switch (r) {
        case DM_MAPIO_SUBMITTED:
                /* The target has taken the I/O to submit by itself later */
@@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
                /* The target has remapped the I/O so dispatch it */
                trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
                                     blk_rq_pos(rq));
-                dm_dispatch_clone_request(clone, rq);
+                ret = dm_dispatch_clone_request(clone, rq);
+                if (ret == BLK_STS_RESOURCE) {
+                        blk_rq_unprep_clone(clone);
+                        tio->ti->type->release_clone_rq(clone);
+                        tio->clone = NULL;
+                        if (!rq->q->mq_ops)
+                                r = DM_MAPIO_DELAY_REQUEUE;
+                        else
+                                r = DM_MAPIO_REQUEUE;
+                        goto check_again;
+                }
                break;
        case DM_MAPIO_REQUEUE:
                /* The target wants to requeue the I/O */
@@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
                return error;
        }
-        elv_register_queue(md->queue);
        return 0;
 }
@@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
        }
        dm_init_md_queue(md);
-        /* backfill 'mq' sysfs registration normally done in blk_register_queue */
-        err = blk_mq_register_dev(disk_to_dev(md->disk), q);
-        if (err)
-                goto out_cleanup_queue;
        return 0;
-out_cleanup_queue:
-        blk_cleanup_queue(q);
 out_tag_set:
        blk_mq_free_tag_set(md->tag_set);
 out_kfree_tag_set:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index de17b7193299..8c26bfc35335 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -920,7 +920,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
                return -EINVAL;
        }
-        ti->max_io_len = (uint32_t) len;
+        /*
+         * BIO based queue uses its own splitting. When multipage bvecs
+         * is switched on, size of the incoming bio may be too big to
+         * be handled in some targets, such as crypt.
+         *
+         * When these targets are ready for the big bio, we can remove
+         * the limit.
+         */
+        ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
        return 0;
 }
@@ -1753,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor)
                goto bad;
        md->dax_dev = dax_dev;
-        add_disk(md->disk);
+        add_disk_no_queue_reg(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
@@ -2013,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
        int r;
+        struct queue_limits limits;
        enum dm_queue_mode type = dm_get_md_type(md);
        switch (type) {
@@ -2049,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
                break;
        }
+        r = dm_calculate_queue_limits(t, &limits);
+        if (r) {
+                DMERR("Cannot calculate initial queue limits");
+                return r;
+        }
+        dm_table_set_restrictions(t, md->queue, &limits);
+        blk_register_queue(md->disk);
        return 0;
 }
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a25fd43650ad..441e67e3a9d7 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
+ccflags-y                               += -I$(src)
 obj-$(CONFIG_NVME_CORE)                 += nvme-core.o
 obj-$(CONFIG_BLK_DEV_NVME)              += nvme.o
 obj-$(CONFIG_NVME_FABRICS)              += nvme-fabrics.o
@@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
 obj-$(CONFIG_NVME_FC)                   += nvme-fc.o
 nvme-core-y                             := core.o
+nvme-core-$(CONFIG_TRACING)             += trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)      += multipath.o
 nvme-core-$(CONFIG_NVM)                 += lightnvm.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 839650e0926a..e8104871cbbf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -29,6 +29,9 @@
 #include <linux/pm_qos.h>
 #include <asm/unaligned.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
 #include "nvme.h"
 #include "fabrics.h"
@@ -65,9 +68,26 @@ static bool streams;
 module_param(streams, bool, 0644);
 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
+/*
+ * nvme_wq - hosts nvme related works that are not reset or delete
+ * nvme_reset_wq - hosts nvme reset works
+ * nvme_delete_wq - hosts nvme delete works
+ *
+ * nvme_wq will host works such are scan, aen handling, fw activation,
+ * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+ * runs reset works which also flush works hosted on nvme_wq for
+ * serialization purposes. nvme_delete_wq host controller deletion
+ * works which flush reset works for serialization.
+ */
 struct workqueue_struct *nvme_wq;
 EXPORT_SYMBOL_GPL(nvme_wq);
+struct workqueue_struct *nvme_reset_wq;
+EXPORT_SYMBOL_GPL(nvme_reset_wq);
+struct workqueue_struct *nvme_delete_wq;
+EXPORT_SYMBOL_GPL(nvme_delete_wq);
 static DEFINE_IDA(nvme_subsystems_ida);
 static LIST_HEAD(nvme_subsystems);
 static DEFINE_MUTEX(nvme_subsystems_lock);
@@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 {
        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
                return -EBUSY;
-        if (!queue_work(nvme_wq, &ctrl->reset_work))
+        if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
                return -EBUSY;
        return 0;
 }
 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
-static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 {
        int ret;
@@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
                flush_work(&ctrl->reset_work);
        return ret;
 }
+EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
 static void nvme_delete_ctrl_work(struct work_struct *work)
 {
@@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
 {
        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
                return -EBUSY;
-        if (!queue_work(nvme_wq, &ctrl->delete_work))
+        if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
                return -EBUSY;
        return 0;
 }
@@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req)
                return BLK_STS_OK;
        case NVME_SC_CAP_EXCEEDED:
                return BLK_STS_NOSPC;
+        case NVME_SC_LBA_RANGE:
+                return BLK_STS_TARGET;
+        case NVME_SC_BAD_ATTRIBUTES:
        case NVME_SC_ONCS_NOT_SUPPORTED:
+        case NVME_SC_INVALID_OPCODE:
+        case NVME_SC_INVALID_FIELD:
+        case NVME_SC_INVALID_NS:
                return BLK_STS_NOTSUPP;
        case NVME_SC_WRITE_FAULT:
        case NVME_SC_READ_ERROR:
        case NVME_SC_UNWRITTEN_BLOCK:
        case NVME_SC_ACCESS_DENIED:
        case NVME_SC_READ_ONLY:
+        case NVME_SC_COMPARE_FAILED:
                return BLK_STS_MEDIUM;
        case NVME_SC_GUARD_CHECK:
        case NVME_SC_APPTAG_CHECK:
@@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req)
 void nvme_complete_rq(struct request *req)
 {
-        if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+        blk_status_t status = nvme_error_status(req);
-                if (nvme_req_needs_failover(req)) {
+        trace_nvme_complete_rq(req);
+        if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
+                if (nvme_req_needs_failover(req, status)) {
                        nvme_failover_req(req);
                        return;
                }
@@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req)
                        return;
                }
        }
+        blk_mq_end_request(req, status);
-        blk_mq_end_request(req, nvme_error_status(req));
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
        old_state = ctrl->state;
        switch (new_state) {
+        case NVME_CTRL_ADMIN_ONLY:
+                switch (old_state) {
+                case NVME_CTRL_RECONNECTING:
+                        changed = true;
+                        /* FALLTHRU */
+                default:
+                        break;
+                }
+                break;
        case NVME_CTRL_LIVE:
                switch (old_state) {
                case NVME_CTRL_NEW:
@@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
                switch (old_state) {
                case NVME_CTRL_NEW:
                case NVME_CTRL_LIVE:
+                case NVME_CTRL_ADMIN_ONLY:
                        changed = true;
                        /* FALLTHRU */
                default:
@@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
        case NVME_CTRL_DELETING:
                switch (old_state) {
                case NVME_CTRL_LIVE:
+                case NVME_CTRL_ADMIN_ONLY:
                case NVME_CTRL_RESETTING:
                case NVME_CTRL_RECONNECTING:
                        changed = true;
@@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
        }
        cmd->common.command_id = req->tag;
+        if (ns)
+                trace_nvme_setup_nvm_cmd(req->q->id, cmd);
+        else
+                trace_nvme_setup_admin_cmd(cmd);
        return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
 #ifdef CONFIG_NVME_MULTIPATH
        /* should never be called due to GENHD_FL_HIDDEN */
        if (WARN_ON_ONCE(ns->head->disk))
-                return -ENXIO;
+                goto fail;
 #endif
        if (!kref_get_unless_zero(&ns->kref))
-                return -ENXIO;
+                goto fail;
+        if (!try_module_get(ns->ctrl->ops->module))
+                goto fail_put_ns;
        return 0;
+fail_put_ns:
+        nvme_put_ns(ns);
+fail:
+        return -ENXIO;
 }
 static void nvme_release(struct gendisk *disk, fmode_t mode)
 {
-        nvme_put_ns(disk->private_data);
+        struct nvme_ns *ns = disk->private_data;
+        module_put(ns->ctrl->ops->module);
+        nvme_put_ns(ns);
 }
 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
        NULL,
 };
+static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+{
+        int count = 0;
+        struct nvme_ctrl *ctrl;
+        mutex_lock(&subsys->lock);
+        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+                if (ctrl->state != NVME_CTRL_DELETING &&
+                    ctrl->state != NVME_CTRL_DEAD)
+                        count++;
+        }
+        mutex_unlock(&subsys->lock);
+        return count;
+}
 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 {
        struct nvme_subsystem *subsys, *found;
@@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
                 * Verify that the subsystem actually supports multiple
                 * controllers, else bail out.
                 */
-                if (!(id->cmic & (1 << 1))) {
+                if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
                        dev_err(ctrl->device,
                                "ignoring ctrl due to duplicate subnqn (%s).\n",
                                found->subnqn);
@@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                                                 shutdown_timeout, 60);
                if (ctrl->shutdown_timeout != shutdown_timeout)
-                        dev_warn(ctrl->device,
+                        dev_info(ctrl->device,
                                 "Shutdown timeout set to %u seconds\n",
                                 ctrl->shutdown_timeout);
        } else
@@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
        struct nvme_ctrl *ctrl =
                container_of(inode->i_cdev, struct nvme_ctrl, cdev);
-        if (ctrl->state != NVME_CTRL_LIVE)
+        switch (ctrl->state) {
+        case NVME_CTRL_LIVE:
+        case NVME_CTRL_ADMIN_ONLY:
+                break;
+        default:
                return -EWOULDBLOCK;
+        }
        file->private_data = ctrl;
        return 0;
 }
@@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
        static const char *const state_name[] = {
                [NVME_CTRL_NEW]         = "new",
                [NVME_CTRL_LIVE]        = "live",
+                [NVME_CTRL_ADMIN_ONLY]  = "only-admin",
                [NVME_CTRL_RESETTING]   = "resetting",
                [NVME_CTRL_RECONNECTING]= "reconnecting",
                [NVME_CTRL_DELETING]    = "deleting",
@@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work)
        if (ctrl->state != NVME_CTRL_LIVE)
                return;
+        WARN_ON_ONCE(!ctrl->tagset);
        if (nvme_identify_ctrl(ctrl, &id))
                return;
@@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work)
 void nvme_queue_scan(struct nvme_ctrl *ctrl)
 {
        /*
-         * Do not queue new scan work when a controller is reset during
+         * Only new queue scan work when admin and IO queues are both alive
-         * removal.
         */
        if (ctrl->state == NVME_CTRL_LIVE)
                queue_work(nvme_wq, &ctrl->scan_work);
@@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
 int __init nvme_core_init(void)
 {
-        int result;
+        int result = -ENOMEM;
        nvme_wq = alloc_workqueue("nvme-wq",
                        WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
        if (!nvme_wq)
-                return -ENOMEM;
+                goto out;
+        nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
+                        WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+        if (!nvme_reset_wq)
+                goto destroy_wq;
+        nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
+                        WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+        if (!nvme_delete_wq)
+                goto destroy_reset_wq;
        result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
        if (result < 0)
-                goto destroy_wq;
+                goto destroy_delete_wq;
        nvme_class = class_create(THIS_MODULE, "nvme");
        if (IS_ERR(nvme_class)) {
@@ -3505,8 +3596,13 @@ destroy_class:
        class_destroy(nvme_class);
 unregister_chrdev:
        unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+destroy_delete_wq:
+        destroy_workqueue(nvme_delete_wq);
+destroy_reset_wq:
+        destroy_workqueue(nvme_reset_wq);
 destroy_wq:
        destroy_workqueue(nvme_wq);
+out:
        return result;
 }
@@ -3516,6 +3612,8 @@ void nvme_core_exit(void)
        class_destroy(nvme_subsys_class);
        class_destroy(nvme_class);
        unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+        destroy_workqueue(nvme_delete_wq);
+        destroy_workqueue(nvme_reset_wq);
        destroy_workqueue(nvme_wq);
 }
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 894c2ccb3891..5dd4ceefed8f 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
 */
 int nvmf_register_transport(struct nvmf_transport_ops *ops)
 {
-        if (!ops->create_ctrl)
+        if (!ops->create_ctrl || !ops->module)
                return -EINVAL;
        down_write(&nvmf_transports_rwsem);
@@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                                ret = -ENOMEM;
                                goto out;
                        }
-                        if (uuid_parse(p, &hostid)) {
+                        ret = uuid_parse(p, &hostid);
+                        if (ret) {
                                pr_err("Invalid hostid %s\n", p);
                                ret = -EINVAL;
+                                kfree(p);
                                goto out;
                        }
+                        kfree(p);
                        break;
                case NVMF_OPT_DUP_CONNECT:
                        opts->duplicate_connect = true;
@@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
                goto out_unlock;
        }
+        if (!try_module_get(ops->module)) {
+                ret = -EBUSY;
+                goto out_unlock;
+        }
        ret = nvmf_check_required_opts(opts, ops->required_opts);
        if (ret)
-                goto out_unlock;
+                goto out_module_put;
        ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
                                ops->allowed_opts | ops->required_opts);
        if (ret)
-                goto out_unlock;
+                goto out_module_put;
        ctrl = ops->create_ctrl(dev, opts);
        if (IS_ERR(ctrl)) {
                ret = PTR_ERR(ctrl);
-                goto out_unlock;
+                goto out_module_put;
        }
        if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
                dev_warn(ctrl->device,
                        "controller returned incorrect NQN: \"%s\".\n",
                        ctrl->subsys->subnqn);
+                module_put(ops->module);
                up_read(&nvmf_transports_rwsem);
                nvme_delete_ctrl_sync(ctrl);
                return ERR_PTR(-EINVAL);
        }
+        module_put(ops->module);
        up_read(&nvmf_transports_rwsem);
        return ctrl;
+out_module_put:
+        module_put(ops->module);
 out_unlock:
        up_read(&nvmf_transports_rwsem);
 out_free_opts:
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 9ba614953607..25b19f722f5b 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -108,6 +108,7 @@ struct nvmf_ctrl_options {
 *                             fabric implementation of NVMe fabrics.
 * @entry:              Used by the fabrics library to add the new
 *                      registration entry to its linked-list internal tree.
+ * @module:             Transport module reference
 * @name:               Name of the NVMe fabric driver implementation.
 * @required_opts:      sysfs command-line options that must be specified
 *                      when adding a new NVMe controller.
@@ -126,6 +127,7 @@ struct nvmf_ctrl_options {
 */
 struct nvmf_transport_ops {
        struct list_head        entry;
+        struct module           *module;
        const char              *name;
        int                     required_opts;
        int                     allowed_opts;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 794e66e4aa20..99bf51c7e513 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
        __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
        nvme_fc_free_queue(&ctrl->queues[0]);
+        /* re-enable the admin_q so anything new can fast fail */
+        blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
        nvme_fc_ctlr_inactive_on_rport(ctrl);
 }
@@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
         * waiting for io to terminate
         */
        nvme_fc_delete_association(ctrl);
+        /* resume the io queues so that things will fast fail */
+        nvme_start_queues(nctrl);
 }
 static void
@@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
 static struct nvmf_transport_ops nvme_fc_transport = {
        .name           = "fc",
+        .module         = THIS_MODULE,
        .required_opts  = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
        .allowed_opts   = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
        .create_ctrl    = nvme_fc_create_ctrl,
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba3d7f3349e5..50ef71ee3d86 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -31,27 +31,10 @@
 enum nvme_nvm_admin_opcode {
        nvme_nvm_admin_identity         = 0xe2,
-        nvme_nvm_admin_get_l2p_tbl      = 0xea,
        nvme_nvm_admin_get_bb_tbl       = 0xf2,
        nvme_nvm_admin_set_bb_tbl       = 0xf1,
 };
-struct nvme_nvm_hb_rw {
-        __u8                    opcode;
-        __u8                    flags;
-        __u16                   command_id;
-        __le32                  nsid;
-        __u64                   rsvd2;
-        __le64                  metadata;
-        __le64                  prp1;
-        __le64                  prp2;
-        __le64                  spba;
-        __le16                  length;
-        __le16                  control;
-        __le32                  dsmgmt;
-        __le64                  slba;
-};
 struct nvme_nvm_ph_rw {
        __u8                    opcode;
        __u8                    flags;
@@ -80,19 +63,6 @@ struct nvme_nvm_identity {
        __u32                   rsvd11[5];
 };
-struct nvme_nvm_l2ptbl {
-        __u8                    opcode;
-        __u8                    flags;
-        __u16                   command_id;
-        __le32                  nsid;
-        __le32                  cdw2[4];
-        __le64                  prp1;
-        __le64                  prp2;
-        __le64                  slba;
-        __le32                  nlb;
-        __le16                  cdw14[6];
-};
 struct nvme_nvm_getbbtbl {
        __u8                    opcode;
        __u8                    flags;
@@ -139,9 +109,7 @@ struct nvme_nvm_command {
        union {
                struct nvme_common_command common;
                struct nvme_nvm_identity identity;
-                struct nvme_nvm_hb_rw hb_rw;
                struct nvme_nvm_ph_rw ph_rw;
-                struct nvme_nvm_l2ptbl l2p;
                struct nvme_nvm_getbbtbl get_bb;
                struct nvme_nvm_setbbtbl set_bb;
                struct nvme_nvm_erase_blk erase;
@@ -167,7 +135,7 @@ struct nvme_nvm_id_group {
        __u8                    num_lun;
        __u8                    num_pln;
        __u8                    rsvd1;
-        __le16                  num_blk;
+        __le16                  num_chk;
        __le16                  num_pg;
        __le16                  fpg_sz;
        __le16                  csecs;
@@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl {
 static inline void _nvme_nvm_check_size(void)
 {
        BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
-        BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
-        BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
@@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void)
 static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 {
        struct nvme_nvm_id_group *src;
-        struct nvm_id_group *dst;
+        struct nvm_id_group *grp;
+        int sec_per_pg, sec_per_pl, pg_per_blk;
        if (nvme_nvm_id->cgrps != 1)
                return -EINVAL;
        src = &nvme_nvm_id->groups[0];
-        dst = &nvm_id->grp;
+        grp = &nvm_id->grp;
-        dst->mtype = src->mtype;
+        grp->mtype = src->mtype;
-        dst->fmtype = src->fmtype;
+        grp->fmtype = src->fmtype;
-        dst->num_ch = src->num_ch;
-        dst->num_lun = src->num_lun;
+        grp->num_ch = src->num_ch;
-        dst->num_pln = src->num_pln;
+        grp->num_lun = src->num_lun;
-        dst->num_pg = le16_to_cpu(src->num_pg);
+        grp->num_chk = le16_to_cpu(src->num_chk);
-        dst->num_blk = le16_to_cpu(src->num_blk);
+        grp->csecs = le16_to_cpu(src->csecs);
-        dst->fpg_sz = le16_to_cpu(src->fpg_sz);
+        grp->sos = le16_to_cpu(src->sos);
-        dst->csecs = le16_to_cpu(src->csecs);
-        dst->sos = le16_to_cpu(src->sos);
+        pg_per_blk = le16_to_cpu(src->num_pg);
+        sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs;
-        dst->trdt = le32_to_cpu(src->trdt);
+        sec_per_pl = sec_per_pg * src->num_pln;
-        dst->trdm = le32_to_cpu(src->trdm);
+        grp->clba = sec_per_pl * pg_per_blk;
-        dst->tprt = le32_to_cpu(src->tprt);
+        grp->ws_per_chk = pg_per_blk;
-        dst->tprm = le32_to_cpu(src->tprm);
-        dst->tbet = le32_to_cpu(src->tbet);
-        dst->tbem = le32_to_cpu(src->tbem);
-        dst->mpos = le32_to_cpu(src->mpos);
-        dst->mccap = le32_to_cpu(src->mccap);
-        dst->cpar = le16_to_cpu(src->cpar);
-        if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
-                memcpy(dst->lptbl.id, src->lptbl.id, 8);
-                dst->lptbl.mlc.num_pairs =
-                                le16_to_cpu(src->lptbl.mlc.num_pairs);
-                if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
-                        pr_err("nvm: number of MLC pairs not supported\n");
-                        return -EINVAL;
-                }
-                memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
+        grp->mpos = le32_to_cpu(src->mpos);
-                                        dst->lptbl.mlc.num_pairs);
+        grp->cpar = le16_to_cpu(src->cpar);
+        grp->mccap = le32_to_cpu(src->mccap);
+        grp->ws_opt = grp->ws_min = sec_per_pg;
+        grp->ws_seq = NVM_IO_SNGL_ACCESS;
+        if (grp->mpos & 0x020202) {
+                grp->ws_seq = NVM_IO_DUAL_ACCESS;
+                grp->ws_opt <<= 1;
+        } else if (grp->mpos & 0x040404) {
+                grp->ws_seq = NVM_IO_QUAD_ACCESS;
+                grp->ws_opt <<= 2;
        }
+        grp->trdt = le32_to_cpu(src->trdt);
+        grp->trdm = le32_to_cpu(src->trdm);
+        grp->tprt = le32_to_cpu(src->tprt);
+        grp->tprm = le32_to_cpu(src->tprm);
+        grp->tbet = le32_to_cpu(src->tbet);
+        grp->tbem = le32_to_cpu(src->tbem);
+        /* 1.2 compatibility */
+        grp->num_pln = src->num_pln;
+        grp->num_pg = le16_to_cpu(src->num_pg);
+        grp->fpg_sz = le16_to_cpu(src->fpg_sz);
        return 0;
 }
@@ -332,62 +305,6 @@ out:
        return ret;
 }
-static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
-                                nvm_l2p_update_fn *update_l2p, void *priv)
-{
-        struct nvme_ns *ns = nvmdev->q->queuedata;
-        struct nvme_nvm_command c = {};
-        u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
-        u32 nlb_pr_rq = len / sizeof(u64);
-        u64 cmd_slba = slba;
-        void *entries;
-        int ret = 0;
-        c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
-        c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
-        entries = kmalloc(len, GFP_KERNEL);
-        if (!entries)
-                return -ENOMEM;
-        while (nlb) {
-                u32 cmd_nlb = min(nlb_pr_rq, nlb);
-                u64 elba = slba + cmd_nlb;
-                c.l2p.slba = cpu_to_le64(cmd_slba);
-                c.l2p.nlb = cpu_to_le32(cmd_nlb);
-                ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
-                                (struct nvme_command *)&c, entries, len);
-                if (ret) {
-                        dev_err(ns->ctrl->device,
-                                "L2P table transfer failed (%d)\n", ret);
-                        ret = -EIO;
-                        goto out;
-                }
-                if (unlikely(elba > nvmdev->total_secs)) {
-                        pr_err("nvm: L2P data from device is out of bounds!\n");
-                        ret = -EINVAL;
-                        goto out;
-                }
-                /* Transform physical address to target address space */
-                nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
-                if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
-                        ret = -EINTR;
-                        goto out;
-                }
-                cmd_slba += cmd_nlb;
-                nlb -= cmd_nlb;
-        }
-out:
-        kfree(entries);
-        return ret;
-}
 static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
                                                                u8 *blks)
 {
@@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
        struct nvme_ctrl *ctrl = ns->ctrl;
        struct nvme_nvm_command c = {};
        struct nvme_nvm_bb_tbl *bb_tbl;
-        int nr_blks = geo->blks_per_lun * geo->plane_mode;
+        int nr_blks = geo->nr_chks * geo->plane_mode;
        int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
        int ret = 0;
@@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
                goto out;
        }
-        memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode);
+        memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode);
 out:
        kfree(bb_tbl);
        return ret;
@@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
        c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
        c->ph_rw.control = cpu_to_le16(rqd->flags);
        c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
-        if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
-                c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
-                                        rqd->bio->bi_iter.bi_sector));
 }
 static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
@@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr,
 static struct nvm_dev_ops nvme_nvm_dev_ops = {
        .identity               = nvme_nvm_identity,
-        .get_l2p_tbl            = nvme_nvm_get_l2p_tbl,
        .get_bb_tbl             = nvme_nvm_get_bb_tbl,
        .set_bb_tbl             = nvme_nvm_set_bb_tbl,
@@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
        } else if (strcmp(attr->name, "num_planes") == 0) {
                return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
        } else if (strcmp(attr->name, "num_blocks") == 0) {     /* u16 */
-                return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
+                return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk);
        } else if (strcmp(attr->name, "num_pages") == 0) {
                return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
        } else if (strcmp(attr->name, "page_size") == 0) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1218a9fca846..3b211d9e58b8 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req)
        kblockd_schedule_work(&ns->head->requeue_work);
 }
-bool nvme_req_needs_failover(struct request *req)
+bool nvme_req_needs_failover(struct request *req, blk_status_t error)
 {
        if (!(req->cmd_flags & REQ_NVME_MPATH))
                return false;
+        return blk_path_error(error);
-        switch (nvme_req(req)->status & 0x7ff) {
-        /*
-         * Generic command status:
-         */
-        case NVME_SC_INVALID_OPCODE:
-        case NVME_SC_INVALID_FIELD:
-        case NVME_SC_INVALID_NS:
-        case NVME_SC_LBA_RANGE:
-        case NVME_SC_CAP_EXCEEDED:
-        case NVME_SC_RESERVATION_CONFLICT:
-                return false;
-        /*
-         * I/O command set specific error.  Unfortunately these values are
-         * reused for fabrics commands, but those should never get here.
-         */
-        case NVME_SC_BAD_ATTRIBUTES:
-        case NVME_SC_INVALID_PI:
-        case NVME_SC_READ_ONLY:
-        case NVME_SC_ONCS_NOT_SUPPORTED:
-                WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
-                        nvme_fabrics_command);
-                return false;
-        /*
-         * Media and Data Integrity Errors:
-         */
-        case NVME_SC_WRITE_FAULT:
-        case NVME_SC_READ_ERROR:
-        case NVME_SC_GUARD_CHECK:
-        case NVME_SC_APPTAG_CHECK:
-        case NVME_SC_REFTAG_CHECK:
-        case NVME_SC_COMPARE_FAILED:
-        case NVME_SC_ACCESS_DENIED:
-        case NVME_SC_UNWRITTEN_BLOCK:
-                return false;
-        }
-        /* Everything else could be a path failure, so should be retried */
-        return true;
 }
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a00eabd06427..8e4550fa08f8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -32,6 +32,8 @@ extern unsigned int admin_timeout;
 #define NVME_KATO_GRACE         10
 extern struct workqueue_struct *nvme_wq;
+extern struct workqueue_struct *nvme_reset_wq;
+extern struct workqueue_struct *nvme_delete_wq;
 enum {
        NVME_NS_LBA             = 0,
@@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
 enum nvme_ctrl_state {
        NVME_CTRL_NEW,
        NVME_CTRL_LIVE,
+        NVME_CTRL_ADMIN_ONLY,    /* Only admin queue live */
        NVME_CTRL_RESETTING,
        NVME_CTRL_RECONNECTING,
        NVME_CTRL_DELETING,
@@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
@@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops;
 #ifdef CONFIG_NVME_MULTIPATH
 void nvme_failover_req(struct request *req);
-bool nvme_req_needs_failover(struct request *req);
+bool nvme_req_needs_failover(struct request *req, blk_status_t error);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns_head *head);
@@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 static inline void nvme_failover_req(struct request *req)
 {
 }
-static inline bool nvme_req_needs_failover(struct request *req)
+static inline bool nvme_req_needs_failover(struct request *req,
+                                           blk_status_t error)
 {
        return false;
 }
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4276ebfff22b..6fe7af00a1f4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
 struct nvme_dev {
-        struct nvme_queue **queues;
+        struct nvme_queue *queues;
        struct blk_mq_tag_set tagset;
        struct blk_mq_tag_set admin_tagset;
        u32 __iomem *dbs;
@@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
                                unsigned int hctx_idx)
 {
        struct nvme_dev *dev = data;
-        struct nvme_queue *nvmeq = dev->queues[0];
+        struct nvme_queue *nvmeq = &dev->queues[0];
        WARN_ON(hctx_idx != 0);
        WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
                          unsigned int hctx_idx)
 {
        struct nvme_dev *dev = data;
-        struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+        struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
        if (!nvmeq->tags)
                nvmeq->tags = &dev->tagset.tags[hctx_idx];
@@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
        struct nvme_dev *dev = set->driver_data;
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
-        struct nvme_queue *nvmeq = dev->queues[queue_idx];
+        struct nvme_queue *nvmeq = &dev->queues[queue_idx];
        BUG_ON(!nvmeq);
        iod->nvmeq = nvmeq;
@@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
        struct nvme_dev *dev = to_nvme_dev(ctrl);
-        struct nvme_queue *nvmeq = dev->queues[0];
+        struct nvme_queue *nvmeq = &dev->queues[0];
        struct nvme_command c;
        memset(&c, 0, sizeof(c));
@@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
         */
        bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
-        /* If there is a reset ongoing, we shouldn't reset again. */
+        /* If there is a reset/reinit ongoing, we shouldn't reset again. */
-        if (dev->ctrl.state == NVME_CTRL_RESETTING)
+        switch (dev->ctrl.state) {
+        case NVME_CTRL_RESETTING:
+        case NVME_CTRL_RECONNECTING:
                return false;
+        default:
+                break;
+        }
        /* We shouldn't reset unless the controller is on fatal error state
         * _or_ if we lost the communication with it.
@@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
        if (nvmeq->sq_cmds)
                dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
                                        nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-        kfree(nvmeq);
 }
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
        int i;
        for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
-                struct nvme_queue *nvmeq = dev->queues[i];
                dev->ctrl.queue_count--;
-                dev->queues[i] = NULL;
+                nvme_free_queue(&dev->queues[i]);
-                nvme_free_queue(nvmeq);
        }
 }
@@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
-        struct nvme_queue *nvmeq = dev->queues[0];
+        struct nvme_queue *nvmeq = &dev->queues[0];
-        if (!nvmeq)
-                return;
-        if (nvme_suspend_queue(nvmeq))
-                return;
        if (shutdown)
                nvme_shutdown_ctrl(&dev->ctrl);
@@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
                                int qid, int depth)
 {
-        if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+        if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
                unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
                                                      dev->ctrl.page_size);
                nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
@@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
        return 0;
 }
-static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
-                                                        int depth, int node)
+                int depth, int node)
 {
-        struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
+        struct nvme_queue *nvmeq = &dev->queues[qid];
-                                                        node);
-        if (!nvmeq)
+        if (dev->ctrl.queue_count > qid)
-                return NULL;
+                return 0;
        nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
                                          &nvmeq->cq_dma_addr, GFP_KERNEL);
@@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
        nvmeq->q_depth = depth;
        nvmeq->qid = qid;
        nvmeq->cq_vector = -1;
-        dev->queues[qid] = nvmeq;
        dev->ctrl.queue_count++;
-        return nvmeq;
+        return 0;
 free_cqdma:
        dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
                                                        nvmeq->cq_dma_addr);
 free_nvmeq:
-        kfree(nvmeq);
+        return -ENOMEM;
-        return NULL;
 }
 static int queue_request_irq(struct nvme_queue *nvmeq)
@@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
        if (result < 0)
                return result;
-        nvmeq = dev->queues[0];
+        result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
-        if (!nvmeq) {
+                        dev_to_node(dev->dev));
-                nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+        if (result)
-                                        dev_to_node(dev->dev));
+                return result;
-                if (!nvmeq)
-                        return -ENOMEM;
-        }
+        nvmeq = &dev->queues[0];
        aqa = nvmeq->q_depth - 1;
        aqa |= aqa << 16;
@@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
        for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
                /* vector == qid - 1, match nvme_create_queue */
-                if (!nvme_alloc_queue(dev, i, dev->q_depth,
+                if (nvme_alloc_queue(dev, i, dev->q_depth,
                     pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
                        ret = -ENOMEM;
                        break;
@@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
        max = min(dev->max_qid, dev->ctrl.queue_count - 1);
        for (i = dev->online_queues; i <= max; i++) {
-                ret = nvme_create_queue(dev->queues[i], i);
+                ret = nvme_create_queue(&dev->queues[i], i);
                if (ret)
                        break;
        }
        /*
         * Ignore failing Create SQ/CQ commands, we can continue with less
-         * than the desired aount of queues, and even a controller without
+         * than the desired amount of queues, and even a controller without
-         * I/O queues an still be used to issue admin commands.  This might
+         * I/O queues can still be used to issue admin commands.  This might
         * be useful to upgrade a buggy firmware for example.
         */
        return ret >= 0 ? 0 : ret;
@@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev,
 }
 static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
-static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
+static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
 {
-        u64 szu, size, offset;
+        u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+        return 1ULL << (12 + 4 * szu);
+}
+static u32 nvme_cmb_size(struct nvme_dev *dev)
+{
+        return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+}
+static void nvme_map_cmb(struct nvme_dev *dev)
+{
+        u64 size, offset;
        resource_size_t bar_size;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
-        void __iomem *cmb;
        int bar;
        dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
-        if (!(NVME_CMB_SZ(dev->cmbsz)))
+        if (!dev->cmbsz)
-                return NULL;
+                return;
        dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
        if (!use_cmb_sqes)
-                return NULL;
+                return;
-        szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
+        size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
-        size = szu * NVME_CMB_SZ(dev->cmbsz);
+        offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
-        offset = szu * NVME_CMB_OFST(dev->cmbloc);
        bar = NVME_CMB_BIR(dev->cmbloc);
        bar_size = pci_resource_len(pdev, bar);
        if (offset > bar_size)
-                return NULL;
+                return;
        /*
         * Controllers may support a CMB size larger than their BAR,
@@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
        if (size > bar_size - offset)
                size = bar_size - offset;
-        cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
+        dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
-        if (!cmb)
+        if (!dev->cmb)
-                return NULL;
+                return;
        dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
        dev->cmb_size = size;
-        return cmb;
+        if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
+                                    &dev_attr_cmb.attr, NULL))
+                dev_warn(dev->ctrl.device,
+                         "failed to add sysfs attribute for CMB\n");
 }
 static inline void nvme_release_cmb(struct nvme_dev *dev)
@@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
        dma_addr_t descs_dma;
        int i = 0;
        void **bufs;
-        u64 size = 0, tmp;
+        u64 size, tmp;
        tmp = (preferred + chunk_size - 1);
        do_div(tmp, chunk_size);
@@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
        u64 preferred = (u64)dev->ctrl.hmpre * 4096;
        u64 min = (u64)dev->ctrl.hmmin * 4096;
        u32 enable_bits = NVME_HOST_MEM_ENABLE;
-        int ret = 0;
+        int ret;
        preferred = min(preferred, max);
        if (min > max) {
@@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
-        struct nvme_queue *adminq = dev->queues[0];
+        struct nvme_queue *adminq = &dev->queues[0];
        struct pci_dev *pdev = to_pci_dev(dev->dev);
        int result, nr_io_queues;
        unsigned long size;
@@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
        if (nr_io_queues == 0)
                return 0;
-        if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+        if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
                result = nvme_cmb_qdepth(dev, nr_io_queues,
                                sizeof(struct nvme_command));
                if (result > 0)
@@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
        return 0;
 }
-static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
+static void nvme_disable_io_queues(struct nvme_dev *dev)
 {
-        int pass;
+        int pass, queues = dev->online_queues - 1;
        unsigned long timeout;
        u8 opcode = nvme_admin_delete_sq;
@@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
 retry:
                timeout = ADMIN_TIMEOUT;
                for (; i > 0; i--, sent++)
-                        if (nvme_delete_queue(dev->queues[i], opcode))
+                        if (nvme_delete_queue(&dev->queues[i], opcode))
                                break;
                while (sent--) {
@@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
 }
 /*
- * Return: error value if an error occurred setting up the queues or calling
+ * return error value only when tagset allocation failed
- * Identify Device.  0 if these succeeded, even if adding some of the
- * namespaces failed.  At the moment, these failures are silent.  TBD which
- * failures should be reported.
 */
 static int nvme_dev_add(struct nvme_dev *dev)
 {
+        int ret;
        if (!dev->ctrl.tagset) {
                dev->tagset.ops = &nvme_mq_ops;
                dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
                dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
                dev->tagset.driver_data = dev;
-                if (blk_mq_alloc_tag_set(&dev->tagset))
+                ret = blk_mq_alloc_tag_set(&dev->tagset);
-                        return 0;
+                if (ret) {
+                        dev_warn(dev->ctrl.device,
+                                "IO queues tagset allocation failed %d\n", ret);
+                        return ret;
+                }
                dev->ctrl.tagset = &dev->tagset;
                nvme_dbbuf_set(dev);
@@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
                        "set queue depth=%u\n", dev->q_depth);
        }
-        /*
+        nvme_map_cmb(dev);
-         * CMBs can currently only exist on >=1.2 PCIe devices. We only
-         * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
-         * has no name we can pass NULL as final argument to
-         * sysfs_add_file_to_group.
-         */
-        if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
-                dev->cmb = nvme_map_cmb(dev);
-                if (dev->cmb) {
-                        if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
-                                                    &dev_attr_cmb.attr, NULL))
-                                dev_warn(dev->ctrl.device,
-                                         "failed to add sysfs attribute for CMB\n");
-                }
-        }
        pci_enable_pcie_error_reporting(pdev);
        pci_save_state(pdev);
@@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
-        int i, queues;
+        int i;
        bool dead = true;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
@@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
        }
        nvme_stop_queues(&dev->ctrl);
-        queues = dev->online_queues - 1;
+        if (!dead) {
-        for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+                nvme_disable_io_queues(dev);
-                nvme_suspend_queue(dev->queues[i]);
-        if (dead) {
-                /* A device might become IO incapable very soon during
-                 * probe, before the admin queue is configured. Thus,
-                 * queue_count can be 0 here.
-                 */
-                if (dev->ctrl.queue_count)
-                        nvme_suspend_queue(dev->queues[0]);
-        } else {
-                nvme_disable_io_queues(dev, queues);
                nvme_disable_admin_queue(dev, shutdown);
        }
+        for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
+                nvme_suspend_queue(&dev->queues[i]);
        nvme_pci_disable(dev);
        blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
@@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work)
                container_of(work, struct nvme_dev, ctrl.reset_work);
        bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
        int result = -ENODEV;
+        enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
        if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
                goto out;
@@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work)
        if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                nvme_dev_disable(dev, false);
+        /*
+         * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
+         * initializing procedure here.
+         */
+        if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
+                dev_warn(dev->ctrl.device,
+                        "failed to mark controller RECONNECTING\n");
+                goto out;
+        }
        result = nvme_pci_enable(dev);
        if (result)
                goto out;
@@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work)
                dev_warn(dev->ctrl.device, "IO queues not created\n");
                nvme_kill_queues(&dev->ctrl);
                nvme_remove_namespaces(&dev->ctrl);
+                new_state = NVME_CTRL_ADMIN_ONLY;
        } else {
                nvme_start_queues(&dev->ctrl);
                nvme_wait_freeze(&dev->ctrl);
-                nvme_dev_add(dev);
+                /* hit this only when allocate tagset fails */
+                if (nvme_dev_add(dev))
+                        new_state = NVME_CTRL_ADMIN_ONLY;
                nvme_unfreeze(&dev->ctrl);
        }
-        if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+        /*
-                dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+         * If only admin queue live, keep it to do further investigation or
+         * recovery.
+         */
+        if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+                dev_warn(dev->ctrl.device,
+                        "failed to mark controller state %d\n", new_state);
                goto out;
        }
@@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
        if (!dev)
                return -ENOMEM;
-        dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
-                                                        GFP_KERNEL, node);
+        dev->queues = kcalloc_node(num_possible_cpus() + 1,
+                        sizeof(struct nvme_queue), GFP_KERNEL, node);
        if (!dev->queues)
                goto free;
@@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (result)
                goto release_pools;
-        nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
        dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
-        queue_work(nvme_wq, &dev->ctrl.reset_work);
+        nvme_reset_ctrl(&dev->ctrl);
        return 0;
 release_pools:
@@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev)
 static void nvme_reset_done(struct pci_dev *pdev)
 {
        struct nvme_dev *dev = pci_get_drvdata(pdev);
-        nvme_reset_ctrl(&dev->ctrl);
+        nvme_reset_ctrl_sync(&dev->ctrl);
 }
 static void nvme_shutdown(struct pci_dev *pdev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2a0bba7f50cf..2bc059f7d73c 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -66,7 +66,6 @@ struct nvme_rdma_request {
        struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
        u32                     num_sge;
        int                     nents;
-        bool                    inline_data;
        struct ib_reg_wr        reg_wr;
        struct ib_cqe           reg_cqe;
        struct nvme_rdma_queue  *queue;
@@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
        sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
-        req->inline_data = true;
        req->num_sge++;
        return 0;
 }
@@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
        int count, ret;
        req->num_sge = 1;
-        req->inline_data = false;
        refcount_set(&req->ref, 2); /* send and recv completions */
        c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -2018,6 +2015,7 @@ out_free_ctrl:
 static struct nvmf_transport_ops nvme_rdma_transport = {
        .name           = "rdma",
+        .module         = THIS_MODULE,
        .required_opts  = NVMF_OPT_TRADDR,
        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
@@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
        }
        mutex_unlock(&nvme_rdma_ctrl_mutex);
-        flush_workqueue(nvme_wq);
+        flush_workqueue(nvme_delete_wq);
 }
 static struct ib_client nvme_rdma_ib_client = {
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
new file mode 100644
index 000000000000..41944bbef835
--- /dev/null
+++ b/drivers/nvme/host/trace.c
@@ -0,0 +1,130 @@
+/*
+ * NVM Express device driver tracepoints
+ * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <asm/unaligned.h>
+#include "trace.h"
+static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
+{
+        const char *ret = trace_seq_buffer_ptr(p);
+        u16 sqid = get_unaligned_le16(cdw10);
+        u16 qsize = get_unaligned_le16(cdw10 + 2);
+        u16 sq_flags = get_unaligned_le16(cdw10 + 4);
+        u16 cqid = get_unaligned_le16(cdw10 + 6);
+        trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u",
+                         sqid, qsize, sq_flags, cqid);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
+{
+        const char *ret = trace_seq_buffer_ptr(p);
+        u16 cqid = get_unaligned_le16(cdw10);
+        u16 qsize = get_unaligned_le16(cdw10 + 2);
+        u16 cq_flags = get_unaligned_le16(cdw10 + 4);
+        u16 irq_vector = get_unaligned_le16(cdw10 + 6);
+        trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u",
+                         cqid, qsize, cq_flags, irq_vector);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10)
+{
+        const char *ret = trace_seq_buffer_ptr(p);
+        u8 cns = cdw10[0];
+        u16 ctrlid = get_unaligned_le16(cdw10 + 2);
+        trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
+{
+        const char *ret = trace_seq_buffer_ptr(p);
+        u64 slba = get_unaligned_le64(cdw10);
+        u16 length = get_unaligned_le16(cdw10 + 8);
+        u16 control = get_unaligned_le16(cdw10 + 10);
+        u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
+        u32 reftag = get_unaligned_le32(cdw10 +  16);
+        trace_seq_printf(p,
+                         "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
+                         slba, length, control, dsmgmt, reftag);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
+{
+        const char *ret = trace_seq_buffer_ptr(p);
+        trace_seq_printf(p, "nr=%u, attributes=%u",
+                         get_unaligned_le32(cdw10),
+                         get_unaligned_le32(cdw10 + 4));
+        trace_seq_putc(p, 0);
+        return ret;
+}
+static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
+{
+        const char *ret = trace_seq_buffer_ptr(p);
+        trace_seq_printf(p, "cdw10=%*ph", 24, cdw10);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
+                                       u8 opcode, u8 *cdw10)
+{
+        switch (opcode) {
+        case nvme_admin_create_sq:
+                return nvme_trace_create_sq(p, cdw10);
+        case nvme_admin_create_cq:
+                return nvme_trace_create_cq(p, cdw10);
+        case nvme_admin_identify:
+                return nvme_trace_admin_identify(p, cdw10);
+        default:
+                return nvme_trace_common(p, cdw10);
+        }
+}
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
+                                     u8 opcode, u8 *cdw10)
+{
+        switch (opcode) {
+        case nvme_cmd_read:
+        case nvme_cmd_write:
+        case nvme_cmd_write_zeroes:
+                return nvme_trace_read_write(p, cdw10);
+        case nvme_cmd_dsm:
+                return nvme_trace_dsm(p, cdw10);
+        default:
+                return nvme_trace_common(p, cdw10);
+        }
+}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
new file mode 100644
index 000000000000..ea91fccd1bc0
--- /dev/null
+++ b/drivers/nvme/host/trace.h
@@ -0,0 +1,165 @@
+/*
+ * NVM Express device driver tracepoints
+ * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nvme
+#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NVME_H
+#include <linux/nvme.h>
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+#include "nvme.h"
+#define nvme_admin_opcode_name(opcode)  { opcode, #opcode }
+#define show_admin_opcode_name(val)                                     \
+        __print_symbolic(val,                                           \
+                nvme_admin_opcode_name(nvme_admin_delete_sq),           \
+                nvme_admin_opcode_name(nvme_admin_create_sq),           \
+                nvme_admin_opcode_name(nvme_admin_get_log_page),        \
+                nvme_admin_opcode_name(nvme_admin_delete_cq),           \
+                nvme_admin_opcode_name(nvme_admin_create_cq),           \
+                nvme_admin_opcode_name(nvme_admin_identify),            \
+                nvme_admin_opcode_name(nvme_admin_abort_cmd),           \
+                nvme_admin_opcode_name(nvme_admin_set_features),        \
+                nvme_admin_opcode_name(nvme_admin_get_features),        \
+                nvme_admin_opcode_name(nvme_admin_async_event),         \
+                nvme_admin_opcode_name(nvme_admin_ns_mgmt),             \
+                nvme_admin_opcode_name(nvme_admin_activate_fw),         \
+                nvme_admin_opcode_name(nvme_admin_download_fw),         \
+                nvme_admin_opcode_name(nvme_admin_ns_attach),           \
+                nvme_admin_opcode_name(nvme_admin_keep_alive),          \
+                nvme_admin_opcode_name(nvme_admin_directive_send),      \
+                nvme_admin_opcode_name(nvme_admin_directive_recv),      \
+                nvme_admin_opcode_name(nvme_admin_dbbuf),               \
+                nvme_admin_opcode_name(nvme_admin_format_nvm),          \
+                nvme_admin_opcode_name(nvme_admin_security_send),       \
+                nvme_admin_opcode_name(nvme_admin_security_recv),       \
+                nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
+                                       u8 *cdw10);
+#define __parse_nvme_admin_cmd(opcode, cdw10) \
+        nvme_trace_parse_admin_cmd(p, opcode, cdw10)
+#define nvme_opcode_name(opcode)        { opcode, #opcode }
+#define show_opcode_name(val)                                   \
+        __print_symbolic(val,                                   \
+                nvme_opcode_name(nvme_cmd_flush),               \
+                nvme_opcode_name(nvme_cmd_write),               \
+                nvme_opcode_name(nvme_cmd_read),                \
+                nvme_opcode_name(nvme_cmd_write_uncor),         \
+                nvme_opcode_name(nvme_cmd_compare),             \
+                nvme_opcode_name(nvme_cmd_write_zeroes),        \
+                nvme_opcode_name(nvme_cmd_dsm),                 \
+                nvme_opcode_name(nvme_cmd_resv_register),       \
+                nvme_opcode_name(nvme_cmd_resv_report),         \
+                nvme_opcode_name(nvme_cmd_resv_acquire),        \
+                nvme_opcode_name(nvme_cmd_resv_release))
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
+                                     u8 *cdw10);
+#define __parse_nvme_cmd(opcode, cdw10) \
+        nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
+TRACE_EVENT(nvme_setup_admin_cmd,
+            TP_PROTO(struct nvme_command *cmd),
+            TP_ARGS(cmd),
+            TP_STRUCT__entry(
+                    __field(u8, opcode)
+                    __field(u8, flags)
+                    __field(u16, cid)
+                    __field(u64, metadata)
+                    __array(u8, cdw10, 24)
+            ),
+            TP_fast_assign(
+                    __entry->opcode = cmd->common.opcode;
+                    __entry->flags = cmd->common.flags;
+                    __entry->cid = cmd->common.command_id;
+                    __entry->metadata = le64_to_cpu(cmd->common.metadata);
+                    memcpy(__entry->cdw10, cmd->common.cdw10,
+                           sizeof(__entry->cdw10));
+            ),
+            TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+                      __entry->cid, __entry->flags, __entry->metadata,
+                      show_admin_opcode_name(__entry->opcode),
+                      __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
+);
+TRACE_EVENT(nvme_setup_nvm_cmd,
+            TP_PROTO(int qid, struct nvme_command *cmd),
+            TP_ARGS(qid, cmd),
+            TP_STRUCT__entry(
+                    __field(int, qid)
+                    __field(u8, opcode)
+                    __field(u8, flags)
+                    __field(u16, cid)
+                    __field(u32, nsid)
+                    __field(u64, metadata)
+                    __array(u8, cdw10, 24)
+            ),
+            TP_fast_assign(
+                    __entry->qid = qid;
+                    __entry->opcode = cmd->common.opcode;
+                    __entry->flags = cmd->common.flags;
+                    __entry->cid = cmd->common.command_id;
+                    __entry->nsid = le32_to_cpu(cmd->common.nsid);
+                    __entry->metadata = le64_to_cpu(cmd->common.metadata);
+                    memcpy(__entry->cdw10, cmd->common.cdw10,
+                           sizeof(__entry->cdw10));
+            ),
+            TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+                      __entry->qid, __entry->nsid, __entry->cid,
+                      __entry->flags, __entry->metadata,
+                      show_opcode_name(__entry->opcode),
+                      __parse_nvme_cmd(__entry->opcode, __entry->cdw10))
+);
+TRACE_EVENT(nvme_complete_rq,
+            TP_PROTO(struct request *req),
+            TP_ARGS(req),
+            TP_STRUCT__entry(
+                    __field(int, qid)
+                    __field(int, cid)
+                    __field(u64, result)
+                    __field(u8, retries)
+                    __field(u8, flags)
+                    __field(u16, status)
+            ),
+            TP_fast_assign(
+                    __entry->qid = req->q->id;
+                    __entry->cid = req->tag;
+                    __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
+                    __entry->retries = nvme_req(req)->retries;
+                    __entry->flags = nvme_req(req)->flags;
+                    __entry->status = nvme_req(req)->status;
+            ),
+            TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
+                      __entry->cid, __entry->qid, __entry->result,
+                      __entry->retries, __entry->flags, __entry->status)
+);
+#endif /* _TRACE_NVME_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 03e4ab65fe77..5f4f8b16685f 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -29,6 +29,7 @@ config NVME_TARGET_RDMA
        tristate "NVMe over Fabrics RDMA target support"
        depends on INFINIBAND
        depends on NVME_TARGET
+        select SGL_ALLOC
        help
          This enables the NVMe RDMA target support, which allows exporting NVMe
          devices over RDMA.
@@ -39,6 +40,7 @@ config NVME_TARGET_FC
        tristate "NVMe over Fabrics FC target driver"
        depends on NVME_TARGET
        depends on HAS_DMA
+        select SGL_ALLOC
        help
          This enables the NVMe FC target support, which allows exporting NVMe
          devices over FC.
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b54748ad5f48..0bd737117a80 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
        req->sg_cnt = 0;
        req->transfer_len = 0;
        req->rsp->status = 0;
+        req->ns = NULL;
        /* no support for fused commands yet */
        if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
 void nvmet_req_uninit(struct nvmet_req *req)
 {
        percpu_ref_put(&req->sq->ref);
+        if (req->ns)
+                nvmet_put_namespace(req->ns);
 }
 EXPORT_SYMBOL_GPL(nvmet_req_uninit);
@@ -830,7 +833,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
                /* Don't accept keep-alive timeout for discovery controllers */
                if (kato) {
                        status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-                        goto out_free_sqs;
+                        goto out_remove_ida;
                }
                /*
@@ -860,6 +863,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
        *ctrlp = ctrl;
        return 0;
+out_remove_ida:
+        ida_simple_remove(&cntlid_ida, ctrl->cntlid);
 out_free_sqs:
        kfree(ctrl->sqs);
 out_free_cqs:
@@ -877,21 +882,22 @@ static void nvmet_ctrl_free(struct kref *ref)
        struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
        struct nvmet_subsys *subsys = ctrl->subsys;
-        nvmet_stop_keep_alive_timer(ctrl);
        mutex_lock(&subsys->lock);
        list_del(&ctrl->subsys_entry);
        mutex_unlock(&subsys->lock);
+        nvmet_stop_keep_alive_timer(ctrl);
        flush_work(&ctrl->async_event_work);
        cancel_work_sync(&ctrl->fatal_err_work);
        ida_simple_remove(&cntlid_ida, ctrl->cntlid);
-        nvmet_subsys_put(subsys);
        kfree(ctrl->sqs);
        kfree(ctrl->cqs);
        kfree(ctrl);
+        nvmet_subsys_put(subsys);
 }
 void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index db3bf6b8bf9e..19e9e42ae943 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
                goto out_ctrl_put;
        }
-        pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
+        pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
 out:
        kfree(d);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 5fd86039e353..9b39a6cb1935 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1697,31 +1697,12 @@ static int
 nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
 {
        struct scatterlist *sg;
-        struct page *page;
        unsigned int nent;
-        u32 page_len, length;
-        int i = 0;
-        length = fod->req.transfer_len;
+        sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent);
-        nent = DIV_ROUND_UP(length, PAGE_SIZE);
-        sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
        if (!sg)
                goto out;
-        sg_init_table(sg, nent);
-        while (length) {
-                page_len = min_t(u32, length, PAGE_SIZE);
-                page = alloc_page(GFP_KERNEL);
-                if (!page)
-                        goto out_free_pages;
-                sg_set_page(&sg[i], page, page_len, 0);
-                length -= page_len;
-                i++;
-        }
        fod->data_sg = sg;
        fod->data_sg_cnt = nent;
        fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent,
@@ -1731,14 +1712,6 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
        return 0;
-out_free_pages:
-        while (i > 0) {
-                i--;
-                __free_page(sg_page(&sg[i]));
-        }
-        kfree(sg);
-        fod->data_sg = NULL;
-        fod->data_sg_cnt = 0;
 out:
        return NVME_SC_INTERNAL;
 }
@@ -1746,18 +1719,13 @@ out:
 static void
 nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
 {
-        struct scatterlist *sg;
-        int count;
        if (!fod->data_sg || !fod->data_sg_cnt)
                return;
        fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt,
                                ((fod->io_dir == NVMET_FCP_WRITE) ?
                                        DMA_FROM_DEVICE : DMA_TO_DEVICE));
-        for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
+        sgl_free(fod->data_sg);
-                __free_page(sg_page(sg));
-        kfree(fod->data_sg);
        fod->data_sg = NULL;
        fod->data_sg_cnt = 0;
 }
@@ -2522,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port)
        list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
                if ((tgtport->fc_target_port.node_name == traddr.nn) &&
                    (tgtport->fc_target_port.port_name == traddr.pn)) {
-                        /* a FC port can only be 1 nvmet port id */
+                        tgtport->port = port;
-                        if (!tgtport->port) {
+                        ret = 0;
-                                tgtport->port = port;
-                                port->priv = tgtport;
-                                nvmet_fc_tgtport_get(tgtport);
-                                ret = 0;
-                        } else
-                                ret = -EALREADY;
                        break;
                }
        }
@@ -2540,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
 static void
 nvmet_fc_remove_port(struct nvmet_port *port)
 {
-        struct nvmet_fc_tgtport *tgtport = port->priv;
+        /* nothing to do */
-        unsigned long flags;
-        bool matched = false;
-        spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
-        if (tgtport->port == port) {
-                matched = true;
-                tgtport->port = NULL;
-        }
-        spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
-        if (matched)
-                nvmet_fc_tgtport_put(tgtport);
 }
 static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 6a018a0bd6ce..34712def81b1 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -204,6 +204,10 @@ struct fcloop_lport {
        struct completion unreg_done;
 };
+struct fcloop_lport_priv {
+        struct fcloop_lport *lport;
+};
 struct fcloop_rport {
        struct nvme_fc_remote_port *remoteport;
        struct nvmet_fc_target_port *targetport;
@@ -238,21 +242,32 @@ struct fcloop_lsreq {
        int                             status;
 };
+enum {
+        INI_IO_START            = 0,
+        INI_IO_ACTIVE           = 1,
+        INI_IO_ABORTED          = 2,
+        INI_IO_COMPLETED        = 3,
+};
 struct fcloop_fcpreq {
        struct fcloop_tport             *tport;
        struct nvmefc_fcp_req           *fcpreq;
        spinlock_t                      reqlock;
        u16                             status;
+        u32                             inistate;
        bool                            active;
        bool                            aborted;
-        struct work_struct              work;
+        struct kref                     ref;
+        struct work_struct              fcp_rcv_work;
+        struct work_struct              abort_rcv_work;
+        struct work_struct              tio_done_work;
        struct nvmefc_tgt_fcp_req       tgt_fcp_req;
 };
 struct fcloop_ini_fcpreq {
        struct nvmefc_fcp_req           *fcpreq;
        struct fcloop_fcpreq            *tfcp_req;
-        struct work_struct              iniwork;
+        spinlock_t                      inilock;
 };
 static inline struct fcloop_lsreq *
@@ -343,17 +358,122 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
        return 0;
 }
-/*
- * FCP IO operation done by initiator abort.
- * call back up initiator "done" flows.
- */
 static void
-fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+fcloop_tfcp_req_free(struct kref *ref)
 {
-        struct fcloop_ini_fcpreq *inireq =
+        struct fcloop_fcpreq *tfcp_req =
-                container_of(work, struct fcloop_ini_fcpreq, iniwork);
+                container_of(ref, struct fcloop_fcpreq, ref);
+        kfree(tfcp_req);
+}
+static void
+fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
+{
+        kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
+}
+static int
+fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
+{
+        return kref_get_unless_zero(&tfcp_req->ref);
+}
+static void
+fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
+                        struct fcloop_fcpreq *tfcp_req, int status)
+{
+        struct fcloop_ini_fcpreq *inireq = NULL;
+        if (fcpreq) {
+                inireq = fcpreq->private;
+                spin_lock(&inireq->inilock);
+                inireq->tfcp_req = NULL;
+                spin_unlock(&inireq->inilock);
+                fcpreq->status = status;
+                fcpreq->done(fcpreq);
+        }
+        /* release original io reference on tgt struct */
+        fcloop_tfcp_req_put(tfcp_req);
+}
+static void
+fcloop_fcp_recv_work(struct work_struct *work)
+{
+        struct fcloop_fcpreq *tfcp_req =
+                container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+        struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+        int ret = 0;
+        bool aborted = false;
+        spin_lock(&tfcp_req->reqlock);
+        switch (tfcp_req->inistate) {
+        case INI_IO_START:
+                tfcp_req->inistate = INI_IO_ACTIVE;
+                break;
+        case INI_IO_ABORTED:
+                aborted = true;
+                break;
+        default:
+                spin_unlock(&tfcp_req->reqlock);
+                WARN_ON(1);
+                return;
+        }
+        spin_unlock(&tfcp_req->reqlock);
+        if (unlikely(aborted))
+                ret = -ECANCELED;
+        else
+                ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+                                &tfcp_req->tgt_fcp_req,
+                                fcpreq->cmdaddr, fcpreq->cmdlen);
+        if (ret)
+                fcloop_call_host_done(fcpreq, tfcp_req, ret);
+        return;
+}
+static void
+fcloop_fcp_abort_recv_work(struct work_struct *work)
+{
+        struct fcloop_fcpreq *tfcp_req =
+                container_of(work, struct fcloop_fcpreq, abort_rcv_work);
+        struct nvmefc_fcp_req *fcpreq;
+        bool completed = false;
+        spin_lock(&tfcp_req->reqlock);
+        fcpreq = tfcp_req->fcpreq;
+        switch (tfcp_req->inistate) {
+        case INI_IO_ABORTED:
+                break;
+        case INI_IO_COMPLETED:
+                completed = true;
+                break;
+        default:
+                spin_unlock(&tfcp_req->reqlock);
+                WARN_ON(1);
+                return;
+        }
+        spin_unlock(&tfcp_req->reqlock);
+        if (unlikely(completed)) {
+                /* remove reference taken in original abort downcall */
+                fcloop_tfcp_req_put(tfcp_req);
+                return;
+        }
-        inireq->fcpreq->done(inireq->fcpreq);
+        if (tfcp_req->tport->targetport)
+                nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+                                        &tfcp_req->tgt_fcp_req);
+        spin_lock(&tfcp_req->reqlock);
+        tfcp_req->fcpreq = NULL;
+        spin_unlock(&tfcp_req->reqlock);
+        fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+        /* call_host_done releases reference for abort downcall */
 }
 /*
@@ -364,20 +484,15 @@ static void
 fcloop_tgt_fcprqst_done_work(struct work_struct *work)
 {
        struct fcloop_fcpreq *tfcp_req =
-                container_of(work, struct fcloop_fcpreq, work);
+                container_of(work, struct fcloop_fcpreq, tio_done_work);
-        struct fcloop_tport *tport = tfcp_req->tport;
        struct nvmefc_fcp_req *fcpreq;
        spin_lock(&tfcp_req->reqlock);
        fcpreq = tfcp_req->fcpreq;
+        tfcp_req->inistate = INI_IO_COMPLETED;
        spin_unlock(&tfcp_req->reqlock);
-        if (tport->remoteport && fcpreq) {
+        fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
-                fcpreq->status = tfcp_req->status;
-                fcpreq->done(fcpreq);
-        }
-        kfree(tfcp_req);
 }
@@ -390,7 +505,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
        struct fcloop_rport *rport = remoteport->private;
        struct fcloop_ini_fcpreq *inireq = fcpreq->private;
        struct fcloop_fcpreq *tfcp_req;
-        int ret = 0;
        if (!rport->targetport)
                return -ECONNREFUSED;
@@ -401,16 +515,20 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
        inireq->fcpreq = fcpreq;
        inireq->tfcp_req = tfcp_req;
-        INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
+        spin_lock_init(&inireq->inilock);
        tfcp_req->fcpreq = fcpreq;
        tfcp_req->tport = rport->targetport->private;
+        tfcp_req->inistate = INI_IO_START;
        spin_lock_init(&tfcp_req->reqlock);
-        INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+        INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
+        INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
+        INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
+        kref_init(&tfcp_req->ref);
-        ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
+        schedule_work(&tfcp_req->fcp_rcv_work);
-                                 fcpreq->cmdaddr, fcpreq->cmdlen);
-        return ret;
+        return 0;
 }
 static void
@@ -589,7 +707,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
 {
        struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
-        schedule_work(&tfcp_req->work);
+        schedule_work(&tfcp_req->tio_done_work);
 }
 static void
@@ -605,27 +723,47 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
                        void *hw_queue_handle,
                        struct nvmefc_fcp_req *fcpreq)
 {
-        struct fcloop_rport *rport = remoteport->private;
        struct fcloop_ini_fcpreq *inireq = fcpreq->private;
-        struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+        struct fcloop_fcpreq *tfcp_req;
+        bool abortio = true;
+        spin_lock(&inireq->inilock);
+        tfcp_req = inireq->tfcp_req;
+        if (tfcp_req)
+                fcloop_tfcp_req_get(tfcp_req);
+        spin_unlock(&inireq->inilock);
        if (!tfcp_req)
                /* abort has already been called */
                return;
-        if (rport->targetport)
-                nvmet_fc_rcv_fcp_abort(rport->targetport,
-                                        &tfcp_req->tgt_fcp_req);
        /* break initiator/target relationship for io */
        spin_lock(&tfcp_req->reqlock);
-        inireq->tfcp_req = NULL;
+        switch (tfcp_req->inistate) {
-        tfcp_req->fcpreq = NULL;
+        case INI_IO_START:
+        case INI_IO_ACTIVE:
+                tfcp_req->inistate = INI_IO_ABORTED;
+                break;
+        case INI_IO_COMPLETED:
+                abortio = false;
+                break;
+        default:
+                spin_unlock(&tfcp_req->reqlock);
+                WARN_ON(1);
+                return;
+        }
        spin_unlock(&tfcp_req->reqlock);
-        /* post the aborted io completion */
+        if (abortio)
-        fcpreq->status = -ECANCELED;
+                /* leave the reference while the work item is scheduled */
-        schedule_work(&inireq->iniwork);
+                WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+        else  {
+                /*
+                 * as the io has already had the done callback made,
+                 * nothing more to do. So release the reference taken above
+                 */
+                fcloop_tfcp_req_put(tfcp_req);
+        }
 }
 static void
@@ -657,7 +795,8 @@ fcloop_nport_get(struct fcloop_nport *nport)
 static void
 fcloop_localport_delete(struct nvme_fc_local_port *localport)
 {
-        struct fcloop_lport *lport = localport->private;
+        struct fcloop_lport_priv *lport_priv = localport->private;
+        struct fcloop_lport *lport = lport_priv->lport;
        /* release any threads waiting for the unreg to complete */
        complete(&lport->unreg_done);
@@ -697,7 +836,7 @@ static struct nvme_fc_port_template fctemplate = {
        .max_dif_sgl_segments   = FCLOOP_SGL_SEGS,
        .dma_boundary           = FCLOOP_DMABOUND_4G,
        /* sizes of additional private data for data structures */
-        .local_priv_sz          = sizeof(struct fcloop_lport),
+        .local_priv_sz          = sizeof(struct fcloop_lport_priv),
        .remote_priv_sz         = sizeof(struct fcloop_rport),
        .lsrqst_priv_sz         = sizeof(struct fcloop_lsreq),
        .fcprqst_priv_sz        = sizeof(struct fcloop_ini_fcpreq),
@@ -714,8 +853,7 @@ static struct nvmet_fc_target_template tgttemplate = {
        .max_dif_sgl_segments   = FCLOOP_SGL_SEGS,
        .dma_boundary           = FCLOOP_DMABOUND_4G,
        /* optional features */
-        .target_features        = NVMET_FCTGTFEAT_CMD_IN_ISR |
+        .target_features        = 0,
-                                  NVMET_FCTGTFEAT_OPDONE_IN_ISR,
        /* sizes of additional private data for data structures */
        .target_priv_sz         = sizeof(struct fcloop_tport),
 };
@@ -728,11 +866,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
        struct fcloop_ctrl_options *opts;
        struct nvme_fc_local_port *localport;
        struct fcloop_lport *lport;
-        int ret;
+        struct fcloop_lport_priv *lport_priv;
+        unsigned long flags;
+        int ret = -ENOMEM;
+        lport = kzalloc(sizeof(*lport), GFP_KERNEL);
+        if (!lport)
+                return -ENOMEM;
        opts = kzalloc(sizeof(*opts), GFP_KERNEL);
        if (!opts)
-                return -ENOMEM;
+                goto out_free_lport;
        ret = fcloop_parse_options(opts, buf);
        if (ret)
@@ -752,23 +896,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
        ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
        if (!ret) {
-                unsigned long flags;
                /* success */
-                lport = localport->private;
+                lport_priv = localport->private;
+                lport_priv->lport = lport;
                lport->localport = localport;
                INIT_LIST_HEAD(&lport->lport_list);
                spin_lock_irqsave(&fcloop_lock, flags);
                list_add_tail(&lport->lport_list, &fcloop_lports);
                spin_unlock_irqrestore(&fcloop_lock, flags);
-                /* mark all of the input buffer consumed */
-                ret = count;
        }
 out_free_opts:
        kfree(opts);
+out_free_lport:
+        /* free only if we're going to fail */
+        if (ret)
+                kfree(lport);
        return ret ? ret : count;
 }
@@ -790,6 +936,8 @@ __wait_localport_unreg(struct fcloop_lport *lport)
        wait_for_completion(&lport->unreg_done);
+        kfree(lport);
        return ret;
 }
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 1e21b286f299..7991ec3a17db 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = {
 static struct nvmf_transport_ops nvme_loop_transport = {
        .name           = "loop",
+        .module         = THIS_MODULE,
        .create_ctrl    = nvme_loop_create_ctrl,
 };
@@ -716,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void)
                nvme_delete_ctrl(&ctrl->ctrl);
        mutex_unlock(&nvme_loop_ctrl_mutex);
-        flush_workqueue(nvme_wq);
+        flush_workqueue(nvme_delete_wq);
 }
 module_init(nvme_loop_init_module);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 49912909c298..978e169c11bf 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -185,59 +185,6 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
        spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
 }
-static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
-{
-        struct scatterlist *sg;
-        int count;
-        if (!sgl || !nents)
-                return;
-        for_each_sg(sgl, sg, nents, count)
-                __free_page(sg_page(sg));
-        kfree(sgl);
-}
-static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
-                u32 length)
-{
-        struct scatterlist *sg;
-        struct page *page;
-        unsigned int nent;
-        int i = 0;
-        nent = DIV_ROUND_UP(length, PAGE_SIZE);
-        sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
-        if (!sg)
-                goto out;
-        sg_init_table(sg, nent);
-        while (length) {
-                u32 page_len = min_t(u32, length, PAGE_SIZE);
-                page = alloc_page(GFP_KERNEL);
-                if (!page)
-                        goto out_free_pages;
-                sg_set_page(&sg[i], page, page_len, 0);
-                length -= page_len;
-                i++;
-        }
-        *sgl = sg;
-        *nents = nent;
-        return 0;
-out_free_pages:
-        while (i > 0) {
-                i--;
-                __free_page(sg_page(&sg[i]));
-        }
-        kfree(sg);
-out:
-        return NVME_SC_INTERNAL;
-}
 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
                        struct nvmet_rdma_cmd *c, bool admin)
 {
@@ -484,7 +431,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
        }
        if (rsp->req.sg != &rsp->cmd->inline_sg)
-                nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
+                sgl_free(rsp->req.sg);
        if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
                nvmet_rdma_process_wr_wait_list(queue);
@@ -621,16 +568,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
        u32 len = get_unaligned_le24(sgl->length);
        u32 key = get_unaligned_le32(sgl->key);
        int ret;
-        u16 status;
        /* no data command? */
        if (!len)
                return 0;
-        status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
+        rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
-                        len);
+        if (!rsp->req.sg)
-        if (status)
+                return NVME_SC_INTERNAL;
-                return status;
        ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
                        rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
@@ -976,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
 {
-        pr_info("freeing queue %d\n", queue->idx);
+        pr_debug("freeing queue %d\n", queue->idx);
        nvmet_sq_destroy(&queue->nvme_sq);
@@ -1558,25 +1503,9 @@ err_ib_client:
 static void __exit nvmet_rdma_exit(void)
 {
-        struct nvmet_rdma_queue *queue;
        nvmet_unregister_transport(&nvmet_rdma_ops);
-        flush_scheduled_work();
-        mutex_lock(&nvmet_rdma_queue_mutex);
-        while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
-                        struct nvmet_rdma_queue, queue_list))) {
-                list_del_init(&queue->queue_list);
-                mutex_unlock(&nvmet_rdma_queue_mutex);
-                __nvmet_rdma_queue_disconnect(queue);
-                mutex_lock(&nvmet_rdma_queue_mutex);
-        }
-        mutex_unlock(&nvmet_rdma_queue_mutex);
-        flush_scheduled_work();
        ib_unregister_client(&nvmet_rdma_ib_client);
+        WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
        ida_destroy(&nvmet_rdma_queue_ida);
 }
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig
index e2bc99980f75..4c44d7bed01a 100644
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -5,6 +5,7 @@ menuconfig TARGET_CORE
        select CONFIGFS_FS
        select CRC_T10DIF
        select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
+        select SGL_ALLOC
        default n
        help
        Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 58caacd54a3b..c03a78ee26cd 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2300,13 +2300,7 @@ queue_full:
 void target_free_sgl(struct scatterlist *sgl, int nents)
 {
-        struct scatterlist *sg;
+        sgl_free_n_order(sgl, nents, 0);
-        int count;
-        for_each_sg(sgl, sg, nents, count)
-                __free_page(sg_page(sg));
-        kfree(sgl);
 }
 EXPORT_SYMBOL(target_free_sgl);
@@ -2414,42 +2408,10 @@ int
 target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length,
                 bool zero_page, bool chainable)
 {
-        struct scatterlist *sg;
+        gfp_t gfp = GFP_KERNEL | (zero_page ? __GFP_ZERO : 0);
-        struct page *page;
-        gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0;
-        unsigned int nalloc, nent;
-        int i = 0;
-        nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE);
-        if (chainable)
-                nalloc++;
-        sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL);
-        if (!sg)
-                return -ENOMEM;
-        sg_init_table(sg, nalloc);
+        *sgl = sgl_alloc_order(length, 0, chainable, gfp, nents);
+        return *sgl ? 0 : -ENOMEM;
-        while (length) {
-                u32 page_len = min_t(u32, length, PAGE_SIZE);
-                page = alloc_page(GFP_KERNEL | zero_flag);
-                if (!page)
-                        goto out;
-                sg_set_page(&sg[i], page, page_len, 0);
-                length -= page_len;
-                i++;
-        }
-        *sgl = sg;
-        *nents = nent;
-        return 0;
-out:
-        while (i > 0) {
-                i--;
-                __free_page(sg_page(&sg[i]));
-        }
-        kfree(sg);
-        return -ENOMEM;
 }
 EXPORT_SYMBOL(target_alloc_sgl);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5982c8a71f02..75610d23d197 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -411,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 static u64 bio_end_offset(struct bio *bio)
 {
-        struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1];
+        struct bio_vec *last = bio_last_bvec_all(bio);
        return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
 }
@@ -563,7 +563,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        /* we need the actual starting offset of this extent in the file */
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree,
-                                   page_offset(bio->bi_io_vec->bv_page),
+                                   page_offset(bio_first_page_all(bio)),
                                   PAGE_SIZE);
        read_unlock(&em_tree->lock);
        if (!em)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 012d63870b99..d43360b33ef6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2257,7 +2257,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
        return 0;
 }
-bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
                           struct io_failure_record *failrec, int failed_mirror)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2281,7 +2281,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
         *      a) deliver good data to the caller
         *      b) correct the bad sectors on disk
         */
-        if (failed_bio->bi_vcnt > 1) {
+        if (failed_bio_pages > 1) {
                /*
                 * to fulfill b), we need to know the exact failing sectors, as
                 * we don't want to rewrite any more than the failed ones. thus,
@@ -2374,6 +2374,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
        int read_mode = 0;
        blk_status_t status;
        int ret;
+        unsigned failed_bio_pages = bio_pages_all(failed_bio);
        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2381,13 +2382,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
        if (ret)
                return ret;
-        if (!btrfs_check_repairable(inode, failed_bio, failrec,
+        if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
                                    failed_mirror)) {
                free_io_failure(failure_tree, tree, failrec);
                return -EIO;
        }
-        if (failed_bio->bi_vcnt > 1)
+        if (failed_bio_pages > 1)
                read_mode |= REQ_FAILFAST_DEV;
        phy_offset >>= inode->i_sb->s_blocksize_bits;
@@ -2724,7 +2725,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
                                       unsigned long bio_flags)
 {
        blk_status_t ret = 0;
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec = bio_last_bvec_all(bio);
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 93dcae0c3183..20854d63c75b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -540,7 +540,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
                u64 end);
 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
                                struct io_failure_record **failrec_ret);
-bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
                            struct io_failure_record *failrec, int fail_mirror);
 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
                                    struct io_failure_record *failrec,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1a7f3cb5be9..cb1e2d201434 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8015,6 +8015,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
        int segs;
        int ret;
        blk_status_t status;
+        struct bio_vec bvec;
        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -8030,8 +8031,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
        }
        segs = bio_segments(failed_bio);
+        bio_get_first_bvec(failed_bio, &bvec);
        if (segs > 1 ||
-            (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
+            (bvec.bv_len > btrfs_inode_sectorsize(inode)))
                read_mode |= REQ_FAILFAST_DEV;
        isector = start - btrfs_io_bio(failed_bio)->logical;
@@ -8074,7 +8076,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
        ASSERT(bio->bi_vcnt == 1);
        io_tree = &BTRFS_I(inode)->io_tree;
        failure_tree = &BTRFS_I(inode)->io_failure_tree;
-        ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
+        ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
        done->uptodate = 1;
        ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -8164,7 +8166,7 @@ static void btrfs_retry_endio(struct bio *bio)
        uptodate = 1;
        ASSERT(bio->bi_vcnt == 1);
-        ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
+        ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
        io_tree = &BTRFS_I(inode)->io_tree;
        failure_tree = &BTRFS_I(inode)->io_failure_tree;
diff --git a/fs/buffer.c b/fs/buffer.c
index 0736a6a2e2f0..8b26295a56fe 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3014,7 +3014,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
 void guard_bio_eod(int op, struct bio *bio)
 {
        sector_t maxsector;
-        struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+        struct bio_vec *bvec = bio_last_bvec_all(bio);
        unsigned truncated_bytes;
        struct hd_struct *part;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 516fa0d3ff9c..455f086cce3d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -56,7 +56,7 @@ static void f2fs_read_end_io(struct bio *bio)
        int i;
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-        if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
+        if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) {
                f2fs_show_injection_info(FAULT_IO);
                bio->bi_status = BLK_STS_IOERR;
        }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cea4836385b7..d4d04fee568a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -126,7 +126,7 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
 * @inode: inode to be moved
 * @wb: target bdi_writeback
- * @head: one of @wb->b_{dirty|io|more_io}
+ * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
 *
 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
 * Returns %true if @inode is the first occupant of the !dirty_time IO
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 23d29b39f71e..d0eb659fa733 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -300,6 +300,29 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
                bv->bv_len = iter.bi_bvec_done;
 }
+static inline unsigned bio_pages_all(struct bio *bio)
+{
+        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+        return bio->bi_vcnt;
+}
+static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
+{
+        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+        return bio->bi_io_vec;
+}
+static inline struct page *bio_first_page_all(struct bio *bio)
+{
+        return bio_first_bvec_all(bio)->bv_page;
+}
+static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
+{
+        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+        return &bio->bi_io_vec[bio->bi_vcnt - 1];
+}
 enum bip_flags {
        BIP_BLOCK_INTEGRITY     = 1 << 0, /* block layer owns integrity data */
        BIP_MAPPED_INTEGRITY    = 1 << 1, /* ref tag has been remapped */
@@ -477,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e9825ff57b15..69bea82ebeb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
                                       struct blkg_rwstat *from)
 {
-        struct blkg_rwstat v = blkg_rwstat_read(from);
+        u64 sum[BLKG_RWSTAT_NR];
        int i;
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+                sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
-                             atomic64_read(&from->aux_cnt[i]),
+        for (i = 0; i < BLKG_RWSTAT_NR; i++)
+                atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
                             &to->aux_cnt[i]);
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95c9a5c862e2..8efcf49796a3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
        unsigned int            queue_num;
        atomic_t                nr_active;
+        unsigned int            nr_expired;
        struct hlist_node       cpuhp_dead;
        struct kobject          kobj;
@@ -65,7 +66,7 @@ struct blk_mq_hw_ctx {
 #endif
        /* Must be the last member - see also blk_mq_hw_ctx_size(). */
-        struct srcu_struct      queue_rq_srcu[0];
+        struct srcu_struct      srcu[0];
 };
 struct blk_mq_tag_set {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9e7d8bd776d2..c5d3db0d83f8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -39,6 +39,34 @@ typedef u8 __bitwise blk_status_t;
 #define BLK_STS_AGAIN           ((__force blk_status_t)12)
+/**
+ * blk_path_error - returns true if error may be path related
+ * @error: status the request was completed with
+ *
+ * Description:
+ *     This classifies block error status into non-retryable errors and ones
+ *     that may be successful if retried on a failover path.
+ *
+ * Return:
+ *     %false - retrying failover path will not help
+ *     %true  - may succeed if retried
+ */
+static inline bool blk_path_error(blk_status_t error)
+{
+        switch (error) {
+        case BLK_STS_NOTSUPP:
+        case BLK_STS_NOSPC:
+        case BLK_STS_TARGET:
+        case BLK_STS_NEXUS:
+        case BLK_STS_MEDIUM:
+        case BLK_STS_PROTECTION:
+                return false;
+        }
+        /* Anything else could be a path failure, so should be retried */
+        return true;
+}
 struct blk_issue_stat {
        u64 stat;
 };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0ce8a372d506..4f3df807cf8f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,8 @@
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
+#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
 struct module;
 struct scsi_ioctl_command;
@@ -121,6 +123,12 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
   bio chain. */
 #define RQF_SPECIAL_PAYLOAD     ((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED   ((__force req_flags_t)(1 << 19))
+/* timeout is expired */
+#define RQF_MQ_TIMEOUT_EXPIRED  ((__force req_flags_t)(1 << 20))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT       ((__force req_flags_t)(1 << 21))
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -133,12 +141,6 @@ typedef __u32 __bitwise req_flags_t;
 * especially blk_mq_rq_ctx_init() to take care of the added fields.
 */
 struct request {
-        struct list_head queuelist;
-        union {
-                struct __call_single_data csd;
-                u64 fifo_time;
-        };
        struct request_queue *q;
        struct blk_mq_ctx *mq_ctx;
@@ -148,8 +150,6 @@ struct request {
        int internal_tag;
-        unsigned long atomic_flags;
        /* the following two fields are internal, NEVER access directly */
        unsigned int __data_len;        /* total data len */
        int tag;
@@ -158,6 +158,8 @@ struct request {
        struct bio *bio;
        struct bio *biotail;
+        struct list_head queuelist;
        /*
         * The hash is used inside the scheduler, and killed once the
         * request reaches the dispatch list. The ipi_list is only used
@@ -205,19 +207,16 @@ struct request {
        struct hd_struct *part;
        unsigned long start_time;
        struct blk_issue_stat issue_stat;
-#ifdef CONFIG_BLK_CGROUP
-        struct request_list *rl;                /* rl this rq is alloced from */
-        unsigned long long start_time_ns;
-        unsigned long long io_start_time_ns;    /* when passed to hardware */
-#endif
        /* Number of scatter-gather DMA addr+len pairs after
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
        unsigned short nr_integrity_segments;
 #endif
+        unsigned short write_hint;
        unsigned short ioprio;
        unsigned int timeout;
@@ -226,11 +225,37 @@ struct request {
        unsigned int extra_len; /* length of alignment and padding */
-        unsigned short write_hint;
+        /*
+         * On blk-mq, the lower bits of ->gstate (generation number and
+         * state) carry the MQ_RQ_* state value and the upper bits the
+         * generation number which is monotonically incremented and used to
+         * distinguish the reuse instances.
+         *
+         * ->gstate_seq allows updates to ->gstate and other fields
+         * (currently ->deadline) during request start to be read
+         * atomically from the timeout path, so that it can operate on a
+         * coherent set of information.
+         */
+        seqcount_t gstate_seq;
+        u64 gstate;
+        /*
+         * ->aborted_gstate is used by the timeout to claim a specific
+         * recycle instance of this request.  See blk_mq_timeout_work().
+         */
+        struct u64_stats_sync aborted_gstate_sync;
+        u64 aborted_gstate;
+        /* access through blk_rq_set_deadline, blk_rq_deadline */
+        unsigned long __deadline;
-        unsigned long deadline;
        struct list_head timeout_list;
+        union {
+                struct __call_single_data csd;
+                u64 fifo_time;
+        };
        /*
         * completion callback.
         */
@@ -239,6 +264,12 @@ struct request {
        /* for bidi */
        struct request *next_rq;
+#ifdef CONFIG_BLK_CGROUP
+        struct request_list *rl;                /* rl this rq is alloced from */
+        unsigned long long start_time_ns;
+        unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
 };
 static inline bool blk_op_is_scsi(unsigned int op)
@@ -564,6 +595,22 @@ struct request_queue {
        struct queue_limits     limits;
        /*
+         * Zoned block device information for request dispatch control.
+         * nr_zones is the total number of zones of the device. This is always
+         * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+         * bits which indicates if a zone is conventional (bit clear) or
+         * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+         * bits which indicates if a zone is write locked, that is, if a write
+         * request targeting the zone was dispatched. All three fields are
+         * initialized by the low level device driver (e.g. scsi/sd.c).
+         * Stacking drivers (device mappers) may or may not initialize
+         * these fields.
+         */
+        unsigned int            nr_zones;
+        unsigned long           *seq_zones_bitmap;
+        unsigned long           *seq_zones_wlock;
+        /*
         * sg stuff
         */
        unsigned int            sg_timeout;
@@ -807,6 +854,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
        return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+        return q->nr_zones;
+}
+static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+                                             sector_t sector)
+{
+        if (!blk_queue_is_zoned(q))
+                return 0;
+        return sector >> ilog2(q->limits.chunk_sectors);
+}
+static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+                                         sector_t sector)
+{
+        if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
+                return false;
+        return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
+}
 static inline bool rq_is_sync(struct request *rq)
 {
        return op_is_sync(rq->cmd_flags);
@@ -1046,6 +1114,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
        return blk_rq_cur_bytes(rq) >> 9;
 }
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+        return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+        return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
 /*
 * Some commands like WRITE SAME have a payload or data transfer size which
 * is different from the size of the request.  Any driver that supports such
@@ -1595,7 +1673,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
        if (q)
                return blk_queue_zone_sectors(q);
+        return 0;
+}
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+        struct request_queue *q = bdev_get_queue(bdev);
+        if (q)
+                return blk_queue_nr_zones(q);
        return 0;
 }
@@ -1731,8 +1817,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
-int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 #ifdef CONFIG_BLK_CGROUP
@@ -1971,6 +2055,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
+#ifdef CONFIG_BLK_DEV_ZONED
+bool blk_req_needs_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+        if (blk_req_needs_zone_write_lock(rq))
+                __blk_req_zone_write_lock(rq);
+}
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+        if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+                __blk_req_zone_write_unlock(rq);
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+        return rq->q->seq_zones_wlock &&
+                test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+        if (!blk_req_needs_zone_write_lock(rq))
+                return true;
+        return !blk_req_zone_is_write_locked(rq);
+}
+#else
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+        return false;
+}
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+        return false;
+}
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+        return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
 #else /* CONFIG_BLOCK */
 struct block_device;
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ec8a4d7af6bd..fe7a22dd133b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv,
                ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \
             bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
+/* for iterating one bio from start to end */
+#define BVEC_ITER_ALL_INIT (struct bvec_iter)                           \
+{                                                                       \
+        .bi_sector      = 0,                                            \
+        .bi_size        = UINT_MAX,                                     \
+        .bi_idx         = 0,                                            \
+        .bi_bvec_done   = 0,                                            \
+}
 #endif /* __LINUX_BVEC_ITER_H */
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 3d794b3dc532..6d9e230dffd2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-extern int elv_register_queue(struct request_queue *q);
-extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, unsigned int);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5144ebe046c9..5e3531027b51 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk)
 {
        device_add_disk(NULL, disk);
 }
+extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk);
+static inline void add_disk_no_queue_reg(struct gendisk *disk)
+{
+        device_add_disk_no_queue_reg(NULL, disk);
+}
 extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2d1d9de06728..7f4b60abdf27 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -50,10 +50,7 @@ struct nvm_id;
 struct nvm_dev;
 struct nvm_tgt_dev;
-typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
 typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
-typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
-                                nvm_l2p_update_fn *, void *);
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
@@ -66,7 +63,6 @@ typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
 struct nvm_dev_ops {
        nvm_id_fn               *identity;
-        nvm_get_l2p_tbl_fn      *get_l2p_tbl;
        nvm_op_bb_tbl_fn        *get_bb_tbl;
        nvm_op_set_bb_fn        *set_bb_tbl;
@@ -112,8 +108,6 @@ enum {
        NVM_RSP_WARN_HIGHECC    = 0x4700,
        /* Device opcodes */
-        NVM_OP_HBREAD           = 0x02,
-        NVM_OP_HBWRITE          = 0x81,
        NVM_OP_PWRITE           = 0x91,
        NVM_OP_PREAD            = 0x92,
        NVM_OP_ERASE            = 0x90,
@@ -165,12 +159,16 @@ struct nvm_id_group {
        u8      fmtype;
        u8      num_ch;
        u8      num_lun;
-        u8      num_pln;
+        u16     num_chk;
-        u16     num_blk;
+        u16     clba;
-        u16     num_pg;
-        u16     fpg_sz;
        u16     csecs;
        u16     sos;
+        u16     ws_min;
+        u16     ws_opt;
+        u16     ws_seq;
+        u16     ws_per_chk;
        u32     trdt;
        u32     trdm;
        u32     tprt;
@@ -181,7 +179,10 @@ struct nvm_id_group {
        u32     mccap;
        u16     cpar;
-        struct nvm_id_lp_tbl lptbl;
+        /* 1.2 compatibility */
+        u8      num_pln;
+        u16     num_pg;
+        u16     fpg_sz;
 };
 struct nvm_addr_format {
@@ -217,6 +218,10 @@ struct nvm_target {
 #define ADDR_EMPTY (~0ULL)
+#define NVM_TARGET_DEFAULT_OP (101)
+#define NVM_TARGET_MIN_OP (3)
+#define NVM_TARGET_MAX_OP (80)
 #define NVM_VERSION_MAJOR 1
 #define NVM_VERSION_MINOR 0
 #define NVM_VERSION_PATCH 0
@@ -239,7 +244,6 @@ struct nvm_rq {
        void *meta_list;
        dma_addr_t dma_meta_list;
-        struct completion *wait;
        nvm_end_io_fn *end_io;
        uint8_t opcode;
@@ -268,31 +272,38 @@ enum {
        NVM_BLK_ST_BAD =        0x8,    /* Bad block */
 };
 /* Device generic information */
 struct nvm_geo {
+        /* generic geometry */
        int nr_chnls;
-        int nr_luns;
+        int all_luns; /* across channels */
-        int luns_per_chnl; /* -1 if channels are not symmetric */
+        int nr_luns; /* per channel */
-        int nr_planes;
+        int nr_chks; /* per lun */
-        int sec_per_pg; /* only sectors for a single page */
-        int pgs_per_blk;
-        int blks_per_lun;
-        int fpg_size;
-        int pfpg_size; /* size of buffer if all pages are to be read */
        int sec_size;
        int oob_size;
        int mccap;
-        struct nvm_addr_format ppaf;
-        /* Calculated/Cached values. These do not reflect the actual usable
+        int sec_per_chk;
-         * blocks at run-time.
+        int sec_per_lun;
-         */
+        int ws_min;
+        int ws_opt;
+        int ws_seq;
+        int ws_per_chk;
        int max_rq_size;
-        int plane_mode; /* drive device in single, double or quad mode */
+        int op;
+        struct nvm_addr_format ppaf;
+        /* Legacy 1.2 specific geometry */
+        int plane_mode; /* drive device in single, double or quad mode */
+        int nr_planes;
+        int sec_per_pg; /* only sectors for a single page */
        int sec_per_pl; /* all sectors across planes */
-        int sec_per_blk;
-        int sec_per_lun;
 };
 /* sub-device structure */
@@ -320,10 +331,6 @@ struct nvm_dev {
        /* Device information */
        struct nvm_geo geo;
-          /* lower page table */
-        int lps_per_blk;
-        int *lptbl;
        unsigned long total_secs;
        unsigned long *lun_map;
@@ -346,36 +353,6 @@ struct nvm_dev {
        struct list_head targets;
 };
-static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
-                                                     u64 pba)
-{
-        struct ppa_addr l;
-        int secs, pgs, blks, luns;
-        sector_t ppa = pba;
-        l.ppa = 0;
-        div_u64_rem(ppa, geo->sec_per_pg, &secs);
-        l.g.sec = secs;
-        sector_div(ppa, geo->sec_per_pg);
-        div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
-        l.g.pg = pgs;
-        sector_div(ppa, geo->pgs_per_blk);
-        div_u64_rem(ppa, geo->blks_per_lun, &blks);
-        l.g.blk = blks;
-        sector_div(ppa, geo->blks_per_lun);
-        div_u64_rem(ppa, geo->luns_per_chnl, &luns);
-        l.g.lun = luns;
-        sector_div(ppa, geo->luns_per_chnl);
-        l.g.ch = ppa;
-        return l;
-}
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev,
                                                  struct ppa_addr r)
 {
@@ -418,25 +395,6 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev,
        return l;
 }
-static inline int ppa_empty(struct ppa_addr ppa_addr)
-{
-        return (ppa_addr.ppa == ADDR_EMPTY);
-}
-static inline void ppa_set_empty(struct ppa_addr *ppa_addr)
-{
-        ppa_addr->ppa = ADDR_EMPTY;
-}
-static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
-{
-        if (ppa_empty(ppa1) || ppa_empty(ppa2))
-                return 0;
-        return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) &&
-                                        (ppa1.g.blk == ppa2.g.blk));
-}
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
@@ -481,17 +439,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
-extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
-                           void *);
-extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
-extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
 extern void nvm_end_io(struct nvm_rq *);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
-extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index aea87f0d917b..4112e2bd747f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -124,14 +124,20 @@ enum {
 #define NVME_CMB_BIR(cmbloc)    ((cmbloc) & 0x7)
 #define NVME_CMB_OFST(cmbloc)   (((cmbloc) >> 12) & 0xfffff)
-#define NVME_CMB_SZ(cmbsz)      (((cmbsz) >> 12) & 0xfffff)
-#define NVME_CMB_SZU(cmbsz)     (((cmbsz) >> 8) & 0xf)
+enum {
+        NVME_CMBSZ_SQS          = 1 << 0,
-#define NVME_CMB_WDS(cmbsz)     ((cmbsz) & 0x10)
+        NVME_CMBSZ_CQS          = 1 << 1,
-#define NVME_CMB_RDS(cmbsz)     ((cmbsz) & 0x8)
+        NVME_CMBSZ_LISTS        = 1 << 2,
-#define NVME_CMB_LISTS(cmbsz)   ((cmbsz) & 0x4)
+        NVME_CMBSZ_RDS          = 1 << 3,
-#define NVME_CMB_CQS(cmbsz)     ((cmbsz) & 0x2)
+        NVME_CMBSZ_WDS          = 1 << 4,
-#define NVME_CMB_SQS(cmbsz)     ((cmbsz) & 0x1)
+        NVME_CMBSZ_SZ_SHIFT     = 12,
+        NVME_CMBSZ_SZ_MASK      = 0xfffff,
+        NVME_CMBSZ_SZU_SHIFT    = 8,
+        NVME_CMBSZ_SZU_MASK     = 0xf,
+};
 /*
 * Submission and Completion Queue Entry Sizes for the NVM command set.
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index b7c83254c566..22b2131bcdcd 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -276,6 +276,17 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
                              unsigned int n_pages, unsigned int offset,
                              unsigned long size, gfp_t gfp_mask);
+#ifdef CONFIG_SGL_ALLOC
+struct scatterlist *sgl_alloc_order(unsigned long long length,
+                                    unsigned int order, bool chainable,
+                                    gfp_t gfp, unsigned int *nent_p);
+struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
+                              unsigned int *nent_p);
+void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
+void sgl_free_order(struct scatterlist *sgl, int order);
+void sgl_free(struct scatterlist *sgl);
+#endif /* CONFIG_SGL_ALLOC */
 size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
                      size_t buflen, off_t skip, bool to_buffer);
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index 42d1a434af29..f9a1be7fc696 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -75,14 +75,23 @@ struct nvm_ioctl_create_simple {
        __u32 lun_end;
 };
+struct nvm_ioctl_create_extended {
+        __u16 lun_begin;
+        __u16 lun_end;
+        __u16 op;
+        __u16 rsv;
+};
 enum {
        NVM_CONFIG_TYPE_SIMPLE = 0,
+        NVM_CONFIG_TYPE_EXTENDED = 1,
 };
 struct nvm_ioctl_create_conf {
        __u32 type;
        union {
                struct nvm_ioctl_create_simple s;
+                struct nvm_ioctl_create_extended e;
        };
 };
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12d35108225..a37a3b4b6342 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
        }
 }
-static cpumask_var_t *alloc_node_to_present_cpumask(void)
+static cpumask_var_t *alloc_node_to_possible_cpumask(void)
 {
        cpumask_var_t *masks;
        int node;
@@ -62,7 +62,7 @@ out_unwind:
        return NULL;
 }
-static void free_node_to_present_cpumask(cpumask_var_t *masks)
+static void free_node_to_possible_cpumask(cpumask_var_t *masks)
 {
        int node;
@@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
        kfree(masks);
 }
-static void build_node_to_present_cpumask(cpumask_var_t *masks)
+static void build_node_to_possible_cpumask(cpumask_var_t *masks)
 {
        int cpu;
-        for_each_present_cpu(cpu)
+        for_each_possible_cpu(cpu)
                cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
 }
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
                                const struct cpumask *mask, nodemask_t *nodemsk)
 {
        int n, nodes = 0;
        /* Calculate the number of nodes in the supplied affinity mask */
        for_each_node(n) {
-                if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
+                if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
                        node_set(n, *nodemsk);
                        nodes++;
                }
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
        int last_affv = affv + affd->pre_vectors;
        nodemask_t nodemsk = NODE_MASK_NONE;
        struct cpumask *masks;
-        cpumask_var_t nmsk, *node_to_present_cpumask;
+        cpumask_var_t nmsk, *node_to_possible_cpumask;
        /*
         * If there aren't any vectors left after applying the pre/post
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
        if (!masks)
                goto out;
-        node_to_present_cpumask = alloc_node_to_present_cpumask();
+        node_to_possible_cpumask = alloc_node_to_possible_cpumask();
-        if (!node_to_present_cpumask)
+        if (!node_to_possible_cpumask)
                goto out;
        /* Fill out vectors at the beginning that don't need affinity */
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
        /* Stabilize the cpumasks */
        get_online_cpus();
-        build_node_to_present_cpumask(node_to_present_cpumask);
+        build_node_to_possible_cpumask(node_to_possible_cpumask);
-        nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
+        nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
                                     &nodemsk);
        /*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
        if (affv <= nodes) {
                for_each_node_mask(n, nodemsk) {
                        cpumask_copy(masks + curvec,
-                                     node_to_present_cpumask[n]);
+                                     node_to_possible_cpumask[n]);
                        if (++curvec == last_affv)
                                break;
                }
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
                vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
                /* Get the cpus on this node which are in the mask */
-                cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
+                cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
                /* Calculate the number of cpus per vector */
                ncpus = cpumask_weight(nmsk);
@@ -192,7 +192,7 @@ done:
        /* Fill out vectors at the end that don't need affinity */
        for (; curvec < nvecs; curvec++)
                cpumask_copy(masks + curvec, irq_default_affinity);
-        free_node_to_present_cpumask(node_to_present_cpumask);
+        free_node_to_possible_cpumask(node_to_possible_cpumask);
 out:
        free_cpumask_var(nmsk);
        return masks;
@@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
                return 0;
        get_online_cpus();
-        ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
+        ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
        put_online_cpus();
        return ret;
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a46be1261c09..11b4282c2d20 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb)
 static void hib_end_io(struct bio *bio)
 {
        struct hib_bio_batch *hb = bio->bi_private;
-        struct page *page = bio->bi_io_vec[0].bv_page;
+        struct page *page = bio_first_page_all(bio);
        if (bio->bi_status) {
                pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
diff --git a/lib/Kconfig b/lib/Kconfig
index c5e84fbcb30b..4dd5c11366f9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -409,6 +409,10 @@ config HAS_DMA
        depends on !NO_DMA
        default y
+config SGL_ALLOC
+        bool
+        default n
 config DMA_NOOP_OPS
        bool
        depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 80aa8d5463fa..42b5ca0acf93 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -462,7 +462,7 @@ static void sbq_wake_up(struct sbitmap_queue *sbq)
                 */
                atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch);
                sbq_index_atomic_inc(&sbq->wake_index);
-                wake_up(&ws->wait);
+                wake_up_nr(&ws->wait, wake_batch);
        }
 }
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7c1c55f7daaa..53728d391d3a 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -474,6 +474,133 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
 }
 EXPORT_SYMBOL(sg_alloc_table_from_pages);
+#ifdef CONFIG_SGL_ALLOC
+/**
+ * sgl_alloc_order - allocate a scatterlist and its pages
+ * @length: Length in bytes of the scatterlist. Must be at least one
+ * @order: Second argument for alloc_pages()
+ * @chainable: Whether or not to allocate an extra element in the scatterlist
+ *      for scatterlist chaining purposes
+ * @gfp: Memory allocation flags
+ * @nent_p: [out] Number of entries in the scatterlist that have pages
+ *
+ * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
+ */
+struct scatterlist *sgl_alloc_order(unsigned long long length,
+                                    unsigned int order, bool chainable,
+                                    gfp_t gfp, unsigned int *nent_p)
+{
+        struct scatterlist *sgl, *sg;
+        struct page *page;
+        unsigned int nent, nalloc;
+        u32 elem_len;
+        nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order);
+        /* Check for integer overflow */
+        if (length > (nent << (PAGE_SHIFT + order)))
+                return NULL;
+        nalloc = nent;
+        if (chainable) {
+                /* Check for integer overflow */
+                if (nalloc + 1 < nalloc)
+                        return NULL;
+                nalloc++;
+        }
+        sgl = kmalloc_array(nalloc, sizeof(struct scatterlist),
+                            (gfp & ~GFP_DMA) | __GFP_ZERO);
+        if (!sgl)
+                return NULL;
+        sg_init_table(sgl, nalloc);
+        sg = sgl;
+        while (length) {
+                elem_len = min_t(u64, length, PAGE_SIZE << order);
+                page = alloc_pages(gfp, order);
+                if (!page) {
+                        sgl_free(sgl);
+                        return NULL;
+                }
+                sg_set_page(sg, page, elem_len, 0);
+                length -= elem_len;
+                sg = sg_next(sg);
+        }
+        WARN_ONCE(length, "length = %lld\n", length);
+        if (nent_p)
+                *nent_p = nent;
+        return sgl;
+}
+EXPORT_SYMBOL(sgl_alloc_order);
+/**
+ * sgl_alloc - allocate a scatterlist and its pages
+ * @length: Length in bytes of the scatterlist
+ * @gfp: Memory allocation flags
+ * @nent_p: [out] Number of entries in the scatterlist
+ *
+ * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
+ */
+struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
+                              unsigned int *nent_p)
+{
+        return sgl_alloc_order(length, 0, false, gfp, nent_p);
+}
+EXPORT_SYMBOL(sgl_alloc);
+/**
+ * sgl_free_n_order - free a scatterlist and its pages
+ * @sgl: Scatterlist with one or more elements
+ * @nents: Maximum number of elements to free
+ * @order: Second argument for __free_pages()
+ *
+ * Notes:
+ * - If several scatterlists have been chained and each chain element is
+ *   freed separately then it's essential to set nents correctly to avoid that a
+ *   page would get freed twice.
+ * - All pages in a chained scatterlist can be freed at once by setting @nents
+ *   to a high number.
+ */
+void sgl_free_n_order(struct scatterlist *sgl, int nents, int order)
+{
+        struct scatterlist *sg;
+        struct page *page;
+        int i;
+        for_each_sg(sgl, sg, nents, i) {
+                if (!sg)
+                        break;
+                page = sg_page(sg);
+                if (page)
+                        __free_pages(page, order);
+        }
+        kfree(sgl);
+}
+EXPORT_SYMBOL(sgl_free_n_order);
+/**
+ * sgl_free_order - free a scatterlist and its pages
+ * @sgl: Scatterlist with one or more elements
+ * @order: Second argument for __free_pages()
+ */
+void sgl_free_order(struct scatterlist *sgl, int order)
+{
+        sgl_free_n_order(sgl, INT_MAX, order);
+}
+EXPORT_SYMBOL(sgl_free_order);
+/**
+ * sgl_free - free a scatterlist and its pages
+ * @sgl: Scatterlist with one or more elements
+ */
+void sgl_free(struct scatterlist *sgl)
+{
+        sgl_free_order(sgl, 0);
+}
+EXPORT_SYMBOL(sgl_free);
+#endif /* CONFIG_SGL_ALLOC */
 void __sg_page_iter_start(struct sg_page_iter *piter,
                          struct scatterlist *sglist, unsigned int nents,
                          unsigned long pgoffset)
diff --git a/mm/page_io.c b/mm/page_io.c
index e93f1a4cacd7..b41cf9644585 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -50,7 +50,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
 void end_swap_bio_write(struct bio *bio)
 {
-        struct page *page = bio->bi_io_vec[0].bv_page;
+        struct page *page = bio_first_page_all(bio);
        if (bio->bi_status) {
                SetPageError(page);
@@ -122,7 +122,7 @@ static void swap_slot_free_notify(struct page *page)
 static void end_swap_bio_read(struct bio *bio)
 {
-        struct page *page = bio->bi_io_vec[0].bv_page;
+        struct page *page = bio_first_page_all(bio);
        struct task_struct *waiter = bio->bi_private;
        if (bio->bi_status) {
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-01-29 14:51:49 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-01-29 14:51:49 -0500
commit	0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch)
tree	cefccd67dc1f27bb45830f6b8065dd4a1c05e83b
parent	9697e9da84299d0d715d515dd2cc48f1eceb277d (diff)
parent	796baeeef85a40b3495a907fb7425086e7010102 (diff)