1 files changed, 324 insertions, 281 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 2cba5ef97b2b..2d053b584410 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -26,8 +26,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
@@ -50,27 +48,26 @@ struct kmem_cache *blk_requestq_cachep;
 */
 static struct workqueue_struct *kblockd_workqueue;
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
 static void drive_stat_acct(struct request *rq, int new_io)
 {
        struct hd_struct *part;
        int rw = rq_data_dir(rq);
+        int cpu;
        if (!blk_fs_request(rq) || !rq->rq_disk)
                return;
-        part = get_part(rq->rq_disk, rq->sector);
+        cpu = part_stat_lock();
+        part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
        if (!new_io)
-                __all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
+                part_stat_inc(cpu, part, merges[rw]);
        else {
-                disk_round_stats(rq->rq_disk);
+                part_round_stats(cpu, part);
-                rq->rq_disk->in_flight++;
+                part_inc_in_flight(part);
-                if (part) {
-                        part_round_stats(part);
-                        part->in_flight++;
-                }
        }
+        part_stat_unlock();
 }
 void blk_queue_congestion_threshold(struct request_queue *q)
@@ -113,7 +110,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        memset(rq, 0, sizeof(*rq));
        INIT_LIST_HEAD(&rq->queuelist);
-        INIT_LIST_HEAD(&rq->donelist);
+        INIT_LIST_HEAD(&rq->timeout_list);
+        rq->cpu = -1;
        rq->q = q;
        rq->sector = rq->hard_sector = (sector_t) -1;
        INIT_HLIST_NODE(&rq->hash);
@@ -308,7 +306,7 @@ void blk_unplug_timeout(unsigned long data)
        blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
                                q->rq.count[READ] + q->rq.count[WRITE]);
-        kblockd_schedule_work(&q->unplug_work);
+        kblockd_schedule_work(q, &q->unplug_work);
 }
 void blk_unplug(struct request_queue *q)
@@ -325,6 +323,21 @@ void blk_unplug(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_unplug);
+static void blk_invoke_request_fn(struct request_queue *q)
+{
+        /*
+         * one level of recursion is ok and is much faster than kicking
+         * the unplug handling
+         */
+        if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+                q->request_fn(q);
+                queue_flag_clear(QUEUE_FLAG_REENTER, q);
+        } else {
+                queue_flag_set(QUEUE_FLAG_PLUGGED, q);
+                kblockd_schedule_work(q, &q->unplug_work);
+        }
+}
 /**
 * blk_start_queue - restart a previously stopped queue
 * @q:    The &struct request_queue in question
@@ -339,18 +352,7 @@ void blk_start_queue(struct request_queue *q)
        WARN_ON(!irqs_disabled());
        queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+        blk_invoke_request_fn(q);
-        /*
-         * one level of recursion is ok and is much faster than kicking
-         * the unplug handling
-         */
-        if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-                q->request_fn(q);
-                queue_flag_clear(QUEUE_FLAG_REENTER, q);
-        } else {
-                blk_plug_device(q);
-                kblockd_schedule_work(&q->unplug_work);
-        }
 }
 EXPORT_SYMBOL(blk_start_queue);
@@ -408,15 +410,8 @@ void __blk_run_queue(struct request_queue *q)
         * Only recurse once to avoid overrunning the stack, let the unplug
         * handling reinvoke the handler shortly if we already got there.
         */
-        if (!elv_queue_empty(q)) {
+        if (!elv_queue_empty(q))
-                if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+                blk_invoke_request_fn(q);
-                        q->request_fn(q);
-                        queue_flag_clear(QUEUE_FLAG_REENTER, q);
-                } else {
-                        blk_plug_device(q);
-                        kblockd_schedule_work(&q->unplug_work);
-                }
-        }
 }
 EXPORT_SYMBOL(__blk_run_queue);
@@ -441,6 +436,14 @@ void blk_put_queue(struct request_queue *q)
 void blk_cleanup_queue(struct request_queue *q)
 {
+        /*
+         * We know we have process context here, so we can be a little
+         * cautious and ensure that pending block actions on this device
+         * are done before moving on. Going into this function, we should
+         * not have processes doing IO to this device.
+         */
+        blk_sync_queue(q);
        mutex_lock(&q->sysfs_lock);
        queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
        mutex_unlock(&q->sysfs_lock);
@@ -496,6 +499,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        }
        init_timer(&q->unplug_timer);
+        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+        INIT_LIST_HEAD(&q->timeout_list);
        kobject_init(&q->kobj, &blk_queue_ktype);
@@ -531,7 +536,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
 *    request queue; this lock will be taken also from interrupt context, so irq
 *    disabling is needed for it.
 *
- *    Function returns a pointer to the initialized request queue, or NULL if
+ *    Function returns a pointer to the initialized request queue, or %NULL if
 *    it didn't succeed.
 *
 * Note:
@@ -569,7 +574,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
        q->request_fn           = rfn;
        q->prep_rq_fn           = NULL;
        q->unplug_fn            = generic_unplug_device;
-        q->queue_flags          = (1 << QUEUE_FLAG_CLUSTER);
+        q->queue_flags          = (1 << QUEUE_FLAG_CLUSTER |
+                                   1 << QUEUE_FLAG_STACKABLE);
        q->queue_lock           = lock;
        blk_queue_segment_boundary(q, 0xffffffff);
@@ -624,10 +630,6 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
        blk_rq_init(q, rq);
-        /*
-         * first three bits are identical in rq->cmd_flags and bio->bi_rw,
-         * see bio.h and blkdev.h
-         */
        rq->cmd_flags = rw | REQ_ALLOCED;
        if (priv) {
@@ -888,9 +890,11 @@ EXPORT_SYMBOL(blk_get_request);
 */
 void blk_start_queueing(struct request_queue *q)
 {
-        if (!blk_queue_plugged(q))
+        if (!blk_queue_plugged(q)) {
+                if (unlikely(blk_queue_stopped(q)))
+                        return;
                q->request_fn(q);
-        else
+        } else
                __generic_unplug_device(q);
 }
 EXPORT_SYMBOL(blk_start_queueing);
@@ -907,6 +911,8 @@ EXPORT_SYMBOL(blk_start_queueing);
 */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
+        blk_delete_timer(rq);
+        blk_clear_rq_complete(rq);
        blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
        if (blk_rq_tagged(rq))
@@ -917,7 +923,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 EXPORT_SYMBOL(blk_requeue_request);
 /**
- * blk_insert_request - insert a special request in to a request queue
+ * blk_insert_request - insert a special request into a request queue
 * @q:          request queue where request should be inserted
 * @rq:         request to be inserted
 * @at_head:    insert request at head or tail of queue
@@ -927,8 +933,8 @@ EXPORT_SYMBOL(blk_requeue_request);
 *    Many block devices need to execute commands asynchronously, so they don't
 *    block the whole kernel from preemption during request execution.  This is
 *    accomplished normally by inserting aritficial requests tagged as
- *    REQ_SPECIAL in to the corresponding request queue, and letting them be
+ *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
- *    scheduled for actual execution by the request queue.
+ *    be scheduled for actual execution by the request queue.
 *
 *    We have the option of inserting the head or the tail of the queue.
 *    Typically we use the tail for new ioctls and so forth.  We use the head
@@ -982,8 +988,22 @@ static inline void add_request(struct request_queue *q, struct request *req)
        __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 }
-/*
+static void part_round_stats_single(int cpu, struct hd_struct *part,
- * disk_round_stats()   - Round off the performance stats on a struct
+                                    unsigned long now)
+{
+        if (now == part->stamp)
+                return;
+        if (part->in_flight) {
+                __part_stat_add(cpu, part, time_in_queue,
+                                part->in_flight * (now - part->stamp));
+                __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
+        }
+        part->stamp = now;
+}
+/**
+ * part_round_stats()   - Round off the performance stats on a struct
 * disk_stats.
 *
 * The average IO queue length and utilisation statistics are maintained
@@ -997,36 +1017,15 @@ static inline void add_request(struct request_queue *q, struct request *req)
 * /proc/diskstats.  This accounts immediately for all queue usage up to
 * the current jiffies and restarts the counters again.
 */
-void disk_round_stats(struct gendisk *disk)
+void part_round_stats(int cpu, struct hd_struct *part)
 {
        unsigned long now = jiffies;
-        if (now == disk->stamp)
+        if (part->partno)
-                return;
+                part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
+        part_round_stats_single(cpu, part, now);
-        if (disk->in_flight) {
-                __disk_stat_add(disk, time_in_queue,
-                                disk->in_flight * (now - disk->stamp));
-                __disk_stat_add(disk, io_ticks, (now - disk->stamp));
-        }
-        disk->stamp = now;
-}
-EXPORT_SYMBOL_GPL(disk_round_stats);
-void part_round_stats(struct hd_struct *part)
-{
-        unsigned long now = jiffies;
-        if (now == part->stamp)
-                return;
-        if (part->in_flight) {
-                __part_stat_add(part, time_in_queue,
-                                part->in_flight * (now - part->stamp));
-                __part_stat_add(part, io_ticks, (now - part->stamp));
-        }
-        part->stamp = now;
 }
+EXPORT_SYMBOL_GPL(part_round_stats);
 /*
 * queue lock must be held
@@ -1070,6 +1069,7 @@ EXPORT_SYMBOL(blk_put_request);
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
+        req->cpu = bio->bi_comp_cpu;
        req->cmd_type = REQ_TYPE_FS;
        /*
@@ -1081,7 +1081,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        /*
         * REQ_BARRIER implies no merging, but lets make it explicit
         */
-        if (unlikely(bio_barrier(bio)))
+        if (unlikely(bio_discard(bio))) {
+                req->cmd_flags |= REQ_DISCARD;
+                if (bio_barrier(bio))
+                        req->cmd_flags |= REQ_SOFTBARRIER;
+                req->q->prepare_discard_fn(req->q, req);
+        } else if (unlikely(bio_barrier(bio)))
                req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
        if (bio_sync(bio))
@@ -1099,7 +1104,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
        struct request *req;
-        int el_ret, nr_sectors, barrier, err;
+        int el_ret, nr_sectors, barrier, discard, err;
        const unsigned short prio = bio_prio(bio);
        const int sync = bio_sync(bio);
        int rw_flags;
@@ -1114,7 +1119,14 @@ static int __make_request(struct request_queue *q, struct bio *bio)
        blk_queue_bounce(q, &bio);
        barrier = bio_barrier(bio);
-        if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
+        if (unlikely(barrier) && bio_has_data(bio) &&
+            (q->next_ordered == QUEUE_ORDERED_NONE)) {
+                err = -EOPNOTSUPP;
+                goto end_io;
+        }
+        discard = bio_discard(bio);
+        if (unlikely(discard) && !q->prepare_discard_fn) {
                err = -EOPNOTSUPP;
                goto end_io;
        }
@@ -1138,6 +1150,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                req->biotail = bio;
                req->nr_sectors = req->hard_nr_sectors += nr_sectors;
                req->ioprio = ioprio_best(req->ioprio, prio);
+                if (!blk_rq_cpu_valid(req))
+                        req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
                if (!attempt_back_merge(q, req))
                        elv_merged_request(q, req, el_ret);
@@ -1165,6 +1179,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                req->sector = req->hard_sector = bio->bi_sector;
                req->nr_sectors = req->hard_nr_sectors += nr_sectors;
                req->ioprio = ioprio_best(req->ioprio, prio);
+                if (!blk_rq_cpu_valid(req))
+                        req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
                if (!attempt_front_merge(q, req))
                        elv_merged_request(q, req, el_ret);
@@ -1200,13 +1216,15 @@ get_rq:
        init_request_from_bio(req, bio);
        spin_lock_irq(q->queue_lock);
+        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
+            bio_flagged(bio, BIO_CPU_AFFINE))
+                req->cpu = blk_cpu_to_group(smp_processor_id());
        if (elv_queue_empty(q))
                blk_plug_device(q);
        add_request(q, req);
 out:
        if (sync)
                __generic_unplug_device(q);
        spin_unlock_irq(q->queue_lock);
        return 0;
@@ -1260,8 +1278,9 @@ __setup("fail_make_request=", setup_fail_make_request);
 static int should_fail_request(struct bio *bio)
 {
-        if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
+        struct hd_struct *part = bio->bi_bdev->bd_part;
-            (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
+        if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
                return should_fail(&fail_make_request, bio->bi_size);
        return 0;
@@ -1314,7 +1333,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 }
 /**
- * generic_make_request: hand a buffer to its device driver for I/O
+ * generic_make_request - hand a buffer to its device driver for I/O
 * @bio:  The bio describing the location in memory and on the device.
 *
 * generic_make_request() is used to make I/O requests of block
@@ -1409,7 +1428,8 @@ end_io:
                if (bio_check_eod(bio, nr_sectors))
                        goto end_io;
-                if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
+                if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
+                    (bio_discard(bio) && !q->prepare_discard_fn)) {
                        err = -EOPNOTSUPP;
                        goto end_io;
                }
@@ -1471,13 +1491,13 @@ void generic_make_request(struct bio *bio)
 EXPORT_SYMBOL(generic_make_request);
 /**
- * submit_bio: submit a bio to the block device layer for I/O
+ * submit_bio - submit a bio to the block device layer for I/O
 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
 * @bio: The &struct bio which describes the I/O
 *
 * submit_bio() is very similar in purpose to generic_make_request(), and
 * uses that function to do most of the work. Both are fairly rough
- * interfaces, @bio must be presetup and ready for I/O.
+ * interfaces; @bio must be presetup and ready for I/O.
 *
 */
 void submit_bio(int rw, struct bio *bio)
@@ -1490,11 +1510,7 @@ void submit_bio(int rw, struct bio *bio)
         * If it's a regular read/write or a barrier with data attached,
         * go through the normal accounting stuff before submission.
         */
-        if (!bio_empty_barrier(bio)) {
+        if (bio_has_data(bio)) {
-                BIO_BUG_ON(!bio->bi_size);
-                BIO_BUG_ON(!bio->bi_io_vec);
                if (rw & WRITE) {
                        count_vm_events(PGPGOUT, count);
                } else {
@@ -1517,9 +1533,90 @@ void submit_bio(int rw, struct bio *bio)
 EXPORT_SYMBOL(submit_bio);
 /**
+ * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * @q:  the queue
+ * @rq: the request being checked
+ *
+ * Description:
+ *    @rq may have been made based on weaker limitations of upper-level queues
+ *    in request stacking drivers, and it may violate the limitation of @q.
+ *    Since the block layer and the underlying device driver trust @rq
+ *    after it is inserted to @q, it should be checked against @q before
+ *    the insertion using this generic function.
+ *
+ *    This function should also be useful for request stacking drivers
+ *    in some cases below, so export this fuction.
+ *    Request stacking drivers like request-based dm may change the queue
+ *    limits while requests are in the queue (e.g. dm's table swapping).
+ *    Such request stacking drivers should check those requests agaist
+ *    the new queue limits again when they dispatch those requests,
+ *    although such checkings are also done against the old queue limits
+ *    when submitting requests.
+ */
+int blk_rq_check_limits(struct request_queue *q, struct request *rq)
+{
+        if (rq->nr_sectors > q->max_sectors ||
+            rq->data_len > q->max_hw_sectors << 9) {
+                printk(KERN_ERR "%s: over max size limit.\n", __func__);
+                return -EIO;
+        }
+        /*
+         * queue's settings related to segment counting like q->bounce_pfn
+         * may differ from that of other stacking queues.
+         * Recalculate it to check the request correctly on this queue's
+         * limitation.
+         */
+        blk_recalc_rq_segments(rq);
+        if (rq->nr_phys_segments > q->max_phys_segments ||
+            rq->nr_phys_segments > q->max_hw_segments) {
+                printk(KERN_ERR "%s: over max segments limit.\n", __func__);
+                return -EIO;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(blk_rq_check_limits);
+/**
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @q:  the queue to submit the request
+ * @rq: the request being queued
+ */
+int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
+{
+        unsigned long flags;
+        if (blk_rq_check_limits(q, rq))
+                return -EIO;
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+        if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
+            should_fail(&fail_make_request, blk_rq_bytes(rq)))
+                return -EIO;
+#endif
+        spin_lock_irqsave(q->queue_lock, flags);
+        /*
+         * Submitting request must be dequeued before calling this function
+         * because it will be linked to another request_queue
+         */
+        BUG_ON(blk_queued_rq(rq));
+        drive_stat_acct(rq, 1);
+        __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+/**
 * __end_that_request_first - end I/O on a request
 * @req:      the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
 * @nr_bytes: number of bytes to complete
 *
 * Description:
@@ -1527,8 +1624,8 @@ EXPORT_SYMBOL(submit_bio);
 *     for the next range of segments (if any) in the cluster.
 *
 * Return:
- *     0 - we are done with this request, call end_that_request_last()
+ *     %0 - we are done with this request, call end_that_request_last()
- *     1 - still buffers pending for this request
+ *     %1 - still buffers pending for this request
 **/
 static int __end_that_request_first(struct request *req, int error,
                                    int nr_bytes)
@@ -1539,7 +1636,7 @@ static int __end_that_request_first(struct request *req, int error,
        blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
        /*
-         * for a REQ_BLOCK_PC request, we want to carry any eventual
+         * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
         * sense key with us all the way through
         */
        if (!blk_pc_request(req))
@@ -1552,11 +1649,14 @@ static int __end_that_request_first(struct request *req, int error,
        }
        if (blk_fs_request(req) && req->rq_disk) {
-                struct hd_struct *part = get_part(req->rq_disk, req->sector);
                const int rw = rq_data_dir(req);
+                struct hd_struct *part;
+                int cpu;
-                all_stat_add(req->rq_disk, part, sectors[rw],
+                cpu = part_stat_lock();
-                                nr_bytes >> 9, req->sector);
+                part = disk_map_sector_rcu(req->rq_disk, req->sector);
+                part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9);
+                part_stat_unlock();
        }
        total_bytes = bio_nbytes = 0;
@@ -1641,88 +1741,14 @@ static int __end_that_request_first(struct request *req, int error,
 }
 /*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
-        struct list_head *cpu_list, local_list;
-        local_irq_disable();
-        cpu_list = &__get_cpu_var(blk_cpu_done);
-        list_replace_init(cpu_list, &local_list);
-        local_irq_enable();
-        while (!list_empty(&local_list)) {
-                struct request *rq;
-                rq = list_entry(local_list.next, struct request, donelist);
-                list_del_init(&rq->donelist);
-                rq->q->softirq_done_fn(rq);
-        }
-}
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
-                                    unsigned long action, void *hcpu)
-{
-        /*
-         * If a CPU goes away, splice its entries to the current CPU
-         * and trigger a run of the softirq
-         */
-        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-                int cpu = (unsigned long) hcpu;
-                local_irq_disable();
-                list_splice_init(&per_cpu(blk_cpu_done, cpu),
-                                 &__get_cpu_var(blk_cpu_done));
-                raise_softirq_irqoff(BLOCK_SOFTIRQ);
-                local_irq_enable();
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
-        .notifier_call  = blk_cpu_notify,
-};
-/**
- * blk_complete_request - end I/O on a request
- * @req:      the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completion callback
- *     through requeueing. The actual completion happens out-of-order,
- *     through a softirq handler. The user must have registered a completion
- *     callback through blk_queue_softirq_done().
- **/
-void blk_complete_request(struct request *req)
-{
-        struct list_head *cpu_list;
-        unsigned long flags;
-        BUG_ON(!req->q->softirq_done_fn);
-        local_irq_save(flags);
-        cpu_list = &__get_cpu_var(blk_cpu_done);
-        list_add_tail(&req->donelist, cpu_list);
-        raise_softirq_irqoff(BLOCK_SOFTIRQ);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_complete_request);
-/*
 * queue lock must be held
 */
 static void end_that_request_last(struct request *req, int error)
 {
        struct gendisk *disk = req->rq_disk;
+        blk_delete_timer(req);
        if (blk_rq_tagged(req))
                blk_queue_end_tag(req->q, req);
@@ -1740,16 +1766,18 @@ static void end_that_request_last(struct request *req, int error)
        if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
                unsigned long duration = jiffies - req->start_time;
                const int rw = rq_data_dir(req);
-                struct hd_struct *part = get_part(disk, req->sector);
+                struct hd_struct *part;
+                int cpu;
-                __all_stat_inc(disk, part, ios[rw], req->sector);
-                __all_stat_add(disk, part, ticks[rw], duration, req->sector);
+                cpu = part_stat_lock();
-                disk_round_stats(disk);
+                part = disk_map_sector_rcu(disk, req->sector);
-                disk->in_flight--;
-                if (part) {
+                part_stat_inc(cpu, part, ios[rw]);
-                        part_round_stats(part);
+                part_stat_add(cpu, part, ticks[rw], duration);
-                        part->in_flight--;
+                part_round_stats(cpu, part);
-                }
+                part_dec_in_flight(part);
+                part_stat_unlock();
        }
        if (req->end_io)
@@ -1762,17 +1790,6 @@ static void end_that_request_last(struct request *req, int error)
        }
 }
-static inline void __end_request(struct request *rq, int uptodate,
-                                 unsigned int nr_bytes)
-{
-        int error = 0;
-        if (uptodate <= 0)
-                error = uptodate ? uptodate : -EIO;
-        __blk_end_request(rq, error, nr_bytes);
-}
 /**
 * blk_rq_bytes - Returns bytes left to complete in the entire request
 * @rq: the request being processed
@@ -1803,74 +1820,57 @@ unsigned int blk_rq_cur_bytes(struct request *rq)
 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 /**
- * end_queued_request - end all I/O on a queued request
- * @rq:         the request being processed
- * @uptodate:   error value or 0/1 uptodate flag
- *
- * Description:
- *     Ends all I/O on a request, and removes it from the block layer queues.
- *     Not suitable for normal IO completion, unless the driver still has
- *     the request attached to the block layer.
- *
- **/
-void end_queued_request(struct request *rq, int uptodate)
-{
-        __end_request(rq, uptodate, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(end_queued_request);
-/**
- * end_dequeued_request - end all I/O on a dequeued request
- * @rq:         the request being processed
- * @uptodate:   error value or 0/1 uptodate flag
- *
- * Description:
- *     Ends all I/O on a request. The request must already have been
- *     dequeued using blkdev_dequeue_request(), as is normally the case
- *     for most drivers.
- *
- **/
-void end_dequeued_request(struct request *rq, int uptodate)
-{
-        __end_request(rq, uptodate, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(end_dequeued_request);
-/**
 * end_request - end I/O on the current segment of the request
 * @req:        the request being processed
- * @uptodate:   error value or 0/1 uptodate flag
+ * @uptodate:   error value or %0/%1 uptodate flag
 *
 * Description:
 *     Ends I/O on the current segment of a request. If that is the only
 *     remaining segment, the request is also completed and freed.
 *
- *     This is a remnant of how older block drivers handled IO completions.
+ *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end IO on the full request in one go, unless
+ *     Modern drivers typically end I/O on the full request in one go, unless
 *     they have a residual value to account for. For that case this function
 *     isn't really useful, unless the residual just happens to be the
 *     full current segment. In other words, don't use this function in new
- *     code. Either use end_request_completely(), or the
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
- *     end_that_request_chunk() (along with end_that_request_last()) for
- *     partial completions.
- *
 **/
 void end_request(struct request *req, int uptodate)
 {
-        __end_request(req, uptodate, req->hard_cur_sectors << 9);
+        int error = 0;
+        if (uptodate <= 0)
+                error = uptodate ? uptodate : -EIO;
+        __blk_end_request(req, error, req->hard_cur_sectors << 9);
 }
 EXPORT_SYMBOL(end_request);
+static int end_that_request_data(struct request *rq, int error,
+                                 unsigned int nr_bytes, unsigned int bidi_bytes)
+{
+        if (rq->bio) {
+                if (__end_that_request_first(rq, error, nr_bytes))
+                        return 1;
+                /* Bidi request must be completed as a whole */
+                if (blk_bidi_rq(rq) &&
+                    __end_that_request_first(rq->next_rq, error, bidi_bytes))
+                        return 1;
+        }
+        return 0;
+}
 /**
 * blk_end_io - Generic end_io function to complete a request.
 * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
+ * @error:        %0 for success, < %0 for error
 * @nr_bytes:     number of bytes to complete @rq
 * @bidi_bytes:   number of bytes to complete @rq->next_rq
 * @drv_callback: function called between completion of bios in the request
 *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
+ *                If the callback returns non %0, this helper returns without
 *                completion of the request.
 *
 * Description:
@@ -1878,8 +1878,8 @@ EXPORT_SYMBOL(end_request);
 *     If @rq has leftover, sets it up for the next range of segments.
 *
 * Return:
- *     0 - we are done with this request
+ *     %0 - we are done with this request
- *     1 - this request is not freed yet, it still has pending buffers.
+ *     %1 - this request is not freed yet, it still has pending buffers.
 **/
 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
                      unsigned int bidi_bytes,
@@ -1888,15 +1888,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
        struct request_queue *q = rq->q;
        unsigned long flags = 0UL;
-        if (blk_fs_request(rq) || blk_pc_request(rq)) {
+        if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
-                if (__end_that_request_first(rq, error, nr_bytes))
+                return 1;
-                        return 1;
-                /* Bidi request must be completed as a whole */
-                if (blk_bidi_rq(rq) &&
-                    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-                        return 1;
-        }
        /* Special feature for tricky drivers */
        if (drv_callback && drv_callback(rq))
@@ -1914,7 +1907,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 /**
 * blk_end_request - Helper function for drivers to complete the request.
 * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
 * @nr_bytes: number of bytes to complete
 *
 * Description:
@@ -1922,8 +1915,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 *     If @rq has leftover, sets it up for the next range of segments.
 *
 * Return:
- *     0 - we are done with this request
+ *     %0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %1 - still buffers pending for this request
 **/
 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
@@ -1934,22 +1927,20 @@ EXPORT_SYMBOL_GPL(blk_end_request);
 /**
 * __blk_end_request - Helper function for drivers to complete the request.
 * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
 * @nr_bytes: number of bytes to complete
 *
 * Description:
 *     Must be called with queue lock held unlike blk_end_request().
 *
 * Return:
- *     0 - we are done with this request
+ *     %0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %1 - still buffers pending for this request
 **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-        if (blk_fs_request(rq) || blk_pc_request(rq)) {
+        if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
-                if (__end_that_request_first(rq, error, nr_bytes))
+                return 1;
-                        return 1;
-        }
        add_disk_randomness(rq->rq_disk);
@@ -1962,7 +1953,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 /**
 * blk_end_bidi_request - Helper function for drivers to complete bidi request.
 * @rq:         the bidi request being processed
- * @error:      0 for success, < 0 for error
+ * @error:      %0 for success, < %0 for error
 * @nr_bytes:   number of bytes to complete @rq
 * @bidi_bytes: number of bytes to complete @rq->next_rq
 *
@@ -1970,8 +1961,8 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
 *
 * Return:
- *     0 - we are done with this request
+ *     %0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %1 - still buffers pending for this request
 **/
 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
                         unsigned int bidi_bytes)
@@ -1981,13 +1972,43 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 /**
+ * blk_update_request - Special helper function for request stacking drivers
+ * @rq:           the request being processed
+ * @error:        %0 for success, < %0 for error
+ * @nr_bytes:     number of bytes to complete @rq
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
+ *     the request structure even if @rq doesn't have leftover.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ *     This special helper function is only for request stacking drivers
+ *     (e.g. request-based dm) so that they can handle partial completion.
+ *     Actual device drivers should use blk_end_request instead.
+ */
+void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+        if (!end_that_request_data(rq, error, nr_bytes, 0)) {
+                /*
+                 * These members are not updated in end_that_request_data()
+                 * when all bios are completed.
+                 * Update them so that the request stacking driver can find
+                 * how many bytes remain in the request later.
+                 */
+                rq->nr_sectors = rq->hard_nr_sectors = 0;
+                rq->current_nr_sectors = rq->hard_cur_sectors = 0;
+        }
+}
+EXPORT_SYMBOL_GPL(blk_update_request);
+/**
 * blk_end_request_callback - Special helper function for tricky drivers
 * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
+ * @error:        %0 for success, < %0 for error
 * @nr_bytes:     number of bytes to complete
 * @drv_callback: function called between completion of bios in the request
 *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
+ *                If the callback returns non %0, this helper returns without
 *                completion of the request.
 *
 * Description:
@@ -2000,10 +2021,10 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 *     Don't use this interface in other places anymore.
 *
 * Return:
- *     0 - we are done with this request
+ *     %0 - we are done with this request
- *     1 - this request is not freed yet.
+ *     %1 - this request is not freed yet.
- *         this request still has pending buffers or
+ *          this request still has pending buffers or
- *         the driver doesn't want to finish this request yet.
+ *          the driver doesn't want to finish this request yet.
 **/
 int blk_end_request_callback(struct request *rq, int error,
                             unsigned int nr_bytes,
@@ -2016,15 +2037,17 @@ EXPORT_SYMBOL_GPL(blk_end_request_callback);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                     struct bio *bio)
 {
-        /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
+        /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
+           we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
        rq->cmd_flags |= (bio->bi_rw & 3);
-        rq->nr_phys_segments = bio_phys_segments(q, bio);
+        if (bio_has_data(bio)) {
-        rq->nr_hw_segments = bio_hw_segments(q, bio);
+                rq->nr_phys_segments = bio_phys_segments(q, bio);
+                rq->buffer = bio_data(bio);
+        }
        rq->current_nr_sectors = bio_cur_sectors(bio);
        rq->hard_cur_sectors = rq->current_nr_sectors;
        rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
-        rq->buffer = bio_data(bio);
        rq->data_len = bio->bi_size;
        rq->bio = rq->biotail = bio;
@@ -2033,7 +2056,35 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                rq->rq_disk = bio->bi_bdev->bd_disk;
 }
-int kblockd_schedule_work(struct work_struct *work)
+/**
+ * blk_lld_busy - Check if underlying low-level drivers of a device are busy
+ * @q : the queue of the device being checked
+ *
+ * Description:
+ *    Check if underlying low-level drivers of a device are busy.
+ *    If the drivers want to export their busy state, they must set own
+ *    exporting function using blk_queue_lld_busy() first.
+ *
+ *    Basically, this function is used only by request stacking drivers
+ *    to stop dispatching requests to underlying devices when underlying
+ *    devices are busy.  This behavior helps more I/O merging on the queue
+ *    of the request stacking driver and prevents I/O throughput regression
+ *    on burst I/O load.
+ *
+ * Return:
+ *    0 - Not busy (The request stacking driver should dispatch request)
+ *    1 - Busy (The request stacking driver should stop dispatching request)
+ */
+int blk_lld_busy(struct request_queue *q)
+{
+        if (q->lld_busy_fn)
+                return q->lld_busy_fn(q);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(blk_lld_busy);
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
        return queue_work(kblockd_workqueue, work);
 }
@@ -2047,8 +2098,6 @@ EXPORT_SYMBOL(kblockd_flush_work);
 int __init blk_dev_init(void)
 {
-        int i;
        kblockd_workqueue = create_workqueue("kblockd");
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");
@@ -2059,12 +2108,6 @@ int __init blk_dev_init(void)
        blk_requestq_cachep = kmem_cache_create("blkdev_queue",
                        sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
-        for_each_possible_cpu(i)
-                INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
-        register_hotcpu_notifier(&blk_cpu_notifier);
        return 0;
 }