Merge branch 'for-3.2/core' of git://git.kernel.dk/linux-block

* 'for-3.2/core' of git://git.kernel.dk/linux-block: (29 commits) block: don't call blk_drain_queue() if elevator is not up blk-throttle: use queue_is_locked() instead of lockdep_is_held() blk-throttle: Take blkcg->lock while traversing blkcg->policy_list blk-throttle: Free up policy node associated with deleted rule block: warn if tag is greater than real_max_depth. block: make gendisk hold a reference to its queue blk-flush: move the queue kick into blk-flush: fix invalid BUG_ON in blk_insert_flush block: Remove the control of complete cpu from bio. block: fix a typo in the blk-cgroup.h file block: initialize the bounce pool if high memory may be added later block: fix request_queue lifetime handling by making blk_queue_cleanup() properly shutdown block: drop @tsk from attempt_plug_merge() and explain sync rules block: make get_request[_wait]() fail if queue is dead block: reorganize throtl_get_tg() and blk_throtl_bio() block: reorganize queue draining block: drop unnecessary blk_get/put_queue() in scsi_cmd_ioctl() and blk_get_tg() block: pass around REQ_* flags instead of broken down booleans during request alloc/free block: move blk_throtl prototypes to block/blk.h block: fix genhd refcounting in blkio_policy_parse_and_set() ... Fix up trivial conflicts due to "mddev_t" -> "struct mddev" conversion and making the request functions be of type "void" instead of "int" in - drivers/md/{faulty.c,linear.c,md.c,md.h,multipath.c,raid0.c,raid1.c,raid10.c,raid5.c} - drivers/staging/zram/zram_drv.c
author: Linus Torvalds <torvalds@linux-foundation.org> 2011-11-04 20:06:58 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-11-04 20:06:58 -0400
commit: b4fdcb02f1e39c27058a885905bd0277370ba441 (patch)
tree: fd4cfd1994f21f44afe5e7904681fb5ac09f81b8 /block
parent: 044595d4e448305fbaec472eb7d22636d24e7d8c (diff)
parent: 6dd9ad7df2019b1e33a372a501907db293ebcd0d (diff)
11 files changed, 439 insertions, 327 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b596e54ddd71..8f630cec906e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -768,25 +768,14 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
        return disk_total;
 }
-static int blkio_check_dev_num(dev_t dev)
-{
-        int part = 0;
-        struct gendisk *disk;
-        disk = get_gendisk(dev, &part);
-        if (!disk || part)
-                return -ENODEV;
-        return 0;
-}
 static int blkio_policy_parse_and_set(char *buf,
        struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
+        struct gendisk *disk = NULL;
        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
-        int ret;
        unsigned long major, minor;
-        int i = 0;
+        int i = 0, ret = -EINVAL;
+        int part;
        dev_t dev;
        u64 temp;
@@ -804,37 +793,36 @@ static int blkio_policy_parse_and_set(char *buf,
        }
        if (i != 2)
-                return -EINVAL;
+                goto out;
        p = strsep(&s[0], ":");
        if (p != NULL)
                major_s = p;
        else
-                return -EINVAL;
+                goto out;
        minor_s = s[0];
        if (!minor_s)
-                return -EINVAL;
+                goto out;
-        ret = strict_strtoul(major_s, 10, &major);
+        if (strict_strtoul(major_s, 10, &major))
-        if (ret)
+                goto out;
-                return -EINVAL;
-        ret = strict_strtoul(minor_s, 10, &minor);
+        if (strict_strtoul(minor_s, 10, &minor))
-        if (ret)
+                goto out;
-                return -EINVAL;
        dev = MKDEV(major, minor);
-        ret = strict_strtoull(s[1], 10, &temp);
+        if (strict_strtoull(s[1], 10, &temp))
-        if (ret)
+                goto out;
-                return -EINVAL;
        /* For rule removal, do not check for device presence. */
        if (temp) {
-                ret = blkio_check_dev_num(dev);
+                disk = get_gendisk(dev, &part);
-                if (ret)
+                if (!disk || part) {
-                        return ret;
+                        ret = -ENODEV;
+                        goto out;
+                }
        }
        newpn->dev = dev;
@@ -843,7 +831,7 @@ static int blkio_policy_parse_and_set(char *buf,
        case BLKIO_POLICY_PROP:
                if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
                     temp > BLKIO_WEIGHT_MAX)
-                        return -EINVAL;
+                        goto out;
                newpn->plid = plid;
                newpn->fileid = fileid;
@@ -860,7 +848,7 @@ static int blkio_policy_parse_and_set(char *buf,
                case BLKIO_THROTL_read_iops_device:
                case BLKIO_THROTL_write_iops_device:
                        if (temp > THROTL_IOPS_MAX)
-                                return -EINVAL;
+                                goto out;
                        newpn->plid = plid;
                        newpn->fileid = fileid;
@@ -871,68 +859,96 @@ static int blkio_policy_parse_and_set(char *buf,
        default:
                BUG();
        }
+        ret = 0;
-        return 0;
+out:
+        put_disk(disk);
+        return ret;
 }
 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
                              dev_t dev)
 {
        struct blkio_policy_node *pn;
+        unsigned long flags;
+        unsigned int weight;
+        spin_lock_irqsave(&blkcg->lock, flags);
        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
                                BLKIO_PROP_weight_device);
        if (pn)
-                return pn->val.weight;
+                weight = pn->val.weight;
        else
-                return blkcg->weight;
+                weight = blkcg->weight;
+        spin_unlock_irqrestore(&blkcg->lock, flags);
+        return weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
        struct blkio_policy_node *pn;
+        unsigned long flags;
+        uint64_t bps = -1;
+        spin_lock_irqsave(&blkcg->lock, flags);
        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
                                BLKIO_THROTL_read_bps_device);
        if (pn)
-                return pn->val.bps;
+                bps = pn->val.bps;
-        else
+        spin_unlock_irqrestore(&blkcg->lock, flags);
-                return -1;
+        return bps;
 }
 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
        struct blkio_policy_node *pn;
+        unsigned long flags;
+        uint64_t bps = -1;
+        spin_lock_irqsave(&blkcg->lock, flags);
        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
                                BLKIO_THROTL_write_bps_device);
        if (pn)
-                return pn->val.bps;
+                bps = pn->val.bps;
-        else
+        spin_unlock_irqrestore(&blkcg->lock, flags);
-                return -1;
+        return bps;
 }
 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
 {
        struct blkio_policy_node *pn;
+        unsigned long flags;
+        unsigned int iops = -1;
+        spin_lock_irqsave(&blkcg->lock, flags);
        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
                                BLKIO_THROTL_read_iops_device);
        if (pn)
-                return pn->val.iops;
+                iops = pn->val.iops;
-        else
+        spin_unlock_irqrestore(&blkcg->lock, flags);
-                return -1;
+        return iops;
 }
 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
 {
        struct blkio_policy_node *pn;
+        unsigned long flags;
+        unsigned int iops = -1;
+        spin_lock_irqsave(&blkcg->lock, flags);
        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
                                BLKIO_THROTL_write_iops_device);
        if (pn)
-                return pn->val.iops;
+                iops = pn->val.iops;
-        else
+        spin_unlock_irqrestore(&blkcg->lock, flags);
-                return -1;
+        return iops;
 }
 /* Checks whether user asked for deleting a policy rule */
@@ -1085,6 +1101,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
        if (blkio_delete_rule_command(newpn)) {
                blkio_policy_delete_node(pn);
+                kfree(pn);
                spin_unlock_irq(&blkcg->lock);
                goto update_io_group;
        }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index a71d2904ffb9..6f3ace7e792f 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -188,7 +188,7 @@ struct blkio_policy_node {
        union {
                unsigned int weight;
                /*
-                 * Rate read/write in terms of byptes per second
+                 * Rate read/write in terms of bytes per second
                 * Whether this rate represents read or write is determined
                 * by file type "fileid".
                 */
diff --git a/block/blk-core.c b/block/blk-core.c
index d34433ae7917..f43c8a5840ae 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
+#include <linux/delay.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -38,8 +39,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
-static int __make_request(struct request_queue *q, struct bio *bio);
 /*
 * For the allocated request tables
 */
@@ -347,30 +346,80 @@ void blk_put_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_put_queue);
-/*
+/**
- * Note: If a driver supplied the queue lock, it is disconnected
+ * blk_drain_queue - drain requests from request_queue
- * by this function. The actual state of the lock doesn't matter
+ * @q: queue to drain
- * here as the request_queue isn't accessible after this point
+ * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
- * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
+ *
+ * Drain requests from @q.  If @drain_all is set, all requests are drained.
+ * If not, only ELVPRIV requests are drained.  The caller is responsible
+ * for ensuring that no new requests which need to be drained are queued.
+ */
+void blk_drain_queue(struct request_queue *q, bool drain_all)
+{
+        while (true) {
+                int nr_rqs;
+                spin_lock_irq(q->queue_lock);
+                elv_drain_elevator(q);
+                if (drain_all)
+                        blk_throtl_drain(q);
+                __blk_run_queue(q);
+                if (drain_all)
+                        nr_rqs = q->rq.count[0] + q->rq.count[1];
+                else
+                        nr_rqs = q->rq.elvpriv;
+                spin_unlock_irq(q->queue_lock);
+                if (!nr_rqs)
+                        break;
+                msleep(10);
+        }
+}
+/**
+ * blk_cleanup_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * Mark @q DEAD, drain all pending requests, destroy and put it.  All
+ * future requests will be failed immediately with -ENODEV.
 */
 void blk_cleanup_queue(struct request_queue *q)
 {
-        /*
+        spinlock_t *lock = q->queue_lock;
-         * We know we have process context here, so we can be a little
-         * cautious and ensure that pending block actions on this device
-         * are done before moving on. Going into this function, we should
-         * not have processes doing IO to this device.
-         */
-        blk_sync_queue(q);
-        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+        /* mark @q DEAD, no new request or merges will be allowed afterwards */
        mutex_lock(&q->sysfs_lock);
        queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
-        mutex_unlock(&q->sysfs_lock);
+        spin_lock_irq(lock);
+        queue_flag_set(QUEUE_FLAG_NOMERGES, q);
+        queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+        queue_flag_set(QUEUE_FLAG_DEAD, q);
        if (q->queue_lock != &q->__queue_lock)
                q->queue_lock = &q->__queue_lock;
+        spin_unlock_irq(lock);
+        mutex_unlock(&q->sysfs_lock);
+        /*
+         * Drain all requests queued before DEAD marking.  The caller might
+         * be trying to tear down @q before its elevator is initialized, in
+         * which case we don't want to call into draining.
+         */
+        if (q->elevator)
+                blk_drain_queue(q, true);
+        /* @q won't process any more request, flush async actions */
+        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+        blk_sync_queue(q);
+        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -541,7 +590,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
        /*
         * This also sets hw/phys segments, boundary and size
         */
-        blk_queue_make_request(q, __make_request);
+        blk_queue_make_request(q, blk_queue_bio);
        q->sg_reserved_size = INT_MAX;
@@ -576,7 +625,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
 }
 static struct request *
-blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
 {
        struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
@@ -587,12 +636,10 @@ blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
        rq->cmd_flags = flags | REQ_ALLOCED;
-        if (priv) {
+        if ((flags & REQ_ELVPRIV) &&
-                if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+            unlikely(elv_set_request(q, rq, gfp_mask))) {
-                        mempool_free(rq, q->rq.rq_pool);
+                mempool_free(rq, q->rq.rq_pool);
-                        return NULL;
+                return NULL;
-                }
-                rq->cmd_flags |= REQ_ELVPRIV;
        }
        return rq;
@@ -651,12 +698,13 @@ static void __freed_request(struct request_queue *q, int sync)
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
-static void freed_request(struct request_queue *q, int sync, int priv)
+static void freed_request(struct request_queue *q, unsigned int flags)
 {
        struct request_list *rl = &q->rq;
+        int sync = rw_is_sync(flags);
        rl->count[sync]--;
-        if (priv)
+        if (flags & REQ_ELVPRIV)
                rl->elvpriv--;
        __freed_request(q, sync);
@@ -684,10 +732,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
        return true;
 }
-/*
+/**
- * Get a free request, queue_lock must be held.
+ * get_request - get a free request
- * Returns NULL on failure, with queue_lock held.
+ * @q: request_queue to allocate request from
- * Returns !NULL on success, with queue_lock *not held*.
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
+ *
+ * Get a free request from @q.  This function may fail under memory
+ * pressure or if @q is dead.
+ *
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
 */
 static struct request *get_request(struct request_queue *q, int rw_flags,
                                   struct bio *bio, gfp_t gfp_mask)
@@ -696,7 +753,10 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
        struct request_list *rl = &q->rq;
        struct io_context *ioc = NULL;
        const bool is_sync = rw_is_sync(rw_flags) != 0;
-        int may_queue, priv = 0;
+        int may_queue;
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+                return NULL;
        may_queue = elv_may_queue(q, rw_flags);
        if (may_queue == ELV_MQUEUE_NO)
@@ -740,17 +800,17 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
        rl->count[is_sync]++;
        rl->starved[is_sync] = 0;
-        if (blk_rq_should_init_elevator(bio)) {
+        if (blk_rq_should_init_elevator(bio) &&
-                priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+            !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
-                if (priv)
+                rw_flags |= REQ_ELVPRIV;
-                        rl->elvpriv++;
+                rl->elvpriv++;
        }
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
        spin_unlock_irq(q->queue_lock);
-        rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
+        rq = blk_alloc_request(q, rw_flags, gfp_mask);
        if (unlikely(!rq)) {
                /*
                 * Allocation failed presumably due to memory. Undo anything
@@ -760,7 +820,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
                 * wait queue, but this is pretty rare.
                 */
                spin_lock_irq(q->queue_lock);
-                freed_request(q, is_sync, priv);
+                freed_request(q, rw_flags);
                /*
                 * in the very unlikely event that allocation failed and no
@@ -790,11 +850,18 @@ out:
        return rq;
 }
-/*
+/**
- * No available requests for this queue, wait for some requests to become
+ * get_request_wait - get a free request with retry
- * available.
+ * @q: request_queue to allocate request from
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
+ *
+ * Get a free request from @q.  This function keeps retrying under memory
+ * pressure and fails iff @q is dead.
 *
- * Called with q->queue_lock held, and returns with it unlocked.
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
 */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
                                        struct bio *bio)
@@ -808,6 +875,9 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
                struct io_context *ioc;
                struct request_list *rl = &q->rq;
+                if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+                        return NULL;
                prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
                                TASK_UNINTERRUPTIBLE);
@@ -838,19 +908,15 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
        struct request *rq;
-        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-                return NULL;
        BUG_ON(rw != READ && rw != WRITE);
        spin_lock_irq(q->queue_lock);
-        if (gfp_mask & __GFP_WAIT) {
+        if (gfp_mask & __GFP_WAIT)
                rq = get_request_wait(q, rw, NULL);
-        } else {
+        else
                rq = get_request(q, rw, NULL, gfp_mask);
-                if (!rq)
+        if (!rq)
-                        spin_unlock_irq(q->queue_lock);
+                spin_unlock_irq(q->queue_lock);
-        }
        /* q->queue_lock is unlocked at this point */
        return rq;
@@ -1052,14 +1118,13 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         * it didn't come out of our reserved rq pools
         */
        if (req->cmd_flags & REQ_ALLOCED) {
-                int is_sync = rq_is_sync(req) != 0;
+                unsigned int flags = req->cmd_flags;
-                int priv = req->cmd_flags & REQ_ELVPRIV;
                BUG_ON(!list_empty(&req->queuelist));
                BUG_ON(!hlist_unhashed(&req->hash));
                blk_free_request(q, req);
-                freed_request(q, is_sync, priv);
+                freed_request(q, flags);
        }
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1161,18 +1226,32 @@ static bool bio_attempt_front_merge(struct request_queue *q,
        return true;
 }
-/*
+/**
- * Attempts to merge with the plugged list in the current process. Returns
+ * attempt_plug_merge - try to merge with %current's plugged list
- * true if merge was successful, otherwise false.
+ * @q: request_queue new bio is being queued at
+ * @bio: new bio being queued
+ * @request_count: out parameter for number of traversed plugged requests
+ *
+ * Determine whether @bio being queued on @q can be merged with a request
+ * on %current's plugged list.  Returns %true if merge was successful,
+ * otherwise %false.
+ *
+ * This function is called without @q->queue_lock; however, elevator is
+ * accessed iff there already are requests on the plugged list which in
+ * turn guarantees validity of the elevator.
+ *
+ * Note that, on successful merge, elevator operation
+ * elevator_bio_merged_fn() will be called without queue lock.  Elevator
+ * must be ready for this.
 */
-static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
+static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
-                               struct bio *bio, unsigned int *request_count)
+                               unsigned int *request_count)
 {
        struct blk_plug *plug;
        struct request *rq;
        bool ret = false;
-        plug = tsk->plug;
+        plug = current->plug;
        if (!plug)
                goto out;
        *request_count = 0;
@@ -1202,7 +1281,6 @@ out:
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-        req->cpu = bio->bi_comp_cpu;
        req->cmd_type = REQ_TYPE_FS;
        req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
@@ -1215,7 +1293,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        blk_rq_bio_prep(req->q, req, bio);
 }
-static int __make_request(struct request_queue *q, struct bio *bio)
+void blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
        struct blk_plug *plug;
@@ -1240,8 +1318,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
         * Check if we can merge with the plugged list before grabbing
         * any locks.
         */
-        if (attempt_plug_merge(current, q, bio, &request_count))
+        if (attempt_plug_merge(q, bio, &request_count))
-                goto out;
+                return;
        spin_lock_irq(q->queue_lock);
@@ -1275,6 +1353,10 @@ get_rq:
         * Returns with the queue unlocked.
         */
        req = get_request_wait(q, rw_flags, bio);
+        if (unlikely(!req)) {
+                bio_endio(bio, -ENODEV);        /* @q is dead */
+                goto out_unlock;
+        }
        /*
         * After dropping the lock and possibly sleeping here, our request
@@ -1284,8 +1366,7 @@ get_rq:
         */
        init_request_from_bio(req, bio);
-        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
+        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
-            bio_flagged(bio, BIO_CPU_AFFINE))
                req->cpu = raw_smp_processor_id();
        plug = current->plug;
@@ -1316,9 +1397,8 @@ get_rq:
 out_unlock:
                spin_unlock_irq(q->queue_lock);
        }
-out:
-        return 0;
 }
+EXPORT_SYMBOL_GPL(blk_queue_bio);       /* for device mapper only */
 /*
 * If bio->bi_dev is a partition, remap the location
@@ -1417,165 +1497,135 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
        return 0;
 }
-/**
+static noinline_for_stack bool
- * generic_make_request - hand a buffer to its device driver for I/O
+generic_make_request_checks(struct bio *bio)
- * @bio:  The bio describing the location in memory and on the device.
- *
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status.  The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may change bi_dev and
- * bi_sector for remaps as it sees fit.  So the values of these fields
- * should NOT be depended on after the call to generic_make_request.
- */
-static inline void __generic_make_request(struct bio *bio)
 {
        struct request_queue *q;
-        sector_t old_sector;
+        int nr_sectors = bio_sectors(bio);
-        int ret, nr_sectors = bio_sectors(bio);
-        dev_t old_dev;
        int err = -EIO;
+        char b[BDEVNAME_SIZE];
+        struct hd_struct *part;
        might_sleep();
        if (bio_check_eod(bio, nr_sectors))
                goto end_io;
-        /*
+        q = bdev_get_queue(bio->bi_bdev);
-         * Resolve the mapping until finished. (drivers are
+        if (unlikely(!q)) {
-         * still free to implement/resolve their own stacking
+                printk(KERN_ERR
-         * by explicitly returning 0)
+                       "generic_make_request: Trying to access "
-         *
+                        "nonexistent block-device %s (%Lu)\n",
-         * NOTE: we don't repeat the blk_size check for each new device.
+                        bdevname(bio->bi_bdev, b),
-         * Stacking drivers are expected to know what they are doing.
+                        (long long) bio->bi_sector);
-         */
+                goto end_io;
-        old_sector = -1;
+        }
-        old_dev = 0;
-        do {
-                char b[BDEVNAME_SIZE];
-                struct hd_struct *part;
-                q = bdev_get_queue(bio->bi_bdev);
-                if (unlikely(!q)) {
-                        printk(KERN_ERR
-                               "generic_make_request: Trying to access "
-                                "nonexistent block-device %s (%Lu)\n",
-                                bdevname(bio->bi_bdev, b),
-                                (long long) bio->bi_sector);
-                        goto end_io;
-                }
-                if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
-                             nr_sectors > queue_max_hw_sectors(q))) {
-                        printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-                               bdevname(bio->bi_bdev, b),
-                               bio_sectors(bio),
-                               queue_max_hw_sectors(q));
-                        goto end_io;
-                }
-                if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-                        goto end_io;
-                part = bio->bi_bdev->bd_part;
-                if (should_fail_request(part, bio->bi_size) ||
-                    should_fail_request(&part_to_disk(part)->part0,
-                                        bio->bi_size))
-                        goto end_io;
-                /*
-                 * If this device has partitions, remap block n
-                 * of partition p to block n+start(p) of the disk.
-                 */
-                blk_partition_remap(bio);
-                if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+        if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
-                        goto end_io;
+                     nr_sectors > queue_max_hw_sectors(q))) {
+                printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+                       bdevname(bio->bi_bdev, b),
+                       bio_sectors(bio),
+                       queue_max_hw_sectors(q));
+                goto end_io;
+        }
-                if (old_sector != -1)
+        part = bio->bi_bdev->bd_part;
-                        trace_block_bio_remap(q, bio, old_dev, old_sector);
+        if (should_fail_request(part, bio->bi_size) ||
+            should_fail_request(&part_to_disk(part)->part0,
+                                bio->bi_size))
+                goto end_io;
-                old_sector = bio->bi_sector;
+        /*
-                old_dev = bio->bi_bdev->bd_dev;
+         * If this device has partitions, remap block n
+         * of partition p to block n+start(p) of the disk.
+         */
+        blk_partition_remap(bio);
-                if (bio_check_eod(bio, nr_sectors))
+        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
-                        goto end_io;
+                goto end_io;
-                /*
+        if (bio_check_eod(bio, nr_sectors))
-                 * Filter flush bio's early so that make_request based
+                goto end_io;
-                 * drivers without flush support don't have to worry
-                 * about them.
-                 */
-                if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
-                        bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
-                        if (!nr_sectors) {
-                                err = 0;
-                                goto end_io;
-                        }
-                }
-                if ((bio->bi_rw & REQ_DISCARD) &&
+        /*
-                    (!blk_queue_discard(q) ||
+         * Filter flush bio's early so that make_request based
-                     ((bio->bi_rw & REQ_SECURE) &&
+         * drivers without flush support don't have to worry
-                      !blk_queue_secdiscard(q)))) {
+         * about them.
-                        err = -EOPNOTSUPP;
+         */
+        if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+                bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+                if (!nr_sectors) {
+                        err = 0;
                        goto end_io;
                }
+        }
-                if (blk_throtl_bio(q, &bio))
+        if ((bio->bi_rw & REQ_DISCARD) &&
-                        goto end_io;
+            (!blk_queue_discard(q) ||
+             ((bio->bi_rw & REQ_SECURE) &&
-                /*
+              !blk_queue_secdiscard(q)))) {
-                 * If bio = NULL, bio has been throttled and will be submitted
+                err = -EOPNOTSUPP;
-                 * later.
+                goto end_io;
-                 */
+        }
-                if (!bio)
-                        break;
-                trace_block_bio_queue(q, bio);
-                ret = q->make_request_fn(q, bio);
+        if (blk_throtl_bio(q, bio))
-        } while (ret);
+                return false;   /* throttled, will be resubmitted later */
-        return;
+        trace_block_bio_queue(q, bio);
+        return true;
 end_io:
        bio_endio(bio, err);
+        return false;
 }
-/*
+/**
- * We only want one ->make_request_fn to be active at a time,
+ * generic_make_request - hand a buffer to its device driver for I/O
- * else stack usage with stacked devices could be a problem.
+ * @bio:  The bio describing the location in memory and on the device.
- * So use current->bio_list to keep a list of requests
+ *
- * submited by a make_request_fn function.
+ * generic_make_request() is used to make I/O requests of block
- * current->bio_list is also used as a flag to say if
+ * devices. It is passed a &struct bio, which describes the I/O that needs
- * generic_make_request is currently active in this task or not.
+ * to be done.
- * If it is NULL, then no make_request is active.  If it is non-NULL,
+ *
- * then a make_request is active, and new requests should be added
+ * generic_make_request() does not return any status.  The
- * at the tail
+ * success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the bio->bi_end_io
+ * function described (one day) else where.
+ *
+ * The caller of generic_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ * set to describe the device address, and the
+ * bi_end_io and optionally bi_private are set to describe how
+ * completion notification should be signaled.
+ *
+ * generic_make_request and the drivers it calls may use bi_next if this
+ * bio happens to be merged with someone else, and may resubmit the bio to
+ * a lower device by calling into generic_make_request recursively, which
+ * means the bio should NOT be touched after the call to ->make_request_fn.
 */
 void generic_make_request(struct bio *bio)
 {
        struct bio_list bio_list_on_stack;
+        if (!generic_make_request_checks(bio))
+                return;
+        /*
+         * We only want one ->make_request_fn to be active at a time, else
+         * stack usage with stacked devices could be a problem.  So use
+         * current->bio_list to keep a list of requests submited by a
+         * make_request_fn function.  current->bio_list is also used as a
+         * flag to say if generic_make_request is currently active in this
+         * task or not.  If it is NULL, then no make_request is active.  If
+         * it is non-NULL, then a make_request is active, and new requests
+         * should be added at the tail
+         */
        if (current->bio_list) {
-                /* make_request is active */
                bio_list_add(current->bio_list, bio);
                return;
        }
        /* following loop may be a bit non-obvious, and so deserves some
         * explanation.
         * Before entering the loop, bio->bi_next is NULL (as all callers
@@ -1583,22 +1633,21 @@ void generic_make_request(struct bio *bio)
         * We pretend that we have just taken it off a longer list, so
         * we assign bio_list to a pointer to the bio_list_on_stack,
         * thus initialising the bio_list of new bios to be
-         * added.  __generic_make_request may indeed add some more bios
+         * added.  ->make_request() may indeed add some more bios
         * through a recursive call to generic_make_request.  If it
         * did, we find a non-NULL value in bio_list and re-enter the loop
         * from the top.  In this case we really did just take the bio
         * of the top of the list (no pretending) and so remove it from
-         * bio_list, and call into __generic_make_request again.
+         * bio_list, and call into ->make_request() again.
-         *
-         * The loop was structured like this to make only one call to
-         * __generic_make_request (which is important as it is large and
-         * inlined) and to keep the structure simple.
         */
        BUG_ON(bio->bi_next);
        bio_list_init(&bio_list_on_stack);
        current->bio_list = &bio_list_on_stack;
        do {
-                __generic_make_request(bio);
+                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+                q->make_request_fn(q, bio);
                bio = bio_list_pop(current->bio_list);
        } while (bio);
        current->bio_list = NULL; /* deactivate */
@@ -1725,6 +1774,8 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
                where = ELEVATOR_INSERT_FLUSH;
        add_acct_request(q, rq, where);
+        if (where == ELEVATOR_INSERT_FLUSH)
+                __blk_run_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
        return 0;
@@ -2628,6 +2679,20 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 #define PLUG_MAGIC      0x91827364
+/**
+ * blk_start_plug - initialize blk_plug and track it inside the task_struct
+ * @plug:       The &struct blk_plug that needs to be initialized
+ *
+ * Description:
+ *   Tracking blk_plug inside the task_struct will help with auto-flushing the
+ *   pending I/O should the task end up blocking between blk_start_plug() and
+ *   blk_finish_plug(). This is important from a performance perspective, but
+ *   also ensures that we don't deadlock. For instance, if the task is blocking
+ *   for a memory allocation, memory reclaim could end up wanting to free a
+ *   page belonging to that request that is currently residing in our private
+ *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
+ *   this kind of deadlock.
+ */
 void blk_start_plug(struct blk_plug *plug)
 {
        struct task_struct *tsk = current;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 491eb30a242d..720ad607ff91 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -320,7 +320,7 @@ void blk_insert_flush(struct request *rq)
                return;
        }
-        BUG_ON(!rq->bio || rq->bio != rq->biotail);
+        BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
        /*
         * If there's data but flush is not necessary, the request can be
@@ -330,7 +330,6 @@ void blk_insert_flush(struct request *rq)
        if ((policy & REQ_FSEQ_DATA) &&
            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
                list_add_tail(&rq->queuelist, &q->queue_head);
-                blk_run_queue_async(q);
                return;
        }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 60fda88c57f0..e7f9f657f105 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -457,11 +457,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 }
 /**
- * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
+ * blk_release_queue: - release a &struct request_queue when it is no longer needed
- * @kobj:    the kobj belonging of the request queue to be released
+ * @kobj:    the kobj belonging to the request queue to be released
 *
 * Description:
- *     blk_cleanup_queue is the pair to blk_init_queue() or
+ *     blk_release_queue is the pair to blk_init_queue() or
 *     blk_queue_make_request().  It should be called when a request queue is
 *     being released; typically when a block device is being de-registered.
 *     Currently, its primary task it to free all the &struct request
@@ -490,6 +490,7 @@ static void blk_release_queue(struct kobject *kobj)
        if (q->queue_tags)
                __blk_queue_free_tags(q);
+        blk_throtl_release(q);
        blk_trace_shutdown(q);
        bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-tag.c b/block/blk-tag.c
index ece65fc4c79b..e74d6d13838f 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -286,12 +286,14 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
        BUG_ON(tag == -1);
-        if (unlikely(tag >= bqt->real_max_depth))
+        if (unlikely(tag >= bqt->max_depth)) {
                /*
                 * This can happen after tag depth has been reduced.
-                 * FIXME: how about a warning or info message here?
+                 * But tag shouldn't be larger than real_max_depth.
                 */
+                WARN_ON(tag >= bqt->real_max_depth);
                return;
+        }
        list_del_init(&rq->queuelist);
        rq->cmd_flags &= ~REQ_QUEUED;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a19f58c6fc3a..4553245d9317 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,6 +10,7 @@
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
+#include "blk.h"
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
@@ -302,16 +303,16 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
        return tg;
 }
-/*
- * This function returns with queue lock unlocked in case of error, like
- * request queue is no more
- */
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
        struct throtl_grp *tg = NULL, *__tg = NULL;
        struct blkio_cgroup *blkcg;
        struct request_queue *q = td->queue;
+        /* no throttling for dead queue */
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+                return NULL;
        rcu_read_lock();
        blkcg = task_blkio_cgroup(current);
        tg = throtl_find_tg(td, blkcg);
@@ -323,32 +324,22 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
        /*
         * Need to allocate a group. Allocation of group also needs allocation
         * of per cpu stats which in-turn takes a mutex() and can block. Hence
-         * we need to drop rcu lock and queue_lock before we call alloc
+         * we need to drop rcu lock and queue_lock before we call alloc.
-         *
-         * Take the request queue reference to make sure queue does not
-         * go away once we return from allocation.
         */
-        blk_get_queue(q);
        rcu_read_unlock();
        spin_unlock_irq(q->queue_lock);
        tg = throtl_alloc_tg(td);
-        /*
-         * We might have slept in group allocation. Make sure queue is not
-         * dead
-         */
-        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-                blk_put_queue(q);
-                if (tg)
-                        kfree(tg);
-                return ERR_PTR(-ENODEV);
-        }
-        blk_put_queue(q);
        /* Group allocated and queue is still alive. take the lock */
        spin_lock_irq(q->queue_lock);
+        /* Make sure @q is still alive */
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+                kfree(tg);
+                return NULL;
+        }
        /*
         * Initialize the new group. After sleeping, read the blkcg again.
         */
@@ -1014,11 +1005,6 @@ static void throtl_release_tgs(struct throtl_data *td)
        }
 }
-static void throtl_td_free(struct throtl_data *td)
-{
-        kfree(td);
-}
 /*
 * Blk cgroup controller notification saying that blkio_group object is being
 * delinked as associated cgroup object is going away. That also means that
@@ -1123,17 +1109,17 @@ static struct blkio_policy_type blkio_policy_throtl = {
        .plid = BLKIO_POLICY_THROTL,
 };
-int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
        struct throtl_data *td = q->td;
        struct throtl_grp *tg;
-        struct bio *bio = *biop;
        bool rw = bio_data_dir(bio), update_disptime = true;
        struct blkio_cgroup *blkcg;
+        bool throttled = false;
        if (bio->bi_rw & REQ_THROTTLED) {
                bio->bi_rw &= ~REQ_THROTTLED;
-                return 0;
+                goto out;
        }
        /*
@@ -1152,7 +1138,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
                        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
                                        rw, rw_is_sync(bio->bi_rw));
                        rcu_read_unlock();
-                        return 0;
+                        goto out;
                }
        }
        rcu_read_unlock();
@@ -1161,18 +1147,10 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
         * Either group has not been allocated yet or it is not an unlimited
         * IO group
         */
        spin_lock_irq(q->queue_lock);
        tg = throtl_get_tg(td);
+        if (unlikely(!tg))
-        if (IS_ERR(tg)) {
+                goto out_unlock;
-                if (PTR_ERR(tg) == -ENODEV) {
-                        /*
-                         * Queue is gone. No queue lock held here.
-                         */
-                        return -ENODEV;
-                }
-        }
        if (tg->nr_queued[rw]) {
                /*
@@ -1200,7 +1178,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
                 * So keep on trimming slice even if bio is not queued.
                 */
                throtl_trim_slice(td, tg, rw);
-                goto out;
+                goto out_unlock;
        }
 queue_bio:
@@ -1212,16 +1190,52 @@ queue_bio:
                        tg->nr_queued[READ], tg->nr_queued[WRITE]);
        throtl_add_bio_tg(q->td, tg, bio);
-        *biop = NULL;
+        throttled = true;
        if (update_disptime) {
                tg_update_disptime(td, tg);
                throtl_schedule_next_dispatch(td);
        }
+out_unlock:
+        spin_unlock_irq(q->queue_lock);
 out:
+        return throttled;
+}
+/**
+ * blk_throtl_drain - drain throttled bios
+ * @q: request_queue to drain throttled bios for
+ *
+ * Dispatch all currently throttled bios on @q through ->make_request_fn().
+ */
+void blk_throtl_drain(struct request_queue *q)
+        __releases(q->queue_lock) __acquires(q->queue_lock)
+{
+        struct throtl_data *td = q->td;
+        struct throtl_rb_root *st = &td->tg_service_tree;
+        struct throtl_grp *tg;
+        struct bio_list bl;
+        struct bio *bio;
+        WARN_ON_ONCE(!queue_is_locked(q));
+        bio_list_init(&bl);
+        while ((tg = throtl_rb_first(st))) {
+                throtl_dequeue_tg(td, tg);
+                while ((bio = bio_list_peek(&tg->bio_lists[READ])))
+                        tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+                while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+                        tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+        }
        spin_unlock_irq(q->queue_lock);
-        return 0;
+        while ((bio = bio_list_pop(&bl)))
+                generic_make_request(bio);
+        spin_lock_irq(q->queue_lock);
 }
 int blk_throtl_init(struct request_queue *q)
@@ -1296,7 +1310,11 @@ void blk_throtl_exit(struct request_queue *q)
         * it.
         */
        throtl_shutdown_wq(q);
-        throtl_td_free(td);
+}
+void blk_throtl_release(struct request_queue *q)
+{
+        kfree(q->td);
 }
 static int __init throtl_init(void)
diff --git a/block/blk.h b/block/blk.h
index 20b900a377c9..3f6551b3c92d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,6 +15,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                        struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
                      struct bio *bio);
+void blk_drain_queue(struct request_queue *q, bool drain_all);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
@@ -188,4 +189,21 @@ static inline int blk_do_io_stat(struct request *rq)
                (rq->cmd_flags & REQ_DISCARD));
 }
-#endif
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
+extern void blk_throtl_drain(struct request_queue *q);
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+extern void blk_throtl_release(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+{
+        return false;
+}
+static inline void blk_throtl_drain(struct request_queue *q) { }
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_release(struct request_queue *q) { }
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+#endif /* BLK_INTERNAL_H */
diff --git a/block/elevator.c b/block/elevator.c
index a3b64bc71d88..66343d6917d0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -31,7 +31,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
@@ -182,7 +181,7 @@ static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
        eq->elevator_data = data;
 }
-static char chosen_elevator[16];
+static char chosen_elevator[ELV_NAME_MAX];
 static int __init elevator_setup(char *str)
 {
@@ -606,43 +605,35 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 void elv_drain_elevator(struct request_queue *q)
 {
        static int printed;
+        lockdep_assert_held(q->queue_lock);
        while (q->elevator->ops->elevator_dispatch_fn(q, 1))
                ;
-        if (q->nr_sorted == 0)
+        if (q->nr_sorted && printed++ < 10) {
-                return;
-        if (printed++ < 10) {
                printk(KERN_ERR "%s: forced dispatching is broken "
                       "(nr_sorted=%u), please report this\n",
                       q->elevator->elevator_type->elevator_name, q->nr_sorted);
        }
 }
-/*
- * Call with queue lock held, interrupts disabled
- */
 void elv_quiesce_start(struct request_queue *q)
 {
        if (!q->elevator)
                return;
+        spin_lock_irq(q->queue_lock);
        queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
+        spin_unlock_irq(q->queue_lock);
-        /*
+        blk_drain_queue(q, false);
-         * make sure we don't have any requests in flight
-         */
-        elv_drain_elevator(q);
-        while (q->rq.elvpriv) {
-                __blk_run_queue(q);
-                spin_unlock_irq(q->queue_lock);
-                msleep(10);
-                spin_lock_irq(q->queue_lock);
-                elv_drain_elevator(q);
-        }
 }
 void elv_quiesce_end(struct request_queue *q)
 {
+        spin_lock_irq(q->queue_lock);
        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+        spin_unlock_irq(q->queue_lock);
 }
 void __elv_add_request(struct request_queue *q, struct request *rq, int where)
@@ -972,7 +963,6 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
        /*
         * Turn on BYPASS and drain all requests w/ elevator private data
         */
-        spin_lock_irq(q->queue_lock);
        elv_quiesce_start(q);
        /*
@@ -983,8 +973,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
        /*
         * attach and start new elevator
         */
+        spin_lock_irq(q->queue_lock);
        elevator_attach(q, e, data);
        spin_unlock_irq(q->queue_lock);
        if (old_elevator->registered) {
@@ -999,9 +989,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
         * finally exit old elevator and turn off BYPASS.
         */
        elevator_exit(old_elevator);
-        spin_lock_irq(q->queue_lock);
        elv_quiesce_end(q);
-        spin_unlock_irq(q->queue_lock);
        blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
@@ -1015,10 +1003,7 @@ fail_register:
        elevator_exit(e);
        q->elevator = old_elevator;
        elv_register_queue(q);
+        elv_quiesce_end(q);
-        spin_lock_irq(q->queue_lock);
-        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
-        spin_unlock_irq(q->queue_lock);
        return err;
 }
diff --git a/block/genhd.c b/block/genhd.c
index 94855a9717de..024fc3944fb5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -612,6 +612,12 @@ void add_disk(struct gendisk *disk)
        register_disk(disk);
        blk_register_queue(disk);
+        /*
+         * Take an extra ref on queue which will be put on disk_release()
+         * so that it sticks around as long as @disk is there.
+         */
+        WARN_ON_ONCE(blk_get_queue(disk->queue));
        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
                                   "bdi");
        WARN_ON(retval);
@@ -1166,6 +1172,8 @@ static void disk_release(struct device *dev)
        disk_replace_part_tbl(disk, NULL);
        free_part_stats(&disk->part0);
        free_part_info(&disk->part0);
+        if (disk->queue)
+                blk_put_queue(disk->queue);
        kfree(disk);
 }
 struct class block_class = {
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 4f4230b79bb6..fbdf0d802ec4 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -565,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 {
        int err;
-        if (!q || blk_get_queue(q))
+        if (!q)
                return -ENXIO;
        switch (cmd) {
@@ -686,7 +686,6 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
                        err = -ENOTTY;
        }
-        blk_put_queue(q);
        return err;
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
author	Linus Torvalds <torvalds@linux-foundation.org>	2011-11-04 20:06:58 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-11-04 20:06:58 -0400
commit	b4fdcb02f1e39c27058a885905bd0277370ba441 (patch)
tree	fd4cfd1994f21f44afe5e7904681fb5ac09f81b8 /block
parent	044595d4e448305fbaec472eb7d22636d24e7d8c (diff)
parent	6dd9ad7df2019b1e33a372a501907db293ebcd0d (diff)