21 files changed, 627 insertions, 235 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 60be1e0455d..e97934eecec 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -65,6 +65,16 @@ config BLK_DEV_BSG
          If unsure, say Y.
+config BLK_DEV_BSGLIB
+        bool "Block layer SG support v4 helper lib"
+        default n
+        select BLK_DEV_BSG
+        help
+          Subsystems will normally enable this if needed. Users will not
+          normally need to manually enable this.
+          If unsure, say N.
 config BLK_DEV_INTEGRITY
        bool "Block layer data integrity support"
        ---help---
diff --git a/block/Makefile b/block/Makefile
index 0fec4b3fab5..514c6e4f427 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 obj-$(CONFIG_BLK_DEV_BSG)       += bsg.o
+obj-$(CONFIG_BLK_DEV_BSGLIB)    += bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)        += blk-cgroup.o
 obj-$(CONFIG_BLK_DEV_THROTTLING)        += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)      += noop-iosched.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bcaf16ee6ad..b596e54ddd7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -785,10 +785,10 @@ static int blkio_policy_parse_and_set(char *buf,
 {
        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
        int ret;
-        unsigned long major, minor, temp;
+        unsigned long major, minor;
        int i = 0;
        dev_t dev;
-        u64 bps, iops;
+        u64 temp;
        memset(s, 0, sizeof(s));
@@ -826,20 +826,23 @@ static int blkio_policy_parse_and_set(char *buf,
        dev = MKDEV(major, minor);
-        ret = blkio_check_dev_num(dev);
+        ret = strict_strtoull(s[1], 10, &temp);
        if (ret)
-                return ret;
+                return -EINVAL;
-        newpn->dev = dev;
+        /* For rule removal, do not check for device presence. */
+        if (temp) {
+                ret = blkio_check_dev_num(dev);
+                if (ret)
+                        return ret;
+        }
-        if (s[1] == NULL)
+        newpn->dev = dev;
-                return -EINVAL;
        switch (plid) {
        case BLKIO_POLICY_PROP:
-                ret = strict_strtoul(s[1], 10, &temp);
+                if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-                if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+                     temp > BLKIO_WEIGHT_MAX)
-                        temp > BLKIO_WEIGHT_MAX)
                        return -EINVAL;
                newpn->plid = plid;
@@ -850,26 +853,18 @@ static int blkio_policy_parse_and_set(char *buf,
                switch(fileid) {
                case BLKIO_THROTL_read_bps_device:
                case BLKIO_THROTL_write_bps_device:
-                        ret = strict_strtoull(s[1], 10, &bps);
-                        if (ret)
-                                return -EINVAL;
                        newpn->plid = plid;
                        newpn->fileid = fileid;
-                        newpn->val.bps = bps;
+                        newpn->val.bps = temp;
                        break;
                case BLKIO_THROTL_read_iops_device:
                case BLKIO_THROTL_write_iops_device:
-                        ret = strict_strtoull(s[1], 10, &iops);
+                        if (temp > THROTL_IOPS_MAX)
-                        if (ret)
-                                return -EINVAL;
-                        if (iops > THROTL_IOPS_MAX)
                                return -EINVAL;
                        newpn->plid = plid;
                        newpn->fileid = fileid;
-                        newpn->val.iops = (unsigned int)iops;
+                        newpn->val.iops = (unsigned int)temp;
                        break;
                }
                break;
diff --git a/block/blk-core.c b/block/blk-core.c
index d2f8f4049ab..8fc4ae28a19 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -348,9 +348,10 @@ void blk_put_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_put_queue);
 /*
- * Note: If a driver supplied the queue lock, it should not zap that lock
+ * Note: If a driver supplied the queue lock, it is disconnected
- * unexpectedly as some queue cleanup components like elevator_exit() and
+ * by this function. The actual state of the lock doesn't matter
- * blk_throtl_exit() need queue lock.
+ * here as the request_queue isn't accessible after this point
+ * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
 */
 void blk_cleanup_queue(struct request_queue *q)
 {
@@ -367,10 +368,8 @@ void blk_cleanup_queue(struct request_queue *q)
        queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
        mutex_unlock(&q->sysfs_lock);
-        if (q->elevator)
+        if (q->queue_lock != &q->__queue_lock)
-                elevator_exit(q->elevator);
+                q->queue_lock = &q->__queue_lock;
-        blk_throtl_exit(q);
        blk_put_queue(q);
 }
@@ -419,6 +418,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        q->backing_dev_info.state = 0;
        q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
        q->backing_dev_info.name = "block";
+        q->node = node_id;
        err = bdi_init(&q->backing_dev_info);
        if (err) {
@@ -503,7 +503,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
        if (!uninit_q)
                return NULL;
-        q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+        q = blk_init_allocated_queue(uninit_q, rfn, lock);
        if (!q)
                blk_cleanup_queue(uninit_q);
@@ -515,18 +515,9 @@ struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
                         spinlock_t *lock)
 {
-        return blk_init_allocated_queue_node(q, rfn, lock, -1);
-}
-EXPORT_SYMBOL(blk_init_allocated_queue);
-struct request_queue *
-blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
-                              spinlock_t *lock, int node_id)
-{
        if (!q)
                return NULL;
-        q->node = node_id;
        if (blk_init_free_list(q))
                return NULL;
@@ -556,7 +547,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
        return NULL;
 }
-EXPORT_SYMBOL(blk_init_allocated_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue);
 int blk_get_queue(struct request_queue *q)
 {
@@ -839,6 +830,9 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
        struct request *rq;
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+                return NULL;
        BUG_ON(rw != READ && rw != WRITE);
        spin_lock_irq(q->queue_lock);
@@ -1164,7 +1158,7 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 * true if merge was successful, otherwise false.
 */
 static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
-                               struct bio *bio)
+                               struct bio *bio, unsigned int *request_count)
 {
        struct blk_plug *plug;
        struct request *rq;
@@ -1173,10 +1167,13 @@ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
        plug = tsk->plug;
        if (!plug)
                goto out;
+        *request_count = 0;
        list_for_each_entry_reverse(rq, &plug->list, queuelist) {
                int el_ret;
+                (*request_count)++;
                if (rq->q != q)
                        continue;
@@ -1216,6 +1213,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
        struct blk_plug *plug;
        int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
        struct request *req;
+        unsigned int request_count = 0;
        /*
         * low level driver can indicate that it wants pages above a
@@ -1234,7 +1232,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
         * Check if we can merge with the plugged list before grabbing
         * any locks.
         */
-        if (attempt_plug_merge(current, q, bio))
+        if (attempt_plug_merge(current, q, bio, &request_count))
                goto out;
        spin_lock_irq(q->queue_lock);
@@ -1279,10 +1277,8 @@ get_rq:
        init_request_from_bio(req, bio);
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-            bio_flagged(bio, BIO_CPU_AFFINE)) {
+            bio_flagged(bio, BIO_CPU_AFFINE))
-                req->cpu = blk_cpu_to_group(get_cpu());
+                req->cpu = raw_smp_processor_id();
-                put_cpu();
-        }
        plug = current->plug;
        if (plug) {
@@ -1301,6 +1297,8 @@ get_rq:
                        if (__rq->q != q)
                                plug->should_sort = 1;
                }
+                if (request_count >= BLK_MAX_REQUEST_COUNT)
+                        blk_flush_plug_list(plug, false);
                list_add_tail(&req->queuelist, &plug->list);
                drive_stat_acct(req, 1);
        } else {
@@ -1357,29 +1355,27 @@ static int __init setup_fail_make_request(char *str)
 }
 __setup("fail_make_request=", setup_fail_make_request);
-static int should_fail_request(struct bio *bio)
+static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
 {
-        struct hd_struct *part = bio->bi_bdev->bd_part;
+        return part->make_it_fail && should_fail(&fail_make_request, bytes);
-        if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
-                return should_fail(&fail_make_request, bio->bi_size);
-        return 0;
 }
 static int __init fail_make_request_debugfs(void)
 {
-        return init_fault_attr_dentries(&fail_make_request,
+        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
-                                        "fail_make_request");
+                                                NULL, &fail_make_request);
+        return IS_ERR(dir) ? PTR_ERR(dir) : 0;
 }
 late_initcall(fail_make_request_debugfs);
 #else /* CONFIG_FAIL_MAKE_REQUEST */
-static inline int should_fail_request(struct bio *bio)
+static inline bool should_fail_request(struct hd_struct *part,
+                                        unsigned int bytes)
 {
-        return 0;
+        return false;
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
@@ -1462,6 +1458,7 @@ static inline void __generic_make_request(struct bio *bio)
        old_dev = 0;
        do {
                char b[BDEVNAME_SIZE];
+                struct hd_struct *part;
                q = bdev_get_queue(bio->bi_bdev);
                if (unlikely(!q)) {
@@ -1485,7 +1482,10 @@ static inline void __generic_make_request(struct bio *bio)
                if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                        goto end_io;
-                if (should_fail_request(bio))
+                part = bio->bi_bdev->bd_part;
+                if (should_fail_request(part, bio->bi_size) ||
+                    should_fail_request(&part_to_disk(part)->part0,
+                                        bio->bi_size))
                        goto end_io;
                /*
@@ -1696,15 +1696,14 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
        unsigned long flags;
+        int where = ELEVATOR_INSERT_BACK;
        if (blk_rq_check_limits(q, rq))
                return -EIO;
-#ifdef CONFIG_FAIL_MAKE_REQUEST
+        if (rq->rq_disk &&
-        if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
+            should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
-            should_fail(&fail_make_request, blk_rq_bytes(rq)))
                return -EIO;
-#endif
        spin_lock_irqsave(q->queue_lock, flags);
@@ -1714,7 +1713,12 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
         */
        BUG_ON(blk_queued_rq(rq));
-        add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
+        if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+                where = ELEVATOR_INSERT_FLUSH;
+        add_acct_request(q, rq, where);
+        if (where == ELEVATOR_INSERT_FLUSH)
+                __blk_run_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
        return 0;
@@ -2271,7 +2275,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
 *     %false - we are done with this request
 *     %true  - still buffers pending for this request
 **/
-static bool __blk_end_bidi_request(struct request *rq, int error,
+bool __blk_end_bidi_request(struct request *rq, int error,
                                   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
        if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8a0e7ec056e..a1ebceb332f 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -50,6 +50,13 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 {
        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+                rq->errors = -ENXIO;
+                if (rq->end_io)
+                        rq->end_io(rq, rq->errors);
+                return;
+        }
        rq->rq_disk = bd_disk;
        rq->end_io = done;
        WARN_ON(irqs_disabled());
diff --git a/block/blk-flush.c b/block/blk-flush.c
index bb21e4c36f7..720ad607ff9 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,11 +95,12 @@ static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
 {
        unsigned int policy = 0;
+        if (blk_rq_sectors(rq))
+                policy |= REQ_FSEQ_DATA;
        if (fflags & REQ_FLUSH) {
                if (rq->cmd_flags & REQ_FLUSH)
                        policy |= REQ_FSEQ_PREFLUSH;
-                if (blk_rq_sectors(rq))
-                        policy |= REQ_FSEQ_DATA;
                if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
                        policy |= REQ_FSEQ_POSTFLUSH;
        }
@@ -122,7 +123,7 @@ static void blk_flush_restore_request(struct request *rq)
        /* make @rq a normal request */
        rq->cmd_flags &= ~REQ_FLUSH_SEQ;
-        rq->end_io = NULL;
+        rq->end_io = rq->flush.saved_end_io;
 }
 /**
@@ -300,9 +301,6 @@ void blk_insert_flush(struct request *rq)
        unsigned int fflags = q->flush_flags;   /* may change, cache */
        unsigned int policy = blk_flush_policy(fflags, rq);
-        BUG_ON(rq->end_io);
-        BUG_ON(!rq->bio || rq->bio != rq->biotail);
        /*
         * @policy now records what operations need to be done.  Adjust
         * REQ_FLUSH and FUA for the driver.
@@ -312,6 +310,19 @@ void blk_insert_flush(struct request *rq)
                rq->cmd_flags &= ~REQ_FUA;
        /*
+         * An empty flush handed down from a stacking driver may
+         * translate into nothing if the underlying device does not
+         * advertise a write-back cache.  In this case, simply
+         * complete the request.
+         */
+        if (!policy) {
+                __blk_end_bidi_request(rq, 0, 0, 0);
+                return;
+        }
+        BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
+        /*
         * If there's data but flush is not necessary, the request can be
         * processed directly without going through flush machinery.  Queue
         * for normal execution.
@@ -329,6 +340,7 @@ void blk_insert_flush(struct request *rq)
        memset(&rq->flush, 0, sizeof(rq->flush));
        INIT_LIST_HEAD(&rq->flush.list);
        rq->cmd_flags |= REQ_FLUSH_SEQ;
+        rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
        rq->end_io = flush_data_end_io;
        blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 342eae9b0d3..6f9bbd97865 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -82,26 +82,26 @@ void exit_io_context(struct task_struct *task)
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
-        struct io_context *ret;
+        struct io_context *ioc;
-        ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+        ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-        if (ret) {
+        if (ioc) {
-                atomic_long_set(&ret->refcount, 1);
+                atomic_long_set(&ioc->refcount, 1);
-                atomic_set(&ret->nr_tasks, 1);
+                atomic_set(&ioc->nr_tasks, 1);
-                spin_lock_init(&ret->lock);
+                spin_lock_init(&ioc->lock);
-                ret->ioprio_changed = 0;
+                ioc->ioprio_changed = 0;
-                ret->ioprio = 0;
+                ioc->ioprio = 0;
-                ret->last_waited = 0; /* doesn't matter... */
+                ioc->last_waited = 0; /* doesn't matter... */
-                ret->nr_batch_requests = 0; /* because this is 0 */
+                ioc->nr_batch_requests = 0; /* because this is 0 */
-                INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
+                INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
-                INIT_HLIST_HEAD(&ret->cic_list);
+                INIT_HLIST_HEAD(&ioc->cic_list);
-                ret->ioc_data = NULL;
+                ioc->ioc_data = NULL;
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-                ret->cgroup_changed = 0;
+                ioc->cgroup_changed = 0;
 #endif
        }
-        return ret;
+        return ioc;
 }
 /*
@@ -139,19 +139,19 @@ struct io_context *current_io_context(gfp_t gfp_flags, int node)
 */
 struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
-        struct io_context *ret = NULL;
+        struct io_context *ioc = NULL;
        /*
         * Check for unlikely race with exiting task. ioc ref count is
         * zero when ioc is being detached.
         */
        do {
-                ret = current_io_context(gfp_flags, node);
+                ioc = current_io_context(gfp_flags, node);
-                if (unlikely(!ret))
+                if (unlikely(!ioc))
                        break;
-        } while (!atomic_long_inc_not_zero(&ret->refcount));
+        } while (!atomic_long_inc_not_zero(&ioc->refcount));
-        return ret;
+        return ioc;
 }
 EXPORT_SYMBOL(get_io_context);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 78e627e2581..2b461b496a7 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -59,7 +59,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
         * granularity
         */
        max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-        if (q->limits.discard_granularity) {
+        if (unlikely(!max_discard_sectors)) {
+                /* Avoid infinite loop below. Being cautious never hurts. */
+                return -EOPNOTSUPP;
+        } else if (q->limits.discard_granularity) {
                unsigned int disc_sects = q->limits.discard_granularity >> 9;
                max_discard_sectors &= ~(disc_sects - 1);
diff --git a/block/blk-map.c b/block/blk-map.c
index e663ac2d8e6..164cd005970 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -204,10 +204,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
                if (!iov[i].iov_len)
                        return -EINVAL;
-                if (uaddr & queue_dma_alignment(q)) {
+                /*
+                 * Keep going so we check length of all segments
+                 */
+                if (uaddr & queue_dma_alignment(q))
                        unaligned = 1;
-                        break;
-                }
        }
        if (unaligned || (q->dma_pad_mask & len) || map_data)
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ee9c2160222..1366a89d8e6 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,24 +103,35 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 void __blk_complete_request(struct request *req)
 {
+        int ccpu, cpu, group_cpu = NR_CPUS;
        struct request_queue *q = req->q;
        unsigned long flags;
-        int ccpu, cpu, group_cpu;
        BUG_ON(!q->softirq_done_fn);
        local_irq_save(flags);
        cpu = smp_processor_id();
-        group_cpu = blk_cpu_to_group(cpu);
        /*
         * Select completion CPU
         */
-        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+        if (req->cpu != -1) {
                ccpu = req->cpu;
-        else
+                if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+                        ccpu = blk_cpu_to_group(ccpu);
+                        group_cpu = blk_cpu_to_group(cpu);
+                }
+        } else
                ccpu = cpu;
+        /*
+         * If current CPU and requested CPU are in the same group, running
+         * softirq in current CPU. One might concern this is just like
+         * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
+         * running in interrupt handler, and currently I/O controller doesn't
+         * support multiple interrupts, so current CPU is unique actually. This
+         * avoids IPI sending from current CPU to the first CPU of a group.
+         */
        if (ccpu == cpu || ccpu == group_cpu) {
                struct list_head *list;
 do_local:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d935bd859c8..60fda88c57f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 {
        bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+        bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
-        return queue_var_show(set, page);
+        return queue_var_show(set << force, page);
 }
 static ssize_t
@@ -257,10 +258,16 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
        ret = queue_var_store(&val, page, count);
        spin_lock_irq(q->queue_lock);
-        if (val)
+        if (val == 2) {
                queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-        else
+                queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
-                queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+        } else if (val == 1) {
+                queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+                queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+        } else if (val == 0) {
+                queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+                queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+        }
        spin_unlock_irq(q->queue_lock);
 #endif
        return ret;
@@ -472,6 +479,11 @@ static void blk_release_queue(struct kobject *kobj)
        blk_sync_queue(q);
+        if (q->elevator)
+                elevator_exit(q->elevator);
+        blk_throtl_exit(q);
        if (rl->rq_pool)
                mempool_destroy(rl->rq_pool);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3689f833afd..a19f58c6fc3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -142,9 +142,9 @@ static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
        return NULL;
 }
-static inline int total_nr_queued(struct throtl_data *td)
+static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
-        return (td->nr_queued[0] + td->nr_queued[1]);
+        return td->nr_queued[0] + td->nr_queued[1];
 }
 static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
@@ -746,7 +746,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
        bool rw = bio_data_dir(bio);
-        bool sync = bio->bi_rw & REQ_SYNC;
+        bool sync = rw_is_sync(bio->bi_rw);
        /* Charge the bio to the group */
        tg->bytes_disp[rw] += bio->bi_size;
@@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q)
        bio_list_init(&bio_list_on_stack);
-        throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
+        throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
                        total_nr_queued(td), td->nr_queued[READ],
                        td->nr_queued[WRITE]);
@@ -970,7 +970,7 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
        struct delayed_work *dwork = &td->throtl_work;
        /* schedule work if limits changed even if no bio is queued */
-        if (total_nr_queued(td) > 0 || td->limits_changed) {
+        if (total_nr_queued(td) || td->limits_changed) {
                /*
                 * We might have a work scheduled to be executed in future.
                 * Cancel that and schedule a new one.
@@ -1150,7 +1150,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
                if (tg_no_rule_group(tg, rw)) {
                        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-                                        rw, bio->bi_rw & REQ_SYNC);
+                                        rw, rw_is_sync(bio->bi_rw));
                        rcu_read_unlock();
                        return 0;
                }
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 4f0c06c7a33..78035488895 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -28,7 +28,10 @@ int blk_should_fake_timeout(struct request_queue *q)
 static int __init fail_io_timeout_debugfs(void)
 {
-        return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout");
+        struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
+                                                NULL, &fail_io_timeout);
+        return IS_ERR(dir) ? PTR_ERR(dir) : 0;
 }
 late_initcall(fail_io_timeout_debugfs);
diff --git a/block/blk.h b/block/blk.h
index d6586287adc..20b900a377c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -17,6 +17,8 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
                      struct bio *bio);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
+bool __blk_end_bidi_request(struct request *rq, int error,
+                            unsigned int nr_bytes, unsigned int bidi_bytes);
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
new file mode 100644
index 00000000000..6690e6e4103
--- /dev/null
+++ b/block/bsg-lib.c
@@ -0,0 +1,298 @@
+/*
+ *  BSG helper library
+ *
+ *  Copyright (C) 2008   James Smart, Emulex Corporation
+ *  Copyright (C) 2011   Red Hat, Inc.  All rights reserved.
+ *  Copyright (C) 2011   Mike Christie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/bsg-lib.h>
+#include <linux/module.h>
+#include <scsi/scsi_cmnd.h>
+/**
+ * bsg_destroy_job - routine to teardown/delete a bsg job
+ * @job: bsg_job that is to be torn down
+ */
+static void bsg_destroy_job(struct bsg_job *job)
+{
+        put_device(job->dev);   /* release reference for the request */
+        kfree(job->request_payload.sg_list);
+        kfree(job->reply_payload.sg_list);
+        kfree(job);
+}
+/**
+ * bsg_job_done - completion routine for bsg requests
+ * @job: bsg_job that is complete
+ * @result: job reply result
+ * @reply_payload_rcv_len: length of payload recvd
+ *
+ * The LLD should call this when the bsg job has completed.
+ */
+void bsg_job_done(struct bsg_job *job, int result,
+                  unsigned int reply_payload_rcv_len)
+{
+        struct request *req = job->req;
+        struct request *rsp = req->next_rq;
+        int err;
+        err = job->req->errors = result;
+        if (err < 0)
+                /* we're only returning the result field in the reply */
+                job->req->sense_len = sizeof(u32);
+        else
+                job->req->sense_len = job->reply_len;
+        /* we assume all request payload was transferred, residual == 0 */
+        req->resid_len = 0;
+        if (rsp) {
+                WARN_ON(reply_payload_rcv_len > rsp->resid_len);
+                /* set reply (bidi) residual */
+                rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
+        }
+        blk_complete_request(req);
+}
+EXPORT_SYMBOL_GPL(bsg_job_done);
+/**
+ * bsg_softirq_done - softirq done routine for destroying the bsg requests
+ * @rq: BSG request that holds the job to be destroyed
+ */
+static void bsg_softirq_done(struct request *rq)
+{
+        struct bsg_job *job = rq->special;
+        blk_end_request_all(rq, rq->errors);
+        bsg_destroy_job(job);
+}
+static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
+{
+        size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments);
+        BUG_ON(!req->nr_phys_segments);
+        buf->sg_list = kzalloc(sz, GFP_KERNEL);
+        if (!buf->sg_list)
+                return -ENOMEM;
+        sg_init_table(buf->sg_list, req->nr_phys_segments);
+        buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+        buf->payload_len = blk_rq_bytes(req);
+        return 0;
+}
+/**
+ * bsg_create_job - create the bsg_job structure for the bsg request
+ * @dev: device that is being sent the bsg request
+ * @req: BSG request that needs a job structure
+ */
+static int bsg_create_job(struct device *dev, struct request *req)
+{
+        struct request *rsp = req->next_rq;
+        struct request_queue *q = req->q;
+        struct bsg_job *job;
+        int ret;
+        BUG_ON(req->special);
+        job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL);
+        if (!job)
+                return -ENOMEM;
+        req->special = job;
+        job->req = req;
+        if (q->bsg_job_size)
+                job->dd_data = (void *)&job[1];
+        job->request = req->cmd;
+        job->request_len = req->cmd_len;
+        job->reply = req->sense;
+        job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer
+                                                 * allocated */
+        if (req->bio) {
+                ret = bsg_map_buffer(&job->request_payload, req);
+                if (ret)
+                        goto failjob_rls_job;
+        }
+        if (rsp && rsp->bio) {
+                ret = bsg_map_buffer(&job->reply_payload, rsp);
+                if (ret)
+                        goto failjob_rls_rqst_payload;
+        }
+        job->dev = dev;
+        /* take a reference for the request */
+        get_device(job->dev);
+        return 0;
+failjob_rls_rqst_payload:
+        kfree(job->request_payload.sg_list);
+failjob_rls_job:
+        kfree(job);
+        return -ENOMEM;
+}
+/*
+ * bsg_goose_queue - restart queue in case it was stopped
+ * @q: request q to be restarted
+ */
+void bsg_goose_queue(struct request_queue *q)
+{
+        if (!q)
+                return;
+        blk_run_queue_async(q);
+}
+EXPORT_SYMBOL_GPL(bsg_goose_queue);
+/**
+ * bsg_request_fn - generic handler for bsg requests
+ * @q: request queue to manage
+ *
+ * On error the create_bsg_job function should return a -Exyz error value
+ * that will be set to the req->errors.
+ *
+ * Drivers/subsys should pass this to the queue init function.
+ */
+void bsg_request_fn(struct request_queue *q)
+{
+        struct device *dev = q->queuedata;
+        struct request *req;
+        struct bsg_job *job;
+        int ret;
+        if (!get_device(dev))
+                return;
+        while (1) {
+                req = blk_fetch_request(q);
+                if (!req)
+                        break;
+                spin_unlock_irq(q->queue_lock);
+                ret = bsg_create_job(dev, req);
+                if (ret) {
+                        req->errors = ret;
+                        blk_end_request_all(req, ret);
+                        spin_lock_irq(q->queue_lock);
+                        continue;
+                }
+                job = req->special;
+                ret = q->bsg_job_fn(job);
+                spin_lock_irq(q->queue_lock);
+                if (ret)
+                        break;
+        }
+        spin_unlock_irq(q->queue_lock);
+        put_device(dev);
+        spin_lock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(bsg_request_fn);
+/**
+ * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
+ * @dev: device to attach bsg device to
+ * @q: request queue setup by caller
+ * @name: device to give bsg device
+ * @job_fn: bsg job handler
+ * @dd_job_size: size of LLD data needed for each job
+ *
+ * The caller should have setup the reuqest queue with bsg_request_fn
+ * as the request_fn.
+ */
+int bsg_setup_queue(struct device *dev, struct request_queue *q,
+                    char *name, bsg_job_fn *job_fn, int dd_job_size)
+{
+        int ret;
+        q->queuedata = dev;
+        q->bsg_job_size = dd_job_size;
+        q->bsg_job_fn = job_fn;
+        queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
+        blk_queue_softirq_done(q, bsg_softirq_done);
+        blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
+        ret = bsg_register_queue(q, dev, name, NULL);
+        if (ret) {
+                printk(KERN_ERR "%s: bsg interface failed to "
+                       "initialize - register queue\n", dev->kobj.name);
+                return ret;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(bsg_setup_queue);
+/**
+ * bsg_remove_queue - Deletes the bsg dev from the q
+ * @q:  the request_queue that is to be torn down.
+ *
+ * Notes:
+ *   Before unregistering the queue empty any requests that are blocked
+ */
+void bsg_remove_queue(struct request_queue *q)
+{
+        struct request *req; /* block request */
+        int counts; /* totals for request_list count and starved */
+        if (!q)
+                return;
+        /* Stop taking in new requests */
+        spin_lock_irq(q->queue_lock);
+        blk_stop_queue(q);
+        /* drain all requests in the queue */
+        while (1) {
+                /* need the lock to fetch a request
+                 * this may fetch the same reqeust as the previous pass
+                 */
+                req = blk_fetch_request(q);
+                /* save requests in use and starved */
+                counts = q->rq.count[0] + q->rq.count[1] +
+                         q->rq.starved[0] + q->rq.starved[1];
+                spin_unlock_irq(q->queue_lock);
+                /* any requests still outstanding? */
+                if (counts == 0)
+                        break;
+                /* This may be the same req as the previous iteration,
+                 * always send the blk_end_request_all after a prefetch.
+                 * It is not okay to not end the request because the
+                 * prefetch started the request.
+                 */
+                if (req) {
+                        /* return -ENXIO to indicate that this queue is
+                         * going away
+                         */
+                        req->errors = -ENXIO;
+                        blk_end_request_all(req, -ENXIO);
+                }
+                msleep(200); /* allow bsg to possibly finish */
+                spin_lock_irq(q->queue_lock);
+        }
+        bsg_unregister_queue(q);
+}
+EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/block/bsg.c b/block/bsg.c
index 0c8b64a1648..702f1316bb8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -182,7 +182,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
                        return -ENOMEM;
        }
-        if (copy_from_user(rq->cmd, (void *)(unsigned long)hdr->request,
+        if (copy_from_user(rq->cmd, (void __user *)(unsigned long)hdr->request,
                           hdr->request_len))
                return -EFAULT;
@@ -249,7 +249,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
        struct request *rq, *next_rq = NULL;
        int ret, rw;
        unsigned int dxfer_len;
-        void *dxferp = NULL;
+        void __user *dxferp = NULL;
        struct bsg_class_device *bcd = &q->bsg_dev;
        /* if the LLD has been removed then the bsg_unregister_queue will
@@ -291,7 +291,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
                rq->next_rq = next_rq;
                next_rq->cmd_type = rq->cmd_type;
-                dxferp = (void*)(unsigned long)hdr->din_xferp;
+                dxferp = (void __user *)(unsigned long)hdr->din_xferp;
                ret =  blk_rq_map_user(q, next_rq, NULL, dxferp,
                                       hdr->din_xfer_len, GFP_KERNEL);
                if (ret)
@@ -300,10 +300,10 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
        if (hdr->dout_xfer_len) {
                dxfer_len = hdr->dout_xfer_len;
-                dxferp = (void*)(unsigned long)hdr->dout_xferp;
+                dxferp = (void __user *)(unsigned long)hdr->dout_xferp;
        } else if (hdr->din_xfer_len) {
                dxfer_len = hdr->din_xfer_len;
-                dxferp = (void*)(unsigned long)hdr->din_xferp;
+                dxferp = (void __user *)(unsigned long)hdr->din_xferp;
        } else
                dxfer_len = 0;
@@ -445,7 +445,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
                int len = min_t(unsigned int, hdr->max_response_len,
                                        rq->sense_len);
-                ret = copy_to_user((void*)(unsigned long)hdr->response,
+                ret = copy_to_user((void __user *)(unsigned long)hdr->response,
                                   rq->sense, len);
                if (!ret)
                        hdr->response_len = len;
@@ -606,7 +606,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
        *ppos = bytes_read;
-        if (!bytes_read || (bytes_read && err_block_err(ret)))
+        if (!bytes_read || err_block_err(ret))
                bytes_read = ret;
        return bytes_read;
@@ -686,7 +686,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
        /*
         * return bytes written on non-fatal errors
         */
-        if (!bytes_written || (bytes_written && err_block_err(ret)))
+        if (!bytes_written || err_block_err(ret))
                bytes_written = ret;
        dprintk("%s: returning %Zd\n", bd->name, bytes_written);
@@ -878,7 +878,7 @@ static unsigned int bsg_poll(struct file *file, poll_table *wait)
        spin_lock_irq(&bd->lock);
        if (!list_empty(&bd->done_list))
                mask |= POLLIN | POLLRDNORM;
-        if (bd->queued_cmds >= bd->max_queue)
+        if (bd->queued_cmds < bd->max_queue)
                mask |= POLLOUT;
        spin_unlock_irq(&bd->lock);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ae21919f15e..4c12869fcf7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -87,9 +87,10 @@ struct cfq_rb_root {
        unsigned count;
        unsigned total_weight;
        u64 min_vdisktime;
+        struct cfq_ttime ttime;
 };
-#define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
+#define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, \
-                        .count = 0, .min_vdisktime = 0, }
+                        .ttime = {.last_end_request = jiffies,},}
 /*
 * Per process-grouping structure
@@ -129,14 +130,14 @@ struct cfq_queue {
        unsigned long slice_end;
        long slice_resid;
-        /* pending metadata requests */
+        /* pending priority requests */
-        int meta_pending;
+        int prio_pending;
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
        /* io prio of this group */
        unsigned short ioprio, org_ioprio;
-        unsigned short ioprio_class, org_ioprio_class;
+        unsigned short ioprio_class;
        pid_t pid;
@@ -212,6 +213,7 @@ struct cfq_group {
 #endif
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
+        struct cfq_ttime ttime;
 };
 /*
@@ -393,6 +395,18 @@ CFQ_CFQQ_FNS(wait_busy);
                        j++, st = i < IDLE_WORKLOAD ? \
                        &cfqg->service_trees[i][j]: NULL) \
+static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
+        struct cfq_ttime *ttime, bool group_idle)
+{
+        unsigned long slice;
+        if (!sample_valid(ttime->ttime_samples))
+                return false;
+        if (group_idle)
+                slice = cfqd->cfq_group_idle;
+        else
+                slice = cfqd->cfq_slice_idle;
+        return ttime->ttime_mean > slice;
+}
 static inline bool iops_mode(struct cfq_data *cfqd)
 {
@@ -670,8 +684,8 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
        if (rq_is_sync(rq1) != rq_is_sync(rq2))
                return rq_is_sync(rq1) ? rq1 : rq2;
-        if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+        if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
-                return rq1->cmd_flags & REQ_META ? rq1 : rq2;
+                return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
        s1 = blk_rq_pos(rq1);
        s2 = blk_rq_pos(rq2);
@@ -1005,8 +1019,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
        return NULL;
 }
-void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
-                                        unsigned int weight)
+                                          unsigned int weight)
 {
        struct cfq_group *cfqg = cfqg_of_blkg(blkg);
        cfqg->new_weight = weight;
@@ -1059,6 +1073,8 @@ static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
                *st = CFQ_RB_ROOT;
        RB_CLEAR_NODE(&cfqg->rb_node);
+        cfqg->ttime.last_end_request = jiffies;
        /*
         * Take the initial reference that will be released on destroy
         * This can be thought of a joint reference by cgroup and
@@ -1198,6 +1214,9 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
        hlist_del_init(&cfqg->cfqd_node);
+        BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
+        cfqd->nr_blkcg_linked_grps--;
        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
@@ -1235,7 +1254,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
 * it should not be NULL as even if elevator was exiting, cgroup deltion
 * path got to it first.
 */
-void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 {
        unsigned long  flags;
        struct cfq_data *cfqd = key;
@@ -1502,16 +1521,11 @@ static void cfq_add_rq_rb(struct request *rq)
 {
        struct cfq_queue *cfqq = RQ_CFQQ(rq);
        struct cfq_data *cfqd = cfqq->cfqd;
-        struct request *__alias, *prev;
+        struct request *prev;
        cfqq->queued[rq_is_sync(rq)]++;
-        /*
+        elv_rb_add(&cfqq->sort_list, rq);
-         * looks a little odd, but the first insert might return an alias.
-         * if that happens, put the alias on the dispatch list
-         */
-        while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
-                cfq_dispatch_insert(cfqd->queue, __alias);
        if (!cfq_cfqq_on_rr(cfqq))
                cfq_add_cfqq_rr(cfqd, cfqq);
@@ -1598,9 +1612,9 @@ static void cfq_remove_request(struct request *rq)
        cfqq->cfqd->rq_queued--;
        cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
                                        rq_data_dir(rq), rq_is_sync(rq));
-        if (rq->cmd_flags & REQ_META) {
+        if (rq->cmd_flags & REQ_PRIO) {
-                WARN_ON(!cfqq->meta_pending);
+                WARN_ON(!cfqq->prio_pending);
-                cfqq->meta_pending--;
+                cfqq->prio_pending--;
        }
 }
@@ -1969,7 +1983,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         * Otherwise, we do only if they are the last ones
         * in their service tree.
         */
-        if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+        if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
+           !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
                return true;
        cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
                        service_tree->count);
@@ -2022,10 +2037,10 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
         * slice, then don't idle. This avoids overrunning the allotted
         * time slice.
         */
-        if (sample_valid(cic->ttime_samples) &&
+        if (sample_valid(cic->ttime.ttime_samples) &&
-            (cfqq->slice_end - jiffies < cic->ttime_mean)) {
+            (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
                cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
-                             cic->ttime_mean);
+                             cic->ttime.ttime_mean);
                return;
        }
@@ -2381,8 +2396,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
         * this group, wait for requests to complete.
         */
 check_group_idle:
-        if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+        if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
-            && cfqq->cfqg->dispatched) {
+            cfqq->cfqg->dispatched &&
+            !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
                cfqq = NULL;
                goto keep_queue;
        }
@@ -2833,7 +2849,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
        cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
                                                        cfqd->queue->node);
        if (cic) {
-                cic->last_end_request = jiffies;
+                cic->ttime.last_end_request = jiffies;
                INIT_LIST_HEAD(&cic->queue_list);
                INIT_HLIST_NODE(&cic->cic_list);
                cic->dtor = cfq_free_io_context;
@@ -2883,7 +2899,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
         * elevate the priority of this queue
         */
        cfqq->org_ioprio = cfqq->ioprio;
-        cfqq->org_ioprio_class = cfqq->ioprio_class;
        cfq_clear_cfqq_prio_changed(cfqq);
 }
@@ -3169,7 +3184,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
                }
        }
-        if (ret)
+        if (ret && ret != -EEXIST)
                printk(KERN_ERR "cfq: cic link failed!\n");
        return ret;
@@ -3185,6 +3200,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
        struct io_context *ioc = NULL;
        struct cfq_io_context *cic;
+        int ret;
        might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -3192,6 +3208,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
        if (!ioc)
                return NULL;
+retry:
        cic = cfq_cic_lookup(cfqd, ioc);
        if (cic)
                goto out;
@@ -3200,7 +3217,12 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
        if (cic == NULL)
                goto err;
-        if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
+        ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
+        if (ret == -EEXIST) {
+                /* someone has linked cic to ioc already */
+                cfq_cic_free(cic);
+                goto retry;
+        } else if (ret)
                goto err_free;
 out:
@@ -3221,14 +3243,28 @@ err:
 }
 static void
-cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
+__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 {
-        unsigned long elapsed = jiffies - cic->last_end_request;
+        unsigned long elapsed = jiffies - ttime->last_end_request;
-        unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
+        elapsed = min(elapsed, 2UL * slice_idle);
-        cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
+        ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
-        cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
+        ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
-        cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
+        ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
+}
+static void
+cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+        struct cfq_io_context *cic)
+{
+        if (cfq_cfqq_sync(cfqq)) {
+                __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
+                __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
+                        cfqd->cfq_slice_idle);
+        }
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+        __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
+#endif
 }
 static void
@@ -3277,8 +3313,8 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
            (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
                enable_idle = 0;
-        else if (sample_valid(cic->ttime_samples)) {
+        else if (sample_valid(cic->ttime.ttime_samples)) {
-                if (cic->ttime_mean > cfqd->cfq_slice_idle)
+                if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
                        enable_idle = 0;
                else
                        enable_idle = 1;
@@ -3343,7 +3379,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
         * So both queues are sync. Let the new request get disk time if
         * it's a metadata request and the current queue is doing regular IO.
         */
-        if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
+        if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
                return true;
        /*
@@ -3410,10 +3446,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        struct cfq_io_context *cic = RQ_CIC(rq);
        cfqd->rq_queued++;
-        if (rq->cmd_flags & REQ_META)
+        if (rq->cmd_flags & REQ_PRIO)
-                cfqq->meta_pending++;
+                cfqq->prio_pending++;
-        cfq_update_io_thinktime(cfqd, cic);
+        cfq_update_io_thinktime(cfqd, cfqq, cic);
        cfq_update_io_seektime(cfqd, cfqq, rq);
        cfq_update_idle_window(cfqd, cfqq, cic);
@@ -3520,12 +3556,16 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        if (cfqq->cfqg->nr_cfqq > 1)
                return false;
+        /* the only queue in the group, but think time is big */
+        if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
+                return false;
        if (cfq_slice_used(cfqq))
                return true;
        /* if slice left is less than think time, wait busy */
-        if (cic && sample_valid(cic->ttime_samples)
+        if (cic && sample_valid(cic->ttime.ttime_samples)
-            && (cfqq->slice_end - jiffies < cic->ttime_mean))
+            && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
                return true;
        /*
@@ -3566,11 +3606,24 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
        if (sync) {
-                RQ_CIC(rq)->last_end_request = now;
+                struct cfq_rb_root *service_tree;
+                RQ_CIC(rq)->ttime.last_end_request = now;
+                if (cfq_cfqq_on_rr(cfqq))
+                        service_tree = cfqq->service_tree;
+                else
+                        service_tree = service_tree_for(cfqq->cfqg,
+                                cfqq_prio(cfqq), cfqq_type(cfqq));
+                service_tree->ttime.last_end_request = now;
                if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
                        cfqd->last_delayed_sync = now;
        }
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+        cfqq->cfqg->ttime.last_end_request = now;
+#endif
        /*
         * If this is the active queue, check if it needs to be expired,
         * or if we want to idle in case it has no pending requests.
@@ -3616,30 +3669,6 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
                cfq_schedule_dispatch(cfqd);
 }
-/*
- * we temporarily boost lower priority queues if they are holding fs exclusive
- * resources. they are boosted to normal prio (CLASS_BE/4)
- */
-static void cfq_prio_boost(struct cfq_queue *cfqq)
-{
-        if (has_fs_excl()) {
-                /*
-                 * boost idle prio on transactions that would lock out other
-                 * users of the filesystem
-                 */
-                if (cfq_class_idle(cfqq))
-                        cfqq->ioprio_class = IOPRIO_CLASS_BE;
-                if (cfqq->ioprio > IOPRIO_NORM)
-                        cfqq->ioprio = IOPRIO_NORM;
-        } else {
-                /*
-                 * unboost the queue (if needed)
-                 */
-                cfqq->ioprio_class = cfqq->org_ioprio_class;
-                cfqq->ioprio = cfqq->org_ioprio;
-        }
-}
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
        if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
@@ -3670,7 +3699,6 @@ static int cfq_may_queue(struct request_queue *q, int rw)
        cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
        if (cfqq) {
                cfq_init_prio_data(cfqq, cic->ioc);
-                cfq_prio_boost(cfqq);
                return __cfq_may_queue(cfqq);
        }
@@ -4015,6 +4043,11 @@ static void *cfq_init_queue(struct request_queue *q)
        if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
                kfree(cfqg);
+                spin_lock(&cic_index_lock);
+                ida_remove(&cic_index_ida, cfqd->cic_index);
+                spin_unlock(&cic_index_lock);
                kfree(cfqd);
                return NULL;
        }
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index cc3eb78e333..7b725020823 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -208,19 +208,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode,
 #define BLKBSZSET_32            _IOW(0x12, 113, int)
 #define BLKGETSIZE64_32         _IOR(0x12, 114, int)
-struct compat_floppy_struct {
-        compat_uint_t   size;
-        compat_uint_t   sect;
-        compat_uint_t   head;
-        compat_uint_t   track;
-        compat_uint_t   stretch;
-        unsigned char   gap;
-        unsigned char   rate;
-        unsigned char   spec1;
-        unsigned char   fmt_gap;
-        const compat_caddr_t name;
-};
 struct compat_floppy_drive_params {
        char            cmos;
        compat_ulong_t  max_dtr;
@@ -288,7 +275,6 @@ struct compat_floppy_write_errors {
 #define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct)
 #define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct)
-#define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct)
 #define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params)
 #define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params)
 #define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct)
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 5139c0ea186..c644137d9cd 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -77,10 +77,8 @@ static void
 deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
 {
        struct rb_root *root = deadline_rb_root(dd, rq);
-        struct request *__alias;
-        while (unlikely(__alias = elv_rb_add(root, rq)))
+        elv_rb_add(root, rq);
-                deadline_move_request(dd, __alias);
 }
 static inline void
diff --git a/block/elevator.c b/block/elevator.c
index b0b38ce0dcb..a3b64bc71d8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -353,7 +353,7 @@ static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
-struct request *elv_rb_add(struct rb_root *root, struct request *rq)
+void elv_rb_add(struct rb_root *root, struct request *rq)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -365,15 +365,12 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq)
                if (blk_rq_pos(rq) < blk_rq_pos(__rq))
                        p = &(*p)->rb_left;
-                else if (blk_rq_pos(rq) > blk_rq_pos(__rq))
+                else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
                        p = &(*p)->rb_right;
-                else
-                        return __rq;
        }
        rb_link_node(&rq->rb_node, parent, p);
        rb_insert_color(&rq->rb_node, root);
-        return NULL;
 }
 EXPORT_SYMBOL(elv_rb_add);
diff --git a/block/genhd.c b/block/genhd.c
index 3608289c8ec..d3834710b95 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -602,7 +602,7 @@ void add_disk(struct gendisk *disk)
        disk->major = MAJOR(devt);
        disk->first_minor = MINOR(devt);
-        /* Register BDI before referencing it from bdev */ 
+        /* Register BDI before referencing it from bdev */
        bdi = &disk->queue->backing_dev_info;
        bdi_register_dev(bdi, disk_devt(disk));
@@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk)
        register_disk(disk);
        blk_register_queue(disk);
+        /*
+         * Take an extra ref on queue which will be put on disk_release()
+         * so that it sticks around as long as @disk is there.
+         */
+        WARN_ON_ONCE(blk_get_queue(disk->queue));
        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
                                   "bdi");
        WARN_ON(retval);
@@ -1018,14 +1024,6 @@ static const struct attribute_group *disk_attr_groups[] = {
        NULL
 };
-static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
-{
-        struct disk_part_tbl *ptbl =
-                container_of(head, struct disk_part_tbl, rcu_head);
-        kfree(ptbl);
-}
 /**
 * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
 * @disk: disk to replace part_tbl for
@@ -1046,7 +1044,7 @@ static void disk_replace_part_tbl(struct gendisk *disk,
        if (old_ptbl) {
                rcu_assign_pointer(old_ptbl->last_lookup, NULL);
-                call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
+                kfree_rcu(old_ptbl, rcu_head);
        }
 }
@@ -1103,8 +1101,26 @@ static void disk_release(struct device *dev)
        disk_replace_part_tbl(disk, NULL);
        free_part_stats(&disk->part0);
        free_part_info(&disk->part0);
+        if (disk->queue)
+                blk_put_queue(disk->queue);
        kfree(disk);
 }
+static int disk_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        struct disk_part_iter piter;
+        struct hd_struct *part;
+        int cnt = 0;
+        disk_part_iter_init(&piter, disk, 0);
+        while((part = disk_part_iter_next(&piter)))
+                cnt++;
+        disk_part_iter_exit(&piter);
+        add_uevent_var(env, "NPARTS=%u", cnt);
+        return 0;
+}
 struct class block_class = {
        .name           = "block",
 };
@@ -1123,6 +1139,7 @@ static struct device_type disk_type = {
        .groups         = disk_attr_groups,
        .release        = disk_release,
        .devnode        = block_devnode,
+        .uevent         = disk_uevent,
 };
 #ifdef CONFIG_PROC_FS
@@ -1148,23 +1165,23 @@ static int diskstats_show(struct seq_file *seqf, void *v)
                                "wsect wuse running use aveq"
                                "\n\n");
        */
- 
        disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
        while ((hd = disk_part_iter_next(&piter))) {
                cpu = part_stat_lock();
                part_round_stats(cpu, hd);
                part_stat_unlock();
-                seq_printf(seqf, "%4d %7d %s %lu %lu %llu "
+                seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
-                           "%u %lu %lu %llu %u %u %u %u\n",
+                           "%u %lu %lu %lu %u %u %u %u\n",
                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
                           disk_name(gp, hd->partno, buf),
                           part_stat_read(hd, ios[READ]),
                           part_stat_read(hd, merges[READ]),
-                           (unsigned long long)part_stat_read(hd, sectors[READ]),
+                           part_stat_read(hd, sectors[READ]),
                           jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
                           part_stat_read(hd, ios[WRITE]),
                           part_stat_read(hd, merges[WRITE]),
-                           (unsigned long long)part_stat_read(hd, sectors[WRITE]),
+                           part_stat_read(hd, sectors[WRITE]),
                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
                           part_in_flight(hd),
                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
@@ -1172,7 +1189,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
                        );
        }
        disk_part_iter_exit(&piter);
- 
        return 0;
 }
@@ -1500,30 +1517,32 @@ void disk_unblock_events(struct gendisk *disk)
 }
 /**
- * disk_check_events - schedule immediate event checking
+ * disk_flush_events - schedule immediate event checking and flushing
- * @disk: disk to check events for
+ * @disk: disk to check and flush events for
+ * @mask: events to flush
 *
- * Schedule immediate event checking on @disk if not blocked.
+ * Schedule immediate event checking on @disk if not blocked.  Events in
+ * @mask are scheduled to be cleared from the driver.  Note that this
+ * doesn't clear the events from @disk->ev.
 *
 * CONTEXT:
- * Don't care.  Safe to call from irq context.
+ * If @mask is non-zero must be called with bdev->bd_mutex held.
 */
-void disk_check_events(struct gendisk *disk)
+void disk_flush_events(struct gendisk *disk, unsigned int mask)
 {
        struct disk_events *ev = disk->ev;
-        unsigned long flags;
        if (!ev)
                return;
-        spin_lock_irqsave(&ev->lock, flags);
+        spin_lock_irq(&ev->lock);
+        ev->clearing |= mask;
        if (!ev->block) {
                cancel_delayed_work(&ev->dwork);
                queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
        }
-        spin_unlock_irqrestore(&ev->lock, flags);
+        spin_unlock_irq(&ev->lock);
 }
-EXPORT_SYMBOL_GPL(disk_check_events);
 /**
 * disk_clear_events - synchronously check, clear and return pending events
@@ -1713,7 +1732,7 @@ static int disk_events_set_dfl_poll_msecs(const char *val,
        mutex_lock(&disk_events_mutex);
        list_for_each_entry(ev, &disk_events, node)
-                disk_check_events(ev->disk);
+                disk_flush_events(ev->disk, 0);
        mutex_unlock(&disk_events_mutex);