27 files changed, 4553 insertions, 1572 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..60be1e0455da 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -2,7 +2,7 @@
 # Block layer core configuration
 #
 menuconfig BLOCK
-       bool "Enable the block layer" if EMBEDDED
+       bool "Enable the block layer" if EXPERT
       default y
       help
         Provide block layer support for the kernel.
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
        T10/SCSI Data Integrity Field or the T13/ATA External Path
        Protection.  If in doubt, say N.
+config BLK_DEV_THROTTLING
+        bool "Block layer bio throttling support"
+        depends on BLK_CGROUP=y && EXPERIMENTAL
+        default n
+        ---help---
+        Block layer bio throttling support. It can be used to limit
+        the IO rate to a device. IO rate policies are per cgroup and
+        one needs to mount and use blkio cgroup controller for creating
+        cgroups and specifying per device IO rate policies.
+        See Documentation/cgroups/blkio-controller.txt for more information.
 endif # BLOCK
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..0fec4b3fab51 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,12 +3,13 @@
 #
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
-                        blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
+                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                        blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 obj-$(CONFIG_BLK_DEV_BSG)       += bsg.o
 obj-$(CONFIG_BLK_CGROUP)        += blk-cgroup.o
+obj-$(CONFIG_BLK_DEV_THROTTLING)        += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)      += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)  += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)       += cfq-iosched.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
deleted file mode 100644
index f0faefca032f..000000000000
--- a/block/blk-barrier.c
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * Functions related to barrier IO handling
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/gfp.h>
-#include "blk.h"
-/**
- * blk_queue_ordered - does this queue support ordered writes
- * @q:        the request queue
- * @ordered:  one of QUEUE_ORDERED_*
- *
- * Description:
- *   For journalled file systems, doing ordered writes on a commit
- *   block instead of explicitly doing wait_on_buffer (which is bad
- *   for performance) can be a big win. Block drivers supporting this
- *   feature should call this function and indicate so.
- *
- **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered)
-{
-        if (ordered != QUEUE_ORDERED_NONE &&
-            ordered != QUEUE_ORDERED_DRAIN &&
-            ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-            ordered != QUEUE_ORDERED_DRAIN_FUA &&
-            ordered != QUEUE_ORDERED_TAG &&
-            ordered != QUEUE_ORDERED_TAG_FLUSH &&
-            ordered != QUEUE_ORDERED_TAG_FUA) {
-                printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
-                return -EINVAL;
-        }
-        q->ordered = ordered;
-        q->next_ordered = ordered;
-        return 0;
-}
-EXPORT_SYMBOL(blk_queue_ordered);
-/*
- * Cache flushing for ordered writes handling
- */
-unsigned blk_ordered_cur_seq(struct request_queue *q)
-{
-        if (!q->ordseq)
-                return 0;
-        return 1 << ffz(q->ordseq);
-}
-unsigned blk_ordered_req_seq(struct request *rq)
-{
-        struct request_queue *q = rq->q;
-        BUG_ON(q->ordseq == 0);
-        if (rq == &q->pre_flush_rq)
-                return QUEUE_ORDSEQ_PREFLUSH;
-        if (rq == &q->bar_rq)
-                return QUEUE_ORDSEQ_BAR;
-        if (rq == &q->post_flush_rq)
-                return QUEUE_ORDSEQ_POSTFLUSH;
-        /*
-         * !fs requests don't need to follow barrier ordering.  Always
-         * put them at the front.  This fixes the following deadlock.
-         *
-         * http://thread.gmane.org/gmane.linux.kernel/537473
-         */
-        if (rq->cmd_type != REQ_TYPE_FS)
-                return QUEUE_ORDSEQ_DRAIN;
-        if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
-            (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
-                return QUEUE_ORDSEQ_DRAIN;
-        else
-                return QUEUE_ORDSEQ_DONE;
-}
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
-{
-        struct request *rq;
-        if (error && !q->orderr)
-                q->orderr = error;
-        BUG_ON(q->ordseq & seq);
-        q->ordseq |= seq;
-        if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-                return false;
-        /*
-         * Okay, sequence complete.
-         */
-        q->ordseq = 0;
-        rq = q->orig_bar_rq;
-        __blk_end_request_all(rq, q->orderr);
-        return true;
-}
-static void pre_flush_end_io(struct request *rq, int error)
-{
-        elv_completed_request(rq->q, rq);
-        blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
-}
-static void bar_end_io(struct request *rq, int error)
-{
-        elv_completed_request(rq->q, rq);
-        blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
-}
-static void post_flush_end_io(struct request *rq, int error)
-{
-        elv_completed_request(rq->q, rq);
-        blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
-}
-static void queue_flush(struct request_queue *q, unsigned which)
-{
-        struct request *rq;
-        rq_end_io_fn *end_io;
-        if (which == QUEUE_ORDERED_DO_PREFLUSH) {
-                rq = &q->pre_flush_rq;
-                end_io = pre_flush_end_io;
-        } else {
-                rq = &q->post_flush_rq;
-                end_io = post_flush_end_io;
-        }
-        blk_rq_init(q, rq);
-        rq->cmd_type = REQ_TYPE_FS;
-        rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
-        rq->rq_disk = q->orig_bar_rq->rq_disk;
-        rq->end_io = end_io;
-        elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-}
-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
-{
-        struct request *rq = *rqp;
-        unsigned skip = 0;
-        q->orderr = 0;
-        q->ordered = q->next_ordered;
-        q->ordseq |= QUEUE_ORDSEQ_STARTED;
-        /*
-         * For an empty barrier, there's no actual BAR request, which
-         * in turn makes POSTFLUSH unnecessary.  Mask them off.
-         */
-        if (!blk_rq_sectors(rq)) {
-                q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-                                QUEUE_ORDERED_DO_POSTFLUSH);
-                /*
-                 * Empty barrier on a write-through device w/ ordered
-                 * tag has no command to issue and without any command
-                 * to issue, ordering by tag can't be used.  Drain
-                 * instead.
-                 */
-                if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
-                    !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
-                        q->ordered &= ~QUEUE_ORDERED_BY_TAG;
-                        q->ordered |= QUEUE_ORDERED_BY_DRAIN;
-                }
-        }
-        /* stash away the original request */
-        blk_dequeue_request(rq);
-        q->orig_bar_rq = rq;
-        rq = NULL;
-        /*
-         * Queue ordered sequence.  As we stack them at the head, we
-         * need to queue in reverse order.  Note that we rely on that
-         * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-         * request gets inbetween ordered sequence.
-         */
-        if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
-                queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-                rq = &q->post_flush_rq;
-        } else
-                skip |= QUEUE_ORDSEQ_POSTFLUSH;
-        if (q->ordered & QUEUE_ORDERED_DO_BAR) {
-                rq = &q->bar_rq;
-                /* initialize proxy request and queue it */
-                blk_rq_init(q, rq);
-                if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-                        rq->cmd_flags |= REQ_WRITE;
-                if (q->ordered & QUEUE_ORDERED_DO_FUA)
-                        rq->cmd_flags |= REQ_FUA;
-                init_request_from_bio(rq, q->orig_bar_rq->bio);
-                rq->end_io = bar_end_io;
-                elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-        } else
-                skip |= QUEUE_ORDSEQ_BAR;
-        if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
-                queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
-                rq = &q->pre_flush_rq;
-        } else
-                skip |= QUEUE_ORDSEQ_PREFLUSH;
-        if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
-                rq = NULL;
-        else
-                skip |= QUEUE_ORDSEQ_DRAIN;
-        *rqp = rq;
-        /*
-         * Complete skipped sequences.  If whole sequence is complete,
-         * return false to tell elevator that this request is gone.
-         */
-        return !blk_ordered_complete_seq(q, skip, 0);
-}
-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
-{
-        struct request *rq = *rqp;
-        const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
-                                (rq->cmd_flags & REQ_HARDBARRIER);
-        if (!q->ordseq) {
-                if (!is_barrier)
-                        return true;
-                if (q->next_ordered != QUEUE_ORDERED_NONE)
-                        return start_ordered(q, rqp);
-                else {
-                        /*
-                         * Queue ordering not supported.  Terminate
-                         * with prejudice.
-                         */
-                        blk_dequeue_request(rq);
-                        __blk_end_request_all(rq, -EOPNOTSUPP);
-                        *rqp = NULL;
-                        return false;
-                }
-        }
-        /*
-         * Ordered sequence in progress
-         */
-        /* Special requests are not subject to ordering rules. */
-        if (rq->cmd_type != REQ_TYPE_FS &&
-            rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-                return true;
-        if (q->ordered & QUEUE_ORDERED_BY_TAG) {
-                /* Ordered by tag.  Blocking the next barrier is enough. */
-                if (is_barrier && rq != &q->bar_rq)
-                        *rqp = NULL;
-        } else {
-                /* Ordered by draining.  Wait for turn. */
-                WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-                if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-                        *rqp = NULL;
-        }
-        return true;
-}
-static void bio_end_empty_barrier(struct bio *bio, int err)
-{
-        if (err) {
-                if (err == -EOPNOTSUPP)
-                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-                clear_bit(BIO_UPTODATE, &bio->bi_flags);
-        }
-        if (bio->bi_private)
-                complete(bio->bi_private);
-        bio_put(bio);
-}
-/**
- * blkdev_issue_flush - queue a flush
- * @bdev:       blockdev to issue flush for
- * @gfp_mask:   memory allocation flags (for bio_alloc)
- * @error_sector:       error sector
- * @flags:      BLKDEV_IFL_* flags to control behaviour
- *
- * Description:
- *    Issue a flush for the block device in question. Caller can supply
- *    room for storing the error offset in case of a flush error, if they
- *    wish to. If WAIT flag is not passed then caller may check only what
- *    request was pushed in some internal queue for later handling.
- */
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
-                sector_t *error_sector, unsigned long flags)
-{
-        DECLARE_COMPLETION_ONSTACK(wait);
-        struct request_queue *q;
-        struct bio *bio;
-        int ret = 0;
-        if (bdev->bd_disk == NULL)
-                return -ENXIO;
-        q = bdev_get_queue(bdev);
-        if (!q)
-                return -ENXIO;
-        /*
-         * some block devices may not have their queue correctly set up here
-         * (e.g. loop device without a backing file) and so issuing a flush
-         * here will panic. Ensure there is a request function before issuing
-         * the barrier.
-         */
-        if (!q->make_request_fn)
-                return -ENXIO;
-        bio = bio_alloc(gfp_mask, 0);
-        bio->bi_end_io = bio_end_empty_barrier;
-        bio->bi_bdev = bdev;
-        if (test_bit(BLKDEV_WAIT, &flags))
-                bio->bi_private = &wait;
-        bio_get(bio);
-        submit_bio(WRITE_BARRIER, bio);
-        if (test_bit(BLKDEV_WAIT, &flags)) {
-                wait_for_completion(&wait);
-                /*
-                 * The driver must store the error location in ->bi_sector, if
-                 * it supports it. For non-stacked drivers, this should be
-                 * copied from blk_rq_pos(rq).
-                 */
-                if (error_sector)
-                        *error_sector = bio->bi_sector;
-        }
-        if (bio_flagged(bio, BIO_EOPNOTSUPP))
-                ret = -EOPNOTSUPP;
-        else if (!bio_flagged(bio, BIO_UPTODATE))
-                ret = -EIO;
-        bio_put(bio);
-        return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2fef1ef931a0..bcaf16ee6ad1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,18 +30,22 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
                                                  struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
-                              struct task_struct *, bool);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
-                           struct cgroup *, struct task_struct *, bool);
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
+/* for encoding cft->private value on file */
+#define BLKIOFILE_PRIVATE(x, val)       (((x) << 16) | (val))
+/* What policy owns the file, proportional or throttle */
+#define BLKIOFILE_POLICY(val)           (((val) >> 16) & 0xffff)
+#define BLKIOFILE_ATTR(val)             ((val) & 0xffff)
 struct cgroup_subsys blkio_subsys = {
        .name = "blkio",
        .create = blkiocg_create,
-        .can_attach = blkiocg_can_attach,
+        .can_attach_task = blkiocg_can_attach_task,
-        .attach = blkiocg_attach,
+        .attach_task = blkiocg_attach_task,
        .destroy = blkiocg_destroy,
        .populate = blkiocg_populate,
 #ifdef CONFIG_BLK_CGROUP
@@ -59,6 +63,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
        list_add(&pn->node, &blkcg->policy_list);
 }
+static inline bool cftype_blkg_same_policy(struct cftype *cft,
+                        struct blkio_group *blkg)
+{
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        if (blkg->plid == plid)
+                return 1;
+        return 0;
+}
+/* Determines if policy node matches cgroup file being accessed */
+static inline bool pn_matches_cftype(struct cftype *cft,
+                        struct blkio_policy_node *pn)
+{
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int fileid = BLKIOFILE_ATTR(cft->private);
+        return (plid == pn->plid && fileid == pn->fileid);
+}
 /* Must be called with blkcg->lock held */
 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 {
@@ -67,12 +92,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 /* Must be called with blkcg->lock held */
 static struct blkio_policy_node *
-blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
+                enum blkio_policy_id plid, int fileid)
 {
        struct blkio_policy_node *pn;
        list_for_each_entry(pn, &blkcg->policy_list, node) {
-                if (pn->dev == dev)
+                if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
                        return pn;
        }
@@ -86,6 +112,74 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
+struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
+{
+        return container_of(task_subsys_state(tsk, blkio_subsys_id),
+                            struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(task_blkio_cgroup);
+static inline void
+blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
+{
+        struct blkio_policy_type *blkiop;
+        list_for_each_entry(blkiop, &blkio_list, list) {
+                /* If this policy does not own the blkg, do not send updates */
+                if (blkiop->plid != blkg->plid)
+                        continue;
+                if (blkiop->ops.blkio_update_group_weight_fn)
+                        blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+                                                        blkg, weight);
+        }
+}
+static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
+                                int fileid)
+{
+        struct blkio_policy_type *blkiop;
+        list_for_each_entry(blkiop, &blkio_list, list) {
+                /* If this policy does not own the blkg, do not send updates */
+                if (blkiop->plid != blkg->plid)
+                        continue;
+                if (fileid == BLKIO_THROTL_read_bps_device
+                    && blkiop->ops.blkio_update_group_read_bps_fn)
+                        blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+                                                                blkg, bps);
+                if (fileid == BLKIO_THROTL_write_bps_device
+                    && blkiop->ops.blkio_update_group_write_bps_fn)
+                        blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+                                                                blkg, bps);
+        }
+}
+static inline void blkio_update_group_iops(struct blkio_group *blkg,
+                        unsigned int iops, int fileid)
+{
+        struct blkio_policy_type *blkiop;
+        list_for_each_entry(blkiop, &blkio_list, list) {
+                /* If this policy does not own the blkg, do not send updates */
+                if (blkiop->plid != blkg->plid)
+                        continue;
+                if (fileid == BLKIO_THROTL_read_iops_device
+                    && blkiop->ops.blkio_update_group_read_iops_fn)
+                        blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+                                                                blkg, iops);
+                if (fileid == BLKIO_THROTL_write_iops_device
+                    && blkiop->ops.blkio_update_group_write_iops_fn)
+                        blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+                                                                blkg,iops);
+        }
+}
 /*
 * Add to the appropriate stat variable depending on the request type.
 * This should be called with the blkg->stats_lock held.
@@ -282,30 +376,47 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
+                                unsigned long unaccounted_time)
 {
        unsigned long flags;
        spin_lock_irqsave(&blkg->stats_lock, flags);
        blkg->stats.time += time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        blkg->stats.unaccounted_time += unaccounted_time;
+#endif
        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+/*
+ * should be called under rcu read lock or queue lock to make sure blkg pointer
+ * is valid.
+ */
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
                                uint64_t bytes, bool direction, bool sync)
 {
-        struct blkio_group_stats *stats;
+        struct blkio_group_stats_cpu *stats_cpu;
        unsigned long flags;
-        spin_lock_irqsave(&blkg->stats_lock, flags);
+        /*
-        stats = &blkg->stats;
+         * Disabling interrupts to provide mutual exclusion between two
-        stats->sectors += bytes >> 9;
+         * writes on same cpu. It probably is not needed for 64bit. Not
-        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+         * optimizing that case yet.
-                        sync);
+         */
-        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+        local_irq_save(flags);
-                        direction, sync);
-        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+        stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+        u64_stats_update_begin(&stats_cpu->syncp);
+        stats_cpu->sectors += bytes >> 9;
+        blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
+                        1, direction, sync);
+        blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
+                        bytes, direction, sync);
+        u64_stats_update_end(&stats_cpu->syncp);
+        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
@@ -328,20 +439,47 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
+/*  Merged stats are per cpu.  */
 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
                                        bool sync)
 {
+        struct blkio_group_stats_cpu *stats_cpu;
        unsigned long flags;
-        spin_lock_irqsave(&blkg->stats_lock, flags);
+        /*
-        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+         * Disabling interrupts to provide mutual exclusion between two
-                        sync);
+         * writes on same cpu. It probably is not needed for 64bit. Not
-        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+         * optimizing that case yet.
+         */
+        local_irq_save(flags);
+        stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+        u64_stats_update_begin(&stats_cpu->syncp);
+        blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
+                                direction, sync);
+        u64_stats_update_end(&stats_cpu->syncp);
+        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
+/*
+ * This function allocates the per cpu stats for blkio_group. Should be called
+ * from sleepable context as alloc_per_cpu() requires that.
+ */
+int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+{
+        /* Allocate memory for per cpu stats */
+        blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+        if (!blkg->stats_cpu)
+                return -ENOMEM;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-                        struct blkio_group *blkg, void *key, dev_t dev)
+                struct blkio_group *blkg, void *key, dev_t dev,
+                enum blkio_policy_id plid)
 {
        unsigned long flags;
@@ -350,6 +488,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
        rcu_assign_pointer(blkg->key, key);
        blkg->blkcg_id = css_id(&blkcg->css);
        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+        blkg->plid = plid;
        spin_unlock_irqrestore(&blkcg->lock, flags);
        /* Need to take css reference ? */
        cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,49 +547,28 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
-#define SHOW_FUNCTION(__VAR)                                            \
+static void blkio_reset_stats_cpu(struct blkio_group *blkg)
-static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
-                                       struct cftype *cftype)           \
-{                                                                       \
-        struct blkio_cgroup *blkcg;                                     \
-                                                                        \
-        blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
-        return (u64)blkcg->__VAR;                                       \
-}
-SHOW_FUNCTION(weight);
-#undef SHOW_FUNCTION
-static int
-blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
-        struct blkio_cgroup *blkcg;
+        struct blkio_group_stats_cpu *stats_cpu;
-        struct blkio_group *blkg;
+        int i, j, k;
-        struct hlist_node *n;
+        /*
-        struct blkio_policy_type *blkiop;
+         * Note: On 64 bit arch this should not be an issue. This has the
-        struct blkio_policy_node *pn;
+         * possibility of returning some inconsistent value on 32bit arch
+         * as 64bit update on 32bit is non atomic. Taking care of this
-        if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+         * corner case makes code very complicated, like sending IPIs to
-                return -EINVAL;
+         * cpus, taking care of stats of offline cpus etc.
+         *
-        blkcg = cgroup_to_blkio_cgroup(cgroup);
+         * reset stats is anyway more of a debug feature and this sounds a
-        spin_lock(&blkio_list_lock);
+         * corner case. So I am not complicating the code yet until and
-        spin_lock_irq(&blkcg->lock);
+         * unless this becomes a real issue.
-        blkcg->weight = (unsigned int)val;
+         */
+        for_each_possible_cpu(i) {
-        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
-                pn = blkio_policy_search_node(blkcg, blkg->dev);
+                stats_cpu->sectors = 0;
+                for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
-                if (pn)
+                        for (k = 0; k < BLKIO_STAT_TOTAL; k++)
-                        continue;
+                                stats_cpu->stat_arr_cpu[j][k] = 0;
-                list_for_each_entry(blkiop, &blkio_list, list)
-                        blkiop->ops.blkio_update_group_weight_fn(blkg,
-                                        blkcg->weight);
        }
-        spin_unlock_irq(&blkcg->lock);
-        spin_unlock(&blkio_list_lock);
-        return 0;
 }
 static int
@@ -497,7 +615,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
                }
 #endif
                spin_unlock(&blkg->stats_lock);
+                /* Reset Per cpu stats which don't take blkg->stats_lock */
+                blkio_reset_stats_cpu(blkg);
        }
        spin_unlock_irq(&blkcg->lock);
        return 0;
 }
@@ -543,6 +665,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
        return val;
 }
+static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+                        enum stat_type_cpu type, enum stat_sub_type sub_type)
+{
+        int cpu;
+        struct blkio_group_stats_cpu *stats_cpu;
+        u64 val = 0, tval;
+        for_each_possible_cpu(cpu) {
+                unsigned int start;
+                stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
+                do {
+                        start = u64_stats_fetch_begin(&stats_cpu->syncp);
+                        if (type == BLKIO_STAT_CPU_SECTORS)
+                                tval = stats_cpu->sectors;
+                        else
+                                tval = stats_cpu->stat_arr_cpu[type][sub_type];
+                } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
+                val += tval;
+        }
+        return val;
+}
+static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
+                struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
+{
+        uint64_t disk_total, val;
+        char key_str[MAX_KEY_LEN];
+        enum stat_sub_type sub_type;
+        if (type == BLKIO_STAT_CPU_SECTORS) {
+                val = blkio_read_stat_cpu(blkg, type, 0);
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
+        }
+        for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+                        sub_type++) {
+                blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+                val = blkio_read_stat_cpu(blkg, type, sub_type);
+                cb->fill(cb, key_str, val);
+        }
+        disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
+                        blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
+        blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+        cb->fill(cb, key_str, disk_total);
+        return disk_total;
+}
 /* This should be called with blkg->stats_lock held */
 static uint64_t blkio_get_stat(struct blkio_group *blkg,
                struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
@@ -554,10 +729,10 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
        if (type == BLKIO_STAT_TIME)
                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
                                        blkg->stats.time, cb, dev);
-        if (type == BLKIO_STAT_SECTORS)
-                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-                                        blkg->stats.sectors, cb, dev);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+        if (type == BLKIO_STAT_UNACCOUNTED_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.unaccounted_time, cb, dev);
        if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
                uint64_t sum = blkg->stats.avg_queue_size_sum;
                uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -593,52 +768,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
        return disk_total;
 }
-#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)                \
-static int blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
-                struct cftype *cftype, struct cgroup_map_cb *cb)        \
-{                                                                       \
-        struct blkio_cgroup *blkcg;                                     \
-        struct blkio_group *blkg;                                       \
-        struct hlist_node *n;                                           \
-        uint64_t cgroup_total = 0;                                      \
-                                                                        \
-        if (!cgroup_lock_live_group(cgroup))                            \
-                return -ENODEV;                                         \
-                                                                        \
-        blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
-        rcu_read_lock();                                                \
-        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-                if (blkg->dev) {                                        \
-                        spin_lock_irq(&blkg->stats_lock);               \
-                        cgroup_total += blkio_get_stat(blkg, cb,        \
-                                                blkg->dev, type);       \
-                        spin_unlock_irq(&blkg->stats_lock);             \
-                }                                                       \
-        }                                                               \
-        if (show_total)                                                 \
-                cb->fill(cb, "Total", cgroup_total);                    \
-        rcu_read_unlock();                                              \
-        cgroup_unlock();                                                \
-        return 0;                                                       \
-}
-SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
-SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
-SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
-SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
-SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
-SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
-SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
-#endif
-#undef SHOW_FUNCTION_PER_GROUP
 static int blkio_check_dev_num(dev_t dev)
 {
        int part = 0;
@@ -652,13 +781,14 @@ static int blkio_check_dev_num(dev_t dev)
 }
 static int blkio_policy_parse_and_set(char *buf,
-                                      struct blkio_policy_node *newpn)
+        struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
        int ret;
        unsigned long major, minor, temp;
        int i = 0;
        dev_t dev;
+        u64 bps, iops;
        memset(s, 0, sizeof(s));
@@ -705,12 +835,47 @@ static int blkio_policy_parse_and_set(char *buf,
        if (s[1] == NULL)
                return -EINVAL;
-        ret = strict_strtoul(s[1], 10, &temp);
+        switch (plid) {
-        if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+        case BLKIO_POLICY_PROP:
-            temp > BLKIO_WEIGHT_MAX)
+                ret = strict_strtoul(s[1], 10, &temp);
-                return -EINVAL;
+                if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+                        temp > BLKIO_WEIGHT_MAX)
+                        return -EINVAL;
-        newpn->weight =  temp;
+                newpn->plid = plid;
+                newpn->fileid = fileid;
+                newpn->val.weight = temp;
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        ret = strict_strtoull(s[1], 10, &bps);
+                        if (ret)
+                                return -EINVAL;
+                        newpn->plid = plid;
+                        newpn->fileid = fileid;
+                        newpn->val.bps = bps;
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        ret = strict_strtoull(s[1], 10, &iops);
+                        if (ret)
+                                return -EINVAL;
+                        if (iops > THROTL_IOPS_MAX)
+                                return -EINVAL;
+                        newpn->plid = plid;
+                        newpn->fileid = fileid;
+                        newpn->val.iops = (unsigned int)iops;
+                        break;
+                }
+                break;
+        default:
+                BUG();
+        }
        return 0;
 }
@@ -720,26 +885,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 {
        struct blkio_policy_node *pn;
-        pn = blkio_policy_search_node(blkcg, dev);
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
+                                BLKIO_PROP_weight_device);
        if (pn)
-                return pn->weight;
+                return pn->val.weight;
        else
                return blkcg->weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
+uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+        struct blkio_policy_node *pn;
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_read_bps_device);
+        if (pn)
+                return pn->val.bps;
+        else
+                return -1;
+}
+uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+        struct blkio_policy_node *pn;
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_write_bps_device);
+        if (pn)
+                return pn->val.bps;
+        else
+                return -1;
+}
+unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+        struct blkio_policy_node *pn;
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_read_iops_device);
+        if (pn)
+                return pn->val.iops;
+        else
+                return -1;
+}
+unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+        struct blkio_policy_node *pn;
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_write_iops_device);
+        if (pn)
+                return pn->val.iops;
+        else
+                return -1;
+}
+/* Checks whether user asked for deleting a policy rule */
+static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
+{
+        switch(pn->plid) {
+        case BLKIO_POLICY_PROP:
+                if (pn->val.weight == 0)
+                        return 1;
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(pn->fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        if (pn->val.bps == 0)
+                                return 1;
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        if (pn->val.iops == 0)
+                                return 1;
+                }
+                break;
+        default:
+                BUG();
+        }
+        return 0;
+}
+static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
+                                        struct blkio_policy_node *newpn)
+{
+        switch(oldpn->plid) {
+        case BLKIO_POLICY_PROP:
+                oldpn->val.weight = newpn->val.weight;
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(newpn->fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        oldpn->val.bps = newpn->val.bps;
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        oldpn->val.iops = newpn->val.iops;
+                }
+                break;
+        default:
+                BUG();
+        }
+}
+/*
+ * Some rules/values in blkg have changed. Propagate those to respective
+ * policies.
+ */
+static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
+                struct blkio_group *blkg, struct blkio_policy_node *pn)
+{
+        unsigned int weight, iops;
+        u64 bps;
+        switch(pn->plid) {
+        case BLKIO_POLICY_PROP:
+                weight = pn->val.weight ? pn->val.weight :
+                                blkcg->weight;
+                blkio_update_group_weight(blkg, weight);
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(pn->fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        bps = pn->val.bps ? pn->val.bps : (-1);
+                        blkio_update_group_bps(blkg, bps, pn->fileid);
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        iops = pn->val.iops ? pn->val.iops : (-1);
+                        blkio_update_group_iops(blkg, iops, pn->fileid);
+                        break;
+                }
+                break;
+        default:
+                BUG();
+        }
+}
-static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+/*
-                                       const char *buffer)
+ * A policy node rule has been updated. Propagate this update to all the
+ * block groups which might be affected by this update.
+ */
+static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
+                                struct blkio_policy_node *pn)
+{
+        struct blkio_group *blkg;
+        struct hlist_node *n;
+        spin_lock(&blkio_list_lock);
+        spin_lock_irq(&blkcg->lock);
+        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                if (pn->dev != blkg->dev || pn->plid != blkg->plid)
+                        continue;
+                blkio_update_blkg_policy(blkcg, blkg, pn);
+        }
+        spin_unlock_irq(&blkcg->lock);
+        spin_unlock(&blkio_list_lock);
+}
+static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
+                                       const char *buffer)
 {
        int ret = 0;
        char *buf;
        struct blkio_policy_node *newpn, *pn;
        struct blkio_cgroup *blkcg;
-        struct blkio_group *blkg;
        int keep_newpn = 0;
-        struct hlist_node *n;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-        struct blkio_policy_type *blkiop;
+        int fileid = BLKIOFILE_ATTR(cft->private);
        buf = kstrdup(buffer, GFP_KERNEL);
        if (!buf)
@@ -751,7 +1070,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
                goto free_buf;
        }
-        ret = blkio_policy_parse_and_set(buf, newpn);
+        ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
        if (ret)
                goto free_newpn;
@@ -759,9 +1078,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
        spin_lock_irq(&blkcg->lock);
-        pn = blkio_policy_search_node(blkcg, newpn->dev);
+        pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
        if (!pn) {
-                if (newpn->weight != 0) {
+                if (!blkio_delete_rule_command(newpn)) {
                        blkio_policy_insert_node(blkcg, newpn);
                        keep_newpn = 1;
                }
@@ -769,33 +1088,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
                goto update_io_group;
        }
-        if (newpn->weight == 0) {
+        if (blkio_delete_rule_command(newpn)) {
-                /* weight == 0 means deleteing a specific weight */
                blkio_policy_delete_node(pn);
                spin_unlock_irq(&blkcg->lock);
                goto update_io_group;
        }
        spin_unlock_irq(&blkcg->lock);
-        pn->weight = newpn->weight;
+        blkio_update_policy_rule(pn, newpn);
 update_io_group:
-        /* update weight for each cfqg */
+        blkio_update_policy_node_blkg(blkcg, newpn);
-        spin_lock(&blkio_list_lock);
-        spin_lock_irq(&blkcg->lock);
-        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-                if (newpn->dev == blkg->dev) {
-                        list_for_each_entry(blkiop, &blkio_list, list)
-                                blkiop->ops.blkio_update_group_weight_fn(blkg,
-                                                         newpn->weight ?
-                                                         newpn->weight :
-                                                         blkcg->weight);
-                }
-        }
-        spin_unlock_irq(&blkcg->lock);
-        spin_unlock(&blkio_list_lock);
 free_newpn:
        if (!keep_newpn)
@@ -805,23 +1108,264 @@ free_buf:
        return ret;
 }
-static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
+static void
-                                      struct seq_file *m)
+blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 {
-        struct blkio_cgroup *blkcg;
+        switch(pn->plid) {
-        struct blkio_policy_node *pn;
+                case BLKIO_POLICY_PROP:
+                        if (pn->fileid == BLKIO_PROP_weight_device)
+                                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                        MINOR(pn->dev), pn->val.weight);
+                        break;
+                case BLKIO_POLICY_THROTL:
+                        switch(pn->fileid) {
+                        case BLKIO_THROTL_read_bps_device:
+                        case BLKIO_THROTL_write_bps_device:
+                                seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+                                        MINOR(pn->dev), pn->val.bps);
+                                break;
+                        case BLKIO_THROTL_read_iops_device:
+                        case BLKIO_THROTL_write_iops_device:
+                                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                        MINOR(pn->dev), pn->val.iops);
+                                break;
+                        }
+                        break;
+                default:
+                        BUG();
+        }
+}
-        seq_printf(m, "dev\tweight\n");
+/* cgroup files which read their data from policy nodes end up here */
+static void blkio_read_policy_node_files(struct cftype *cft,
+                        struct blkio_cgroup *blkcg, struct seq_file *m)
+{
+        struct blkio_policy_node *pn;
-        blkcg = cgroup_to_blkio_cgroup(cgrp);
        if (!list_empty(&blkcg->policy_list)) {
                spin_lock_irq(&blkcg->lock);
                list_for_each_entry(pn, &blkcg->policy_list, node) {
-                        seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                        if (!pn_matches_cftype(cft, pn))
-                                   MINOR(pn->dev), pn->weight);
+                                continue;
+                        blkio_print_policy_node(m, pn);
                }
                spin_unlock_irq(&blkcg->lock);
        }
+}
+static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
+                                struct seq_file *m)
+{
+        struct blkio_cgroup *blkcg;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int name = BLKIOFILE_ATTR(cft->private);
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        switch(plid) {
+        case BLKIO_POLICY_PROP:
+                switch(name) {
+                case BLKIO_PROP_weight_device:
+                        blkio_read_policy_node_files(cft, blkcg, m);
+                        return 0;
+                default:
+                        BUG();
+                }
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(name){
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        blkio_read_policy_node_files(cft, blkcg, m);
+                        return 0;
+                default:
+                        BUG();
+                }
+                break;
+        default:
+                BUG();
+        }
+        return 0;
+}
+static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
+                struct cftype *cft, struct cgroup_map_cb *cb,
+                enum stat_type type, bool show_total, bool pcpu)
+{
+        struct blkio_group *blkg;
+        struct hlist_node *n;
+        uint64_t cgroup_total = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                if (blkg->dev) {
+                        if (!cftype_blkg_same_policy(cft, blkg))
+                                continue;
+                        if (pcpu)
+                                cgroup_total += blkio_get_stat_cpu(blkg, cb,
+                                                blkg->dev, type);
+                        else {
+                                spin_lock_irq(&blkg->stats_lock);
+                                cgroup_total += blkio_get_stat(blkg, cb,
+                                                blkg->dev, type);
+                                spin_unlock_irq(&blkg->stats_lock);
+                        }
+                }
+        }
+        if (show_total)
+                cb->fill(cb, "Total", cgroup_total);
+        rcu_read_unlock();
+        return 0;
+}
+/* All map kind of cgroup file get serviced by this function */
+static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
+                                struct cgroup_map_cb *cb)
+{
+        struct blkio_cgroup *blkcg;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int name = BLKIOFILE_ATTR(cft->private);
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        switch(plid) {
+        case BLKIO_POLICY_PROP:
+                switch(name) {
+                case BLKIO_PROP_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_TIME, 0, 0);
+                case BLKIO_PROP_sectors:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SECTORS, 0, 1);
+                case BLKIO_PROP_io_service_bytes:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
+                case BLKIO_PROP_io_serviced:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SERVICED, 1, 1);
+                case BLKIO_PROP_io_service_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_SERVICE_TIME, 1, 0);
+                case BLKIO_PROP_io_wait_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_WAIT_TIME, 1, 0);
+                case BLKIO_PROP_io_merged:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_MERGED, 1, 1);
+                case BLKIO_PROP_io_queued:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_QUEUED, 1, 0);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+                case BLKIO_PROP_unaccounted_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
+                case BLKIO_PROP_dequeue:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_DEQUEUE, 0, 0);
+                case BLKIO_PROP_avg_queue_size:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
+                case BLKIO_PROP_group_wait_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
+                case BLKIO_PROP_idle_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_IDLE_TIME, 0, 0);
+                case BLKIO_PROP_empty_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_EMPTY_TIME, 0, 0);
+#endif
+                default:
+                        BUG();
+                }
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(name){
+                case BLKIO_THROTL_io_service_bytes:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
+                case BLKIO_THROTL_io_serviced:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SERVICED, 1, 1);
+                default:
+                        BUG();
+                }
+                break;
+        default:
+                BUG();
+        }
+        return 0;
+}
+static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
+{
+        struct blkio_group *blkg;
+        struct hlist_node *n;
+        struct blkio_policy_node *pn;
+        if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+                return -EINVAL;
+        spin_lock(&blkio_list_lock);
+        spin_lock_irq(&blkcg->lock);
+        blkcg->weight = (unsigned int)val;
+        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                pn = blkio_policy_search_node(blkcg, blkg->dev,
+                                BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
+                if (pn)
+                        continue;
+                blkio_update_group_weight(blkg, blkcg->weight);
+        }
+        spin_unlock_irq(&blkcg->lock);
+        spin_unlock(&blkio_list_lock);
+        return 0;
+}
+static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
+        struct blkio_cgroup *blkcg;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int name = BLKIOFILE_ATTR(cft->private);
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        switch(plid) {
+        case BLKIO_POLICY_PROP:
+                switch(name) {
+                case BLKIO_PROP_weight:
+                        return (u64)blkcg->weight;
+                }
+                break;
+        default:
+                BUG();
+        }
+        return 0;
+}
+static int
+blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+        struct blkio_cgroup *blkcg;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int name = BLKIOFILE_ATTR(cft->private);
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        switch(plid) {
+        case BLKIO_POLICY_PROP:
+                switch(name) {
+                case BLKIO_PROP_weight:
+                        return blkio_weight_write(blkcg, val);
+                }
+                break;
+        default:
+                BUG();
+        }
        return 0;
 }
@@ -829,71 +1373,157 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 struct cftype blkio_files[] = {
        {
                .name = "weight_device",
-                .read_seq_string = blkiocg_weight_device_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-                .write_string = blkiocg_weight_device_write,
+                                BLKIO_PROP_weight_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
                .max_write_len = 256,
        },
        {
                .name = "weight",
-                .read_u64 = blkiocg_weight_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-                .write_u64 = blkiocg_weight_write,
+                                BLKIO_PROP_weight),
+                .read_u64 = blkiocg_file_read_u64,
+                .write_u64 = blkiocg_file_write_u64,
        },
        {
                .name = "time",
-                .read_map = blkiocg_time_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_time),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "sectors",
-                .read_map = blkiocg_sectors_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_sectors),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "io_service_bytes",
-                .read_map = blkiocg_io_service_bytes_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_service_bytes),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "io_serviced",
-                .read_map = blkiocg_io_serviced_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_serviced),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "io_service_time",
-                .read_map = blkiocg_io_service_time_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_service_time),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "io_wait_time",
-                .read_map = blkiocg_io_wait_time_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_wait_time),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "io_merged",
-                .read_map = blkiocg_io_merged_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_merged),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "io_queued",
-                .read_map = blkiocg_io_queued_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_queued),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "reset_stats",
                .write_u64 = blkiocg_reset_stats,
        },
+#ifdef CONFIG_BLK_DEV_THROTTLING
+        {
+                .name = "throttle.read_bps_device",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_read_bps_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
+        {
+                .name = "throttle.write_bps_device",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_write_bps_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
+        {
+                .name = "throttle.read_iops_device",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_read_iops_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
+        {
+                .name = "throttle.write_iops_device",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_write_iops_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
+        {
+                .name = "throttle.io_service_bytes",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_io_service_bytes),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "throttle.io_serviced",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_io_serviced),
+                .read_map = blkiocg_file_read_map,
+        },
+#endif /* CONFIG_BLK_DEV_THROTTLING */
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        {
                .name = "avg_queue_size",
-                .read_map = blkiocg_avg_queue_size_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_avg_queue_size),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "group_wait_time",
-                .read_map = blkiocg_group_wait_time_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_group_wait_time),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "idle_time",
-                .read_map = blkiocg_idle_time_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_idle_time),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "empty_time",
-                .read_map = blkiocg_empty_time_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_empty_time),
+                .read_map = blkiocg_file_read_map,
        },
        {
                .name = "dequeue",
-                .read_map = blkiocg_dequeue_read,
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_dequeue),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "unaccounted_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_unaccounted_time),
+                .read_map = blkiocg_file_read_map,
        },
 #endif
 };
@@ -932,13 +1562,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
                /*
                 * This blkio_group is being unlinked as associated cgroup is
                 * going away. Let all the IO controlling policies know about
-                 * this event. Currently this is static call to one io
+                 * this event.
-                 * controlling policy. Once we have more policies in place, we
-                 * need some dynamic registration of callback function.
                 */
                spin_lock(&blkio_list_lock);
-                list_for_each_entry(blkiop, &blkio_list, list)
+                list_for_each_entry(blkiop, &blkio_list, list) {
+                        if (blkiop->plid != blkg->plid)
+                                continue;
                        blkiop->ops.blkio_unlink_group_fn(key, blkg);
+                }
                spin_unlock(&blkio_list_lock);
        } while (1);
@@ -964,10 +1595,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
                goto done;
        }
-        /* Currently we do not support hierarchy deeper than two level (0,1) */
-        if (parent != cgroup->top_cgroup)
-                return ERR_PTR(-EPERM);
        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
        if (!blkcg)
                return ERR_PTR(-ENOMEM);
@@ -987,9 +1614,7 @@ done:
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-                                struct cgroup *cgroup, struct task_struct *tsk,
-                                bool threadgroup)
 {
        struct io_context *ioc;
        int ret = 0;
@@ -1004,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
        return ret;
 }
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-                                struct cgroup *prev, struct task_struct *tsk,
-                                bool threadgroup)
 {
        struct io_context *ioc;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..a71d2904ffb9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -14,6 +14,15 @@
 */
 #include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
+enum blkio_policy_id {
+        BLKIO_POLICY_PROP = 0,          /* Proportional Bandwidth division */
+        BLKIO_POLICY_THROTL,            /* Throttling */
+};
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX         UINT_MAX
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -28,20 +37,15 @@ enum stat_type {
         * request completion for IOs doen by this cgroup. This may not be
         * accurate when NCQ is turned on. */
        BLKIO_STAT_SERVICE_TIME = 0,
-        /* Total bytes transferred */
-        BLKIO_STAT_SERVICE_BYTES,
-        /* Total IOs serviced, post merge */
-        BLKIO_STAT_SERVICED,
        /* Total time spent waiting in scheduler queue in ns */
        BLKIO_STAT_WAIT_TIME,
-        /* Number of IOs merged */
-        BLKIO_STAT_MERGED,
        /* Number of IOs queued up */
        BLKIO_STAT_QUEUED,
        /* All the single valued stats go below this */
        BLKIO_STAT_TIME,
-        BLKIO_STAT_SECTORS,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+        /* Time not charged to this cgroup */
+        BLKIO_STAT_UNACCOUNTED_TIME,
        BLKIO_STAT_AVG_QUEUE_SIZE,
        BLKIO_STAT_IDLE_TIME,
        BLKIO_STAT_EMPTY_TIME,
@@ -50,6 +54,18 @@ enum stat_type {
 #endif
 };
+/* Per cpu stats */
+enum stat_type_cpu {
+        BLKIO_STAT_CPU_SECTORS,
+        /* Total bytes transferred */
+        BLKIO_STAT_CPU_SERVICE_BYTES,
+        /* Total IOs serviced, post merge */
+        BLKIO_STAT_CPU_SERVICED,
+        /* Number of IOs merged */
+        BLKIO_STAT_CPU_MERGED,
+        BLKIO_STAT_CPU_NR
+};
 enum stat_sub_type {
        BLKIO_STAT_READ = 0,
        BLKIO_STAT_WRITE,
@@ -65,6 +81,36 @@ enum blkg_state_flags {
        BLKG_empty,
 };
+/* cgroup files owned by proportional weight policy */
+enum blkcg_file_name_prop {
+        BLKIO_PROP_weight = 1,
+        BLKIO_PROP_weight_device,
+        BLKIO_PROP_io_service_bytes,
+        BLKIO_PROP_io_serviced,
+        BLKIO_PROP_time,
+        BLKIO_PROP_sectors,
+        BLKIO_PROP_unaccounted_time,
+        BLKIO_PROP_io_service_time,
+        BLKIO_PROP_io_wait_time,
+        BLKIO_PROP_io_merged,
+        BLKIO_PROP_io_queued,
+        BLKIO_PROP_avg_queue_size,
+        BLKIO_PROP_group_wait_time,
+        BLKIO_PROP_idle_time,
+        BLKIO_PROP_empty_time,
+        BLKIO_PROP_dequeue,
+};
+/* cgroup files owned by throttle policy */
+enum blkcg_file_name_throtl {
+        BLKIO_THROTL_read_bps_device,
+        BLKIO_THROTL_write_bps_device,
+        BLKIO_THROTL_read_iops_device,
+        BLKIO_THROTL_write_iops_device,
+        BLKIO_THROTL_io_service_bytes,
+        BLKIO_THROTL_io_serviced,
+};
 struct blkio_cgroup {
        struct cgroup_subsys_state css;
        unsigned int weight;
@@ -76,9 +122,11 @@ struct blkio_cgroup {
 struct blkio_group_stats {
        /* total disk time and nr sectors dispatched by this group */
        uint64_t time;
-        uint64_t sectors;
        uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+        /* Time not charged to this cgroup */
+        uint64_t unaccounted_time;
        /* Sum of number of IOs queued across all samples */
        uint64_t avg_queue_size_sum;
        /* Count of samples taken for average */
@@ -103,6 +151,13 @@ struct blkio_group_stats {
 #endif
 };
+/* Per cpu blkio group stats */
+struct blkio_group_stats_cpu {
+        uint64_t sectors;
+        uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
+        struct u64_stats_sync syncp;
+};
 struct blkio_group {
        /* An rcu protected unique identifier for the group */
        void *key;
@@ -112,33 +167,73 @@ struct blkio_group {
        char path[128];
        /* The device MKDEV(major, minor), this group has been created for */
        dev_t dev;
+        /* policy which owns this blk group */
+        enum blkio_policy_id plid;
        /* Need to serialize the stats in the case of reset/update */
        spinlock_t stats_lock;
        struct blkio_group_stats stats;
+        /* Per cpu stats pointer */
+        struct blkio_group_stats_cpu __percpu *stats_cpu;
 };
 struct blkio_policy_node {
        struct list_head node;
        dev_t dev;
-        unsigned int weight;
+        /* This node belongs to max bw policy or porportional weight policy */
+        enum blkio_policy_id plid;
+        /* cgroup file to which this rule belongs to */
+        int fileid;
+        union {
+                unsigned int weight;
+                /*
+                 * Rate read/write in terms of byptes per second
+                 * Whether this rate represents read or write is determined
+                 * by file type "fileid".
+                 */
+                u64 bps;
+                unsigned int iops;
+        } val;
 };
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
                                     dev_t dev);
+extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
+extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
+extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
+extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
-typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
-                                                unsigned int weight);
+typedef void (blkio_update_group_weight_fn) (void *key,
+                        struct blkio_group *blkg, unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (void * key,
+                        struct blkio_group *blkg, u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (void *key,
+                        struct blkio_group *blkg, u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (void *key,
+                        struct blkio_group *blkg, unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (void *key,
+                        struct blkio_group *blkg, unsigned int write_iops);
 struct blkio_policy_ops {
        blkio_unlink_group_fn *blkio_unlink_group_fn;
        blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+        blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
+        blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
+        blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
+        blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
 };
 struct blkio_policy_type {
        struct list_head list;
        struct blkio_policy_ops ops;
+        enum blkio_policy_id plid;
 };
 /* Blkio controller policy registration */
@@ -165,7 +260,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 #endif
-#define BLKIO_WEIGHT_MIN        100
+#define BLKIO_WEIGHT_MIN        10
 #define BLKIO_WEIGHT_MAX        1000
 #define BLKIO_WEIGHT_DEFAULT    500
@@ -211,13 +306,17 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-                        struct blkio_group *blkg, void *key, dev_t dev);
+        struct blkio_group *blkg, void *key, dev_t dev,
+        enum blkio_policy_id plid);
+extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
                                                void *key);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                                        unsigned long time);
+                                        unsigned long time,
+                                        unsigned long unaccounted_time);
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
                                                bool direction, bool sync);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -232,9 +331,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+static inline struct blkio_cgroup *
+task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-                        struct blkio_group *blkg, void *key, dev_t dev) {}
+                struct blkio_group *blkg, void *key, dev_t dev,
+                enum blkio_policy_id plid) {}
+static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
@@ -242,7 +346,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                                                unsigned long time) {}
+                                                unsigned long time,
+                                                unsigned long unaccounted_time)
+{}
 static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
                                uint64_t bytes, bool direction, bool sync) {}
 static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index 32a1c123dfb3..d2f8f4049abd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,13 +27,14 @@
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
+#include <linux/list_sort.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 #include "blk.h"
-EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -64,13 +65,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
                return;
        cpu = part_stat_lock();
-        part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-        if (!new_io)
+        if (!new_io) {
+                part = rq->part;
                part_stat_inc(cpu, part, merges[rw]);
-        else {
+        } else {
+                part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+                if (!hd_struct_try_get(part)) {
+                        /*
+                         * The partition is already being removed,
+                         * the request will be accounted on the disk only
+                         *
+                         * We take a reference on disk->part0 although that
+                         * partition will never be deleted, so we can treat
+                         * it as any other partition.
+                         */
+                        part = &rq->rq_disk->part0;
+                        hd_struct_get(part);
+                }
                part_round_stats(cpu, part);
                part_inc_in_flight(part, rw);
+                rq->part = part;
        }
        part_stat_unlock();
@@ -128,46 +143,36 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->ref_count = 1;
        rq->start_time = jiffies;
        set_start_time_ns(rq);
+        rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
                          unsigned int nbytes, int error)
 {
-        struct request_queue *q = rq->q;
+        if (error)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
-        if (&q->bar_rq != rq) {
+        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                if (error)
+                error = -EIO;
-                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
-                else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                        error = -EIO;
-                if (unlikely(nbytes > bio->bi_size)) {
-                        printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-                               __func__, nbytes, bio->bi_size);
-                        nbytes = bio->bi_size;
-                }
-                if (unlikely(rq->cmd_flags & REQ_QUIET))
+        if (unlikely(nbytes > bio->bi_size)) {
-                        set_bit(BIO_QUIET, &bio->bi_flags);
+                printk(KERN_ERR "%s: want %u bytes done, %u left\n",
+                       __func__, nbytes, bio->bi_size);
+                nbytes = bio->bi_size;
+        }
-                bio->bi_size -= nbytes;
+        if (unlikely(rq->cmd_flags & REQ_QUIET))
-                bio->bi_sector += (nbytes >> 9);
+                set_bit(BIO_QUIET, &bio->bi_flags);
-                if (bio_integrity(bio))
+        bio->bi_size -= nbytes;
-                        bio_integrity_advance(bio, nbytes);
+        bio->bi_sector += (nbytes >> 9);
-                if (bio->bi_size == 0)
+        if (bio_integrity(bio))
-                        bio_endio(bio, error);
+                bio_integrity_advance(bio, nbytes);
-        } else {
-                /*
+        /* don't actually finish bio if it's part of flush sequence */
-                 * Okay, this is the barrier request in progress, just
+        if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
-                 * record the error;
+                bio_endio(bio, error);
-                 */
-                if (error && !q->orderr)
-                        q->orderr = error;
-        }
 }
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -193,136 +198,32 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
-/*
+static void blk_delay_work(struct work_struct *work)
- * "plug" the device if there are no outstanding requests: this will
- * force the transfer to start only after we have put all the requests
- * on the list.
- *
- * This is called with interrupts off and no requests on the queue and
- * with the queue lock held.
- */
-void blk_plug_device(struct request_queue *q)
 {
-        WARN_ON(!irqs_disabled());
+        struct request_queue *q;
-        /*
-         * don't plug a stopped queue, it must be paired with blk_start_queue()
-         * which will restart the queueing
-         */
-        if (blk_queue_stopped(q))
-                return;
-        if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
+        q = container_of(work, struct request_queue, delay_work.work);
-                mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+        spin_lock_irq(q->queue_lock);
-                trace_block_plug(q);
+        __blk_run_queue(q);
-        }
+        spin_unlock_irq(q->queue_lock);
 }
-EXPORT_SYMBOL(blk_plug_device);
 /**
- * blk_plug_device_unlocked - plug a device without queue lock held
+ * blk_delay_queue - restart queueing after defined interval
- * @q:    The &struct request_queue to plug
+ * @q:          The &struct request_queue in question
+ * @msecs:      Delay in msecs
 *
 * Description:
- *   Like @blk_plug_device(), but grabs the queue lock and disables
+ *   Sometimes queueing needs to be postponed for a little while, to allow
- *   interrupts.
+ *   resources to come back. This function will make sure that queueing is
- **/
+ *   restarted around the specified time.
-void blk_plug_device_unlocked(struct request_queue *q)
-{
-        unsigned long flags;
-        spin_lock_irqsave(q->queue_lock, flags);
-        blk_plug_device(q);
-        spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_plug_device_unlocked);
-/*
- * remove the queue from the plugged list, if present. called with
- * queue lock held and interrupts disabled.
- */
-int blk_remove_plug(struct request_queue *q)
-{
-        WARN_ON(!irqs_disabled());
-        if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
-                return 0;
-        del_timer(&q->unplug_timer);
-        return 1;
-}
-EXPORT_SYMBOL(blk_remove_plug);
-/*
- * remove the plug and let it rip..
 */
-void __generic_unplug_device(struct request_queue *q)
+void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 {
-        if (unlikely(blk_queue_stopped(q)))
+        queue_delayed_work(kblockd_workqueue, &q->delay_work,
-                return;
+                                msecs_to_jiffies(msecs));
-        if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
-                return;
-        q->request_fn(q);
 }
+EXPORT_SYMBOL(blk_delay_queue);
-/**
- * generic_unplug_device - fire a request queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   Linux uses plugging to build bigger requests queues before letting
- *   the device have at them. If a queue is plugged, the I/O scheduler
- *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged, the request_fn defined for the queue is invoked and
- *   transfers started.
- **/
-void generic_unplug_device(struct request_queue *q)
-{
-        if (blk_queue_plugged(q)) {
-                spin_lock_irq(q->queue_lock);
-                __generic_unplug_device(q);
-                spin_unlock_irq(q->queue_lock);
-        }
-}
-EXPORT_SYMBOL(generic_unplug_device);
-static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
-                                   struct page *page)
-{
-        struct request_queue *q = bdi->unplug_io_data;
-        blk_unplug(q);
-}
-void blk_unplug_work(struct work_struct *work)
-{
-        struct request_queue *q =
-                container_of(work, struct request_queue, unplug_work);
-        trace_block_unplug_io(q);
-        q->unplug_fn(q);
-}
-void blk_unplug_timeout(unsigned long data)
-{
-        struct request_queue *q = (struct request_queue *)data;
-        trace_block_unplug_timer(q);
-        kblockd_schedule_work(q, &q->unplug_work);
-}
-void blk_unplug(struct request_queue *q)
-{
-        /*
-         * devices don't necessarily have an ->unplug_fn defined
-         */
-        if (q->unplug_fn) {
-                trace_block_unplug_io(q);
-                q->unplug_fn(q);
-        }
-}
-EXPORT_SYMBOL(blk_unplug);
 /**
 * blk_start_queue - restart a previously stopped queue
@@ -358,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue);
 **/
 void blk_stop_queue(struct request_queue *q)
 {
-        blk_remove_plug(q);
+        __cancel_delayed_work(&q->delay_work);
        queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
@@ -376,12 +277,15 @@ EXPORT_SYMBOL(blk_stop_queue);
 *     that its ->make_request_fn will not re-add plugging prior to calling
 *     this function.
 *
+ *     This function does not cancel any asynchronous activity arising
+ *     out of elevator or throttling code. That would require elevaotor_exit()
+ *     and blk_throtl_exit() to be called with queue lock initialized.
+ *
 */
 void blk_sync_queue(struct request_queue *q)
 {
-        del_timer_sync(&q->unplug_timer);
        del_timer_sync(&q->timeout);
-        cancel_work_sync(&q->unplug_work);
+        cancel_delayed_work_sync(&q->delay_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
@@ -392,31 +296,32 @@ EXPORT_SYMBOL(blk_sync_queue);
 * Description:
 *    See @blk_run_queue. This variant must be called with the queue lock
 *    held and interrupts disabled.
- *
 */
 void __blk_run_queue(struct request_queue *q)
 {
-        blk_remove_plug(q);
        if (unlikely(blk_queue_stopped(q)))
                return;
-        if (elv_queue_empty(q))
+        q->request_fn(q);
-                return;
+}
+EXPORT_SYMBOL(__blk_run_queue);
-        /*
+/**
-         * Only recurse once to avoid overrunning the stack, let the unplug
+ * blk_run_queue_async - run a single device queue in workqueue context
-         * handling reinvoke the handler shortly if we already got there.
+ * @q:  The queue to run
-         */
+ *
-        if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+ * Description:
-                q->request_fn(q);
+ *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
-                queue_flag_clear(QUEUE_FLAG_REENTER, q);
+ *    of us.
-        } else {
+ */
-                queue_flag_set(QUEUE_FLAG_PLUGGED, q);
+void blk_run_queue_async(struct request_queue *q)
-                kblockd_schedule_work(q, &q->unplug_work);
+{
+        if (likely(!blk_queue_stopped(q))) {
+                __cancel_delayed_work(&q->delay_work);
+                queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
        }
 }
-EXPORT_SYMBOL(__blk_run_queue);
+EXPORT_SYMBOL(blk_run_queue_async);
 /**
 * blk_run_queue - run a single device queue
@@ -440,7 +345,13 @@ void blk_put_queue(struct request_queue *q)
 {
        kobject_put(&q->kobj);
 }
+EXPORT_SYMBOL(blk_put_queue);
+/*
+ * Note: If a driver supplied the queue lock, it should not zap that lock
+ * unexpectedly as some queue cleanup components like elevator_exit() and
+ * blk_throtl_exit() need queue lock.
+ */
 void blk_cleanup_queue(struct request_queue *q)
 {
        /*
@@ -459,6 +370,8 @@ void blk_cleanup_queue(struct request_queue *q)
        if (q->elevator)
                elevator_exit(q->elevator);
+        blk_throtl_exit(q);
        blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -501,8 +414,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        if (!q)
                return NULL;
-        q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
-        q->backing_dev_info.unplug_io_data = q;
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
        q->backing_dev_info.state = 0;
@@ -515,18 +426,31 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
                return NULL;
        }
+        if (blk_throtl_init(q)) {
+                kmem_cache_free(blk_requestq_cachep, q);
+                return NULL;
+        }
        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                    laptop_mode_timer_fn, (unsigned long) q);
-        init_timer(&q->unplug_timer);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
        INIT_LIST_HEAD(&q->timeout_list);
-        INIT_WORK(&q->unplug_work, blk_unplug_work);
+        INIT_LIST_HEAD(&q->flush_queue[0]);
+        INIT_LIST_HEAD(&q->flush_queue[1]);
+        INIT_LIST_HEAD(&q->flush_data_in_flight);
+        INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
        kobject_init(&q->kobj, &blk_queue_ktype);
        mutex_init(&q->sysfs_lock);
        spin_lock_init(&q->__queue_lock);
+        /*
+         * By default initialize queue_lock to internal lock and driver can
+         * override it later if need be.
+         */
+        q->queue_lock = &q->__queue_lock;
        return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -609,9 +533,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
        q->request_fn           = rfn;
        q->prep_rq_fn           = NULL;
        q->unprep_rq_fn         = NULL;
-        q->unplug_fn            = generic_unplug_device;
        q->queue_flags          = QUEUE_FLAG_DEFAULT;
-        q->queue_lock           = lock;
+        /* Override internal queue lock with supplied lock pointer */
+        if (lock)
+                q->queue_lock           = lock;
        /*
         * This also sets hw/phys segments, boundary and size
@@ -641,6 +567,7 @@ int blk_get_queue(struct request_queue *q)
        return 1;
 }
+EXPORT_SYMBOL(blk_get_queue);
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
@@ -740,6 +667,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
 }
 /*
+ * Determine if elevator data should be initialized when allocating the
+ * request associated with @bio.
+ */
+static bool blk_rq_should_init_elevator(struct bio *bio)
+{
+        if (!bio)
+                return true;
+        /*
+         * Flush requests do not use the elevator so skip initialization.
+         * This allows a request to share the flush and elevator data.
+         */
+        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+                return false;
+        return true;
+}
+/*
 * Get a free request, queue_lock must be held.
 * Returns NULL on failure, with queue_lock held.
 * Returns !NULL on success, with queue_lock *not held*.
@@ -751,7 +697,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
        struct request_list *rl = &q->rq;
        struct io_context *ioc = NULL;
        const bool is_sync = rw_is_sync(rw_flags) != 0;
-        int may_queue, priv;
+        int may_queue, priv = 0;
        may_queue = elv_may_queue(q, rw_flags);
        if (may_queue == ELV_MQUEUE_NO)
@@ -795,9 +741,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
        rl->count[is_sync]++;
        rl->starved[is_sync] = 0;
-        priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+        if (blk_rq_should_init_elevator(bio)) {
-        if (priv)
+                priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-                rl->elvpriv++;
+                if (priv)
+                        rl->elvpriv++;
+        }
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
@@ -844,8 +792,8 @@ out:
 }
 /*
- * No available requests for this queue, unplug the device and wait for some
+ * No available requests for this queue, wait for some requests to become
- * requests to become available.
+ * available.
 *
 * Called with q->queue_lock held, and returns with it unlocked.
 */
@@ -866,7 +814,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
                trace_block_sleeprq(q, bio, rw_flags & 1);
-                __generic_unplug_device(q);
                spin_unlock_irq(q->queue_lock);
                io_schedule();
@@ -988,6 +935,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(blk_requeue_request);
+static void add_acct_request(struct request_queue *q, struct request *rq,
+                             int where)
+{
+        drive_stat_acct(rq, 1);
+        __elv_add_request(q, rq, where);
+}
 /**
 * blk_insert_request - insert a special request into a request queue
 * @q:          request queue where request should be inserted
@@ -1030,29 +984,12 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
        if (blk_rq_tagged(rq))
                blk_queue_end_tag(q, rq);
-        drive_stat_acct(rq, 1);
+        add_acct_request(q, rq, where);
-        __elv_add_request(q, rq, where, 0);
        __blk_run_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
-        drive_stat_acct(req, 1);
-        /*
-         * elevator indicated where it wants this request to be
-         * inserted at elevator_merge time
-         */
-        __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
 static void part_round_stats_single(int cpu, struct hd_struct *part,
                                    unsigned long now)
 {
@@ -1168,6 +1105,96 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
+static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+                                   struct bio *bio)
+{
+        const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+        if (!ll_back_merge_fn(q, req, bio))
+                return false;
+        trace_block_bio_backmerge(q, bio);
+        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+                blk_rq_set_mixed_merge(req);
+        req->biotail->bi_next = bio;
+        req->biotail = bio;
+        req->__data_len += bio->bi_size;
+        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+        drive_stat_acct(req, 0);
+        elv_bio_merged(q, req, bio);
+        return true;
+}
+static bool bio_attempt_front_merge(struct request_queue *q,
+                                    struct request *req, struct bio *bio)
+{
+        const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+        if (!ll_front_merge_fn(q, req, bio))
+                return false;
+        trace_block_bio_frontmerge(q, bio);
+        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+                blk_rq_set_mixed_merge(req);
+        bio->bi_next = req->bio;
+        req->bio = bio;
+        /*
+         * may not be valid. if the low level driver said
+         * it didn't need a bounce buffer then it better
+         * not touch req->buffer either...
+         */
+        req->buffer = bio_data(bio);
+        req->__sector = bio->bi_sector;
+        req->__data_len += bio->bi_size;
+        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+        drive_stat_acct(req, 0);
+        elv_bio_merged(q, req, bio);
+        return true;
+}
+/*
+ * Attempts to merge with the plugged list in the current process. Returns
+ * true if merge was successful, otherwise false.
+ */
+static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
+                               struct bio *bio)
+{
+        struct blk_plug *plug;
+        struct request *rq;
+        bool ret = false;
+        plug = tsk->plug;
+        if (!plug)
+                goto out;
+        list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+                int el_ret;
+                if (rq->q != q)
+                        continue;
+                el_ret = elv_try_merge(rq, bio);
+                if (el_ret == ELEVATOR_BACK_MERGE) {
+                        ret = bio_attempt_back_merge(q, rq, bio);
+                        if (ret)
+                                break;
+                } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+                        ret = bio_attempt_front_merge(q, rq, bio);
+                        if (ret)
+                                break;
+                }
+        }
+out:
+        return ret;
+}
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
        req->cpu = bio->bi_comp_cpu;
@@ -1183,31 +1210,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        blk_rq_bio_prep(req->q, req, bio);
 }
-/*
- * Only disabling plugging for non-rotational devices if it does tagging
- * as well, otherwise we do need the proper merging
- */
-static inline bool queue_should_plug(struct request_queue *q)
-{
-        return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
-}
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
-        struct request *req;
-        int el_ret;
-        unsigned int bytes = bio->bi_size;
-        const unsigned short prio = bio_prio(bio);
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
-        const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+        struct blk_plug *plug;
-        const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
+        int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
-        int rw_flags;
+        struct request *req;
-        if ((bio->bi_rw & REQ_HARDBARRIER) &&
-            (q->next_ordered == QUEUE_ORDERED_NONE)) {
-                bio_endio(bio, -EOPNOTSUPP);
-                return 0;
-        }
        /*
         * low level driver can indicate that it wants pages above a
         * certain limit bounced to low memory (ie for highmem, or even
@@ -1215,73 +1224,34 @@ static int __make_request(struct request_queue *q, struct bio *bio)
         */
        blk_queue_bounce(q, &bio);
-        spin_lock_irq(q->queue_lock);
+        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+                spin_lock_irq(q->queue_lock);
-        if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+                where = ELEVATOR_INSERT_FLUSH;
                goto get_rq;
+        }
-        el_ret = elv_merge(q, &req, bio);
+        /*
-        switch (el_ret) {
+         * Check if we can merge with the plugged list before grabbing
-        case ELEVATOR_BACK_MERGE:
+         * any locks.
-                BUG_ON(!rq_mergeable(req));
+         */
+        if (attempt_plug_merge(current, q, bio))
-                if (!ll_back_merge_fn(q, req, bio))
-                        break;
-                trace_block_bio_backmerge(q, bio);
-                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
-                        blk_rq_set_mixed_merge(req);
-                req->biotail->bi_next = bio;
-                req->biotail = bio;
-                req->__data_len += bytes;
-                req->ioprio = ioprio_best(req->ioprio, prio);
-                if (!blk_rq_cpu_valid(req))
-                        req->cpu = bio->bi_comp_cpu;
-                drive_stat_acct(req, 0);
-                elv_bio_merged(q, req, bio);
-                if (!attempt_back_merge(q, req))
-                        elv_merged_request(q, req, el_ret);
                goto out;
-        case ELEVATOR_FRONT_MERGE:
+        spin_lock_irq(q->queue_lock);
-                BUG_ON(!rq_mergeable(req));
-                if (!ll_front_merge_fn(q, req, bio))
-                        break;
-                trace_block_bio_frontmerge(q, bio);
-                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
+        el_ret = elv_merge(q, &req, bio);
-                        blk_rq_set_mixed_merge(req);
+        if (el_ret == ELEVATOR_BACK_MERGE) {
-                        req->cmd_flags &= ~REQ_FAILFAST_MASK;
+                if (bio_attempt_back_merge(q, req, bio)) {
-                        req->cmd_flags |= ff;
+                        if (!attempt_back_merge(q, req))
+                                elv_merged_request(q, req, el_ret);
+                        goto out_unlock;
+                }
+        } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+                if (bio_attempt_front_merge(q, req, bio)) {
+                        if (!attempt_front_merge(q, req))
+                                elv_merged_request(q, req, el_ret);
+                        goto out_unlock;
                }
-                bio->bi_next = req->bio;
-                req->bio = bio;
-                /*
-                 * may not be valid. if the low level driver said
-                 * it didn't need a bounce buffer then it better
-                 * not touch req->buffer either...
-                 */
-                req->buffer = bio_data(bio);
-                req->__sector = bio->bi_sector;
-                req->__data_len += bytes;
-                req->ioprio = ioprio_best(req->ioprio, prio);
-                if (!blk_rq_cpu_valid(req))
-                        req->cpu = bio->bi_comp_cpu;
-                drive_stat_acct(req, 0);
-                elv_bio_merged(q, req, bio);
-                if (!attempt_front_merge(q, req))
-                        elv_merged_request(q, req, el_ret);
-                goto out;
-        /* ELV_NO_MERGE: elevator says don't/can't merge. */
-        default:
-                ;
        }
 get_rq:
@@ -1308,17 +1278,39 @@ get_rq:
         */
        init_request_from_bio(req, bio);
-        spin_lock_irq(q->queue_lock);
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-            bio_flagged(bio, BIO_CPU_AFFINE))
+            bio_flagged(bio, BIO_CPU_AFFINE)) {
-                req->cpu = blk_cpu_to_group(smp_processor_id());
+                req->cpu = blk_cpu_to_group(get_cpu());
-        if (queue_should_plug(q) && elv_queue_empty(q))
+                put_cpu();
-                blk_plug_device(q);
+        }
-        add_request(q, req);
+        plug = current->plug;
+        if (plug) {
+                /*
+                 * If this is the first request added after a plug, fire
+                 * of a plug trace. If others have been added before, check
+                 * if we have multiple devices in this plug. If so, make a
+                 * note to sort the list before dispatch.
+                 */
+                if (list_empty(&plug->list))
+                        trace_block_plug(q);
+                else if (!plug->should_sort) {
+                        struct request *__rq;
+                        __rq = list_entry_rq(plug->list.prev);
+                        if (__rq->q != q)
+                                plug->should_sort = 1;
+                }
+                list_add_tail(&req->queuelist, &plug->list);
+                drive_stat_acct(req, 1);
+        } else {
+                spin_lock_irq(q->queue_lock);
+                add_acct_request(q, req, where);
+                __blk_run_queue(q);
+out_unlock:
+                spin_unlock_irq(q->queue_lock);
+        }
 out:
-        if (unplug || !queue_should_plug(q))
-                __generic_unplug_device(q);
-        spin_unlock_irq(q->queue_lock);
        return 0;
 }
@@ -1335,9 +1327,9 @@ static inline void blk_partition_remap(struct bio *bio)
                bio->bi_sector += p->start_sect;
                bio->bi_bdev = bdev->bd_contains;
-                trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
+                trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                    bdev->bd_dev,
+                                      bdev->bd_dev,
-                                    bio->bi_sector - p->start_sect);
+                                      bio->bi_sector - p->start_sect);
        }
 }
@@ -1350,7 +1342,7 @@ static void handle_bad_sector(struct bio *bio)
                        bdevname(bio->bi_bdev, b),
                        bio->bi_rw,
                        (unsigned long long)bio->bi_sector + bio_sectors(bio),
-                        (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
+                        (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
        set_bit(BIO_EOF, &bio->bi_flags);
 }
@@ -1403,7 +1395,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
                return 0;
        /* Test device or partition size, when known. */
-        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
        if (maxsector) {
                sector_t sector = bio->bi_sector;
@@ -1506,7 +1498,7 @@ static inline void __generic_make_request(struct bio *bio)
                        goto end_io;
                if (old_sector != -1)
-                        trace_block_remap(q, bio, old_dev, old_sector);
+                        trace_block_bio_remap(q, bio, old_dev, old_sector);
                old_sector = bio->bi_sector;
                old_dev = bio->bi_bdev->bd_dev;
@@ -1514,6 +1506,19 @@ static inline void __generic_make_request(struct bio *bio)
                if (bio_check_eod(bio, nr_sectors))
                        goto end_io;
+                /*
+                 * Filter flush bio's early so that make_request based
+                 * drivers without flush support don't have to worry
+                 * about them.
+                 */
+                if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+                        bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+                        if (!nr_sectors) {
+                                err = 0;
+                                goto end_io;
+                        }
+                }
                if ((bio->bi_rw & REQ_DISCARD) &&
                    (!blk_queue_discard(q) ||
                     ((bio->bi_rw & REQ_SECURE) &&
@@ -1522,6 +1527,16 @@ static inline void __generic_make_request(struct bio *bio)
                        goto end_io;
                }
+                if (blk_throtl_bio(q, &bio))
+                        goto end_io;
+                /*
+                 * If bio = NULL, bio has been throttled and will be submitted
+                 * later.
+                 */
+                if (!bio)
+                        break;
                trace_block_bio_queue(q, bio);
                ret = q->make_request_fn(q, bio);
@@ -1612,11 +1627,12 @@ void submit_bio(int rw, struct bio *bio)
                if (unlikely(block_dump)) {
                        char b[BDEVNAME_SIZE];
-                        printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+                        printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
                        current->comm, task_pid_nr(current),
                                (rw & WRITE) ? "WRITE" : "READ",
                                (unsigned long long)bio->bi_sector,
-                                bdevname(bio->bi_bdev, b));
+                                bdevname(bio->bi_bdev, b),
+                                count);
                }
        }
@@ -1637,7 +1653,7 @@ EXPORT_SYMBOL(submit_bio);
 *    the insertion using this generic function.
 *
 *    This function should also be useful for request stacking drivers
- *    in some cases below, so export this fuction.
+ *    in some cases below, so export this function.
 *    Request stacking drivers like request-based dm may change the queue
 *    limits while requests are in the queue (e.g. dm's table swapping).
 *    Such request stacking drivers should check those requests agaist
@@ -1698,9 +1714,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
         */
        BUG_ON(blk_queued_rq(rq));
-        drive_stat_acct(rq, 1);
+        add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
-        __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
        spin_unlock_irqrestore(q->queue_lock, flags);
        return 0;
@@ -1759,7 +1773,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
                int cpu;
                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+                part = req->part;
                part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                part_stat_unlock();
        }
@@ -1768,24 +1782,25 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 static void blk_account_io_done(struct request *req)
 {
        /*
-         * Account IO completion.  bar_rq isn't accounted as a normal
+         * Account IO completion.  flush_rq isn't accounted as a
-         * IO on queueing nor completion.  Accounting the containing
+         * normal IO on queueing nor completion.  Accounting the
-         * request is enough.
+         * containing request is enough.
         */
-        if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+        if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
                unsigned long duration = jiffies - req->start_time;
                const int rw = rq_data_dir(req);
                struct hd_struct *part;
                int cpu;
                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+                part = req->part;
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rw);
+                hd_struct_put(part);
                part_stat_unlock();
        }
 }
@@ -2011,9 +2026,26 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
        if (error && req->cmd_type == REQ_TYPE_FS &&
            !(req->cmd_flags & REQ_QUIET)) {
-                printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
+                char *error_type;
-                                req->rq_disk ? req->rq_disk->disk_name : "?",
-                                (unsigned long long)blk_rq_pos(req));
+                switch (error) {
+                case -ENOLINK:
+                        error_type = "recoverable transport";
+                        break;
+                case -EREMOTEIO:
+                        error_type = "critical target";
+                        break;
+                case -EBADE:
+                        error_type = "critical nexus";
+                        break;
+                case -EIO:
+                default:
+                        error_type = "I/O";
+                        break;
+                }
+                printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
+                       error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
+                       (unsigned long long)blk_rq_pos(req));
        }
        blk_account_io_completion(req, nr_bytes);
@@ -2111,7 +2143,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
         * size, something has gone terribly wrong.
         */
        if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
-                printk(KERN_ERR "blk: request botched\n");
+                blk_dump_rq_flags(req, "request botched");
                req->__data_len = blk_rq_cur_bytes(req);
        }
@@ -2497,9 +2529,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
        dst->cpu = src->cpu;
-        dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
+        dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
-        if (src->cmd_flags & REQ_DISCARD)
-                dst->cmd_flags |= REQ_DISCARD;
        dst->cmd_type = src->cmd_type;
        dst->__sector = blk_rq_pos(src);
        dst->__data_len = blk_rq_bytes(src);
@@ -2579,12 +2609,171 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
+int kblockd_schedule_delayed_work(struct request_queue *q,
+                        struct delayed_work *dwork, unsigned long delay)
+{
+        return queue_delayed_work(kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+#define PLUG_MAGIC      0x91827364
+void blk_start_plug(struct blk_plug *plug)
+{
+        struct task_struct *tsk = current;
+        plug->magic = PLUG_MAGIC;
+        INIT_LIST_HEAD(&plug->list);
+        INIT_LIST_HEAD(&plug->cb_list);
+        plug->should_sort = 0;
+        /*
+         * If this is a nested plug, don't actually assign it. It will be
+         * flushed on its own.
+         */
+        if (!tsk->plug) {
+                /*
+                 * Store ordering should not be needed here, since a potential
+                 * preempt will imply a full memory barrier
+                 */
+                tsk->plug = plug;
+        }
+}
+EXPORT_SYMBOL(blk_start_plug);
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct request *rqa = container_of(a, struct request, queuelist);
+        struct request *rqb = container_of(b, struct request, queuelist);
+        return !(rqa->q <= rqb->q);
+}
+/*
+ * If 'from_schedule' is true, then postpone the dispatch of requests
+ * until a safe kblockd context. We due this to avoid accidental big
+ * additional stack usage in driver dispatch, in places where the originally
+ * plugger did not intend it.
+ */
+static void queue_unplugged(struct request_queue *q, unsigned int depth,
+                            bool from_schedule)
+        __releases(q->queue_lock)
+{
+        trace_block_unplug(q, depth, !from_schedule);
+        /*
+         * If we are punting this to kblockd, then we can safely drop
+         * the queue_lock before waking kblockd (which needs to take
+         * this lock).
+         */
+        if (from_schedule) {
+                spin_unlock(q->queue_lock);
+                blk_run_queue_async(q);
+        } else {
+                __blk_run_queue(q);
+                spin_unlock(q->queue_lock);
+        }
+}
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+        LIST_HEAD(callbacks);
+        if (list_empty(&plug->cb_list))
+                return;
+        list_splice_init(&plug->cb_list, &callbacks);
+        while (!list_empty(&callbacks)) {
+                struct blk_plug_cb *cb = list_first_entry(&callbacks,
+                                                          struct blk_plug_cb,
+                                                          list);
+                list_del(&cb->list);
+                cb->callback(cb);
+        }
+}
+void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+{
+        struct request_queue *q;
+        unsigned long flags;
+        struct request *rq;
+        LIST_HEAD(list);
+        unsigned int depth;
+        BUG_ON(plug->magic != PLUG_MAGIC);
+        flush_plug_callbacks(plug);
+        if (list_empty(&plug->list))
+                return;
+        list_splice_init(&plug->list, &list);
+        if (plug->should_sort) {
+                list_sort(NULL, &list, plug_rq_cmp);
+                plug->should_sort = 0;
+        }
+        q = NULL;
+        depth = 0;
+        /*
+         * Save and disable interrupts here, to avoid doing it for every
+         * queue lock we have to take.
+         */
+        local_irq_save(flags);
+        while (!list_empty(&list)) {
+                rq = list_entry_rq(list.next);
+                list_del_init(&rq->queuelist);
+                BUG_ON(!rq->q);
+                if (rq->q != q) {
+                        /*
+                         * This drops the queue lock
+                         */
+                        if (q)
+                                queue_unplugged(q, depth, from_schedule);
+                        q = rq->q;
+                        depth = 0;
+                        spin_lock(q->queue_lock);
+                }
+                /*
+                 * rq is already accounted, so use raw insert
+                 */
+                if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
+                        __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
+                else
+                        __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
+                depth++;
+        }
+        /*
+         * This drops the queue lock
+         */
+        if (q)
+                queue_unplugged(q, depth, from_schedule);
+        local_irq_restore(flags);
+}
+void blk_finish_plug(struct blk_plug *plug)
+{
+        blk_flush_plug_list(plug, false);
+        if (plug == current->plug)
+                current->plug = NULL;
+}
+EXPORT_SYMBOL(blk_finish_plug);
 int __init blk_dev_init(void)
 {
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
                        sizeof(((struct request *)0)->cmd_flags));
-        kblockd_workqueue = create_workqueue("kblockd");
+        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
+        kblockd_workqueue = alloc_workqueue("kblockd",
+                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..8a0e7ec056e7 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,9 +54,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
        rq->end_io = done;
        WARN_ON(irqs_disabled());
        spin_lock_irq(q->queue_lock);
-        __elv_add_request(q, rq, where, 1);
+        __elv_add_request(q, rq, where);
-        __generic_unplug_device(q);
+        __blk_run_queue(q);
-        /* the queue is stopped so it won't be plugged+unplugged */
+        /* the queue is stopped so it won't be run */
        if (rq->cmd_type == REQ_TYPE_PM_RESUME)
                q->request_fn(q);
        spin_unlock_irq(q->queue_lock);
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
        DECLARE_COMPLETION_ONSTACK(wait);
        char sense[SCSI_SENSE_BUFFERSIZE];
        int err = 0;
+        unsigned long hang_check;
        /*
         * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
        rq->end_io_data = &wait;
        blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
-        wait_for_completion(&wait);
+        /* Prevent hang_check timer from firing at us during very long I/O */
+        hang_check = sysctl_hung_task_timeout_secs;
+        if (hang_check)
+                while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
+        else
+                wait_for_completion(&wait);
        if (rq->errors)
                err = -EIO;
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 000000000000..bb21e4c36f70
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,443 @@
+/*
+ * Functions to sequence FLUSH and FUA writes.
+ *
+ * Copyright (C) 2011           Max Planck Institute for Gravitational Physics
+ * Copyright (C) 2011           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
+ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
+ * properties and hardware capability.
+ *
+ * If a request doesn't have data, only REQ_FLUSH makes sense, which
+ * indicates a simple flush request.  If there is data, REQ_FLUSH indicates
+ * that the device cache should be flushed before the data is executed, and
+ * REQ_FUA means that the data must be on non-volatile media on request
+ * completion.
+ *
+ * If the device doesn't have writeback cache, FLUSH and FUA don't make any
+ * difference.  The requests are either completed immediately if there's no
+ * data or executed as normal requests otherwise.
+ *
+ * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
+ *
+ * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
+ * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ *
+ * The actual execution of flush is double buffered.  Whenever a request
+ * needs to execute PRE or POSTFLUSH, it queues at
+ * q->flush_queue[q->flush_pending_idx].  Once certain criteria are met, a
+ * flush is issued and the pending_idx is toggled.  When the flush
+ * completes, all the requests which were pending are proceeded to the next
+ * step.  This allows arbitrary merging of different types of FLUSH/FUA
+ * requests.
+ *
+ * Currently, the following conditions are used to determine when to issue
+ * flush.
+ *
+ * C1. At any given time, only one flush shall be in progress.  This makes
+ *     double buffering sufficient.
+ *
+ * C2. Flush is deferred if any request is executing DATA of its sequence.
+ *     This avoids issuing separate POSTFLUSHes for requests which shared
+ *     PREFLUSH.
+ *
+ * C3. The second condition is ignored if there is a request which has
+ *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
+ *     starvation in the unlikely case where there are continuous stream of
+ *     FUA (without FLUSH) requests.
+ *
+ * For devices which support FUA, it isn't clear whether C2 (and thus C3)
+ * is beneficial.
+ *
+ * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
+ * Once while executing DATA and again after the whole sequence is
+ * complete.  The first completion updates the contained bio but doesn't
+ * finish it so that the bio submitter is notified only after the whole
+ * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * req_bio_endio().
+ *
+ * The above peculiarity requires that each FLUSH/FUA request has only one
+ * bio attached to it, which is guaranteed as they aren't allowed to be
+ * merged in the usual way.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+#include "blk.h"
+/* FLUSH/FUA sequences */
+enum {
+        REQ_FSEQ_PREFLUSH       = (1 << 0), /* pre-flushing in progress */
+        REQ_FSEQ_DATA           = (1 << 1), /* data write in progress */
+        REQ_FSEQ_POSTFLUSH      = (1 << 2), /* post-flushing in progress */
+        REQ_FSEQ_DONE           = (1 << 3),
+        REQ_FSEQ_ACTIONS        = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
+                                  REQ_FSEQ_POSTFLUSH,
+        /*
+         * If flush has been pending longer than the following timeout,
+         * it's issued even if flush_data requests are still in flight.
+         */
+        FLUSH_PENDING_TIMEOUT   = 5 * HZ,
+};
+static bool blk_kick_flush(struct request_queue *q);
+static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
+{
+        unsigned int policy = 0;
+        if (fflags & REQ_FLUSH) {
+                if (rq->cmd_flags & REQ_FLUSH)
+                        policy |= REQ_FSEQ_PREFLUSH;
+                if (blk_rq_sectors(rq))
+                        policy |= REQ_FSEQ_DATA;
+                if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+                        policy |= REQ_FSEQ_POSTFLUSH;
+        }
+        return policy;
+}
+static unsigned int blk_flush_cur_seq(struct request *rq)
+{
+        return 1 << ffz(rq->flush.seq);
+}
+static void blk_flush_restore_request(struct request *rq)
+{
+        /*
+         * After flush data completion, @rq->bio is %NULL but we need to
+         * complete the bio again.  @rq->biotail is guaranteed to equal the
+         * original @rq->bio.  Restore it.
+         */
+        rq->bio = rq->biotail;
+        /* make @rq a normal request */
+        rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+        rq->end_io = NULL;
+}
+/**
+ * blk_flush_complete_seq - complete flush sequence
+ * @rq: FLUSH/FUA request being sequenced
+ * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
+ * @error: whether an error occurred
+ *
+ * @rq just completed @seq part of its flush sequence, record the
+ * completion and trigger the next step.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if requests were added to the dispatch queue, %false otherwise.
+ */
+static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
+                                   int error)
+{
+        struct request_queue *q = rq->q;
+        struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+        bool queued = false;
+        BUG_ON(rq->flush.seq & seq);
+        rq->flush.seq |= seq;
+        if (likely(!error))
+                seq = blk_flush_cur_seq(rq);
+        else
+                seq = REQ_FSEQ_DONE;
+        switch (seq) {
+        case REQ_FSEQ_PREFLUSH:
+        case REQ_FSEQ_POSTFLUSH:
+                /* queue for flush */
+                if (list_empty(pending))
+                        q->flush_pending_since = jiffies;
+                list_move_tail(&rq->flush.list, pending);
+                break;
+        case REQ_FSEQ_DATA:
+                list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
+                list_add(&rq->queuelist, &q->queue_head);
+                queued = true;
+                break;
+        case REQ_FSEQ_DONE:
+                /*
+                 * @rq was previously adjusted by blk_flush_issue() for
+                 * flush sequencing and may already have gone through the
+                 * flush data request completion path.  Restore @rq for
+                 * normal completion and end it.
+                 */
+                BUG_ON(!list_empty(&rq->queuelist));
+                list_del_init(&rq->flush.list);
+                blk_flush_restore_request(rq);
+                __blk_end_request_all(rq, error);
+                break;
+        default:
+                BUG();
+        }
+        return blk_kick_flush(q) | queued;
+}
+static void flush_end_io(struct request *flush_rq, int error)
+{
+        struct request_queue *q = flush_rq->q;
+        struct list_head *running = &q->flush_queue[q->flush_running_idx];
+        bool queued = false;
+        struct request *rq, *n;
+        BUG_ON(q->flush_pending_idx == q->flush_running_idx);
+        /* account completion of the flush request */
+        q->flush_running_idx ^= 1;
+        elv_completed_request(q, flush_rq);
+        /* and push the waiting requests to the next stage */
+        list_for_each_entry_safe(rq, n, running, flush.list) {
+                unsigned int seq = blk_flush_cur_seq(rq);
+                BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+                queued |= blk_flush_complete_seq(rq, seq, error);
+        }
+        /*
+         * Kick the queue to avoid stall for two cases:
+         * 1. Moving a request silently to empty queue_head may stall the
+         * queue.
+         * 2. When flush request is running in non-queueable queue, the
+         * queue is hold. Restart the queue after flush request is finished
+         * to avoid stall.
+         * This function is called from request completion path and calling
+         * directly into request_fn may confuse the driver.  Always use
+         * kblockd.
+         */
+        if (queued || q->flush_queue_delayed)
+                blk_run_queue_async(q);
+        q->flush_queue_delayed = 0;
+}
+/**
+ * blk_kick_flush - consider issuing flush request
+ * @q: request_queue being kicked
+ *
+ * Flush related states of @q have changed, consider issuing flush request.
+ * Please read the comment at the top of this file for more info.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if flush was issued, %false otherwise.
+ */
+static bool blk_kick_flush(struct request_queue *q)
+{
+        struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+        struct request *first_rq =
+                list_first_entry(pending, struct request, flush.list);
+        /* C1 described at the top of this file */
+        if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
+                return false;
+        /* C2 and C3 */
+        if (!list_empty(&q->flush_data_in_flight) &&
+            time_before(jiffies,
+                        q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+                return false;
+        /*
+         * Issue flush and toggle pending_idx.  This makes pending_idx
+         * different from running_idx, which means flush is in flight.
+         */
+        blk_rq_init(q, &q->flush_rq);
+        q->flush_rq.cmd_type = REQ_TYPE_FS;
+        q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+        q->flush_rq.rq_disk = first_rq->rq_disk;
+        q->flush_rq.end_io = flush_end_io;
+        q->flush_pending_idx ^= 1;
+        list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
+        return true;
+}
+static void flush_data_end_io(struct request *rq, int error)
+{
+        struct request_queue *q = rq->q;
+        /*
+         * After populating an empty queue, kick it to avoid stall.  Read
+         * the comment in flush_end_io().
+         */
+        if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+                blk_run_queue_async(q);
+}
+/**
+ * blk_insert_flush - insert a new FLUSH/FUA request
+ * @rq: request to insert
+ *
+ * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * @rq is being submitted.  Analyze what needs to be done and put it on the
+ * right queue.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_insert_flush(struct request *rq)
+{
+        struct request_queue *q = rq->q;
+        unsigned int fflags = q->flush_flags;   /* may change, cache */
+        unsigned int policy = blk_flush_policy(fflags, rq);
+        BUG_ON(rq->end_io);
+        BUG_ON(!rq->bio || rq->bio != rq->biotail);
+        /*
+         * @policy now records what operations need to be done.  Adjust
+         * REQ_FLUSH and FUA for the driver.
+         */
+        rq->cmd_flags &= ~REQ_FLUSH;
+        if (!(fflags & REQ_FUA))
+                rq->cmd_flags &= ~REQ_FUA;
+        /*
+         * If there's data but flush is not necessary, the request can be
+         * processed directly without going through flush machinery.  Queue
+         * for normal execution.
+         */
+        if ((policy & REQ_FSEQ_DATA) &&
+            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+                list_add_tail(&rq->queuelist, &q->queue_head);
+                return;
+        }
+        /*
+         * @rq should go through flush machinery.  Mark it part of flush
+         * sequence and submit for further processing.
+         */
+        memset(&rq->flush, 0, sizeof(rq->flush));
+        INIT_LIST_HEAD(&rq->flush.list);
+        rq->cmd_flags |= REQ_FLUSH_SEQ;
+        rq->end_io = flush_data_end_io;
+        blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
+}
+/**
+ * blk_abort_flushes - @q is being aborted, abort flush requests
+ * @q: request_queue being aborted
+ *
+ * To be called from elv_abort_queue().  @q is being aborted.  Prepare all
+ * FLUSH/FUA requests for abortion.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_abort_flushes(struct request_queue *q)
+{
+        struct request *rq, *n;
+        int i;
+        /*
+         * Requests in flight for data are already owned by the dispatch
+         * queue or the device driver.  Just restore for normal completion.
+         */
+        list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
+                list_del_init(&rq->flush.list);
+                blk_flush_restore_request(rq);
+        }
+        /*
+         * We need to give away requests on flush queues.  Restore for
+         * normal completion and put them on the dispatch queue.
+         */
+        for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
+                list_for_each_entry_safe(rq, n, &q->flush_queue[i],
+                                         flush.list) {
+                        list_del_init(&rq->flush.list);
+                        blk_flush_restore_request(rq);
+                        list_add_tail(&rq->queuelist, &q->queue_head);
+                }
+        }
+}
+static void bio_end_flush(struct bio *bio, int err)
+{
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        bio_put(bio);
+}
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:       blockdev to issue flush for
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @error_sector:       error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
+ */
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+                sector_t *error_sector)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        struct request_queue *q;
+        struct bio *bio;
+        int ret = 0;
+        if (bdev->bd_disk == NULL)
+                return -ENXIO;
+        q = bdev_get_queue(bdev);
+        if (!q)
+                return -ENXIO;
+        /*
+         * some block devices may not have their queue correctly set up here
+         * (e.g. loop device without a backing file) and so issuing a flush
+         * here will panic. Ensure there is a request function before issuing
+         * the flush.
+         */
+        if (!q->make_request_fn)
+                return -ENXIO;
+        bio = bio_alloc(gfp_mask, 0);
+        bio->bi_end_io = bio_end_flush;
+        bio->bi_bdev = bdev;
+        bio->bi_private = &wait;
+        bio_get(bio);
+        submit_bio(WRITE_FLUSH, bio);
+        wait_for_completion(&wait);
+        /*
+         * The driver must store the error location in ->bi_sector, if
+         * it supports it. For non-stacked drivers, this should be
+         * copied from blk_rq_pos(rq).
+         */
+        if (error_sector)
+               *error_sector = bio->bi_sector;
+        if (!bio_flagged(bio, BIO_UPTODATE))
+                ret = -EIO;
+        bio_put(bio);
+        return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..129b9e209a3b 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -30,26 +30,41 @@
 static struct kmem_cache *integrity_cachep;
+static const char *bi_unsupported_name = "unsupported";
 /**
 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
- * @rq:         request with integrity metadata attached
+ * @q:          request queue
+ * @bio:        bio with integrity metadata attached
 *
 * Description: Returns the number of elements required in a
- * scatterlist corresponding to the integrity metadata in a request.
+ * scatterlist corresponding to the integrity metadata in a bio.
 */
-int blk_rq_count_integrity_sg(struct request *rq)
+int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
 {
-        struct bio_vec *iv, *ivprv;
+        struct bio_vec *iv, *ivprv = NULL;
-        struct req_iterator iter;
+        unsigned int segments = 0;
-        unsigned int segments;
+        unsigned int seg_size = 0;
+        unsigned int i = 0;
+        bio_for_each_integrity_vec(iv, bio, i) {
+                if (ivprv) {
+                        if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+                                goto new_segment;
-        ivprv = NULL;
+                        if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
-        segments = 0;
+                                goto new_segment;
-        rq_for_each_integrity_segment(iv, rq, iter) {
+                        if (seg_size + iv->bv_len > queue_max_segment_size(q))
+                                goto new_segment;
-                if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+                        seg_size += iv->bv_len;
+                } else {
+new_segment:
                        segments++;
+                        seg_size = iv->bv_len;
+                }
                ivprv = iv;
        }
@@ -60,30 +75,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
 /**
 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @rq:         request with integrity metadata attached
+ * @q:          request queue
+ * @bio:        bio with integrity metadata attached
 * @sglist:     target scatterlist
 *
 * Description: Map the integrity vectors in request into a
 * scatterlist.  The scatterlist must be big enough to hold all
 * elements.  I.e. sized using blk_rq_count_integrity_sg().
 */
-int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
+                            struct scatterlist *sglist)
 {
-        struct bio_vec *iv, *ivprv;
+        struct bio_vec *iv, *ivprv = NULL;
-        struct req_iterator iter;
+        struct scatterlist *sg = NULL;
-        struct scatterlist *sg;
+        unsigned int segments = 0;
-        unsigned int segments;
+        unsigned int i = 0;
-        ivprv = NULL;
-        sg = NULL;
-        segments = 0;
-        rq_for_each_integrity_segment(iv, rq, iter) {
+        bio_for_each_integrity_vec(iv, bio, i) {
                if (ivprv) {
                        if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
                                goto new_segment;
+                        if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+                                goto new_segment;
+                        if (sg->length + iv->bv_len > queue_max_segment_size(q))
+                                goto new_segment;
                        sg->length += iv->bv_len;
                } else {
 new_segment:
@@ -162,6 +181,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
 }
 EXPORT_SYMBOL(blk_integrity_compare);
+int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
+                           struct request *next)
+{
+        if (blk_integrity_rq(req) != blk_integrity_rq(next))
+                return -1;
+        if (req->nr_integrity_segments + next->nr_integrity_segments >
+            q->limits.max_integrity_segments)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_rq);
+int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
+                            struct bio *bio)
+{
+        int nr_integrity_segs;
+        struct bio *next = bio->bi_next;
+        bio->bi_next = NULL;
+        nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
+        bio->bi_next = next;
+        if (req->nr_integrity_segments + nr_integrity_segs >
+            q->limits.max_integrity_segments)
+                return -1;
+        req->nr_integrity_segments += nr_integrity_segs;
+        return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_bio);
 struct integrity_sysfs_entry {
        struct attribute attr;
        ssize_t (*show)(struct blk_integrity *, char *);
@@ -307,6 +360,14 @@ static struct kobj_type integrity_ktype = {
        .release        = blk_integrity_release,
 };
+bool blk_integrity_is_initialized(struct gendisk *disk)
+{
+        struct blk_integrity *bi = blk_get_integrity(disk);
+        return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
+}
+EXPORT_SYMBOL(blk_integrity_is_initialized);
 /**
 * blk_integrity_register - Register a gendisk as being integrity-capable
 * @disk:       struct gendisk pointer to make integrity-aware
@@ -356,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
                bi->get_tag_fn = template->get_tag_fn;
                bi->tag_size = template->tag_size;
        } else
-                bi->name = "unsupported";
+                bi->name = bi_unsupported_name;
        return 0;
 }
@@ -381,7 +442,6 @@ void blk_integrity_unregister(struct gendisk *disk)
        kobject_uevent(&bi->kobj, KOBJ_REMOVE);
        kobject_del(&bi->kobj);
        kobject_put(&bi->kobj);
-        kmem_cache_free(integrity_cachep, bi);
        disk->integrity = NULL;
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d22c4c55c406..342eae9b0d3c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -21,7 +21,7 @@ static void cfq_dtor(struct io_context *ioc)
        if (!hlist_empty(&ioc->cic_list)) {
                struct cfq_io_context *cic;
-                cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+                cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
                                                                cic_list);
                cic->dtor(ioc);
        }
@@ -57,14 +57,14 @@ static void cfq_exit(struct io_context *ioc)
        if (!hlist_empty(&ioc->cic_list)) {
                struct cfq_io_context *cic;
-                cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+                cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
                                                                cic_list);
                cic->exit(ioc);
        }
        rcu_read_unlock();
 }
-/* Called by the exitting task */
+/* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
 {
        struct io_context *ioc;
@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task)
        task->io_context = NULL;
        task_unlock(task);
-        if (atomic_dec_and_test(&ioc->nr_tasks)) {
+        if (atomic_dec_and_test(&ioc->nr_tasks))
                cfq_exit(ioc);
-        }
        put_io_context(ioc);
 }
@@ -97,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
                INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
                INIT_HLIST_HEAD(&ret->cic_list);
                ret->ioc_data = NULL;
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+                ret->cgroup_changed = 0;
+#endif
        }
        return ret;
@@ -153,20 +155,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node)
 }
 EXPORT_SYMBOL(get_io_context);
-void copy_io_context(struct io_context **pdst, struct io_context **psrc)
-{
-        struct io_context *src = *psrc;
-        struct io_context *dst = *pdst;
-        if (src) {
-                BUG_ON(atomic_long_read(&src->refcount) == 0);
-                atomic_long_inc(&src->refcount);
-                put_io_context(dst);
-                *pdst = src;
-        }
-}
-EXPORT_SYMBOL(copy_io_context);
 static int __init blk_ioc_init(void)
 {
        iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-lib.c b/block/blk-lib.c
index c392029a104e..78e627e2581d 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,17 +9,20 @@
 #include "blk.h"
-static void blkdev_discard_end_io(struct bio *bio, int err)
+struct bio_batch {
-{
+        atomic_t                done;
-        if (err) {
+        unsigned long           flags;
-                if (err == -EOPNOTSUPP)
+        struct completion       *wait;
-                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+};
-                clear_bit(BIO_UPTODATE, &bio->bi_flags);
-        }
-        if (bio->bi_private)
+static void bio_batch_end_io(struct bio *bio, int err)
-                complete(bio->bi_private);
+{
+        struct bio_batch *bb = bio->bi_private;
+        if (err && (err != -EOPNOTSUPP))
+                clear_bit(BIO_UPTODATE, &bb->flags);
+        if (atomic_dec_and_test(&bb->done))
+                complete(bb->wait);
        bio_put(bio);
 }
@@ -39,9 +42,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 {
        DECLARE_COMPLETION_ONSTACK(wait);
        struct request_queue *q = bdev_get_queue(bdev);
-        int type = flags & BLKDEV_IFL_BARRIER ?
+        int type = REQ_WRITE | REQ_DISCARD;
-                DISCARD_BARRIER : DISCARD_NOBARRIER;
        unsigned int max_discard_sectors;
+        struct bio_batch bb;
        struct bio *bio;
        int ret = 0;
@@ -62,13 +65,17 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                max_discard_sectors &= ~(disc_sects - 1);
        }
-        if (flags & BLKDEV_IFL_SECURE) {
+        if (flags & BLKDEV_DISCARD_SECURE) {
                if (!blk_queue_secdiscard(q))
                        return -EOPNOTSUPP;
-                type |= DISCARD_SECURE;
+                type |= REQ_SECURE;
        }
-        while (nr_sects && !ret) {
+        atomic_set(&bb.done, 1);
+        bb.flags = 1 << BIO_UPTODATE;
+        bb.wait = &wait;
+        while (nr_sects) {
                bio = bio_alloc(gfp_mask, 1);
                if (!bio) {
                        ret = -ENOMEM;
@@ -76,10 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                }
                bio->bi_sector = sector;
-                bio->bi_end_io = blkdev_discard_end_io;
+                bio->bi_end_io = bio_batch_end_io;
                bio->bi_bdev = bdev;
-                if (flags & BLKDEV_IFL_WAIT)
+                bio->bi_private = &bb;
-                        bio->bi_private = &wait;
                if (nr_sects > max_discard_sectors) {
                        bio->bi_size = max_discard_sectors << 9;
@@ -90,85 +96,45 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                        nr_sects = 0;
                }
-                bio_get(bio);
+                atomic_inc(&bb.done);
                submit_bio(type, bio);
+        }
-                if (flags & BLKDEV_IFL_WAIT)
+        /* Wait for bios in-flight */
-                        wait_for_completion(&wait);
+        if (!atomic_dec_and_test(&bb.done))
+                wait_for_completion(&wait);
-                if (bio_flagged(bio, BIO_EOPNOTSUPP))
+        if (!test_bit(BIO_UPTODATE, &bb.flags))
-                        ret = -EOPNOTSUPP;
+                ret = -EIO;
-                else if (!bio_flagged(bio, BIO_UPTODATE))
-                        ret = -EIO;
-                bio_put(bio);
-        }
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
-struct bio_batch
-{
-        atomic_t                done;
-        unsigned long           flags;
-        struct completion       *wait;
-        bio_end_io_t            *end_io;
-};
-static void bio_batch_end_io(struct bio *bio, int err)
-{
-        struct bio_batch *bb = bio->bi_private;
-        if (err) {
-                if (err == -EOPNOTSUPP)
-                        set_bit(BIO_EOPNOTSUPP, &bb->flags);
-                else
-                        clear_bit(BIO_UPTODATE, &bb->flags);
-        }
-        if (bb) {
-                if (bb->end_io)
-                        bb->end_io(bio, err);
-                atomic_inc(&bb->done);
-                complete(bb->wait);
-        }
-        bio_put(bio);
-}
 /**
- * blkdev_issue_zeroout generate number of zero filed write bios
+ * blkdev_issue_zeroout - generate number of zero filed write bios
 * @bdev:       blockdev to issue
 * @sector:     start sector
 * @nr_sects:   number of sectors to write
 * @gfp_mask:   memory allocation flags (for bio_alloc)
- * @flags:      BLKDEV_IFL_* flags to control behaviour
 *
 * Description:
 *  Generate and issue number of bios with zerofiled pages.
- *  Send barrier at the beginning and at the end if requested. This guarantie
- *  correct request ordering. Empty barrier allow us to avoid post queue flush.
 */
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-                        sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+                        sector_t nr_sects, gfp_t gfp_mask)
 {
        int ret;
        struct bio *bio;
        struct bio_batch bb;
-        unsigned int sz, issued = 0;
+        unsigned int sz;
        DECLARE_COMPLETION_ONSTACK(wait);
-        atomic_set(&bb.done, 0);
+        atomic_set(&bb.done, 1);
        bb.flags = 1 << BIO_UPTODATE;
        bb.wait = &wait;
-        bb.end_io = NULL;
-        if (flags & BLKDEV_IFL_BARRIER) {
-                /* issue async barrier before the data */
-                ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
-                if (ret)
-                        return ret;
-        }
-submit:
        ret = 0;
        while (nr_sects != 0) {
                bio = bio_alloc(gfp_mask,
@@ -181,14 +147,10 @@ submit:
                bio->bi_sector = sector;
                bio->bi_bdev   = bdev;
                bio->bi_end_io = bio_batch_end_io;
-                if (flags & BLKDEV_IFL_WAIT)
+                bio->bi_private = &bb;
-                        bio->bi_private = &bb;
                while (nr_sects != 0) {
                        sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
-                        if (sz == 0)
-                                /* bio has maximum size possible */
-                                break;
                        ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
                        nr_sects -= ret >> 9;
                        sector += ret >> 9;
@@ -196,36 +158,18 @@ submit:
                                break;
                }
                ret = 0;
-                issued++;
+                atomic_inc(&bb.done);
                submit_bio(WRITE, bio);
        }
-        /*
-         * When all data bios are in flight. Send final barrier if requeted.
-         */
-        if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
-                ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
-                                        flags & BLKDEV_IFL_WAIT);
-        if (flags & BLKDEV_IFL_WAIT)
+        /* Wait for bios in-flight */
-                /* Wait for bios in-flight */
+        if (!atomic_dec_and_test(&bb.done))
-                while ( issued != atomic_read(&bb.done))
+                wait_for_completion(&wait);
-                        wait_for_completion(&wait);
        if (!test_bit(BIO_UPTODATE, &bb.flags))
                /* One of bios in the batch was completed with error.*/
                ret = -EIO;
-        if (ret)
-                goto out;
-        if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
-                ret = -EOPNOTSUPP;
-                goto out;
-        }
-        if (nr_sects != 0)
-                goto submit;
-out:
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index ade0a08c9099..e663ac2d8e68 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
         * direct dma. else, set up kernel bounce buffers
         */
        uaddr = (unsigned long) ubuf;
-        if (blk_rq_aligned(q, ubuf, len) && !map_data)
+        if (blk_rq_aligned(q, uaddr, len) && !map_data)
                bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
        else
                bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -201,6 +201,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
        for (i = 0; i < iov_count; i++) {
                unsigned long uaddr = (unsigned long)iov[i].iov_base;
+                if (!iov[i].iov_len)
+                        return -EINVAL;
                if (uaddr & queue_dma_alignment(q)) {
                        unaligned = 1;
                        break;
@@ -288,6 +291,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
                    unsigned int len, gfp_t gfp_mask)
 {
        int reading = rq_data_dir(rq) == READ;
+        unsigned long addr = (unsigned long) kbuf;
        int do_copy = 0;
        struct bio *bio;
        int ret;
@@ -297,7 +301,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
        if (!len || !kbuf)
                return -EINVAL;
-        do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
+        do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
        if (do_copy)
                bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
        else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index eafc94f68d79..cfcc37cb222b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
                return 0;
        fbio = bio;
-        cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+        cluster = blk_queue_cluster(q);
        seg_size = 0;
        nr_phys_segs = 0;
        for_each_bio(bio) {
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments);
 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
                                   struct bio *nxt)
 {
-        if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+        if (!blk_queue_cluster(q))
                return 0;
        if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
@@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
        int nsegs, cluster;
        nsegs = 0;
-        cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+        cluster = blk_queue_cluster(q);
        /*
         * for each bio in rq
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
        int nr_phys_segs = bio_phys_segments(q, bio);
-        if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
+        if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
-                req->cmd_flags |= REQ_NOMERGE;
+                goto no_merge;
-                if (req == q->last_merge)
-                        q->last_merge = NULL;
+        if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
-                return 0;
+                goto no_merge;
-        }
        /*
         * This will form the start of a new hw segment.  Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
         */
        req->nr_phys_segments += nr_phys_segs;
        return 1;
+no_merge:
+        req->cmd_flags |= REQ_NOMERGE;
+        if (req == q->last_merge)
+                q->last_merge = NULL;
+        return 0;
 }
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
        if (total_phys_segments > queue_max_segments(q))
                return 0;
+        if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
+                return 0;
        /* Merge is OK... */
        req->nr_phys_segments = total_phys_segments;
        return 1;
@@ -343,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
                int cpu;
                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+                part = req->part;
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rq_data_dir(req));
+                hd_struct_put(part);
                part_stat_unlock();
        }
 }
@@ -384,9 +393,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
            || next->special)
                return 0;
-        if (blk_integrity_rq(req) != blk_integrity_rq(next))
-                return 0;
        /*
         * If we are allowed to merge, then append bio list
         * from next to rq and release next. merge_requests_fn
@@ -459,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
        return 0;
 }
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+                          struct request *next)
+{
+        return attempt_merge(q, rq, next);
+}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..fa1eb0449a05 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 void blk_set_default_limits(struct queue_limits *lim)
 {
        lim->max_segments = BLK_MAX_SEGMENTS;
+        lim->max_integrity_segments = 0;
        lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
        lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
        lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -119,13 +120,13 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->discard_granularity = 0;
        lim->discard_alignment = 0;
        lim->discard_misaligned = 0;
-        lim->discard_zeroes_data = -1;
+        lim->discard_zeroes_data = 1;
        lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
        lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
        lim->alignment_offset = 0;
        lim->io_opt = 0;
        lim->misaligned = 0;
-        lim->no_cluster = 0;
+        lim->cluster = 1;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
@@ -163,23 +164,9 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
        blk_queue_congestion_threshold(q);
        q->nr_batching = BLK_BATCH_REQ;
-        q->unplug_thresh = 4;           /* hmm */
-        q->unplug_delay = msecs_to_jiffies(3);  /* 3 milliseconds */
-        if (q->unplug_delay == 0)
-                q->unplug_delay = 1;
-        q->unplug_timer.function = blk_unplug_timeout;
-        q->unplug_timer.data = (unsigned long)q;
        blk_set_default_limits(&q->limits);
        blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+        q->limits.discard_zeroes_data = 0;
-        /*
-         * If the caller didn't supply a lock, fall back to our embedded
-         * per-queue locks
-         */
-        if (!q->queue_lock)
-                q->queue_lock = &q->__queue_lock;
        /*
         * by default assume old behaviour and bounce for any highmem page
@@ -213,7 +200,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
         */
        if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
                dma = 1;
-        q->limits.bounce_pfn = max_low_pfn;
+        q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
 #else
        if (b_pfn < blk_max_low_pfn)
                dma = 1;
@@ -228,8 +215,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 /**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
- * @q:  the request queue for the device
+ * @limits: the queue limits
 * @max_hw_sectors:  max hardware sectors in the usual 512b unit
 *
 * Description:
@@ -243,7 +230,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
 *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
 *    The soft limit can not exceed max_hw_sectors.
 **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
 {
        if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
                max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
@@ -251,9 +238,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
                       __func__, max_hw_sectors);
        }
-        q->limits.max_hw_sectors = max_hw_sectors;
+        limits->max_hw_sectors = max_hw_sectors;
-        q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
+        limits->max_sectors = min_t(unsigned int, max_hw_sectors,
-                                      BLK_DEF_MAX_SECTORS);
+                                    BLK_DEF_MAX_SECTORS);
+}
+EXPORT_SYMBOL(blk_limits_max_hw_sectors);
+/**
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * @q:  the request queue for the device
+ * @max_hw_sectors:  max hardware sectors in the usual 512b unit
+ *
+ * Description:
+ *    See description for blk_limits_max_hw_sectors().
+ **/
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+{
+        blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
 *   hardware can operate on without reverting to read-modify-write
 *   operations.
 */
-void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 {
        q->limits.physical_block_size = size;
@@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 /**
 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
 * @t:  the stacking driver (top)
@@ -468,15 +464,6 @@ EXPORT_SYMBOL(blk_queue_io_opt);
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
        blk_stack_limits(&t->limits, &b->limits, 0);
-        if (!t->queue_lock)
-                WARN_ON_ONCE(1);
-        else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
-                unsigned long flags;
-                spin_lock_irqsave(t->queue_lock, flags);
-                queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
-                spin_unlock_irqrestore(t->queue_lock, flags);
-        }
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
@@ -514,6 +501,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                                            b->seg_boundary_mask);
        t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+        t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+                                                 b->max_integrity_segments);
        t->max_segment_size = min_not_zero(t->max_segment_size,
                                           b->max_segment_size);
@@ -547,7 +536,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        t->io_min = max(t->io_min, b->io_min);
        t->io_opt = lcm(t->io_opt, b->io_opt);
-        t->no_cluster |= b->no_cluster;
+        t->cluster &= b->cluster;
        t->discard_zeroes_data &= b->discard_zeroes_data;
        /* Physical block size a multiple of the logical block size? */
@@ -643,7 +632,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
                       sector_t offset)
 {
        struct request_queue *t = disk->queue;
-        struct request_queue *b = bdev_get_queue(bdev);
        if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
                char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
@@ -654,17 +642,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
                printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
                       top, bottom);
        }
-        if (!t->queue_lock)
-                WARN_ON_ONCE(1);
-        else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
-                unsigned long flags;
-                spin_lock_irqsave(t->queue_lock, flags);
-                if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
-                        queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
-                spin_unlock_irqrestore(t->queue_lock, flags);
-        }
 }
 EXPORT_SYMBOL(disk_stack_limits);
@@ -794,6 +771,32 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+/**
+ * blk_queue_flush - configure queue's cache flush capability
+ * @q:          the request queue for the device
+ * @flush:      0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ *
+ * Tell block layer cache flush capability of @q.  If it supports
+ * flushing, REQ_FLUSH should be set.  If it supports bypassing
+ * write cache for individual writes, REQ_FUA should be set.
+ */
+void blk_queue_flush(struct request_queue *q, unsigned int flush)
+{
+        WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+        if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+                flush &= ~REQ_FUA;
+        q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush);
+void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+{
+        q->flush_not_queueable = !queueable;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 static int __init blk_settings_init(void)
 {
        blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0749b89c6885..d935bd859c87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
                blk_set_queue_full(q, BLK_RW_SYNC);
-        } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+        } else {
                blk_clear_queue_full(q, BLK_RW_SYNC);
                wake_up(&rl->wait[BLK_RW_SYNC]);
        }
        if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
                blk_set_queue_full(q, BLK_RW_ASYNC);
-        } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+        } else {
                blk_clear_queue_full(q, BLK_RW_ASYNC);
                wake_up(&rl->wait[BLK_RW_ASYNC]);
        }
@@ -112,9 +112,14 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
        return queue_var_show(queue_max_segments(q), (page));
 }
+static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
+{
+        return queue_var_show(q->limits.max_integrity_segments, (page));
+}
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
-        if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+        if (blk_queue_cluster(q))
                return queue_var_show(queue_max_segment_size(q), (page));
        return queue_var_show(PAGE_CACHE_SIZE, (page));
@@ -147,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 {
-        return queue_var_show(q->limits.max_discard_sectors << 9, page);
+        return sprintf(page, "%llu\n",
+                       (unsigned long long)q->limits.max_discard_sectors << 9);
 }
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
@@ -288,6 +294,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
        .show = queue_max_segments_show,
 };
+static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
+        .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+        .show = queue_max_integrity_segments_show,
+};
 static struct queue_sysfs_entry queue_max_segment_size_entry = {
        .attr = {.name = "max_segment_size", .mode = S_IRUGO },
        .show = queue_max_segment_size_show,
@@ -375,6 +386,7 @@ static struct attribute *default_attrs[] = {
        &queue_max_hw_sectors_entry.attr,
        &queue_max_sectors_entry.attr,
        &queue_max_segments_entry.attr,
+        &queue_max_integrity_segments_entry.attr,
        &queue_max_segment_size_entry.attr,
        &queue_iosched_entry.attr,
        &queue_hw_sector_size_entry.attr,
@@ -487,7 +499,6 @@ int blk_register_queue(struct gendisk *disk)
 {
        int ret;
        struct device *dev = disk_to_dev(disk);
        struct request_queue *q = disk->queue;
        if (WARN_ON(!q))
@@ -498,8 +509,10 @@ int blk_register_queue(struct gendisk *disk)
                return ret;
        ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
-        if (ret < 0)
+        if (ret < 0) {
+                blk_trace_remove_sysfs(dev);
                return ret;
+        }
        kobject_uevent(&q->kobj, KOBJ_ADD);
@@ -510,7 +523,7 @@ int blk_register_queue(struct gendisk *disk)
        if (ret) {
                kobject_uevent(&q->kobj, KOBJ_REMOVE);
                kobject_del(&q->kobj);
-                blk_trace_remove_sysfs(disk_to_dev(disk));
+                blk_trace_remove_sysfs(dev);
                kobject_put(&dev->kobj);
                return ret;
        }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..3689f833afdc
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,1312 @@
+/*
+ * Interface for controlling IO bandwidth on a request queue
+ *
+ * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
+/* Max dispatch from a group in 1 round */
+static int throtl_grp_quantum = 8;
+/* Total max dispatch from all groups in one round */
+static int throtl_quantum = 32;
+/* Throttling is performed over 100ms slice and after that slice is renewed */
+static unsigned long throtl_slice = HZ/10;      /* 100 ms */
+/* A workqueue to queue throttle related work */
+static struct workqueue_struct *kthrotld_workqueue;
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+                                unsigned long delay);
+struct throtl_rb_root {
+        struct rb_root rb;
+        struct rb_node *left;
+        unsigned int count;
+        unsigned long min_disptime;
+};
+#define THROTL_RB_ROOT  (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
+                        .count = 0, .min_disptime = 0}
+#define rb_entry_tg(node)       rb_entry((node), struct throtl_grp, rb_node)
+struct throtl_grp {
+        /* List of throtl groups on the request queue*/
+        struct hlist_node tg_node;
+        /* active throtl group service_tree member */
+        struct rb_node rb_node;
+        /*
+         * Dispatch time in jiffies. This is the estimated time when group
+         * will unthrottle and is ready to dispatch more bio. It is used as
+         * key to sort active groups in service tree.
+         */
+        unsigned long disptime;
+        struct blkio_group blkg;
+        atomic_t ref;
+        unsigned int flags;
+        /* Two lists for READ and WRITE */
+        struct bio_list bio_lists[2];
+        /* Number of queued bios on READ and WRITE lists */
+        unsigned int nr_queued[2];
+        /* bytes per second rate limits */
+        uint64_t bps[2];
+        /* IOPS limits */
+        unsigned int iops[2];
+        /* Number of bytes disptached in current slice */
+        uint64_t bytes_disp[2];
+        /* Number of bio's dispatched in current slice */
+        unsigned int io_disp[2];
+        /* When did we start a new slice */
+        unsigned long slice_start[2];
+        unsigned long slice_end[2];
+        /* Some throttle limits got updated for the group */
+        int limits_changed;
+        struct rcu_head rcu_head;
+};
+struct throtl_data
+{
+        /* List of throtl groups */
+        struct hlist_head tg_list;
+        /* service tree for active throtl groups */
+        struct throtl_rb_root tg_service_tree;
+        struct throtl_grp *root_tg;
+        struct request_queue *queue;
+        /* Total Number of queued bios on READ and WRITE lists */
+        unsigned int nr_queued[2];
+        /*
+         * number of total undestroyed groups
+         */
+        unsigned int nr_undestroyed_grps;
+        /* Work for dispatching throttled bios */
+        struct delayed_work throtl_work;
+        int limits_changed;
+};
+enum tg_state_flags {
+        THROTL_TG_FLAG_on_rr = 0,       /* on round-robin busy list */
+};
+#define THROTL_TG_FNS(name)                                             \
+static inline void throtl_mark_tg_##name(struct throtl_grp *tg)         \
+{                                                                       \
+        (tg)->flags |= (1 << THROTL_TG_FLAG_##name);                    \
+}                                                                       \
+static inline void throtl_clear_tg_##name(struct throtl_grp *tg)        \
+{                                                                       \
+        (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);                   \
+}                                                                       \
+static inline int throtl_tg_##name(const struct throtl_grp *tg)         \
+{                                                                       \
+        return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;       \
+}
+THROTL_TG_FNS(on_rr);
+#define throtl_log_tg(td, tg, fmt, args...)                             \
+        blk_add_trace_msg((td)->queue, "throtl %s " fmt,                \
+                                blkg_path(&(tg)->blkg), ##args);        \
+#define throtl_log(td, fmt, args...)    \
+        blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
+static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
+{
+        if (blkg)
+                return container_of(blkg, struct throtl_grp, blkg);
+        return NULL;
+}
+static inline int total_nr_queued(struct throtl_data *td)
+{
+        return (td->nr_queued[0] + td->nr_queued[1]);
+}
+static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
+{
+        atomic_inc(&tg->ref);
+        return tg;
+}
+static void throtl_free_tg(struct rcu_head *head)
+{
+        struct throtl_grp *tg;
+        tg = container_of(head, struct throtl_grp, rcu_head);
+        free_percpu(tg->blkg.stats_cpu);
+        kfree(tg);
+}
+static void throtl_put_tg(struct throtl_grp *tg)
+{
+        BUG_ON(atomic_read(&tg->ref) <= 0);
+        if (!atomic_dec_and_test(&tg->ref))
+                return;
+        /*
+         * A group is freed in rcu manner. But having an rcu lock does not
+         * mean that one can access all the fields of blkg and assume these
+         * are valid. For example, don't try to follow throtl_data and
+         * request queue links.
+         *
+         * Having a reference to blkg under an rcu allows acess to only
+         * values local to groups like group stats and group rate limits
+         */
+        call_rcu(&tg->rcu_head, throtl_free_tg);
+}
+static void throtl_init_group(struct throtl_grp *tg)
+{
+        INIT_HLIST_NODE(&tg->tg_node);
+        RB_CLEAR_NODE(&tg->rb_node);
+        bio_list_init(&tg->bio_lists[0]);
+        bio_list_init(&tg->bio_lists[1]);
+        tg->limits_changed = false;
+        /* Practically unlimited BW */
+        tg->bps[0] = tg->bps[1] = -1;
+        tg->iops[0] = tg->iops[1] = -1;
+        /*
+         * Take the initial reference that will be released on destroy
+         * This can be thought of a joint reference by cgroup and
+         * request queue which will be dropped by either request queue
+         * exit or cgroup deletion path depending on who is exiting first.
+         */
+        atomic_set(&tg->ref, 1);
+}
+/* Should be called with rcu read lock held (needed for blkcg) */
+static void
+throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
+{
+        hlist_add_head(&tg->tg_node, &td->tg_list);
+        td->nr_undestroyed_grps++;
+}
+static void
+__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+        struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+        unsigned int major, minor;
+        if (!tg || tg->blkg.dev)
+                return;
+        /*
+         * Fill in device details for a group which might not have been
+         * filled at group creation time as queue was being instantiated
+         * and driver had not attached a device yet
+         */
+        if (bdi->dev && dev_name(bdi->dev)) {
+                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                tg->blkg.dev = MKDEV(major, minor);
+        }
+}
+/*
+ * Should be called with without queue lock held. Here queue lock will be
+ * taken rarely. It will be taken only once during life time of a group
+ * if need be
+ */
+static void
+throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+        if (!tg || tg->blkg.dev)
+                return;
+        spin_lock_irq(td->queue->queue_lock);
+        __throtl_tg_fill_dev_details(td, tg);
+        spin_unlock_irq(td->queue->queue_lock);
+}
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+                        struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+        __throtl_tg_fill_dev_details(td, tg);
+        /* Add group onto cgroup list */
+        blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+                                tg->blkg.dev, BLKIO_POLICY_THROTL);
+        tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+        tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+        tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+        tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
+        throtl_add_group_to_td_list(td, tg);
+}
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+        struct throtl_grp *tg = NULL;
+        int ret;
+        tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+        if (!tg)
+                return NULL;
+        ret = blkio_alloc_blkg_stats(&tg->blkg);
+        if (ret) {
+                kfree(tg);
+                return NULL;
+        }
+        throtl_init_group(tg);
+        return tg;
+}
+static struct
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
+{
+        struct throtl_grp *tg = NULL;
+        void *key = td;
+        /*
+         * This is the common case when there are no blkio cgroups.
+         * Avoid lookup in this case
+         */
+        if (blkcg == &blkio_root_cgroup)
+                tg = td->root_tg;
+        else
+                tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+        __throtl_tg_fill_dev_details(td, tg);
+        return tg;
+}
+/*
+ * This function returns with queue lock unlocked in case of error, like
+ * request queue is no more
+ */
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+        struct throtl_grp *tg = NULL, *__tg = NULL;
+        struct blkio_cgroup *blkcg;
+        struct request_queue *q = td->queue;
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        tg = throtl_find_tg(td, blkcg);
+        if (tg) {
+                rcu_read_unlock();
+                return tg;
+        }
+        /*
+         * Need to allocate a group. Allocation of group also needs allocation
+         * of per cpu stats which in-turn takes a mutex() and can block. Hence
+         * we need to drop rcu lock and queue_lock before we call alloc
+         *
+         * Take the request queue reference to make sure queue does not
+         * go away once we return from allocation.
+         */
+        blk_get_queue(q);
+        rcu_read_unlock();
+        spin_unlock_irq(q->queue_lock);
+        tg = throtl_alloc_tg(td);
+        /*
+         * We might have slept in group allocation. Make sure queue is not
+         * dead
+         */
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+                blk_put_queue(q);
+                if (tg)
+                        kfree(tg);
+                return ERR_PTR(-ENODEV);
+        }
+        blk_put_queue(q);
+        /* Group allocated and queue is still alive. take the lock */
+        spin_lock_irq(q->queue_lock);
+        /*
+         * Initialize the new group. After sleeping, read the blkcg again.
+         */
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        /*
+         * If some other thread already allocated the group while we were
+         * not holding queue lock, free up the group
+         */
+        __tg = throtl_find_tg(td, blkcg);
+        if (__tg) {
+                kfree(tg);
+                rcu_read_unlock();
+                return __tg;
+        }
+        /* Group allocation failed. Account the IO to root group */
+        if (!tg) {
+                tg = td->root_tg;
+                return tg;
+        }
+        throtl_init_add_tg_lists(td, tg, blkcg);
+        rcu_read_unlock();
+        return tg;
+}
+static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+{
+        /* Service tree is empty */
+        if (!root->count)
+                return NULL;
+        if (!root->left)
+                root->left = rb_first(&root->rb);
+        if (root->left)
+                return rb_entry_tg(root->left);
+        return NULL;
+}
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+        rb_erase(n, root);
+        RB_CLEAR_NODE(n);
+}
+static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+{
+        if (root->left == n)
+                root->left = NULL;
+        rb_erase_init(n, &root->rb);
+        --root->count;
+}
+static void update_min_dispatch_time(struct throtl_rb_root *st)
+{
+        struct throtl_grp *tg;
+        tg = throtl_rb_first(st);
+        if (!tg)
+                return;
+        st->min_disptime = tg->disptime;
+}
+static void
+tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+{
+        struct rb_node **node = &st->rb.rb_node;
+        struct rb_node *parent = NULL;
+        struct throtl_grp *__tg;
+        unsigned long key = tg->disptime;
+        int left = 1;
+        while (*node != NULL) {
+                parent = *node;
+                __tg = rb_entry_tg(parent);
+                if (time_before(key, __tg->disptime))
+                        node = &parent->rb_left;
+                else {
+                        node = &parent->rb_right;
+                        left = 0;
+                }
+        }
+        if (left)
+                st->left = &tg->rb_node;
+        rb_link_node(&tg->rb_node, parent, node);
+        rb_insert_color(&tg->rb_node, &st->rb);
+}
+static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+        struct throtl_rb_root *st = &td->tg_service_tree;
+        tg_service_tree_add(st, tg);
+        throtl_mark_tg_on_rr(tg);
+        st->count++;
+}
+static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+        if (!throtl_tg_on_rr(tg))
+                __throtl_enqueue_tg(td, tg);
+}
+static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+        throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
+        throtl_clear_tg_on_rr(tg);
+}
+static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+        if (throtl_tg_on_rr(tg))
+                __throtl_dequeue_tg(td, tg);
+}
+static void throtl_schedule_next_dispatch(struct throtl_data *td)
+{
+        struct throtl_rb_root *st = &td->tg_service_tree;
+        /*
+         * If there are more bios pending, schedule more work.
+         */
+        if (!total_nr_queued(td))
+                return;
+        BUG_ON(!st->count);
+        update_min_dispatch_time(st);
+        if (time_before_eq(st->min_disptime, jiffies))
+                throtl_schedule_delayed_work(td, 0);
+        else
+                throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
+}
+static inline void
+throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+        tg->bytes_disp[rw] = 0;
+        tg->io_disp[rw] = 0;
+        tg->slice_start[rw] = jiffies;
+        tg->slice_end[rw] = jiffies + throtl_slice;
+        throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+                        rw == READ ? 'R' : 'W', tg->slice_start[rw],
+                        tg->slice_end[rw], jiffies);
+}
+static inline void throtl_set_slice_end(struct throtl_data *td,
+                struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+        tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+}
+static inline void throtl_extend_slice(struct throtl_data *td,
+                struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+        tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+        throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+                        rw == READ ? 'R' : 'W', tg->slice_start[rw],
+                        tg->slice_end[rw], jiffies);
+}
+/* Determine if previously allocated or extended slice is complete or not */
+static bool
+throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+        if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
+                return 0;
+        return 1;
+}
+/* Trim the used slices and adjust slice start accordingly */
+static inline void
+throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+        unsigned long nr_slices, time_elapsed, io_trim;
+        u64 bytes_trim, tmp;
+        BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
+        /*
+         * If bps are unlimited (-1), then time slice don't get
+         * renewed. Don't try to trim the slice if slice is used. A new
+         * slice will start when appropriate.
+         */
+        if (throtl_slice_used(td, tg, rw))
+                return;
+        /*
+         * A bio has been dispatched. Also adjust slice_end. It might happen
+         * that initially cgroup limit was very low resulting in high
+         * slice_end, but later limit was bumped up and bio was dispached
+         * sooner, then we need to reduce slice_end. A high bogus slice_end
+         * is bad because it does not allow new slice to start.
+         */
+        throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+        time_elapsed = jiffies - tg->slice_start[rw];
+        nr_slices = time_elapsed / throtl_slice;
+        if (!nr_slices)
+                return;
+        tmp = tg->bps[rw] * throtl_slice * nr_slices;
+        do_div(tmp, HZ);
+        bytes_trim = tmp;
+        io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
+        if (!bytes_trim && !io_trim)
+                return;
+        if (tg->bytes_disp[rw] >= bytes_trim)
+                tg->bytes_disp[rw] -= bytes_trim;
+        else
+                tg->bytes_disp[rw] = 0;
+        if (tg->io_disp[rw] >= io_trim)
+                tg->io_disp[rw] -= io_trim;
+        else
+                tg->io_disp[rw] = 0;
+        tg->slice_start[rw] += nr_slices * throtl_slice;
+        throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
+                        " start=%lu end=%lu jiffies=%lu",
+                        rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
+                        tg->slice_start[rw], tg->slice_end[rw], jiffies);
+}
+static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
+                struct bio *bio, unsigned long *wait)
+{
+        bool rw = bio_data_dir(bio);
+        unsigned int io_allowed;
+        unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+        u64 tmp;
+        jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
+        /* Slice has just started. Consider one slice interval */
+        if (!jiffy_elapsed)
+                jiffy_elapsed_rnd = throtl_slice;
+        jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+        /*
+         * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+         * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+         * will allow dispatch after 1 second and after that slice should
+         * have been trimmed.
+         */
+        tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+        do_div(tmp, HZ);
+        if (tmp > UINT_MAX)
+                io_allowed = UINT_MAX;
+        else
+                io_allowed = tmp;
+        if (tg->io_disp[rw] + 1 <= io_allowed) {
+                if (wait)
+                        *wait = 0;
+                return 1;
+        }
+        /* Calc approx time to dispatch */
+        jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+        if (jiffy_wait > jiffy_elapsed)
+                jiffy_wait = jiffy_wait - jiffy_elapsed;
+        else
+                jiffy_wait = 1;
+        if (wait)
+                *wait = jiffy_wait;
+        return 0;
+}
+static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
+                struct bio *bio, unsigned long *wait)
+{
+        bool rw = bio_data_dir(bio);
+        u64 bytes_allowed, extra_bytes, tmp;
+        unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+        jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
+        /* Slice has just started. Consider one slice interval */
+        if (!jiffy_elapsed)
+                jiffy_elapsed_rnd = throtl_slice;
+        jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+        tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+        do_div(tmp, HZ);
+        bytes_allowed = tmp;
+        if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+                if (wait)
+                        *wait = 0;
+                return 1;
+        }
+        /* Calc approx time to dispatch */
+        extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+        jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+        if (!jiffy_wait)
+                jiffy_wait = 1;
+        /*
+         * This wait time is without taking into consideration the rounding
+         * up we did. Add that time also.
+         */
+        jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
+        if (wait)
+                *wait = jiffy_wait;
+        return 0;
+}
+static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
+        if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
+                return 1;
+        return 0;
+}
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+                                struct bio *bio, unsigned long *wait)
+{
+        bool rw = bio_data_dir(bio);
+        unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+        /*
+         * Currently whole state machine of group depends on first bio
+         * queued in the group bio list. So one should not be calling
+         * this function with a different bio if there are other bios
+         * queued.
+         */
+        BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+        /* If tg->bps = -1, then BW is unlimited */
+        if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+                if (wait)
+                        *wait = 0;
+                return 1;
+        }
+        /*
+         * If previous slice expired, start a new one otherwise renew/extend
+         * existing slice to make sure it is at least throtl_slice interval
+         * long since now.
+         */
+        if (throtl_slice_used(td, tg, rw))
+                throtl_start_new_slice(td, tg, rw);
+        else {
+                if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+                        throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+        }
+        if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
+            && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+                if (wait)
+                        *wait = 0;
+                return 1;
+        }
+        max_wait = max(bps_wait, iops_wait);
+        if (wait)
+                *wait = max_wait;
+        if (time_before(tg->slice_end[rw], jiffies + max_wait))
+                throtl_extend_slice(td, tg, rw, jiffies + max_wait);
+        return 0;
+}
+static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
+{
+        bool rw = bio_data_dir(bio);
+        bool sync = bio->bi_rw & REQ_SYNC;
+        /* Charge the bio to the group */
+        tg->bytes_disp[rw] += bio->bi_size;
+        tg->io_disp[rw]++;
+        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+}
+static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
+                        struct bio *bio)
+{
+        bool rw = bio_data_dir(bio);
+        bio_list_add(&tg->bio_lists[rw], bio);
+        /* Take a bio reference on tg */
+        throtl_ref_get_tg(tg);
+        tg->nr_queued[rw]++;
+        td->nr_queued[rw]++;
+        throtl_enqueue_tg(td, tg);
+}
+static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+{
+        unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+        struct bio *bio;
+        if ((bio = bio_list_peek(&tg->bio_lists[READ])))
+                tg_may_dispatch(td, tg, bio, &read_wait);
+        if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+                tg_may_dispatch(td, tg, bio, &write_wait);
+        min_wait = min(read_wait, write_wait);
+        disptime = jiffies + min_wait;
+        /* Update dispatch time */
+        throtl_dequeue_tg(td, tg);
+        tg->disptime = disptime;
+        throtl_enqueue_tg(td, tg);
+}
+static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
+                                bool rw, struct bio_list *bl)
+{
+        struct bio *bio;
+        bio = bio_list_pop(&tg->bio_lists[rw]);
+        tg->nr_queued[rw]--;
+        /* Drop bio reference on tg */
+        throtl_put_tg(tg);
+        BUG_ON(td->nr_queued[rw] <= 0);
+        td->nr_queued[rw]--;
+        throtl_charge_bio(tg, bio);
+        bio_list_add(bl, bio);
+        bio->bi_rw |= REQ_THROTTLED;
+        throtl_trim_slice(td, tg, rw);
+}
+static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
+                                struct bio_list *bl)
+{
+        unsigned int nr_reads = 0, nr_writes = 0;
+        unsigned int max_nr_reads = throtl_grp_quantum*3/4;
+        unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+        struct bio *bio;
+        /* Try to dispatch 75% READS and 25% WRITES */
+        while ((bio = bio_list_peek(&tg->bio_lists[READ]))
+                && tg_may_dispatch(td, tg, bio, NULL)) {
+                tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+                nr_reads++;
+                if (nr_reads >= max_nr_reads)
+                        break;
+        }
+        while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
+                && tg_may_dispatch(td, tg, bio, NULL)) {
+                tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+                nr_writes++;
+                if (nr_writes >= max_nr_writes)
+                        break;
+        }
+        return nr_reads + nr_writes;
+}
+static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+{
+        unsigned int nr_disp = 0;
+        struct throtl_grp *tg;
+        struct throtl_rb_root *st = &td->tg_service_tree;
+        while (1) {
+                tg = throtl_rb_first(st);
+                if (!tg)
+                        break;
+                if (time_before(jiffies, tg->disptime))
+                        break;
+                throtl_dequeue_tg(td, tg);
+                nr_disp += throtl_dispatch_tg(td, tg, bl);
+                if (tg->nr_queued[0] || tg->nr_queued[1]) {
+                        tg_update_disptime(td, tg);
+                        throtl_enqueue_tg(td, tg);
+                }
+                if (nr_disp >= throtl_quantum)
+                        break;
+        }
+        return nr_disp;
+}
+static void throtl_process_limit_change(struct throtl_data *td)
+{
+        struct throtl_grp *tg;
+        struct hlist_node *pos, *n;
+        if (!td->limits_changed)
+                return;
+        xchg(&td->limits_changed, false);
+        throtl_log(td, "limits changed");
+        hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+                if (!tg->limits_changed)
+                        continue;
+                if (!xchg(&tg->limits_changed, false))
+                        continue;
+                throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+                        " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
+                        tg->iops[READ], tg->iops[WRITE]);
+                /*
+                 * Restart the slices for both READ and WRITES. It
+                 * might happen that a group's limit are dropped
+                 * suddenly and we don't want to account recently
+                 * dispatched IO with new low rate
+                 */
+                throtl_start_new_slice(td, tg, 0);
+                throtl_start_new_slice(td, tg, 1);
+                if (throtl_tg_on_rr(tg))
+                        tg_update_disptime(td, tg);
+        }
+}
+/* Dispatch throttled bios. Should be called without queue lock held. */
+static int throtl_dispatch(struct request_queue *q)
+{
+        struct throtl_data *td = q->td;
+        unsigned int nr_disp = 0;
+        struct bio_list bio_list_on_stack;
+        struct bio *bio;
+        struct blk_plug plug;
+        spin_lock_irq(q->queue_lock);
+        throtl_process_limit_change(td);
+        if (!total_nr_queued(td))
+                goto out;
+        bio_list_init(&bio_list_on_stack);
+        throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
+                        total_nr_queued(td), td->nr_queued[READ],
+                        td->nr_queued[WRITE]);
+        nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+        if (nr_disp)
+                throtl_log(td, "bios disp=%u", nr_disp);
+        throtl_schedule_next_dispatch(td);
+out:
+        spin_unlock_irq(q->queue_lock);
+        /*
+         * If we dispatched some requests, unplug the queue to make sure
+         * immediate dispatch
+         */
+        if (nr_disp) {
+                blk_start_plug(&plug);
+                while((bio = bio_list_pop(&bio_list_on_stack)))
+                        generic_make_request(bio);
+                blk_finish_plug(&plug);
+        }
+        return nr_disp;
+}
+void blk_throtl_work(struct work_struct *work)
+{
+        struct throtl_data *td = container_of(work, struct throtl_data,
+                                        throtl_work.work);
+        struct request_queue *q = td->queue;
+        throtl_dispatch(q);
+}
+/* Call with queue lock held */
+static void
+throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
+{
+        struct delayed_work *dwork = &td->throtl_work;
+        /* schedule work if limits changed even if no bio is queued */
+        if (total_nr_queued(td) > 0 || td->limits_changed) {
+                /*
+                 * We might have a work scheduled to be executed in future.
+                 * Cancel that and schedule a new one.
+                 */
+                __cancel_delayed_work(dwork);
+                queue_delayed_work(kthrotld_workqueue, dwork, delay);
+                throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
+                                delay, jiffies);
+        }
+}
+static void
+throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+        /* Something wrong if we are trying to remove same group twice */
+        BUG_ON(hlist_unhashed(&tg->tg_node));
+        hlist_del_init(&tg->tg_node);
+        /*
+         * Put the reference taken at the time of creation so that when all
+         * queues are gone, group can be destroyed.
+         */
+        throtl_put_tg(tg);
+        td->nr_undestroyed_grps--;
+}
+static void throtl_release_tgs(struct throtl_data *td)
+{
+        struct hlist_node *pos, *n;
+        struct throtl_grp *tg;
+        hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+                /*
+                 * If cgroup removal path got to blk_group first and removed
+                 * it from cgroup list, then it will take care of destroying
+                 * cfqg also.
+                 */
+                if (!blkiocg_del_blkio_group(&tg->blkg))
+                        throtl_destroy_tg(td, tg);
+        }
+}
+static void throtl_td_free(struct throtl_data *td)
+{
+        kfree(td);
+}
+/*
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid throtl_data pointer as long as we are
+ * rcu read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if queue was going away, cgroup deltion
+ * path got to it first.
+ */
+void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
+{
+        unsigned long flags;
+        struct throtl_data *td = key;
+        spin_lock_irqsave(td->queue->queue_lock, flags);
+        throtl_destroy_tg(td, tg_of_blkg(blkg));
+        spin_unlock_irqrestore(td->queue->queue_lock, flags);
+}
+static void throtl_update_blkio_group_common(struct throtl_data *td,
+                                struct throtl_grp *tg)
+{
+        xchg(&tg->limits_changed, true);
+        xchg(&td->limits_changed, true);
+        /* Schedule a work now to process the limit change */
+        throtl_schedule_delayed_work(td, 0);
+}
+/*
+ * For all update functions, key should be a valid pointer because these
+ * update functions are called under blkcg_lock, that means, blkg is
+ * valid and in turn key is valid. queue exit path can not race because
+ * of blkcg_lock
+ *
+ * Can not take queue lock in update functions as queue lock under blkcg_lock
+ * is not allowed. Under other paths we take blkcg_lock under queue_lock.
+ */
+static void throtl_update_blkio_group_read_bps(void *key,
+                                struct blkio_group *blkg, u64 read_bps)
+{
+        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
+        tg->bps[READ] = read_bps;
+        throtl_update_blkio_group_common(td, tg);
+}
+static void throtl_update_blkio_group_write_bps(void *key,
+                                struct blkio_group *blkg, u64 write_bps)
+{
+        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
+        tg->bps[WRITE] = write_bps;
+        throtl_update_blkio_group_common(td, tg);
+}
+static void throtl_update_blkio_group_read_iops(void *key,
+                        struct blkio_group *blkg, unsigned int read_iops)
+{
+        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
+        tg->iops[READ] = read_iops;
+        throtl_update_blkio_group_common(td, tg);
+}
+static void throtl_update_blkio_group_write_iops(void *key,
+                        struct blkio_group *blkg, unsigned int write_iops)
+{
+        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
+        tg->iops[WRITE] = write_iops;
+        throtl_update_blkio_group_common(td, tg);
+}
+static void throtl_shutdown_wq(struct request_queue *q)
+{
+        struct throtl_data *td = q->td;
+        cancel_delayed_work_sync(&td->throtl_work);
+}
+static struct blkio_policy_type blkio_policy_throtl = {
+        .ops = {
+                .blkio_unlink_group_fn = throtl_unlink_blkio_group,
+                .blkio_update_group_read_bps_fn =
+                                        throtl_update_blkio_group_read_bps,
+                .blkio_update_group_write_bps_fn =
+                                        throtl_update_blkio_group_write_bps,
+                .blkio_update_group_read_iops_fn =
+                                        throtl_update_blkio_group_read_iops,
+                .blkio_update_group_write_iops_fn =
+                                        throtl_update_blkio_group_write_iops,
+        },
+        .plid = BLKIO_POLICY_THROTL,
+};
+int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+{
+        struct throtl_data *td = q->td;
+        struct throtl_grp *tg;
+        struct bio *bio = *biop;
+        bool rw = bio_data_dir(bio), update_disptime = true;
+        struct blkio_cgroup *blkcg;
+        if (bio->bi_rw & REQ_THROTTLED) {
+                bio->bi_rw &= ~REQ_THROTTLED;
+                return 0;
+        }
+        /*
+         * A throtl_grp pointer retrieved under rcu can be used to access
+         * basic fields like stats and io rates. If a group has no rules,
+         * just update the dispatch stats in lockless manner and return.
+         */
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        tg = throtl_find_tg(td, blkcg);
+        if (tg) {
+                throtl_tg_fill_dev_details(td, tg);
+                if (tg_no_rule_group(tg, rw)) {
+                        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
+                                        rw, bio->bi_rw & REQ_SYNC);
+                        rcu_read_unlock();
+                        return 0;
+                }
+        }
+        rcu_read_unlock();
+        /*
+         * Either group has not been allocated yet or it is not an unlimited
+         * IO group
+         */
+        spin_lock_irq(q->queue_lock);
+        tg = throtl_get_tg(td);
+        if (IS_ERR(tg)) {
+                if (PTR_ERR(tg) == -ENODEV) {
+                        /*
+                         * Queue is gone. No queue lock held here.
+                         */
+                        return -ENODEV;
+                }
+        }
+        if (tg->nr_queued[rw]) {
+                /*
+                 * There is already another bio queued in same dir. No
+                 * need to update dispatch time.
+                 */
+                update_disptime = false;
+                goto queue_bio;
+        }
+        /* Bio is with-in rate limit of group */
+        if (tg_may_dispatch(td, tg, bio, NULL)) {
+                throtl_charge_bio(tg, bio);
+                /*
+                 * We need to trim slice even when bios are not being queued
+                 * otherwise it might happen that a bio is not queued for
+                 * a long time and slice keeps on extending and trim is not
+                 * called for a long time. Now if limits are reduced suddenly
+                 * we take into account all the IO dispatched so far at new
+                 * low rate and * newly queued IO gets a really long dispatch
+                 * time.
+                 *
+                 * So keep on trimming slice even if bio is not queued.
+                 */
+                throtl_trim_slice(td, tg, rw);
+                goto out;
+        }
+queue_bio:
+        throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
+                        " iodisp=%u iops=%u queued=%d/%d",
+                        rw == READ ? 'R' : 'W',
+                        tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+                        tg->io_disp[rw], tg->iops[rw],
+                        tg->nr_queued[READ], tg->nr_queued[WRITE]);
+        throtl_add_bio_tg(q->td, tg, bio);
+        *biop = NULL;
+        if (update_disptime) {
+                tg_update_disptime(td, tg);
+                throtl_schedule_next_dispatch(td);
+        }
+out:
+        spin_unlock_irq(q->queue_lock);
+        return 0;
+}
+int blk_throtl_init(struct request_queue *q)
+{
+        struct throtl_data *td;
+        struct throtl_grp *tg;
+        td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+        if (!td)
+                return -ENOMEM;
+        INIT_HLIST_HEAD(&td->tg_list);
+        td->tg_service_tree = THROTL_RB_ROOT;
+        td->limits_changed = false;
+        INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+        /* alloc and Init root group. */
+        td->queue = q;
+        tg = throtl_alloc_tg(td);
+        if (!tg) {
+                kfree(td);
+                return -ENOMEM;
+        }
+        td->root_tg = tg;
+        rcu_read_lock();
+        throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
+        rcu_read_unlock();
+        /* Attach throtl data to request queue */
+        q->td = td;
+        return 0;
+}
+void blk_throtl_exit(struct request_queue *q)
+{
+        struct throtl_data *td = q->td;
+        bool wait = false;
+        BUG_ON(!td);
+        throtl_shutdown_wq(q);
+        spin_lock_irq(q->queue_lock);
+        throtl_release_tgs(td);
+        /* If there are other groups */
+        if (td->nr_undestroyed_grps > 0)
+                wait = true;
+        spin_unlock_irq(q->queue_lock);
+        /*
+         * Wait for tg->blkg->key accessors to exit their grace periods.
+         * Do this wait only if there are other undestroyed groups out
+         * there (other than root group). This can happen if cgroup deletion
+         * path claimed the responsibility of cleaning up a group before
+         * queue cleanup code get to the group.
+         *
+         * Do not call synchronize_rcu() unconditionally as there are drivers
+         * which create/delete request queue hundreds of times during scan/boot
+         * and synchronize_rcu() can take significant time and slow down boot.
+         */
+        if (wait)
+                synchronize_rcu();
+        /*
+         * Just being safe to make sure after previous flush if some body did
+         * update limits through cgroup and another work got queued, cancel
+         * it.
+         */
+        throtl_shutdown_wq(q);
+        throtl_td_free(td);
+}
+static int __init throtl_init(void)
+{
+        kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
+        if (!kthrotld_workqueue)
+                panic("Failed to create kthrotld\n");
+        blkio_policy_register(&blkio_policy_throtl);
+        return 0;
+}
+module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d6b911ac002c..d6586287adc9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
-void blk_unplug_work(struct work_struct *work);
-void blk_unplug_timeout(unsigned long data);
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
@@ -34,7 +32,7 @@ enum rq_atomic_flags {
 /*
 * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them suceeds
+ * sure that only one of them succeeds
 */
 static inline int blk_mark_rq_complete(struct request *rq)
 {
@@ -51,18 +49,41 @@ static inline void blk_clear_rq_complete(struct request *rq)
 */
 #define ELV_ON_HASH(rq)         (!hlist_unhashed(&(rq)->hash))
+void blk_insert_flush(struct request *rq);
+void blk_abort_flushes(struct request_queue *q);
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
        struct request *rq;
        while (1) {
-                while (!list_empty(&q->queue_head)) {
+                if (!list_empty(&q->queue_head)) {
                        rq = list_entry_rq(q->queue_head.next);
-                        if (blk_do_ordered(q, &rq))
+                        return rq;
-                                return rq;
                }
-                if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+                /*
+                 * Flush request is running and flush request isn't queueable
+                 * in the drive, we can hold the queue till flush request is
+                 * finished. Even we don't do this, driver can't dispatch next
+                 * requests and will requeue them. And this can improve
+                 * throughput too. For example, we have request flush1, write1,
+                 * flush 2. flush1 is dispatched, then queue is hold, write1
+                 * isn't inserted to queue. After flush1 is finished, flush2
+                 * will be dispatched. Since disk cache is already clean,
+                 * flush2 will be finished very soon, so looks like flush2 is
+                 * folded to flush1.
+                 * Since the queue is hold, a flag is set to indicate the queue
+                 * should be restarted later. Please see flush_end_io() for
+                 * details.
+                 */
+                if (q->flush_pending_idx != q->flush_running_idx &&
+                                !queue_flush_queueable(q)) {
+                        q->flush_queue_delayed = 1;
+                        return NULL;
+                }
+                if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
+                    !q->elevator->ops->elevator_dispatch_fn(q, 0))
                        return NULL;
        }
 }
@@ -103,6 +124,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
                      struct bio *bio);
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+                                struct request *next);
 void blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
@@ -132,14 +155,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
        return q->nr_congestion_off;
 }
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-#define rq_for_each_integrity_segment(bvl, _rq, _iter)          \
-        __rq_for_each_bio(_iter.bio, _rq)                       \
-                bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
-#endif /* BLK_DEV_INTEGRITY */
 static inline int blk_cpu_to_group(int cpu)
 {
        int group = NR_CPUS;
diff --git a/block/bsg.c b/block/bsg.c
index 0c00870553a3..0c8b64a16484 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -20,7 +20,6 @@
 #include <linux/uio.h>
 #include <linux/idr.h>
 #include <linux/bsg.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <scsi/scsi.h>
@@ -251,6 +250,14 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
        int ret, rw;
        unsigned int dxfer_len;
        void *dxferp = NULL;
+        struct bsg_class_device *bcd = &q->bsg_dev;
+        /* if the LLD has been removed then the bsg_unregister_queue will
+         * eventually be called and the class_dev was freed, so we can no
+         * longer use this request_queue. Return no such address.
+         */
+        if (!bcd->class_dev)
+                return ERR_PTR(-ENXIO);
        dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
                hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
@@ -843,9 +850,7 @@ static int bsg_open(struct inode *inode, struct file *file)
 {
        struct bsg_device *bd;
-        lock_kernel();
        bd = bsg_get_device(inode, file);
-        unlock_kernel();
        if (IS_ERR(bd))
                return PTR_ERR(bd);
@@ -968,6 +973,7 @@ static const struct file_operations bsg_fops = {
        .release        =       bsg_release,
        .unlocked_ioctl =       bsg_ioctl,
        .owner          =       THIS_MODULE,
+        .llseek         =       default_llseek,
 };
 void bsg_unregister_queue(struct request_queue *q)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9eba291eb6fd..ae21919f15e1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
 #define RQ_CIC(rq)              \
-        ((struct cfq_io_context *) (rq)->elevator_private)
+        ((struct cfq_io_context *) (rq)->elevator_private[0])
-#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private[1])
-#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private3)
+#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private[2])
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -87,7 +87,6 @@ struct cfq_rb_root {
        unsigned count;
        unsigned total_weight;
        u64 min_vdisktime;
-        struct rb_node *active;
 };
 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
                        .count = 0, .min_vdisktime = 0, }
@@ -97,7 +96,7 @@ struct cfq_rb_root {
 */
 struct cfq_queue {
        /* reference count */
-        atomic_t ref;
+        int ref;
        /* various state flags, see below */
        unsigned int flags;
        /* parent cfq_data */
@@ -147,7 +146,6 @@ struct cfq_queue {
        struct cfq_rb_root *service_tree;
        struct cfq_queue *new_cfqq;
        struct cfq_group *cfqg;
-        struct cfq_group *orig_cfqg;
        /* Number of sectors dispatched from queue in single dispatch round */
        unsigned long nr_sectors;
 };
@@ -160,6 +158,7 @@ enum wl_prio_t {
        BE_WORKLOAD = 0,
        RT_WORKLOAD = 1,
        IDLE_WORKLOAD = 2,
+        CFQ_PRIO_NR,
 };
 /*
@@ -179,15 +178,25 @@ struct cfq_group {
        /* group service_tree key */
        u64 vdisktime;
        unsigned int weight;
-        bool on_st;
+        unsigned int new_weight;
+        bool needs_update;
        /* number of cfqq currently on this group */
        int nr_cfqq;
-        /* Per group busy queus average. Useful for workload slice calc. */
-        unsigned int busy_queues_avg[2];
        /*
-         * rr lists of queues with requests, onle rr for each priority class.
+         * Per group busy queues average. Useful for workload slice calc. We
+         * create the array for each prio class but at run time it is used
+         * only for RT and BE class and slot for IDLE class remains unused.
+         * This is primarily done to avoid confusion and a gcc warning.
+         */
+        unsigned int busy_queues_avg[CFQ_PRIO_NR];
+        /*
+         * rr lists of queues with requests. We maintain service trees for
+         * RT and BE classes. These trees are subdivided in subclasses
+         * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
+         * class there is no subclassification and all the cfq queues go on
+         * a single tree service_tree_idle.
         * Counts are embedded in the cfq_rb_root
         */
        struct cfq_rb_root service_trees[2][3];
@@ -199,7 +208,7 @@ struct cfq_group {
        struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
        struct hlist_node cfqd_node;
-        atomic_t ref;
+        int ref;
 #endif
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
@@ -221,7 +230,6 @@ struct cfq_data {
        enum wl_type_t serving_type;
        unsigned long workload_expires;
        struct cfq_group *serving_group;
-        bool noidle_tree_requires_idle;
        /*
         * Each priority tree is sorted by next_request position.  These
@@ -231,6 +239,7 @@ struct cfq_data {
        struct rb_root prio_trees[CFQ_PRIO_LISTS];
        unsigned int busy_queues;
+        unsigned int busy_sync_queues;
        int rq_in_driver;
        int rq_in_flight[2];
@@ -278,7 +287,6 @@ struct cfq_data {
        unsigned int cfq_slice_idle;
        unsigned int cfq_group_idle;
        unsigned int cfq_latency;
-        unsigned int cfq_group_isolation;
        unsigned int cic_index;
        struct list_head cic_list;
@@ -292,7 +300,9 @@ struct cfq_data {
        /* List of cfq groups being managed on this device*/
        struct hlist_head cfqg_list;
-        struct rcu_head rcu;
+        /* Number of groups which are on blkcg->blkg_list */
+        unsigned int nr_blkcg_linked_grps;
 };
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -359,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy);
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
        blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-                        blkg_path(&(cfqq)->cfqg->blkg), ##args);
+                        blkg_path(&(cfqq)->cfqg->blkg), ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)                          \
        blk_add_trace_msg((cfqd)->queue, "%s " fmt,                     \
-                                blkg_path(&(cfqg)->blkg), ##args);      \
+                                blkg_path(&(cfqg)->blkg), ##args)       \
 #else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
        blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0);
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
 #endif
 #define cfq_log(cfqd, fmt, args...)     \
        blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
@@ -494,13 +504,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
        }
 }
-static int cfq_queue_empty(struct request_queue *q)
-{
-        struct cfq_data *cfqd = q->elevator->elevator_data;
-        return !cfqd->rq_queued;
-}
 /*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -551,20 +554,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
-        u64 vdisktime = st->min_vdisktime;
        struct cfq_group *cfqg;
-        if (st->active) {
-                cfqg = rb_entry_cfqg(st->active);
-                vdisktime = cfqg->vdisktime;
-        }
        if (st->left) {
                cfqg = rb_entry_cfqg(st->left);
-                vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+                st->min_vdisktime = max_vdisktime(st->min_vdisktime,
+                                                  cfqg->vdisktime);
        }
-        st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
 }
 /*
@@ -596,8 +592,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
        return cfq_target_latency * cfqg->weight / st->total_weight;
 }
-static inline void
+static inline unsigned
-cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
        unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
        if (cfqd->cfq_latency) {
@@ -623,6 +619,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                                    low_slice);
                }
        }
+        return slice;
+}
+static inline void
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+        unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
        cfqq->slice_start = jiffies;
        cfqq->slice_end = jiffies + slice;
        cfqq->allocated_slice = slice;
@@ -637,11 +641,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 {
        if (cfq_cfqq_slice_new(cfqq))
-                return 0;
+                return false;
        if (time_before(jiffies, cfqq->slice_end))
-                return 0;
+                return false;
-        return 1;
+        return true;
 }
 /*
@@ -663,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
        if (rq2 == NULL)
                return rq1;
-        if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+        if (rq_is_sync(rq1) != rq_is_sync(rq2))
-                return rq1;
+                return rq_is_sync(rq1) ? rq1 : rq2;
-        else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
-                return rq2;
+        if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
-        if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+                return rq1->cmd_flags & REQ_META ? rq1 : rq2;
-                return rq1;
-        else if ((rq2->cmd_flags & REQ_META) &&
-                 !(rq1->cmd_flags & REQ_META))
-                return rq2;
        s1 = blk_rq_pos(rq1);
        s2 = blk_rq_pos(rq2);
@@ -853,20 +853,40 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 }
 static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_update_group_weight(struct cfq_group *cfqg)
+{
+        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+        if (cfqg->needs_update) {
+                cfqg->weight = cfqg->new_weight;
+                cfqg->needs_update = false;
+        }
+}
+static void
+cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+        cfq_update_group_weight(cfqg);
+        __cfq_group_service_tree_add(st, cfqg);
+        st->total_weight += cfqg->weight;
+}
+static void
+cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
        struct cfq_group *__cfqg;
        struct rb_node *n;
        cfqg->nr_cfqq++;
-        if (cfqg->on_st)
+        if (!RB_EMPTY_NODE(&cfqg->rb_node))
                return;
        /*
         * Currently put the group at the end. Later implement something
         * so that groups get lesser vtime based on their weights, so that
-         * if group does not loose all if it was not continously backlogged.
+         * if group does not loose all if it was not continuously backlogged.
         */
        n = rb_last(&st->rb);
        if (n) {
@@ -874,20 +894,22 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
                cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
        } else
                cfqg->vdisktime = st->min_vdisktime;
+        cfq_group_service_tree_add(st, cfqg);
+}
-        __cfq_group_service_tree_add(st, cfqg);
+static void
-        cfqg->on_st = true;
+cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
-        st->total_weight += cfqg->weight;
+{
+        st->total_weight -= cfqg->weight;
+        if (!RB_EMPTY_NODE(&cfqg->rb_node))
+                cfq_rb_erase(&cfqg->rb_node, st);
 }
 static void
-cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
-        if (st->active == &cfqg->rb_node)
-                st->active = NULL;
        BUG_ON(cfqg->nr_cfqq < 1);
        cfqg->nr_cfqq--;
@@ -896,15 +918,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
                return;
        cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-        cfqg->on_st = false;
+        cfq_group_service_tree_del(st, cfqg);
-        st->total_weight -= cfqg->weight;
-        if (!RB_EMPTY_NODE(&cfqg->rb_node))
-                cfq_rb_erase(&cfqg->rb_node, st);
        cfqg->saved_workload_slice = 0;
        cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
-static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
+                                                unsigned int *unaccounted_time)
 {
        unsigned int slice_used;
@@ -923,8 +943,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
                                        1);
        } else {
                slice_used = jiffies - cfqq->slice_start;
-                if (slice_used > cfqq->allocated_slice)
+                if (slice_used > cfqq->allocated_slice) {
+                        *unaccounted_time = slice_used - cfqq->allocated_slice;
                        slice_used = cfqq->allocated_slice;
+                }
+                if (time_after(cfqq->slice_start, cfqq->dispatch_start))
+                        *unaccounted_time += cfqq->slice_start -
+                                        cfqq->dispatch_start;
        }
        return slice_used;
@@ -934,12 +959,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
                                struct cfq_queue *cfqq)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
-        unsigned int used_sl, charge;
+        unsigned int used_sl, charge, unaccounted_sl = 0;
        int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
                        - cfqg->service_tree_idle.count;
        BUG_ON(nr_sync < 0);
-        used_sl = charge = cfq_cfqq_slice_usage(cfqq);
+        used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
        if (iops_mode(cfqd))
                charge = cfqq->slice_dispatch;
@@ -947,9 +972,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
                charge = cfqq->allocated_slice;
        /* Can't update vdisktime while group is on service tree */
-        cfq_rb_erase(&cfqg->rb_node, st);
+        cfq_group_service_tree_del(st, cfqg);
        cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-        __cfq_group_service_tree_add(st, cfqg);
+        /* If a new weight was requested, update now, off tree */
+        cfq_group_service_tree_add(st, cfqg);
        /* This group is being expired. Save the context */
        if (time_after(cfqd->workload_expires, jiffies)) {
@@ -962,10 +988,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
                                        st->min_vdisktime);
-        cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
+        cfq_log_cfqq(cfqq->cfqd, cfqq,
-                        " sect=%u", used_sl, cfqq->slice_dispatch, charge,
+                     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
-                        iops_mode(cfqd), cfqq->nr_sectors);
+                     used_sl, cfqq->slice_dispatch, charge,
-        cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+                     iops_mode(cfqd), cfqq->nr_sectors);
+        cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
+                                          unaccounted_sl);
        cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
@@ -977,35 +1005,55 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
        return NULL;
 }
-void
+void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
-cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+                                        unsigned int weight)
 {
-        cfqg_of_blkg(blkg)->weight = weight;
+        struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+        cfqg->new_weight = weight;
+        cfqg->needs_update = true;
 }
-static struct cfq_group *
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+                        struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
 {
-        struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-        struct cfq_group *cfqg = NULL;
-        void *key = cfqd;
-        int i, j;
-        struct cfq_rb_root *st;
        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
        unsigned int major, minor;
-        cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+        /*
-        if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+         * Add group onto cgroup list. It might happen that bdi->dev is
+         * not initialized yet. Initialize this new group without major
+         * and minor info and this info will be filled in once a new thread
+         * comes for IO.
+         */
+        if (bdi->dev) {
                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-                cfqg->blkg.dev = MKDEV(major, minor);
+                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-                goto done;
+                                        (void *)cfqd, MKDEV(major, minor));
-        }
+        } else
-        if (cfqg || !create)
+                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-                goto done;
+                                        (void *)cfqd, 0);
+        cfqd->nr_blkcg_linked_grps++;
+        cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+        /* Add group on cfqd list */
+        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
+ */
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+{
+        struct cfq_group *cfqg = NULL;
+        int i, j, ret;
+        struct cfq_rb_root *st;
        cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
        if (!cfqg)
-                goto done;
+                return NULL;
        for_each_cfqg_st(cfqg, i, j, st)
                *st = CFQ_RB_ROOT;
@@ -1017,52 +1065,103 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
         * elevator which will be dropped by either elevator exit
         * or cgroup deletion path depending on who is exiting first.
         */
-        atomic_set(&cfqg->ref, 1);
+        cfqg->ref = 1;
+        ret = blkio_alloc_blkg_stats(&cfqg->blkg);
+        if (ret) {
+                kfree(cfqg);
+                return NULL;
+        }
+        return cfqg;
+}
+static struct cfq_group *
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+{
+        struct cfq_group *cfqg = NULL;
+        void *key = cfqd;
+        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+        unsigned int major, minor;
        /*
-         * Add group onto cgroup list. It might happen that bdi->dev is
+         * This is the common case when there are no blkio cgroups.
-         * not initiliazed yet. Initialize this new group without major
+         * Avoid lookup in this case
-         * and minor info and this info will be filled in once a new thread
-         * comes for IO. See code above.
         */
-        if (bdi->dev) {
+        if (blkcg == &blkio_root_cgroup)
-                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                cfqg = &cfqd->root_group;
-                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+        else
-                                        MKDEV(major, minor));
+                cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-        } else
-                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-                                        0);
-        cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
-        /* Add group on cfqd list */
+        if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                cfqg->blkg.dev = MKDEV(major, minor);
+        }
-done:
        return cfqg;
 }
 /*
- * Search for the cfq group current task belongs to. If create = 1, then also
+ * Search for the cfq group current task belongs to. request_queue lock must
- * create the cfq group if it does not exist. request_queue lock must be held.
+ * be held.
 */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
-        struct cgroup *cgroup;
+        struct blkio_cgroup *blkcg;
-        struct cfq_group *cfqg = NULL;
+        struct cfq_group *cfqg = NULL, *__cfqg = NULL;
+        struct request_queue *q = cfqd->queue;
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        cfqg = cfq_find_cfqg(cfqd, blkcg);
+        if (cfqg) {
+                rcu_read_unlock();
+                return cfqg;
+        }
+        /*
+         * Need to allocate a group. Allocation of group also needs allocation
+         * of per cpu stats which in-turn takes a mutex() and can block. Hence
+         * we need to drop rcu lock and queue_lock before we call alloc.
+         *
+         * Not taking any queue reference here and assuming that queue is
+         * around by the time we return. CFQ queue allocation code does
+         * the same. It might be racy though.
+         */
+        rcu_read_unlock();
+        spin_unlock_irq(q->queue_lock);
+        cfqg = cfq_alloc_cfqg(cfqd);
+        spin_lock_irq(q->queue_lock);
        rcu_read_lock();
-        cgroup = task_cgroup(current, blkio_subsys_id);
+        blkcg = task_blkio_cgroup(current);
-        cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
-        if (!cfqg && create)
+        /*
+         * If some other thread already allocated the group while we were
+         * not holding queue lock, free up the group
+         */
+        __cfqg = cfq_find_cfqg(cfqd, blkcg);
+        if (__cfqg) {
+                kfree(cfqg);
+                rcu_read_unlock();
+                return __cfqg;
+        }
+        if (!cfqg)
                cfqg = &cfqd->root_group;
+        cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
        rcu_read_unlock();
        return cfqg;
 }
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
-        atomic_inc(&cfqg->ref);
+        cfqg->ref++;
        return cfqg;
 }
@@ -1074,7 +1173,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
        cfqq->cfqg = cfqg;
        /* cfqq reference on cfqg */
-        atomic_inc(&cfqq->cfqg->ref);
+        cfqq->cfqg->ref++;
 }
 static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1082,11 +1181,13 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
        struct cfq_rb_root *st;
        int i, j;
-        BUG_ON(atomic_read(&cfqg->ref) <= 0);
+        BUG_ON(cfqg->ref <= 0);
-        if (!atomic_dec_and_test(&cfqg->ref))
+        cfqg->ref--;
+        if (cfqg->ref)
                return;
        for_each_cfqg_st(cfqg, i, j, st)
-                BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+        free_percpu(cfqg->blkg.stats_cpu);
        kfree(cfqg);
 }
@@ -1145,7 +1246,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 }
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
        return &cfqd->root_group;
 }
@@ -1179,33 +1280,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        struct cfq_rb_root *service_tree;
        int left;
        int new_cfqq = 1;
-        int group_changed = 0;
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-        if (!cfqd->cfq_group_isolation
-            && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
-            && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
-                /* Move this cfq to root group */
-                cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-                if (!RB_EMPTY_NODE(&cfqq->rb_node))
-                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-                cfqq->orig_cfqg = cfqq->cfqg;
-                cfqq->cfqg = &cfqd->root_group;
-                atomic_inc(&cfqd->root_group.ref);
-                group_changed = 1;
-        } else if (!cfqd->cfq_group_isolation
-                   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
-                /* cfqq is sequential now needs to go to its original group */
-                BUG_ON(cfqq->cfqg != &cfqd->root_group);
-                if (!RB_EMPTY_NODE(&cfqq->rb_node))
-                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-                cfq_put_cfqg(cfqq->cfqg);
-                cfqq->cfqg = cfqq->orig_cfqg;
-                cfqq->orig_cfqg = NULL;
-                group_changed = 1;
-                cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
-        }
-#endif
        service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
                                                cfqq_type(cfqq));
@@ -1276,9 +1350,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        rb_link_node(&cfqq->rb_node, parent, p);
        rb_insert_color(&cfqq->rb_node, &service_tree->rb);
        service_tree->count++;
-        if ((add_front || !new_cfqq) && !group_changed)
+        if (add_front || !new_cfqq)
                return;
-        cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+        cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
 }
 static struct cfq_queue *
@@ -1366,6 +1440,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        BUG_ON(cfq_cfqq_on_rr(cfqq));
        cfq_mark_cfqq_on_rr(cfqq);
        cfqd->busy_queues++;
+        if (cfq_cfqq_sync(cfqq))
+                cfqd->busy_sync_queues++;
        cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -1389,9 +1465,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                cfqq->p_root = NULL;
        }
-        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+        cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
        BUG_ON(!cfqd->busy_queues);
        cfqd->busy_queues--;
+        if (cfq_cfqq_sync(cfqq))
+                cfqd->busy_sync_queues--;
 }
 /*
@@ -1663,8 +1741,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        /*
         * store what was left of this slice, if the queue idled/timed out
         */
-        if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
+        if (timed_out) {
-                cfqq->slice_resid = cfqq->slice_end - jiffies;
+                if (cfq_cfqq_slice_new(cfqq))
+                        cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
+                else
+                        cfqq->slice_resid = cfqq->slice_end - jiffies;
                cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
        }
@@ -1678,9 +1759,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (cfqq == cfqd->active_queue)
                cfqd->active_queue = NULL;
-        if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-                cfqd->grp_service_tree.active = NULL;
        if (cfqd->active_cic) {
                put_io_context(cfqd->active_cic->ioc);
                cfqd->active_cic = NULL;
@@ -1892,10 +1970,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         * in their service tree.
         */
        if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
-                return 1;
+                return true;
        cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
                        service_tree->count);
-        return 0;
+        return false;
 }
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -1946,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
         */
        if (sample_valid(cic->ttime_samples) &&
            (cfqq->slice_end - jiffies < cic->ttime_mean)) {
-                cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
+                cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
-                                cic->ttime_mean);
+                             cic->ttime_mean);
                return;
        }
@@ -2020,7 +2098,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
-        return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
+        return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
 }
 /*
@@ -2031,7 +2109,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
        int process_refs, io_refs;
        io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-        process_refs = atomic_read(&cfqq->ref) - io_refs;
+        process_refs = cfqq->ref - io_refs;
        BUG_ON(process_refs < 0);
        return process_refs;
 }
@@ -2071,10 +2149,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
         */
        if (new_process_refs >= process_refs) {
                cfqq->new_cfqq = new_cfqq;
-                atomic_add(process_refs, &new_cfqq->ref);
+                new_cfqq->ref += process_refs;
        } else {
                new_cfqq->new_cfqq = cfqq;
-                atomic_add(new_process_refs, &cfqq->ref);
+                cfqq->ref += new_process_refs;
        }
 }
@@ -2107,12 +2185,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
        unsigned count;
        struct cfq_rb_root *st;
        unsigned group_slice;
+        enum wl_prio_t original_prio = cfqd->serving_prio;
-        if (!cfqg) {
-                cfqd->serving_prio = IDLE_WORKLOAD;
-                cfqd->workload_expires = jiffies + 1;
-                return;
-        }
        /* Choose next priority. RT > BE > IDLE */
        if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
@@ -2125,6 +2198,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
                return;
        }
+        if (original_prio != cfqd->serving_prio)
+                goto new_workload;
        /*
         * For RT and BE, we have to choose also the type
         * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2139,6 +2215,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
        if (count && !time_after(jiffies, cfqd->workload_expires))
                return;
+new_workload:
        /* otherwise select new workload type */
        cfqd->serving_type =
                cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@ -2180,7 +2257,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
        slice = max_t(unsigned, slice, CFQ_MIN_TT);
        cfq_log(cfqd, "workload slice:%d", slice);
        cfqd->workload_expires = jiffies + slice;
-        cfqd->noidle_tree_requires_idle = false;
 }
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -2191,7 +2267,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
        if (RB_EMPTY_ROOT(&st->rb))
                return NULL;
        cfqg = cfq_rb_first_group(st);
-        st->active = &cfqg->rb_node;
        update_min_vdisktime(st);
        return cfqg;
 }
@@ -2285,6 +2360,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
                goto keep_queue;
        }
+        /*
+         * This is a deep seek queue, but the device is much faster than
+         * the queue can deliver, don't idle
+         **/
+        if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
+            (cfq_cfqq_slice_new(cfqq) ||
+            (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
+                cfq_clear_cfqq_deep(cfqq);
+                cfq_clear_cfqq_idle_window(cfqq);
+        }
        if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
                cfqq = NULL;
                goto keep_queue;
@@ -2359,12 +2445,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
 {
        /* the queue hasn't finished any request, can't estimate */
        if (cfq_cfqq_slice_new(cfqq))
-                return 1;
+                return true;
        if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
                cfqq->slice_end))
-                return 1;
+                return true;
-        return 0;
+        return false;
 }
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -2391,6 +2477,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         * Does this cfqq already have too much IO in flight?
         */
        if (cfqq->dispatched >= max_dispatch) {
+                bool promote_sync = false;
                /*
                 * idle queue must always only have a single IO in flight
                 */
@@ -2398,15 +2485,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                        return false;
                /*
+                 * If there is only one sync queue
+                 * we can ignore async queue here and give the sync
+                 * queue no dispatch limit. The reason is a sync queue can
+                 * preempt async queue, limiting the sync queue doesn't make
+                 * sense. This is useful for aiostress test.
+                 */
+                if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
+                        promote_sync = true;
+                /*
                 * We have other queues, don't allow more IO from this one
                 */
-                if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
+                if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
+                                !promote_sync)
                        return false;
                /*
                 * Sole queue user, no limit
                 */
-                if (cfqd->busy_queues == 1)
+                if (cfqd->busy_queues == 1 || promote_sync)
                        max_dispatch = -1;
                else
                        /*
@@ -2528,18 +2626,18 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
        struct cfq_data *cfqd = cfqq->cfqd;
-        struct cfq_group *cfqg, *orig_cfqg;
+        struct cfq_group *cfqg;
-        BUG_ON(atomic_read(&cfqq->ref) <= 0);
+        BUG_ON(cfqq->ref <= 0);
-        if (!atomic_dec_and_test(&cfqq->ref))
+        cfqq->ref--;
+        if (cfqq->ref)
                return;
        cfq_log_cfqq(cfqd, cfqq, "put_queue");
        BUG_ON(rb_first(&cfqq->sort_list));
        BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
        cfqg = cfqq->cfqg;
-        orig_cfqg = cfqq->orig_cfqg;
        if (unlikely(cfqd->active_queue == cfqq)) {
                __cfq_slice_expired(cfqd, cfqq, 0);
@@ -2549,33 +2647,23 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
        BUG_ON(cfq_cfqq_on_rr(cfqq));
        kmem_cache_free(cfq_pool, cfqq);
        cfq_put_cfqg(cfqg);
-        if (orig_cfqg)
-                cfq_put_cfqg(orig_cfqg);
 }
 /*
- * Must always be called with the rcu_read_lock() held
+ * Call func for each cic attached to this ioc.
 */
 static void
-__call_for_each_cic(struct io_context *ioc,
+call_for_each_cic(struct io_context *ioc,
-                    void (*func)(struct io_context *, struct cfq_io_context *))
+                  void (*func)(struct io_context *, struct cfq_io_context *))
 {
        struct cfq_io_context *cic;
        struct hlist_node *n;
+        rcu_read_lock();
        hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
                func(ioc, cic);
-}
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
-                  void (*func)(struct io_context *, struct cfq_io_context *))
-{
-        rcu_read_lock();
-        __call_for_each_cic(ioc, func);
        rcu_read_unlock();
 }
@@ -2636,7 +2724,7 @@ static void cfq_free_io_context(struct io_context *ioc)
         * should be ok to iterate over the known list, we will see all cic's
         * since no new ones are added.
         */
-        __call_for_each_cic(ioc, cic_free_func);
+        call_for_each_cic(ioc, cic_free_func);
 }
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2685,8 +2773,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
        smp_wmb();
        cic->key = cfqd_dead_key(cfqd);
-        if (ioc->ioc_data == cic)
+        rcu_read_lock();
+        if (rcu_dereference(ioc->ioc_data) == cic) {
+                rcu_read_unlock();
+                spin_lock(&ioc->lock);
                rcu_assign_pointer(ioc->ioc_data, NULL);
+                spin_unlock(&ioc->lock);
+        } else
+                rcu_read_unlock();
        if (cic->cfqq[BLK_RW_ASYNC]) {
                cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2835,7 +2929,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        RB_CLEAR_NODE(&cfqq->p_node);
        INIT_LIST_HEAD(&cfqq->fifo);
-        atomic_set(&cfqq->ref, 0);
+        cfqq->ref = 0;
        cfqq->cfqd = cfqd;
        cfq_mark_cfqq_prio_changed(cfqq);
@@ -2892,7 +2986,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
        struct cfq_group *cfqg;
 retry:
-        cfqg = cfq_get_cfqg(cfqd, 1);
+        cfqg = cfq_get_cfqg(cfqd);
        cic = cfq_cic_lookup(cfqd, ioc);
        /* cic always exists here */
        cfqq = cic_to_cfqq(cic, is_sync);
@@ -2971,11 +3065,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
         * pin the queue now that it's allocated, scheduler exit will prune it
         */
        if (!is_sync && !(*async_cfqq)) {
-                atomic_inc(&cfqq->ref);
+                cfqq->ref++;
                *async_cfqq = cfqq;
        }
-        atomic_inc(&cfqq->ref);
+        cfqq->ref++;
        return cfqq;
 }
@@ -2993,7 +3087,8 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
        spin_lock_irqsave(&ioc->lock, flags);
-        BUG_ON(ioc->ioc_data == cic);
+        BUG_ON(rcu_dereference_check(ioc->ioc_data,
+                lockdep_is_held(&ioc->lock)) == cic);
        radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
        hlist_del_rcu(&cic->cic_list);
@@ -3177,7 +3272,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (cfqq->queued[0] + cfqq->queued[1] >= 4)
                cfq_mark_cfqq_deep(cfqq);
-        if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+        if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+                enable_idle = 0;
+        else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
            (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
                enable_idle = 0;
        else if (sample_valid(cic->ttime_samples)) {
@@ -3255,6 +3352,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
        if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
                return true;
+        /* An idle queue should not be idle now for some reason */
+        if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
+                return true;
        if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
                return false;
@@ -3274,10 +3375,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+        struct cfq_queue *old_cfqq = cfqd->active_queue;
        cfq_log_cfqq(cfqd, cfqq, "preempt");
        cfq_slice_expired(cfqd, 1);
        /*
+         * workload type is changed, don't save slice, otherwise preempt
+         * doesn't happen
+         */
+        if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+                cfqq->cfqg->saved_workload_slice = 0;
+        /*
         * Put the new queue at the front of the of the current list,
         * so we know that it will be selected next.
         */
@@ -3402,6 +3512,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
        struct cfq_io_context *cic = cfqd->active_cic;
+        /* If the queue already has requests, don't wait */
+        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+                return false;
        /* If there are other queues in the group, don't wait */
        if (cfqq->cfqg->nr_cfqq > 1)
                return false;
@@ -3494,17 +3608,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
                        cfq_slice_expired(cfqd, 1);
                else if (sync && cfqq_empty &&
                         !cfq_close_cooperator(cfqd, cfqq)) {
-                        cfqd->noidle_tree_requires_idle |=
+                        cfq_arm_slice_timer(cfqd);
-                                !(rq->cmd_flags & REQ_NOIDLE);
-                        /*
-                         * Idling is enabled for SYNC_WORKLOAD.
-                         * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-                         * only if we processed at least one !REQ_NOIDLE request
-                         */
-                        if (cfqd->serving_type == SYNC_WORKLOAD
-                            || cfqd->noidle_tree_requires_idle
-                            || cfqq->cfqg->nr_cfqq == 1)
-                                cfq_arm_slice_timer(cfqd);
                }
        }
@@ -3589,12 +3693,12 @@ static void cfq_put_request(struct request *rq)
                put_io_context(RQ_CIC(rq)->ioc);
-                rq->elevator_private = NULL;
+                rq->elevator_private[0] = NULL;
-                rq->elevator_private2 = NULL;
+                rq->elevator_private[1] = NULL;
                /* Put down rq reference on cfqg */
                cfq_put_cfqg(RQ_CFQG(rq));
-                rq->elevator_private3 = NULL;
+                rq->elevator_private[2] = NULL;
                cfq_put_queue(cfqq);
        }
@@ -3681,19 +3785,15 @@ new_queue:
        }
        cfqq->allocated[rw]++;
-        atomic_inc(&cfqq->ref);
+        cfqq->ref++;
+        rq->elevator_private[0] = cic;
+        rq->elevator_private[1] = cfqq;
+        rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
        spin_unlock_irqrestore(q->queue_lock, flags);
-        rq->elevator_private = cic;
-        rq->elevator_private2 = cfqq;
-        rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
        return 0;
 queue_fail:
-        if (cic)
-                put_io_context(cic->ioc);
        cfq_schedule_dispatch(cfqd);
        spin_unlock_irqrestore(q->queue_lock, flags);
        cfq_log(cfqd, "set_request fail");
@@ -3788,15 +3888,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
                cfq_put_queue(cfqd->async_idle_cfqq);
 }
-static void cfq_cfqd_free(struct rcu_head *head)
-{
-        kfree(container_of(head, struct cfq_data, rcu));
-}
 static void cfq_exit_queue(struct elevator_queue *e)
 {
        struct cfq_data *cfqd = e->elevator_data;
        struct request_queue *q = cfqd->queue;
+        bool wait = false;
        cfq_shutdown_timer_wq(cfqd);
@@ -3815,7 +3911,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
        cfq_put_async_queues(cfqd);
        cfq_release_cfq_groups(cfqd);
-        cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+        /*
+         * If there are groups which we could not unlink from blkcg list,
+         * wait for a rcu period for them to be freed.
+         */
+        if (cfqd->nr_blkcg_linked_grps)
+                wait = true;
        spin_unlock_irq(q->queue_lock);
@@ -3825,8 +3927,25 @@ static void cfq_exit_queue(struct elevator_queue *e)
        ida_remove(&cic_index_ida, cfqd->cic_index);
        spin_unlock(&cic_index_lock);
-        /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
+        /*
-        call_rcu(&cfqd->rcu, cfq_cfqd_free);
+         * Wait for cfqg->blkg->key accessors to exit their grace periods.
+         * Do this wait only if there are other unlinked groups out
+         * there. This can happen if cgroup deletion path claimed the
+         * responsibility of cleaning up a group before queue cleanup code
+         * get to the group.
+         *
+         * Do not call synchronize_rcu() unconditionally as there are drivers
+         * which create/delete request queue hundreds of times during scan/boot
+         * and synchronize_rcu() can take significant time and slow down boot.
+         */
+        if (wait)
+                synchronize_rcu();
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+        /* Free up per cpu stats for root group */
+        free_percpu(cfqd->root_group.blkg.stats_cpu);
+#endif
+        kfree(cfqd);
 }
 static int cfq_alloc_cic_index(void)
@@ -3859,9 +3978,17 @@ static void *cfq_init_queue(struct request_queue *q)
                return NULL;
        cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-        if (!cfqd)
+        if (!cfqd) {
+                spin_lock(&cic_index_lock);
+                ida_remove(&cic_index_ida, i);
+                spin_unlock(&cic_index_lock);
                return NULL;
+        }
+        /*
+         * Don't need take queue_lock in the routine, since we are
+         * initializing the ioscheduler, and nobody is using cfqd
+         */
        cfqd->cic_index = i;
        /* Init root service tree */
@@ -3878,14 +4005,29 @@ static void *cfq_init_queue(struct request_queue *q)
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
        /*
-         * Take a reference to root group which we never drop. This is just
+         * Set root group reference to 2. One reference will be dropped when
-         * to make sure that cfq_put_cfqg() does not try to kfree root group
+         * all groups on cfqd->cfqg_list are being deleted during queue exit.
+         * Other reference will remain there as we don't want to delete this
+         * group as it is statically allocated and gets destroyed when
+         * throtl_data goes away.
         */
-        atomic_set(&cfqg->ref, 1);
+        cfqg->ref = 2;
+        if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+                kfree(cfqg);
+                kfree(cfqd);
+                return NULL;
+        }
        rcu_read_lock();
        cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
                                        (void *)cfqd, 0);
        rcu_read_unlock();
+        cfqd->nr_blkcg_linked_grps++;
+        /* Add group on cfqd->cfqg_list */
+        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 #endif
        /*
         * Not strictly needed (since RB_ROOT just clears the node and we
@@ -3901,7 +4043,7 @@ static void *cfq_init_queue(struct request_queue *q)
         * will not attempt to free it.
         */
        cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-        atomic_inc(&cfqd->oom_cfqq.ref);
+        cfqd->oom_cfqq.ref++;
        cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
        INIT_LIST_HEAD(&cfqd->cic_list);
@@ -3925,7 +4067,6 @@ static void *cfq_init_queue(struct request_queue *q)
        cfqd->cfq_slice_idle = cfq_slice_idle;
        cfqd->cfq_group_idle = cfq_group_idle;
        cfqd->cfq_latency = 1;
-        cfqd->cfq_group_isolation = 0;
        cfqd->hw_tag = -1;
        /*
         * we optimistically start assuming sync ops weren't delayed in last
@@ -4001,7 +4142,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
-SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
 #undef SHOW_FUNCTION
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
@@ -4035,7 +4175,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
                UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
-STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
 #undef STORE_FUNCTION
 #define CFQ_ATTR(name) \
@@ -4053,7 +4192,6 @@ static struct elv_fs_entry cfq_attrs[] = {
        CFQ_ATTR(slice_idle),
        CFQ_ATTR(group_idle),
        CFQ_ATTR(low_latency),
-        CFQ_ATTR(group_isolation),
        __ATTR_NULL
 };
@@ -4068,7 +4206,6 @@ static struct elevator_type iosched_cfq = {
                .elevator_add_req_fn =          cfq_insert_request,
                .elevator_activate_req_fn =     cfq_activate_request,
                .elevator_deactivate_req_fn =   cfq_deactivate_request,
-                .elevator_queue_empty_fn =      cfq_queue_empty,
                .elevator_completed_req_fn =    cfq_completed_request,
                .elevator_former_req_fn =       elv_rb_former_request,
                .elevator_latter_req_fn =       elv_rb_latter_request,
@@ -4090,6 +4227,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
                .blkio_unlink_group_fn =        cfq_unlink_blkio_group,
                .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
        },
+        .plid = BLKIO_POLICY_PROP,
 };
 #else
 static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..2a155927e37c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 }
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                        unsigned long time)
+                        unsigned long time, unsigned long unaccounted_time)
 {
-        blkiocg_update_timeslice_used(blkg, time);
+        blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
 }
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
                        struct blkio_group *blkg, void *key, dev_t dev) {
-        blkiocg_add_blkio_group(blkcg, blkg, key, dev);
+        blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
 }
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
                        unsigned long dequeue) {}
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                        unsigned long time) {}
+                        unsigned long time, unsigned long unaccounted_time) {}
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
                                bool direction, bool sync) {}
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 119f07b74dc0..cc3eb78e333a 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -8,7 +8,6 @@
 #include <linux/hdreg.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
-#include <linux/smp_lock.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
@@ -744,13 +743,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
                return 0;
        case BLKGETSIZE:
-                size = bdev->bd_inode->i_size;
+                size = i_size_read(bdev->bd_inode);
                if ((size >> 9) > ~0UL)
                        return -EFBIG;
                return compat_put_ulong(arg, size >> 9);
        case BLKGETSIZE64_32:
-                return compat_put_u64(arg, bdev->bd_inode->i_size);
+                return compat_put_u64(arg, i_size_read(bdev->bd_inode));
        case BLKTRACESETUP32:
        case BLKTRACESTART: /* compatible */
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b23..5139c0ea1864 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
        return 1;
 }
-static int deadline_queue_empty(struct request_queue *q)
-{
-        struct deadline_data *dd = q->elevator->elevator_data;
-        return list_empty(&dd->fifo_list[WRITE])
-                && list_empty(&dd->fifo_list[READ]);
-}
 static void deadline_exit_queue(struct elevator_queue *e)
 {
        struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
                .elevator_merge_req_fn =        deadline_merged_requests,
                .elevator_dispatch_fn =         deadline_dispatch_requests,
                .elevator_add_req_fn =          deadline_add_request,
-                .elevator_queue_empty_fn =      deadline_queue_empty,
                .elevator_former_req_fn =       elv_rb_former_request,
                .elevator_latter_req_fn =       elv_rb_latter_request,
                .elevator_init_fn =             deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 4e11559aa2b0..b0b38ce0dcb6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
-static inline int elv_try_merge(struct request *__rq, struct bio *bio)
+int elv_try_merge(struct request *__rq, struct bio *bio)
 {
        int ret = ELEVATOR_NO_MERGE;
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name)
        e = elevator_find(name);
        if (!e) {
-                char elv[ELV_NAME_MAX + strlen("-iosched")];
                spin_unlock(&elv_list_lock);
+                request_module("%s-iosched", name);
-                snprintf(elv, sizeof(elv), "%s-iosched", name);
-                request_module("%s", elv);
                spin_lock(&elv_list_lock);
                e = elevator_find(name);
        }
@@ -429,7 +424,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
        q->nr_sorted--;
        boundary = q->end_sector;
-        stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED;
+        stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
        list_for_each_prev(entry, &q->queue_head) {
                struct request *pos = list_entry_rq(entry);
@@ -519,6 +514,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
        return ELEVATOR_NO_MERGE;
 }
+/*
+ * Attempt to do an insertion back merge. Only check for the case where
+ * we can append 'rq' to an existing request, so we can throw 'rq' away
+ * afterwards.
+ *
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_insert_merge(struct request_queue *q,
+                                     struct request *rq)
+{
+        struct request *__rq;
+        if (blk_queue_nomerges(q))
+                return false;
+        /*
+         * First try one-hit cache.
+         */
+        if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+                return true;
+        if (blk_queue_noxmerges(q))
+                return false;
+        /*
+         * See if our hash lookup can find a potential backmerge.
+         */
+        __rq = elv_rqhash_find(q, blk_rq_pos(rq));
+        if (__rq && blk_attempt_req_merge(q, __rq, rq))
+                return true;
+        return false;
+}
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
        struct elevator_queue *e = q->elevator;
@@ -536,14 +565,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
                             struct request *next)
 {
        struct elevator_queue *e = q->elevator;
+        const int next_sorted = next->cmd_flags & REQ_SORTED;
-        if (e->ops->elevator_merge_req_fn)
+        if (next_sorted && e->ops->elevator_merge_req_fn)
                e->ops->elevator_merge_req_fn(q, rq, next);
        elv_rqhash_reposition(q, rq);
-        elv_rqhash_del(q, next);
-        q->nr_sorted--;
+        if (next_sorted) {
+                elv_rqhash_del(q, next);
+                q->nr_sorted--;
+        }
        q->last_merge = rq;
 }
@@ -570,7 +603,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
        rq->cmd_flags &= ~REQ_STARTED;
-        elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+        __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
 void elv_drain_elevator(struct request_queue *q)
@@ -615,20 +648,28 @@ void elv_quiesce_end(struct request_queue *q)
        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
 }
-void elv_insert(struct request_queue *q, struct request *rq, int where)
+void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
-        struct list_head *pos;
-        unsigned ordseq;
-        int unplug_it = 1;
        trace_block_rq_insert(q, rq);
        rq->q = q;
+        if (rq->cmd_flags & REQ_SOFTBARRIER) {
+                /* barriers are scheduling boundary, update end_sector */
+                if (rq->cmd_type == REQ_TYPE_FS ||
+                    (rq->cmd_flags & REQ_DISCARD)) {
+                        q->end_sector = rq_end_sector(rq);
+                        q->boundary_rq = rq;
+                }
+        } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
+                    (where == ELEVATOR_INSERT_SORT ||
+                     where == ELEVATOR_INSERT_SORT_MERGE))
+                where = ELEVATOR_INSERT_BACK;
        switch (where) {
+        case ELEVATOR_INSERT_REQUEUE:
        case ELEVATOR_INSERT_FRONT:
                rq->cmd_flags |= REQ_SOFTBARRIER;
                list_add(&rq->queuelist, &q->queue_head);
                break;
@@ -649,6 +690,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
                __blk_run_queue(q);
                break;
+        case ELEVATOR_INSERT_SORT_MERGE:
+                /*
+                 * If we succeed in merging this request with one in the
+                 * queue already, we are done - rq has now been freed,
+                 * so no need to do anything further.
+                 */
+                if (elv_attempt_insert_merge(q, rq))
+                        break;
        case ELEVATOR_INSERT_SORT:
                BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
                       !(rq->cmd_flags & REQ_DISCARD));
@@ -668,115 +717,28 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
                q->elevator->ops->elevator_add_req_fn(q, rq);
                break;
-        case ELEVATOR_INSERT_REQUEUE:
+        case ELEVATOR_INSERT_FLUSH:
-                /*
-                 * If ordered flush isn't in progress, we do front
-                 * insertion; otherwise, requests should be requeued
-                 * in ordseq order.
-                 */
                rq->cmd_flags |= REQ_SOFTBARRIER;
+                blk_insert_flush(rq);
-                /*
-                 * Most requeues happen because of a busy condition,
-                 * don't force unplug of the queue for that case.
-                 */
-                unplug_it = 0;
-                if (q->ordseq == 0) {
-                        list_add(&rq->queuelist, &q->queue_head);
-                        break;
-                }
-                ordseq = blk_ordered_req_seq(rq);
-                list_for_each(pos, &q->queue_head) {
-                        struct request *pos_rq = list_entry_rq(pos);
-                        if (ordseq <= blk_ordered_req_seq(pos_rq))
-                                break;
-                }
-                list_add_tail(&rq->queuelist, pos);
                break;
        default:
                printk(KERN_ERR "%s: bad insertion point %d\n",
                       __func__, where);
                BUG();
        }
-        if (unplug_it && blk_queue_plugged(q)) {
-                int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
-                                - queue_in_flight(q);
-                if (nrq >= q->unplug_thresh)
-                        __generic_unplug_device(q);
-        }
-}
-void __elv_add_request(struct request_queue *q, struct request *rq, int where,
-                       int plug)
-{
-        if (q->ordcolor)
-                rq->cmd_flags |= REQ_ORDERED_COLOR;
-        if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
-                /*
-                 * toggle ordered color
-                 */
-                if (rq->cmd_flags & REQ_HARDBARRIER)
-                        q->ordcolor ^= 1;
-                /*
-                 * barriers implicitly indicate back insertion
-                 */
-                if (where == ELEVATOR_INSERT_SORT)
-                        where = ELEVATOR_INSERT_BACK;
-                /*
-                 * this request is scheduling boundary, update
-                 * end_sector
-                 */
-                if (rq->cmd_type == REQ_TYPE_FS ||
-                    (rq->cmd_flags & REQ_DISCARD)) {
-                        q->end_sector = rq_end_sector(rq);
-                        q->boundary_rq = rq;
-                }
-        } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
-                    where == ELEVATOR_INSERT_SORT)
-                where = ELEVATOR_INSERT_BACK;
-        if (plug)
-                blk_plug_device(q);
-        elv_insert(q, rq, where);
 }
 EXPORT_SYMBOL(__elv_add_request);
-void elv_add_request(struct request_queue *q, struct request *rq, int where,
+void elv_add_request(struct request_queue *q, struct request *rq, int where)
-                     int plug)
 {
        unsigned long flags;
        spin_lock_irqsave(q->queue_lock, flags);
-        __elv_add_request(q, rq, where, plug);
+        __elv_add_request(q, rq, where);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(elv_add_request);
-int elv_queue_empty(struct request_queue *q)
-{
-        struct elevator_queue *e = q->elevator;
-        if (!list_empty(&q->queue_head))
-                return 0;
-        if (e->ops->elevator_queue_empty_fn)
-                return e->ops->elevator_queue_empty_fn(q);
-        return 1;
-}
-EXPORT_SYMBOL(elv_queue_empty);
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
@@ -802,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
        if (e->ops->elevator_set_req_fn)
                return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
-        rq->elevator_private = NULL;
+        rq->elevator_private[0] = NULL;
        return 0;
 }
@@ -828,6 +790,8 @@ void elv_abort_queue(struct request_queue *q)
 {
        struct request *rq;
+        blk_abort_flushes(q);
        while (!list_empty(&q->queue_head)) {
                rq = list_entry_rq(q->queue_head.next);
                rq->cmd_flags |= REQ_QUIET;
@@ -855,24 +819,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
                    e->ops->elevator_completed_req_fn)
                        e->ops->elevator_completed_req_fn(q, rq);
        }
-        /*
-         * Check if the queue is waiting for fs requests to be
-         * drained for flush sequence.
-         */
-        if (unlikely(q->ordseq)) {
-                struct request *next = NULL;
-                if (!list_empty(&q->queue_head))
-                        next = list_entry_rq(q->queue_head.next);
-                if (!queue_in_flight(q) &&
-                    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-                    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
-                        blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-                        __blk_run_queue(q);
-                }
-        }
 }
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..3608289c8ecd 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,13 +18,12 @@
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/log2.h>
 #include "blk.h"
 static DEFINE_MUTEX(block_class_lock);
-#ifndef CONFIG_SYSFS_DEPRECATED
 struct kobject *block_depr;
-#endif
 /* for extended dynamic devt allocation, currently only one major is used */
 #define MAX_EXT_DEVT            (1 << MINORBITS)
@@ -37,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
 static struct device_type disk_type;
+static void disk_add_events(struct gendisk *disk);
+static void disk_del_events(struct gendisk *disk);
+static void disk_release_events(struct gendisk *disk);
 /**
 * disk_get_part - get partition
 * @disk: disk to look partition from
@@ -241,7 +244,7 @@ static struct blk_major_name {
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 /* index in the above - for now: assume no multimajor ranges */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
        return major % BLKDEV_MAJOR_HASH_SIZE;
 }
@@ -504,6 +507,64 @@ static int exact_lock(dev_t devt, void *data)
        return 0;
 }
+void register_disk(struct gendisk *disk)
+{
+        struct device *ddev = disk_to_dev(disk);
+        struct block_device *bdev;
+        struct disk_part_iter piter;
+        struct hd_struct *part;
+        int err;
+        ddev->parent = disk->driverfs_dev;
+        dev_set_name(ddev, disk->disk_name);
+        /* delay uevents, until we scanned partition table */
+        dev_set_uevent_suppress(ddev, 1);
+        if (device_add(ddev))
+                return;
+        if (!sysfs_deprecated) {
+                err = sysfs_create_link(block_depr, &ddev->kobj,
+                                        kobject_name(&ddev->kobj));
+                if (err) {
+                        device_del(ddev);
+                        return;
+                }
+        }
+        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+        /* No minors to use for partitions */
+        if (!disk_partitionable(disk))
+                goto exit;
+        /* No such device (e.g., media were just removed) */
+        if (!get_capacity(disk))
+                goto exit;
+        bdev = bdget_disk(disk, 0);
+        if (!bdev)
+                goto exit;
+        bdev->bd_invalidated = 1;
+        err = blkdev_get(bdev, FMODE_READ, NULL);
+        if (err < 0)
+                goto exit;
+        blkdev_put(bdev, FMODE_READ);
+exit:
+        /* announce disk after possible partitions are created */
+        dev_set_uevent_suppress(ddev, 0);
+        kobject_uevent(&ddev->kobj, KOBJ_ADD);
+        /* announce possible partitions */
+        disk_part_iter_init(&piter, disk, 0);
+        while ((part = disk_part_iter_next(&piter)))
+                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+        disk_part_iter_exit(&piter);
+}
 /**
 * add_disk - add partitioning information to kernel list
 * @disk: per-device partitioning information
@@ -541,28 +602,60 @@ void add_disk(struct gendisk *disk)
        disk->major = MAJOR(devt);
        disk->first_minor = MINOR(devt);
+        /* Register BDI before referencing it from bdev */ 
+        bdi = &disk->queue->backing_dev_info;
+        bdi_register_dev(bdi, disk_devt(disk));
        blk_register_region(disk_devt(disk), disk->minors, NULL,
                            exact_match, exact_lock, disk);
        register_disk(disk);
        blk_register_queue(disk);
-        bdi = &disk->queue->backing_dev_info;
-        bdi_register_dev(bdi, disk_devt(disk));
        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
                                   "bdi");
        WARN_ON(retval);
-}
+        disk_add_events(disk);
+}
 EXPORT_SYMBOL(add_disk);
-EXPORT_SYMBOL(del_gendisk);     /* in partitions/check.c */
-void unlink_gendisk(struct gendisk *disk)
+void del_gendisk(struct gendisk *disk)
 {
+        struct disk_part_iter piter;
+        struct hd_struct *part;
+        disk_del_events(disk);
+        /* invalidate stuff */
+        disk_part_iter_init(&piter, disk,
+                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+        while ((part = disk_part_iter_next(&piter))) {
+                invalidate_partition(disk, part->partno);
+                delete_partition(disk, part->partno);
+        }
+        disk_part_iter_exit(&piter);
+        invalidate_partition(disk, 0);
+        blk_free_devt(disk_to_dev(disk)->devt);
+        set_capacity(disk, 0);
+        disk->flags &= ~GENHD_FL_UP;
        sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
        bdi_unregister(&disk->queue->backing_dev_info);
        blk_unregister_queue(disk);
        blk_unregister_region(disk_devt(disk), disk->minors);
+        part_stat_set_all(&disk->part0, 0);
+        disk->part0.stamp = 0;
+        kobject_put(disk->part0.holder_dir);
+        kobject_put(disk->slave_dir);
+        disk->driverfs_dev = NULL;
+        if (!sysfs_deprecated)
+                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+        device_del(disk_to_dev(disk));
 }
+EXPORT_SYMBOL(del_gendisk);
 /**
 * get_gendisk - get partitioning information for a given device
@@ -642,10 +735,11 @@ void __init printk_all_partitions(void)
                struct hd_struct *part;
                char name_buf[BDEVNAME_SIZE];
                char devt_buf[BDEVT_SIZE];
+                u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
                /*
                 * Don't show empty devices or things that have been
-                 * surpressed
+                 * suppressed
                 */
                if (get_capacity(disk) == 0 ||
                    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
@@ -660,10 +754,14 @@ void __init printk_all_partitions(void)
                while ((part = disk_part_iter_next(&piter))) {
                        bool is_part0 = part == &disk->part0;
-                        printk("%s%s %10llu %s", is_part0 ? "" : "  ",
+                        uuid[0] = 0;
+                        if (part->info)
+                                part_unpack_uuid(part->info->uuid, uuid);
+                        printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
                               bdevt_str(part_devt(part), devt_buf),
                               (unsigned long long)part->nr_sects >> 1,
-                               disk_name(disk, part->partno, name_buf));
+                               disk_name(disk, part->partno, name_buf), uuid);
                        if (is_part0) {
                                if (disk->driverfs_dev != NULL &&
                                    disk->driverfs_dev->driver != NULL)
@@ -730,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
        static void *p;
        p = disk_seqf_start(seqf, pos);
-        if (!IS_ERR(p) && p && !*pos)
+        if (!IS_ERR_OR_NULL(p) && !*pos)
                seq_puts(seqf, "major minor  #blocks  name\n\n");
        return p;
 }
@@ -803,10 +901,9 @@ static int __init genhd_device_init(void)
        register_blkdev(BLOCK_EXT_MAJOR, "blkext");
-#ifndef CONFIG_SYSFS_DEPRECATED
        /* create top-level block dir */
-        block_depr = kobject_create_and_add("block", NULL);
+        if (!sysfs_deprecated)
-#endif
+                block_depr = kobject_create_and_add("block", NULL);
        return 0;
 }
@@ -1001,9 +1098,11 @@ static void disk_release(struct device *dev)
 {
        struct gendisk *disk = dev_to_disk(dev);
+        disk_release_events(disk);
        kfree(disk->random);
        disk_replace_part_tbl(disk, NULL);
        free_part_stats(&disk->part0);
+        free_part_info(&disk->part0);
        kfree(disk);
 }
 struct class block_class = {
@@ -1059,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
                           "%u %lu %lu %llu %u %u %u %u\n",
                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
                           disk_name(gp, hd->partno, buf),
-                           part_stat_read(hd, ios[0]),
+                           part_stat_read(hd, ios[READ]),
-                           part_stat_read(hd, merges[0]),
+                           part_stat_read(hd, merges[READ]),
-                           (unsigned long long)part_stat_read(hd, sectors[0]),
+                           (unsigned long long)part_stat_read(hd, sectors[READ]),
-                           jiffies_to_msecs(part_stat_read(hd, ticks[0])),
+                           jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
-                           part_stat_read(hd, ios[1]),
+                           part_stat_read(hd, ios[WRITE]),
-                           part_stat_read(hd, merges[1]),
+                           part_stat_read(hd, merges[WRITE]),
-                           (unsigned long long)part_stat_read(hd, sectors[1]),
+                           (unsigned long long)part_stat_read(hd, sectors[WRITE]),
-                           jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
                           part_in_flight(hd),
                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
                           jiffies_to_msecs(part_stat_read(hd, time_in_queue))
@@ -1105,29 +1204,6 @@ static int __init proc_genhd_init(void)
 module_init(proc_genhd_init);
 #endif /* CONFIG_PROC_FS */
-static void media_change_notify_thread(struct work_struct *work)
-{
-        struct gendisk *gd = container_of(work, struct gendisk, async_notify);
-        char event[] = "MEDIA_CHANGE=1";
-        char *envp[] = { event, NULL };
-        /*
-         * set enviroment vars to indicate which event this is for
-         * so that user space will know to go check the media status.
-         */
-        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
-        put_device(gd->driverfs_dev);
-}
-#if 0
-void genhd_media_change_notify(struct gendisk *disk)
-{
-        get_device(disk->driverfs_dev);
-        schedule_work(&disk->async_notify);
-}
-EXPORT_SYMBOL_GPL(genhd_media_change_notify);
-#endif  /*  0  */
 dev_t blk_lookup_devt(const char *name, int partno)
 {
        dev_t devt = MKDEV(0, 0);
@@ -1188,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
                }
                disk->part_tbl->part[0] = &disk->part0;
+                hd_ref_init(&disk->part0);
                disk->minors = minors;
                rand_initialize_disk(disk);
                disk_to_dev(disk)->class = &block_class;
                disk_to_dev(disk)->type = &disk_type;
                device_initialize(disk_to_dev(disk));
-                INIT_WORK(&disk->async_notify,
-                        media_change_notify_thread);
        }
        return disk;
 }
@@ -1279,10 +1355,444 @@ int invalidate_partition(struct gendisk *disk, int partno)
        struct block_device *bdev = bdget_disk(disk, partno);
        if (bdev) {
                fsync_bdev(bdev);
-                res = __invalidate_device(bdev);
+                res = __invalidate_device(bdev, true);
                bdput(bdev);
        }
        return res;
 }
 EXPORT_SYMBOL(invalidate_partition);
+/*
+ * Disk events - monitor disk events like media change and eject request.
+ */
+struct disk_events {
+        struct list_head        node;           /* all disk_event's */
+        struct gendisk          *disk;          /* the associated disk */
+        spinlock_t              lock;
+        struct mutex            block_mutex;    /* protects blocking */
+        int                     block;          /* event blocking depth */
+        unsigned int            pending;        /* events already sent out */
+        unsigned int            clearing;       /* events being cleared */
+        long                    poll_msecs;     /* interval, -1 for default */
+        struct delayed_work     dwork;
+};
+static const char *disk_events_strs[] = {
+        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
+        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "eject_request",
+};
+static char *disk_uevents[] = {
+        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
+        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "DISK_EJECT_REQUEST=1",
+};
+/* list of all disk_events */
+static DEFINE_MUTEX(disk_events_mutex);
+static LIST_HEAD(disk_events);
+/* disable in-kernel polling by default */
+static unsigned long disk_events_dfl_poll_msecs = 0;
+static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
+{
+        struct disk_events *ev = disk->ev;
+        long intv_msecs = 0;
+        /*
+         * If device-specific poll interval is set, always use it.  If
+         * the default is being used, poll iff there are events which
+         * can't be monitored asynchronously.
+         */
+        if (ev->poll_msecs >= 0)
+                intv_msecs = ev->poll_msecs;
+        else if (disk->events & ~disk->async_events)
+                intv_msecs = disk_events_dfl_poll_msecs;
+        return msecs_to_jiffies(intv_msecs);
+}
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events().  Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
+{
+        struct disk_events *ev = disk->ev;
+        unsigned long flags;
+        bool cancel;
+        if (!ev)
+                return;
+        /*
+         * Outer mutex ensures that the first blocker completes canceling
+         * the event work before further blockers are allowed to finish.
+         */
+        mutex_lock(&ev->block_mutex);
+        spin_lock_irqsave(&ev->lock, flags);
+        cancel = !ev->block++;
+        spin_unlock_irqrestore(&ev->lock, flags);
+        if (cancel)
+                cancel_delayed_work_sync(&disk->ev->dwork);
+        mutex_unlock(&ev->block_mutex);
+}
+static void __disk_unblock_events(struct gendisk *disk, bool check_now)
+{
+        struct disk_events *ev = disk->ev;
+        unsigned long intv;
+        unsigned long flags;
+        spin_lock_irqsave(&ev->lock, flags);
+        if (WARN_ON_ONCE(ev->block <= 0))
+                goto out_unlock;
+        if (--ev->block)
+                goto out_unlock;
+        /*
+         * Not exactly a latency critical operation, set poll timer
+         * slack to 25% and kick event check.
+         */
+        intv = disk_events_poll_jiffies(disk);
+        set_timer_slack(&ev->dwork.timer, intv / 4);
+        if (check_now)
+                queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+        else if (intv)
+                queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+out_unlock:
+        spin_unlock_irqrestore(&ev->lock, flags);
+}
+/**
+ * disk_unblock_events - unblock disk event checking
+ * @disk: disk to unblock events for
+ *
+ * Undo disk_block_events().  When the block count reaches zero, it
+ * starts events polling if configured.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_unblock_events(struct gendisk *disk)
+{
+        if (disk->ev)
+                __disk_unblock_events(disk, false);
+}
+/**
+ * disk_check_events - schedule immediate event checking
+ * @disk: disk to check events for
+ *
+ * Schedule immediate event checking on @disk if not blocked.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_check_events(struct gendisk *disk)
+{
+        struct disk_events *ev = disk->ev;
+        unsigned long flags;
+        if (!ev)
+                return;
+        spin_lock_irqsave(&ev->lock, flags);
+        if (!ev->block) {
+                cancel_delayed_work(&ev->dwork);
+                queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+        }
+        spin_unlock_irqrestore(&ev->lock, flags);
+}
+EXPORT_SYMBOL_GPL(disk_check_events);
+/**
+ * disk_clear_events - synchronously check, clear and return pending events
+ * @disk: disk to fetch and clear events from
+ * @mask: mask of events to be fetched and clearted
+ *
+ * Disk events are synchronously checked and pending events in @mask
+ * are cleared and returned.  This ignores the block count.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+{
+        const struct block_device_operations *bdops = disk->fops;
+        struct disk_events *ev = disk->ev;
+        unsigned int pending;
+        if (!ev) {
+                /* for drivers still using the old ->media_changed method */
+                if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
+                    bdops->media_changed && bdops->media_changed(disk))
+                        return DISK_EVENT_MEDIA_CHANGE;
+                return 0;
+        }
+        /* tell the workfn about the events being cleared */
+        spin_lock_irq(&ev->lock);
+        ev->clearing |= mask;
+        spin_unlock_irq(&ev->lock);
+        /* uncondtionally schedule event check and wait for it to finish */
+        disk_block_events(disk);
+        queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+        flush_delayed_work(&ev->dwork);
+        __disk_unblock_events(disk, false);
+        /* then, fetch and clear pending events */
+        spin_lock_irq(&ev->lock);
+        WARN_ON_ONCE(ev->clearing & mask);      /* cleared by workfn */
+        pending = ev->pending & mask;
+        ev->pending &= ~mask;
+        spin_unlock_irq(&ev->lock);
+        return pending;
+}
+static void disk_events_workfn(struct work_struct *work)
+{
+        struct delayed_work *dwork = to_delayed_work(work);
+        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
+        struct gendisk *disk = ev->disk;
+        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+        unsigned int clearing = ev->clearing;
+        unsigned int events;
+        unsigned long intv;
+        int nr_events = 0, i;
+        /* check events */
+        events = disk->fops->check_events(disk, clearing);
+        /* accumulate pending events and schedule next poll if necessary */
+        spin_lock_irq(&ev->lock);
+        events &= ~ev->pending;
+        ev->pending |= events;
+        ev->clearing &= ~clearing;
+        intv = disk_events_poll_jiffies(disk);
+        if (!ev->block && intv)
+                queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+        spin_unlock_irq(&ev->lock);
+        /*
+         * Tell userland about new events.  Only the events listed in
+         * @disk->events are reported.  Unlisted events are processed the
+         * same internally but never get reported to userland.
+         */
+        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+                if (events & disk->events & (1 << i))
+                        envp[nr_events++] = disk_uevents[i];
+        if (nr_events)
+                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+/*
+ * A disk events enabled device has the following sysfs nodes under
+ * its /sys/block/X/ directory.
+ *
+ * events               : list of all supported events
+ * events_async         : list of events which can be detected w/o polling
+ * events_poll_msecs    : polling interval, 0: disable, -1: system default
+ */
+static ssize_t __disk_events_show(unsigned int events, char *buf)
+{
+        const char *delim = "";
+        ssize_t pos = 0;
+        int i;
+        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
+                if (events & (1 << i)) {
+                        pos += sprintf(buf + pos, "%s%s",
+                                       delim, disk_events_strs[i]);
+                        delim = " ";
+                }
+        if (pos)
+                pos += sprintf(buf + pos, "\n");
+        return pos;
+}
+static ssize_t disk_events_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        return __disk_events_show(disk->events, buf);
+}
+static ssize_t disk_events_async_show(struct device *dev,
+                                      struct device_attribute *attr, char *buf)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        return __disk_events_show(disk->async_events, buf);
+}
+static ssize_t disk_events_poll_msecs_show(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
+}
+static ssize_t disk_events_poll_msecs_store(struct device *dev,
+                                            struct device_attribute *attr,
+                                            const char *buf, size_t count)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        long intv;
+        if (!count || !sscanf(buf, "%ld", &intv))
+                return -EINVAL;
+        if (intv < 0 && intv != -1)
+                return -EINVAL;
+        disk_block_events(disk);
+        disk->ev->poll_msecs = intv;
+        __disk_unblock_events(disk, true);
+        return count;
+}
+static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+                         disk_events_poll_msecs_show,
+                         disk_events_poll_msecs_store);
+static const struct attribute *disk_events_attrs[] = {
+        &dev_attr_events.attr,
+        &dev_attr_events_async.attr,
+        &dev_attr_events_poll_msecs.attr,
+        NULL,
+};
+/*
+ * The default polling interval can be specified by the kernel
+ * parameter block.events_dfl_poll_msecs which defaults to 0
+ * (disable).  This can also be modified runtime by writing to
+ * /sys/module/block/events_dfl_poll_msecs.
+ */
+static int disk_events_set_dfl_poll_msecs(const char *val,
+                                          const struct kernel_param *kp)
+{
+        struct disk_events *ev;
+        int ret;
+        ret = param_set_ulong(val, kp);
+        if (ret < 0)
+                return ret;
+        mutex_lock(&disk_events_mutex);
+        list_for_each_entry(ev, &disk_events, node)
+                disk_check_events(ev->disk);
+        mutex_unlock(&disk_events_mutex);
+        return 0;
+}
+static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
+        .set    = disk_events_set_dfl_poll_msecs,
+        .get    = param_get_ulong,
+};
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX     "block."
+module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
+                &disk_events_dfl_poll_msecs, 0644);
+/*
+ * disk_{add|del|release}_events - initialize and destroy disk_events.
+ */
+static void disk_add_events(struct gendisk *disk)
+{
+        struct disk_events *ev;
+        if (!disk->fops->check_events)
+                return;
+        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+        if (!ev) {
+                pr_warn("%s: failed to initialize events\n", disk->disk_name);
+                return;
+        }
+        if (sysfs_create_files(&disk_to_dev(disk)->kobj,
+                               disk_events_attrs) < 0) {
+                pr_warn("%s: failed to create sysfs files for events\n",
+                        disk->disk_name);
+                kfree(ev);
+                return;
+        }
+        disk->ev = ev;
+        INIT_LIST_HEAD(&ev->node);
+        ev->disk = disk;
+        spin_lock_init(&ev->lock);
+        mutex_init(&ev->block_mutex);
+        ev->block = 1;
+        ev->poll_msecs = -1;
+        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
+        mutex_lock(&disk_events_mutex);
+        list_add_tail(&ev->node, &disk_events);
+        mutex_unlock(&disk_events_mutex);
+        /*
+         * Block count is initialized to 1 and the following initial
+         * unblock kicks it into action.
+         */
+        __disk_unblock_events(disk, true);
+}
+static void disk_del_events(struct gendisk *disk)
+{
+        if (!disk->ev)
+                return;
+        disk_block_events(disk);
+        mutex_lock(&disk_events_mutex);
+        list_del_init(&disk->ev->node);
+        mutex_unlock(&disk_events_mutex);
+        sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+}
+static void disk_release_events(struct gendisk *disk)
+{
+        /* the block count should be 1 from disk_del_events() */
+        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+        kfree(disk->ev);
+}
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..1124cd297263 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,7 +5,6 @@
 #include <linux/hdreg.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
@@ -62,7 +61,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
                        /* all seems OK */
                        part = add_partition(disk, partno, start, length,
-                                             ADDPART_FLAG_NONE);
+                                             ADDPART_FLAG_NONE, NULL);
                        mutex_unlock(&bdev->bd_mutex);
                        return IS_ERR(part) ? PTR_ERR(part) : 0;
                case BLKPG_DEL_PARTITION:
@@ -116,7 +115,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
                             uint64_t len, int secure)
 {
-        unsigned long flags = BLKDEV_IFL_WAIT;
+        unsigned long flags = 0;
        if (start & 511)
                return -EINVAL;
@@ -125,10 +124,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
        start >>= 9;
        len >>= 9;
-        if (start + len > (bdev->bd_inode->i_size >> 9))
+        if (start + len > (i_size_read(bdev->bd_inode) >> 9))
                return -EINVAL;
        if (secure)
-                flags |= BLKDEV_IFL_SECURE;
+                flags |= BLKDEV_DISCARD_SECURE;
        return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
@@ -242,6 +241,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                 * We need to set the startsect first, the driver may
                 * want to override it.
                 */
+                memset(&geo, 0, sizeof(geo));
                geo.start = get_start_sect(bdev);
                ret = disk->fops->getgeo(bdev, &geo);
                if (ret)
@@ -294,11 +294,14 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                        return -EINVAL;
                if (get_user(n, (int __user *) arg))
                        return -EFAULT;
-                if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
+                if (!(mode & FMODE_EXCL)) {
-                        return -EBUSY;
+                        bdgrab(bdev);
+                        if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
+                                return -EBUSY;
+                }
                ret = set_blocksize(bdev, n);
                if (!(mode & FMODE_EXCL))
-                        bd_release(bdev);
+                        blkdev_put(bdev, mode | FMODE_EXCL);
                return ret;
        case BLKPG:
                ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
@@ -307,12 +310,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                ret = blkdev_reread_part(bdev);
                break;
        case BLKGETSIZE:
-                size = bdev->bd_inode->i_size;
+                size = i_size_read(bdev->bd_inode);
                if ((size >> 9) > ~0UL)
                        return -EFBIG;
                return put_ulong(arg, size >> 9);
        case BLKGETSIZE64:
-                return put_u64(arg, bdev->bd_inode->i_size);
+                return put_u64(arg, i_size_read(bdev->bd_inode));
        case BLKTRACESTART:
        case BLKTRACESTOP:
        case BLKTRACESETUP:
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd37..06389e9ef96d 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
        list_add_tail(&rq->queuelist, &nd->queue);
 }
-static int noop_queue_empty(struct request_queue *q)
-{
-        struct noop_data *nd = q->elevator->elevator_data;
-        return list_empty(&nd->queue);
-}
 static struct request *
 noop_former_request(struct request_queue *q, struct request *rq)
 {
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
                .elevator_merge_req_fn          = noop_merged_requests,
                .elevator_dispatch_fn           = noop_dispatch,
                .elevator_add_req_fn            = noop_add_request,
-                .elevator_queue_empty_fn        = noop_queue_empty,
                .elevator_former_req_fn         = noop_former_request,
                .elevator_latter_req_fn         = noop_latter_request,
                .elevator_init_fn               = noop_init_queue,
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a8b5a10eb5b0..4f4230b79bb6 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
        if (hdr->iovec_count) {
                const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
                size_t iov_data_len;
-                struct sg_iovec *iov;
+                struct sg_iovec *sg_iov;
+                struct iovec *iov;
+                int i;
-                iov = kmalloc(size, GFP_KERNEL);
+                sg_iov = kmalloc(size, GFP_KERNEL);
-                if (!iov) {
+                if (!sg_iov) {
                        ret = -ENOMEM;
                        goto out;
                }
-                if (copy_from_user(iov, hdr->dxferp, size)) {
+                if (copy_from_user(sg_iov, hdr->dxferp, size)) {
-                        kfree(iov);
+                        kfree(sg_iov);
                        ret = -EFAULT;
                        goto out;
                }
+                /*
+                 * Sum up the vecs, making sure they don't overflow
+                 */
+                iov = (struct iovec *) sg_iov;
+                iov_data_len = 0;
+                for (i = 0; i < hdr->iovec_count; i++) {
+                        if (iov_data_len + iov[i].iov_len < iov_data_len) {
+                                kfree(sg_iov);
+                                ret = -EINVAL;
+                                goto out;
+                        }
+                        iov_data_len += iov[i].iov_len;
+                }
                /* SG_IO howto says that the shorter of the two wins */
-                iov_data_len = iov_length((struct iovec *)iov,
-                                          hdr->iovec_count);
                if (hdr->dxfer_len < iov_data_len) {
-                        hdr->iovec_count = iov_shorten((struct iovec *)iov,
+                        hdr->iovec_count = iov_shorten(iov,
                                                       hdr->iovec_count,
                                                       hdr->dxfer_len);
                        iov_data_len = hdr->dxfer_len;
                }
-                ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count,
+                ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count,
                                          iov_data_len, GFP_KERNEL);
-                kfree(iov);
+                kfree(sg_iov);
        } else if (hdr->dxfer_len)
                ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
                                      GFP_KERNEL);