18 files changed, 1082 insertions, 727 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a3eb9e..f0605ab2a761 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
+                                unsigned long unaccounted_time)
 {
        unsigned long flags;
        spin_lock_irqsave(&blkg->stats_lock, flags);
        blkg->stats.time += time;
+        blkg->stats.unaccounted_time += unaccounted_time;
        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
@@ -604,6 +606,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
                                        blkg->stats.sectors, cb, dev);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+        if (type == BLKIO_STAT_UNACCOUNTED_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.unaccounted_time, cb, dev);
        if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
                uint64_t sum = blkg->stats.avg_queue_size_sum;
                uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -863,7 +868,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 }
 /*
- * Some rules/values in blkg have changed. Propogate those to respective
+ * Some rules/values in blkg have changed. Propagate those to respective
 * policies.
 */
 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
@@ -898,7 +903,7 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 }
 /*
- * A policy node rule has been updated. Propogate this update to all the
+ * A policy node rule has been updated. Propagate this update to all the
 * block groups which might be affected by this update.
 */
 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
@@ -1125,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
                        return blkio_read_blkg_stats(blkcg, cft, cb,
                                                BLKIO_STAT_QUEUED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+                case BLKIO_PROP_unaccounted_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_UNACCOUNTED_TIME, 0);
                case BLKIO_PROP_dequeue:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
                                                BLKIO_STAT_DEQUEUE, 0);
@@ -1382,6 +1390,12 @@ struct cftype blkio_files[] = {
                                BLKIO_PROP_dequeue),
                .read_map = blkiocg_file_read_map,
        },
+        {
+                .name = "unaccounted_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_unaccounted_time),
+                .read_map = blkiocg_file_read_map,
+        },
 #endif
 };
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861bdd549..10919fae2d3a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,6 +49,8 @@ enum stat_type {
        /* All the single valued stats go below this */
        BLKIO_STAT_TIME,
        BLKIO_STAT_SECTORS,
+        /* Time not charged to this cgroup */
+        BLKIO_STAT_UNACCOUNTED_TIME,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        BLKIO_STAT_AVG_QUEUE_SIZE,
        BLKIO_STAT_IDLE_TIME,
@@ -81,6 +83,7 @@ enum blkcg_file_name_prop {
        BLKIO_PROP_io_serviced,
        BLKIO_PROP_time,
        BLKIO_PROP_sectors,
+        BLKIO_PROP_unaccounted_time,
        BLKIO_PROP_io_service_time,
        BLKIO_PROP_io_wait_time,
        BLKIO_PROP_io_merged,
@@ -114,6 +117,8 @@ struct blkio_group_stats {
        /* total disk time and nr sectors dispatched by this group */
        uint64_t time;
        uint64_t sectors;
+        /* Time not charged to this cgroup */
+        uint64_t unaccounted_time;
        uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        /* Sum of number of IOs queued across all samples */
@@ -240,7 +245,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 #endif
-#define BLKIO_WEIGHT_MIN        100
+#define BLKIO_WEIGHT_MIN        10
 #define BLKIO_WEIGHT_MAX        1000
 #define BLKIO_WEIGHT_DEFAULT    500
@@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
                                                void *key);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                                        unsigned long time);
+                                        unsigned long time,
+                                        unsigned long unaccounted_time);
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
                                                bool direction, bool sync);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                                                unsigned long time) {}
+                                                unsigned long time,
+                                                unsigned long unaccounted_time)
+{}
 static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
                                uint64_t bytes, bool direction, bool sync) {}
 static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index a63336d49f30..a2e58eeb3549 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
+#include <linux/list_sort.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -149,39 +150,29 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
                          unsigned int nbytes, int error)
 {
-        struct request_queue *q = rq->q;
+        if (error)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
-        if (&q->flush_rq != rq) {
+        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                if (error)
+                error = -EIO;
-                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
-                else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                        error = -EIO;
-                if (unlikely(nbytes > bio->bi_size)) {
+        if (unlikely(nbytes > bio->bi_size)) {
-                        printk(KERN_ERR "%s: want %u bytes done, %u left\n",
+                printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-                               __func__, nbytes, bio->bi_size);
+                       __func__, nbytes, bio->bi_size);
-                        nbytes = bio->bi_size;
+                nbytes = bio->bi_size;
-                }
+        }
-                if (unlikely(rq->cmd_flags & REQ_QUIET))
+        if (unlikely(rq->cmd_flags & REQ_QUIET))
-                        set_bit(BIO_QUIET, &bio->bi_flags);
+                set_bit(BIO_QUIET, &bio->bi_flags);
-                bio->bi_size -= nbytes;
+        bio->bi_size -= nbytes;
-                bio->bi_sector += (nbytes >> 9);
+        bio->bi_sector += (nbytes >> 9);
-                if (bio_integrity(bio))
+        if (bio_integrity(bio))
-                        bio_integrity_advance(bio, nbytes);
+                bio_integrity_advance(bio, nbytes);
-                if (bio->bi_size == 0)
+        /* don't actually finish bio if it's part of flush sequence */
-                        bio_endio(bio, error);
+        if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
-        } else {
+                bio_endio(bio, error);
-                /*
-                 * Okay, this is the sequenced flush request in
-                 * progress, just record the error;
-                 */
-                if (error && !q->flush_err)
-                        q->flush_err = error;
-        }
 }
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -207,136 +198,32 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
-/*
+static void blk_delay_work(struct work_struct *work)
- * "plug" the device if there are no outstanding requests: this will
- * force the transfer to start only after we have put all the requests
- * on the list.
- *
- * This is called with interrupts off and no requests on the queue and
- * with the queue lock held.
- */
-void blk_plug_device(struct request_queue *q)
 {
-        WARN_ON(!irqs_disabled());
+        struct request_queue *q;
-        /*
-         * don't plug a stopped queue, it must be paired with blk_start_queue()
-         * which will restart the queueing
-         */
-        if (blk_queue_stopped(q))
-                return;
-        if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
+        q = container_of(work, struct request_queue, delay_work.work);
-                mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+        spin_lock_irq(q->queue_lock);
-                trace_block_plug(q);
+        __blk_run_queue(q);
-        }
+        spin_unlock_irq(q->queue_lock);
 }
-EXPORT_SYMBOL(blk_plug_device);
 /**
- * blk_plug_device_unlocked - plug a device without queue lock held
+ * blk_delay_queue - restart queueing after defined interval
- * @q:    The &struct request_queue to plug
+ * @q:          The &struct request_queue in question
+ * @msecs:      Delay in msecs
 *
 * Description:
- *   Like @blk_plug_device(), but grabs the queue lock and disables
+ *   Sometimes queueing needs to be postponed for a little while, to allow
- *   interrupts.
+ *   resources to come back. This function will make sure that queueing is
- **/
+ *   restarted around the specified time.
-void blk_plug_device_unlocked(struct request_queue *q)
-{
-        unsigned long flags;
-        spin_lock_irqsave(q->queue_lock, flags);
-        blk_plug_device(q);
-        spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_plug_device_unlocked);
-/*
- * remove the queue from the plugged list, if present. called with
- * queue lock held and interrupts disabled.
- */
-int blk_remove_plug(struct request_queue *q)
-{
-        WARN_ON(!irqs_disabled());
-        if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
-                return 0;
-        del_timer(&q->unplug_timer);
-        return 1;
-}
-EXPORT_SYMBOL(blk_remove_plug);
-/*
- * remove the plug and let it rip..
 */
-void __generic_unplug_device(struct request_queue *q)
+void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 {
-        if (unlikely(blk_queue_stopped(q)))
+        queue_delayed_work(kblockd_workqueue, &q->delay_work,
-                return;
+                                msecs_to_jiffies(msecs));
-        if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
-                return;
-        q->request_fn(q);
 }
+EXPORT_SYMBOL(blk_delay_queue);
-/**
- * generic_unplug_device - fire a request queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   Linux uses plugging to build bigger requests queues before letting
- *   the device have at them. If a queue is plugged, the I/O scheduler
- *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged, the request_fn defined for the queue is invoked and
- *   transfers started.
- **/
-void generic_unplug_device(struct request_queue *q)
-{
-        if (blk_queue_plugged(q)) {
-                spin_lock_irq(q->queue_lock);
-                __generic_unplug_device(q);
-                spin_unlock_irq(q->queue_lock);
-        }
-}
-EXPORT_SYMBOL(generic_unplug_device);
-static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
-                                   struct page *page)
-{
-        struct request_queue *q = bdi->unplug_io_data;
-        blk_unplug(q);
-}
-void blk_unplug_work(struct work_struct *work)
-{
-        struct request_queue *q =
-                container_of(work, struct request_queue, unplug_work);
-        trace_block_unplug_io(q);
-        q->unplug_fn(q);
-}
-void blk_unplug_timeout(unsigned long data)
-{
-        struct request_queue *q = (struct request_queue *)data;
-        trace_block_unplug_timer(q);
-        kblockd_schedule_work(q, &q->unplug_work);
-}
-void blk_unplug(struct request_queue *q)
-{
-        /*
-         * devices don't necessarily have an ->unplug_fn defined
-         */
-        if (q->unplug_fn) {
-                trace_block_unplug_io(q);
-                q->unplug_fn(q);
-        }
-}
-EXPORT_SYMBOL(blk_unplug);
 /**
 * blk_start_queue - restart a previously stopped queue
@@ -352,7 +239,7 @@ void blk_start_queue(struct request_queue *q)
        WARN_ON(!irqs_disabled());
        queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-        __blk_run_queue(q, false);
+        __blk_run_queue(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
@@ -372,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue);
 **/
 void blk_stop_queue(struct request_queue *q)
 {
-        blk_remove_plug(q);
+        __cancel_delayed_work(&q->delay_work);
        queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
@@ -390,51 +277,51 @@ EXPORT_SYMBOL(blk_stop_queue);
 *     that its ->make_request_fn will not re-add plugging prior to calling
 *     this function.
 *
+ *     This function does not cancel any asynchronous activity arising
+ *     out of elevator or throttling code. That would require elevaotor_exit()
+ *     and blk_throtl_exit() to be called with queue lock initialized.
+ *
 */
 void blk_sync_queue(struct request_queue *q)
 {
-        del_timer_sync(&q->unplug_timer);
        del_timer_sync(&q->timeout);
-        cancel_work_sync(&q->unplug_work);
+        cancel_delayed_work_sync(&q->delay_work);
-        throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 /**
 * __blk_run_queue - run a single device queue
 * @q:  The queue to run
- * @force_kblockd: Don't run @q->request_fn directly.  Use kblockd.
 *
 * Description:
 *    See @blk_run_queue. This variant must be called with the queue lock
 *    held and interrupts disabled.
- *
 */
-void __blk_run_queue(struct request_queue *q, bool force_kblockd)
+void __blk_run_queue(struct request_queue *q)
 {
-        blk_remove_plug(q);
        if (unlikely(blk_queue_stopped(q)))
                return;
-        if (elv_queue_empty(q))
+        q->request_fn(q);
-                return;
-        /*
-         * Only recurse once to avoid overrunning the stack, let the unplug
-         * handling reinvoke the handler shortly if we already got there.
-         */
-        if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-                q->request_fn(q);
-                queue_flag_clear(QUEUE_FLAG_REENTER, q);
-        } else {
-                queue_flag_set(QUEUE_FLAG_PLUGGED, q);
-                kblockd_schedule_work(q, &q->unplug_work);
-        }
 }
 EXPORT_SYMBOL(__blk_run_queue);
 /**
+ * blk_run_queue_async - run a single device queue in workqueue context
+ * @q:  The queue to run
+ *
+ * Description:
+ *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
+ *    of us.
+ */
+void blk_run_queue_async(struct request_queue *q)
+{
+        if (likely(!blk_queue_stopped(q)))
+                queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+}
+EXPORT_SYMBOL(blk_run_queue_async);
+/**
 * blk_run_queue - run a single device queue
 * @q: The queue to run
 *
@@ -447,7 +334,7 @@ void blk_run_queue(struct request_queue *q)
        unsigned long flags;
        spin_lock_irqsave(q->queue_lock, flags);
-        __blk_run_queue(q, false);
+        __blk_run_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
@@ -457,6 +344,11 @@ void blk_put_queue(struct request_queue *q)
        kobject_put(&q->kobj);
 }
+/*
+ * Note: If a driver supplied the queue lock, it should not zap that lock
+ * unexpectedly as some queue cleanup components like elevator_exit() and
+ * blk_throtl_exit() need queue lock.
+ */
 void blk_cleanup_queue(struct request_queue *q)
 {
        /*
@@ -475,6 +367,8 @@ void blk_cleanup_queue(struct request_queue *q)
        if (q->elevator)
                elevator_exit(q->elevator);
+        blk_throtl_exit(q);
        blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -517,8 +411,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        if (!q)
                return NULL;
-        q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
-        q->backing_dev_info.unplug_io_data = q;
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
        q->backing_dev_info.state = 0;
@@ -538,17 +430,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                    laptop_mode_timer_fn, (unsigned long) q);
-        init_timer(&q->unplug_timer);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
        INIT_LIST_HEAD(&q->timeout_list);
-        INIT_LIST_HEAD(&q->pending_flushes);
+        INIT_LIST_HEAD(&q->flush_queue[0]);
-        INIT_WORK(&q->unplug_work, blk_unplug_work);
+        INIT_LIST_HEAD(&q->flush_queue[1]);
+        INIT_LIST_HEAD(&q->flush_data_in_flight);
+        INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
        kobject_init(&q->kobj, &blk_queue_ktype);
        mutex_init(&q->sysfs_lock);
        spin_lock_init(&q->__queue_lock);
+        /*
+         * By default initialize queue_lock to internal lock and driver can
+         * override it later if need be.
+         */
+        q->queue_lock = &q->__queue_lock;
        return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -631,9 +530,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
        q->request_fn           = rfn;
        q->prep_rq_fn           = NULL;
        q->unprep_rq_fn         = NULL;
-        q->unplug_fn            = generic_unplug_device;
        q->queue_flags          = QUEUE_FLAG_DEFAULT;
-        q->queue_lock           = lock;
+        /* Override internal queue lock with supplied lock pointer */
+        if (lock)
+                q->queue_lock           = lock;
        /*
         * This also sets hw/phys segments, boundary and size
@@ -666,6 +567,8 @@ int blk_get_queue(struct request_queue *q)
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
+        BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
        if (rq->cmd_flags & REQ_ELVPRIV)
                elv_put_request(q, rq);
        mempool_free(rq, q->rq.rq_pool);
@@ -762,6 +665,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
 }
 /*
+ * Determine if elevator data should be initialized when allocating the
+ * request associated with @bio.
+ */
+static bool blk_rq_should_init_elevator(struct bio *bio)
+{
+        if (!bio)
+                return true;
+        /*
+         * Flush requests do not use the elevator so skip initialization.
+         * This allows a request to share the flush and elevator data.
+         */
+        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+                return false;
+        return true;
+}
+/*
 * Get a free request, queue_lock must be held.
 * Returns NULL on failure, with queue_lock held.
 * Returns !NULL on success, with queue_lock *not held*.
@@ -773,7 +695,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
        struct request_list *rl = &q->rq;
        struct io_context *ioc = NULL;
        const bool is_sync = rw_is_sync(rw_flags) != 0;
-        int may_queue, priv;
+        int may_queue, priv = 0;
        may_queue = elv_may_queue(q, rw_flags);
        if (may_queue == ELV_MQUEUE_NO)
@@ -817,9 +739,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
        rl->count[is_sync]++;
        rl->starved[is_sync] = 0;
-        priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+        if (blk_rq_should_init_elevator(bio)) {
-        if (priv)
+                priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-                rl->elvpriv++;
+                if (priv)
+                        rl->elvpriv++;
+        }
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
@@ -866,8 +790,8 @@ out:
 }
 /*
- * No available requests for this queue, unplug the device and wait for some
+ * No available requests for this queue, wait for some requests to become
- * requests to become available.
+ * available.
 *
 * Called with q->queue_lock held, and returns with it unlocked.
 */
@@ -888,7 +812,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
                trace_block_sleeprq(q, bio, rw_flags & 1);
-                __generic_unplug_device(q);
                spin_unlock_irq(q->queue_lock);
                io_schedule();
@@ -1010,6 +933,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(blk_requeue_request);
+static void add_acct_request(struct request_queue *q, struct request *rq,
+                             int where)
+{
+        drive_stat_acct(rq, 1);
+        __elv_add_request(q, rq, where);
+}
 /**
 * blk_insert_request - insert a special request into a request queue
 * @q:          request queue where request should be inserted
@@ -1052,9 +982,8 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
        if (blk_rq_tagged(rq))
                blk_queue_end_tag(q, rq);
-        drive_stat_acct(rq, 1);
+        add_acct_request(q, rq, where);
-        __elv_add_request(q, rq, where, 0);
+        __blk_run_queue(q);
-        __blk_run_queue(q, false);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
@@ -1174,6 +1103,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
+static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+                                   struct bio *bio)
+{
+        const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+        /*
+         * Debug stuff, kill later
+         */
+        if (!rq_mergeable(req)) {
+                blk_dump_rq_flags(req, "back");
+                return false;
+        }
+        if (!ll_back_merge_fn(q, req, bio))
+                return false;
+        trace_block_bio_backmerge(q, bio);
+        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+                blk_rq_set_mixed_merge(req);
+        req->biotail->bi_next = bio;
+        req->biotail = bio;
+        req->__data_len += bio->bi_size;
+        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+        drive_stat_acct(req, 0);
+        return true;
+}
+static bool bio_attempt_front_merge(struct request_queue *q,
+                                    struct request *req, struct bio *bio)
+{
+        const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+        sector_t sector;
+        /*
+         * Debug stuff, kill later
+         */
+        if (!rq_mergeable(req)) {
+                blk_dump_rq_flags(req, "front");
+                return false;
+        }
+        if (!ll_front_merge_fn(q, req, bio))
+                return false;
+        trace_block_bio_frontmerge(q, bio);
+        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+                blk_rq_set_mixed_merge(req);
+        sector = bio->bi_sector;
+        bio->bi_next = req->bio;
+        req->bio = bio;
+        /*
+         * may not be valid. if the low level driver said
+         * it didn't need a bounce buffer then it better
+         * not touch req->buffer either...
+         */
+        req->buffer = bio_data(bio);
+        req->__sector = bio->bi_sector;
+        req->__data_len += bio->bi_size;
+        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+        drive_stat_acct(req, 0);
+        return true;
+}
+/*
+ * Attempts to merge with the plugged list in the current process. Returns
+ * true if merge was successful, otherwise false.
+ */
+static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
+                               struct bio *bio)
+{
+        struct blk_plug *plug;
+        struct request *rq;
+        bool ret = false;
+        plug = tsk->plug;
+        if (!plug)
+                goto out;
+        list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+                int el_ret;
+                if (rq->q != q)
+                        continue;
+                el_ret = elv_try_merge(rq, bio);
+                if (el_ret == ELEVATOR_BACK_MERGE) {
+                        ret = bio_attempt_back_merge(q, rq, bio);
+                        if (ret)
+                                break;
+                } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+                        ret = bio_attempt_front_merge(q, rq, bio);
+                        if (ret)
+                                break;
+                }
+        }
+out:
+        return ret;
+}
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
        req->cpu = bio->bi_comp_cpu;
@@ -1189,26 +1225,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        blk_rq_bio_prep(req->q, req, bio);
 }
-/*
- * Only disabling plugging for non-rotational devices if it does tagging
- * as well, otherwise we do need the proper merging
- */
-static inline bool queue_should_plug(struct request_queue *q)
-{
-        return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
-}
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
-        struct request *req;
-        int el_ret;
-        unsigned int bytes = bio->bi_size;
-        const unsigned short prio = bio_prio(bio);
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
-        const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+        struct blk_plug *plug;
-        const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
+        int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
-        int where = ELEVATOR_INSERT_SORT;
+        struct request *req;
-        int rw_flags;
        /*
         * low level driver can indicate that it wants pages above a
@@ -1217,78 +1239,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
         */
        blk_queue_bounce(q, &bio);
-        spin_lock_irq(q->queue_lock);
        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-                where = ELEVATOR_INSERT_FRONT;
+                spin_lock_irq(q->queue_lock);
+                where = ELEVATOR_INSERT_FLUSH;
                goto get_rq;
        }
-        if (elv_queue_empty(q))
+        /*
-                goto get_rq;
+         * Check if we can merge with the plugged list before grabbing
+         * any locks.
-        el_ret = elv_merge(q, &req, bio);
+         */
-        switch (el_ret) {
+        if (attempt_plug_merge(current, q, bio))
-        case ELEVATOR_BACK_MERGE:
-                BUG_ON(!rq_mergeable(req));
-                if (!ll_back_merge_fn(q, req, bio))
-                        break;
-                trace_block_bio_backmerge(q, bio);
-                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
-                        blk_rq_set_mixed_merge(req);
-                req->biotail->bi_next = bio;
-                req->biotail = bio;
-                req->__data_len += bytes;
-                req->ioprio = ioprio_best(req->ioprio, prio);
-                if (!blk_rq_cpu_valid(req))
-                        req->cpu = bio->bi_comp_cpu;
-                drive_stat_acct(req, 0);
-                elv_bio_merged(q, req, bio);
-                if (!attempt_back_merge(q, req))
-                        elv_merged_request(q, req, el_ret);
                goto out;
-        case ELEVATOR_FRONT_MERGE:
+        spin_lock_irq(q->queue_lock);
-                BUG_ON(!rq_mergeable(req));
-                if (!ll_front_merge_fn(q, req, bio))
-                        break;
-                trace_block_bio_frontmerge(q, bio);
-                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
+        el_ret = elv_merge(q, &req, bio);
-                        blk_rq_set_mixed_merge(req);
+        if (el_ret == ELEVATOR_BACK_MERGE) {
-                        req->cmd_flags &= ~REQ_FAILFAST_MASK;
+                BUG_ON(req->cmd_flags & REQ_ON_PLUG);
-                        req->cmd_flags |= ff;
+                if (bio_attempt_back_merge(q, req, bio)) {
+                        if (!attempt_back_merge(q, req))
+                                elv_merged_request(q, req, el_ret);
+                        goto out_unlock;
+                }
+        } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+                BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+                if (bio_attempt_front_merge(q, req, bio)) {
+                        if (!attempt_front_merge(q, req))
+                                elv_merged_request(q, req, el_ret);
+                        goto out_unlock;
                }
-                bio->bi_next = req->bio;
-                req->bio = bio;
-                /*
-                 * may not be valid. if the low level driver said
-                 * it didn't need a bounce buffer then it better
-                 * not touch req->buffer either...
-                 */
-                req->buffer = bio_data(bio);
-                req->__sector = bio->bi_sector;
-                req->__data_len += bytes;
-                req->ioprio = ioprio_best(req->ioprio, prio);
-                if (!blk_rq_cpu_valid(req))
-                        req->cpu = bio->bi_comp_cpu;
-                drive_stat_acct(req, 0);
-                elv_bio_merged(q, req, bio);
-                if (!attempt_front_merge(q, req))
-                        elv_merged_request(q, req, el_ret);
-                goto out;
-        /* ELV_NO_MERGE: elevator says don't/can't merge. */
-        default:
-                ;
        }
 get_rq:
@@ -1315,20 +1295,43 @@ get_rq:
         */
        init_request_from_bio(req, bio);
-        spin_lock_irq(q->queue_lock);
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-            bio_flagged(bio, BIO_CPU_AFFINE))
+            bio_flagged(bio, BIO_CPU_AFFINE)) {
-                req->cpu = blk_cpu_to_group(smp_processor_id());
+                req->cpu = blk_cpu_to_group(get_cpu());
-        if (queue_should_plug(q) && elv_queue_empty(q))
+                put_cpu();
-                blk_plug_device(q);
+        }
-        /* insert the request into the elevator */
+        plug = current->plug;
-        drive_stat_acct(req, 1);
+        if (plug) {
-        __elv_add_request(q, req, where, 0);
+                /*
+                 * If this is the first request added after a plug, fire
+                 * of a plug trace. If others have been added before, check
+                 * if we have multiple devices in this plug. If so, make a
+                 * note to sort the list before dispatch.
+                 */
+                if (list_empty(&plug->list))
+                        trace_block_plug(q);
+                else if (!plug->should_sort) {
+                        struct request *__rq;
+                        __rq = list_entry_rq(plug->list.prev);
+                        if (__rq->q != q)
+                                plug->should_sort = 1;
+                }
+                /*
+                 * Debug flag, kill later
+                 */
+                req->cmd_flags |= REQ_ON_PLUG;
+                list_add_tail(&req->queuelist, &plug->list);
+                drive_stat_acct(req, 1);
+        } else {
+                spin_lock_irq(q->queue_lock);
+                add_acct_request(q, req, where);
+                __blk_run_queue(q);
+out_unlock:
+                spin_unlock_irq(q->queue_lock);
+        }
 out:
-        if (unplug || !queue_should_plug(q))
-                __generic_unplug_device(q);
-        spin_unlock_irq(q->queue_lock);
        return 0;
 }
@@ -1731,9 +1734,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
         */
        BUG_ON(blk_queued_rq(rq));
-        drive_stat_acct(rq, 1);
+        add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
-        __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
        spin_unlock_irqrestore(q->queue_lock, flags);
        return 0;
@@ -1805,7 +1806,7 @@ static void blk_account_io_done(struct request *req)
         * normal IO on queueing nor completion.  Accounting the
         * containing request is enough.
         */
-        if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
+        if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
                unsigned long duration = jiffies - req->start_time;
                const int rw = rq_data_dir(req);
                struct hd_struct *part;
@@ -2162,7 +2163,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
         * size, something has gone terribly wrong.
         */
        if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
-                printk(KERN_ERR "blk: request botched\n");
+                blk_dump_rq_flags(req, "request botched");
                req->__data_len = blk_rq_cur_bytes(req);
        }
@@ -2628,6 +2629,166 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
+int kblockd_schedule_delayed_work(struct request_queue *q,
+                        struct delayed_work *dwork, unsigned long delay)
+{
+        return queue_delayed_work(kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+#define PLUG_MAGIC      0x91827364
+void blk_start_plug(struct blk_plug *plug)
+{
+        struct task_struct *tsk = current;
+        plug->magic = PLUG_MAGIC;
+        INIT_LIST_HEAD(&plug->list);
+        INIT_LIST_HEAD(&plug->cb_list);
+        plug->should_sort = 0;
+        /*
+         * If this is a nested plug, don't actually assign it. It will be
+         * flushed on its own.
+         */
+        if (!tsk->plug) {
+                /*
+                 * Store ordering should not be needed here, since a potential
+                 * preempt will imply a full memory barrier
+                 */
+                tsk->plug = plug;
+        }
+}
+EXPORT_SYMBOL(blk_start_plug);
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct request *rqa = container_of(a, struct request, queuelist);
+        struct request *rqb = container_of(b, struct request, queuelist);
+        return !(rqa->q <= rqb->q);
+}
+/*
+ * If 'from_schedule' is true, then postpone the dispatch of requests
+ * until a safe kblockd context. We due this to avoid accidental big
+ * additional stack usage in driver dispatch, in places where the originally
+ * plugger did not intend it.
+ */
+static void queue_unplugged(struct request_queue *q, unsigned int depth,
+                            bool from_schedule)
+        __releases(q->queue_lock)
+{
+        trace_block_unplug(q, depth, !from_schedule);
+        /*
+         * If we are punting this to kblockd, then we can safely drop
+         * the queue_lock before waking kblockd (which needs to take
+         * this lock).
+         */
+        if (from_schedule) {
+                spin_unlock(q->queue_lock);
+                blk_run_queue_async(q);
+        } else {
+                __blk_run_queue(q);
+                spin_unlock(q->queue_lock);
+        }
+}
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+        LIST_HEAD(callbacks);
+        if (list_empty(&plug->cb_list))
+                return;
+        list_splice_init(&plug->cb_list, &callbacks);
+        while (!list_empty(&callbacks)) {
+                struct blk_plug_cb *cb = list_first_entry(&callbacks,
+                                                          struct blk_plug_cb,
+                                                          list);
+                list_del(&cb->list);
+                cb->callback(cb);
+        }
+}
+void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+{
+        struct request_queue *q;
+        unsigned long flags;
+        struct request *rq;
+        LIST_HEAD(list);
+        unsigned int depth;
+        BUG_ON(plug->magic != PLUG_MAGIC);
+        flush_plug_callbacks(plug);
+        if (list_empty(&plug->list))
+                return;
+        list_splice_init(&plug->list, &list);
+        if (plug->should_sort) {
+                list_sort(NULL, &list, plug_rq_cmp);
+                plug->should_sort = 0;
+        }
+        q = NULL;
+        depth = 0;
+        /*
+         * Save and disable interrupts here, to avoid doing it for every
+         * queue lock we have to take.
+         */
+        local_irq_save(flags);
+        while (!list_empty(&list)) {
+                rq = list_entry_rq(list.next);
+                list_del_init(&rq->queuelist);
+                BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
+                BUG_ON(!rq->q);
+                if (rq->q != q) {
+                        /*
+                         * This drops the queue lock
+                         */
+                        if (q)
+                                queue_unplugged(q, depth, from_schedule);
+                        q = rq->q;
+                        depth = 0;
+                        spin_lock(q->queue_lock);
+                }
+                rq->cmd_flags &= ~REQ_ON_PLUG;
+                /*
+                 * rq is already accounted, so use raw insert
+                 */
+                if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
+                        __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
+                else
+                        __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
+                depth++;
+        }
+        /*
+         * This drops the queue lock
+         */
+        if (q)
+                queue_unplugged(q, depth, from_schedule);
+        local_irq_restore(flags);
+}
+void blk_finish_plug(struct blk_plug *plug)
+{
+        blk_flush_plug_list(plug, false);
+        if (plug == current->plug)
+                current->plug = NULL;
+}
+EXPORT_SYMBOL(blk_finish_plug);
 int __init blk_dev_init(void)
 {
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index cf1456a02acd..81e31819a597 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
        rq->end_io = done;
        WARN_ON(irqs_disabled());
        spin_lock_irq(q->queue_lock);
-        __elv_add_request(q, rq, where, 1);
+        __elv_add_request(q, rq, where);
-        __generic_unplug_device(q);
+        __blk_run_queue(q);
        /* the queue is stopped so it won't be plugged+unplugged */
        if (rq->cmd_type == REQ_TYPE_PM_RESUME)
                q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b27d0208611b..6c9b5e189e62 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
 /*
 * Functions to sequence FLUSH and FUA writes.
+ *
+ * Copyright (C) 2011           Max Planck Institute for Gravitational Physics
+ * Copyright (C) 2011           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
+ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
+ * properties and hardware capability.
+ *
+ * If a request doesn't have data, only REQ_FLUSH makes sense, which
+ * indicates a simple flush request.  If there is data, REQ_FLUSH indicates
+ * that the device cache should be flushed before the data is executed, and
+ * REQ_FUA means that the data must be on non-volatile media on request
+ * completion.
+ *
+ * If the device doesn't have writeback cache, FLUSH and FUA don't make any
+ * difference.  The requests are either completed immediately if there's no
+ * data or executed as normal requests otherwise.
+ *
+ * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
+ *
+ * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
+ * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ *
+ * The actual execution of flush is double buffered.  Whenever a request
+ * needs to execute PRE or POSTFLUSH, it queues at
+ * q->flush_queue[q->flush_pending_idx].  Once certain criteria are met, a
+ * flush is issued and the pending_idx is toggled.  When the flush
+ * completes, all the requests which were pending are proceeded to the next
+ * step.  This allows arbitrary merging of different types of FLUSH/FUA
+ * requests.
+ *
+ * Currently, the following conditions are used to determine when to issue
+ * flush.
+ *
+ * C1. At any given time, only one flush shall be in progress.  This makes
+ *     double buffering sufficient.
+ *
+ * C2. Flush is deferred if any request is executing DATA of its sequence.
+ *     This avoids issuing separate POSTFLUSHes for requests which shared
+ *     PREFLUSH.
+ *
+ * C3. The second condition is ignored if there is a request which has
+ *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
+ *     starvation in the unlikely case where there are continuous stream of
+ *     FUA (without FLUSH) requests.
+ *
+ * For devices which support FUA, it isn't clear whether C2 (and thus C3)
+ * is beneficial.
+ *
+ * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
+ * Once while executing DATA and again after the whole sequence is
+ * complete.  The first completion updates the contained bio but doesn't
+ * finish it so that the bio submitter is notified only after the whole
+ * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * req_bio_endio().
+ *
+ * The above peculiarity requires that each FLUSH/FUA request has only one
+ * bio attached to it, which is guaranteed as they aren't allowed to be
+ * merged in the usual way.
 */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/bio.h>
@@ -11,58 +74,142 @@
 /* FLUSH/FUA sequences */
 enum {
-        QUEUE_FSEQ_STARTED      = (1 << 0), /* flushing in progress */
+        REQ_FSEQ_PREFLUSH       = (1 << 0), /* pre-flushing in progress */
-        QUEUE_FSEQ_PREFLUSH     = (1 << 1), /* pre-flushing in progress */
+        REQ_FSEQ_DATA           = (1 << 1), /* data write in progress */
-        QUEUE_FSEQ_DATA         = (1 << 2), /* data write in progress */
+        REQ_FSEQ_POSTFLUSH      = (1 << 2), /* post-flushing in progress */
-        QUEUE_FSEQ_POSTFLUSH    = (1 << 3), /* post-flushing in progress */
+        REQ_FSEQ_DONE           = (1 << 3),
-        QUEUE_FSEQ_DONE         = (1 << 4),
+        REQ_FSEQ_ACTIONS        = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
+                                  REQ_FSEQ_POSTFLUSH,
+        /*
+         * If flush has been pending longer than the following timeout,
+         * it's issued even if flush_data requests are still in flight.
+         */
+        FLUSH_PENDING_TIMEOUT   = 5 * HZ,
 };
-static struct request *queue_next_fseq(struct request_queue *q);
+static bool blk_kick_flush(struct request_queue *q);
-unsigned blk_flush_cur_seq(struct request_queue *q)
+static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
 {
-        if (!q->flush_seq)
+        unsigned int policy = 0;
-                return 0;
-        return 1 << ffz(q->flush_seq);
+        if (fflags & REQ_FLUSH) {
+                if (rq->cmd_flags & REQ_FLUSH)
+                        policy |= REQ_FSEQ_PREFLUSH;
+                if (blk_rq_sectors(rq))
+                        policy |= REQ_FSEQ_DATA;
+                if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+                        policy |= REQ_FSEQ_POSTFLUSH;
+        }
+        return policy;
 }
-static struct request *blk_flush_complete_seq(struct request_queue *q,
+static unsigned int blk_flush_cur_seq(struct request *rq)
-                                              unsigned seq, int error)
 {
-        struct request *next_rq = NULL;
+        return 1 << ffz(rq->flush.seq);
+}
-        if (error && !q->flush_err)
-                q->flush_err = error;
+static void blk_flush_restore_request(struct request *rq)
+{
-        BUG_ON(q->flush_seq & seq);
+        /*
-        q->flush_seq |= seq;
+         * After flush data completion, @rq->bio is %NULL but we need to
+         * complete the bio again.  @rq->biotail is guaranteed to equal the
-        if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
+         * original @rq->bio.  Restore it.
-                /* not complete yet, queue the next flush sequence */
+         */
-                next_rq = queue_next_fseq(q);
+        rq->bio = rq->biotail;
-        } else {
-                /* complete this flush request */
+        /* make @rq a normal request */
-                __blk_end_request_all(q->orig_flush_rq, q->flush_err);
+        rq->cmd_flags &= ~REQ_FLUSH_SEQ;
-                q->orig_flush_rq = NULL;
+        rq->end_io = NULL;
-                q->flush_seq = 0;
+}
-                /* dispatch the next flush if there's one */
+/**
-                if (!list_empty(&q->pending_flushes)) {
+ * blk_flush_complete_seq - complete flush sequence
-                        next_rq = list_entry_rq(q->pending_flushes.next);
+ * @rq: FLUSH/FUA request being sequenced
-                        list_move(&next_rq->queuelist, &q->queue_head);
+ * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
-                }
+ * @error: whether an error occurred
+ *
+ * @rq just completed @seq part of its flush sequence, record the
+ * completion and trigger the next step.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if requests were added to the dispatch queue, %false otherwise.
+ */
+static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
+                                   int error)
+{
+        struct request_queue *q = rq->q;
+        struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+        bool queued = false;
+        BUG_ON(rq->flush.seq & seq);
+        rq->flush.seq |= seq;
+        if (likely(!error))
+                seq = blk_flush_cur_seq(rq);
+        else
+                seq = REQ_FSEQ_DONE;
+        switch (seq) {
+        case REQ_FSEQ_PREFLUSH:
+        case REQ_FSEQ_POSTFLUSH:
+                /* queue for flush */
+                if (list_empty(pending))
+                        q->flush_pending_since = jiffies;
+                list_move_tail(&rq->flush.list, pending);
+                break;
+        case REQ_FSEQ_DATA:
+                list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
+                list_add(&rq->queuelist, &q->queue_head);
+                queued = true;
+                break;
+        case REQ_FSEQ_DONE:
+                /*
+                 * @rq was previously adjusted by blk_flush_issue() for
+                 * flush sequencing and may already have gone through the
+                 * flush data request completion path.  Restore @rq for
+                 * normal completion and end it.
+                 */
+                BUG_ON(!list_empty(&rq->queuelist));
+                list_del_init(&rq->flush.list);
+                blk_flush_restore_request(rq);
+                __blk_end_request_all(rq, error);
+                break;
+        default:
+                BUG();
        }
-        return next_rq;
+        return blk_kick_flush(q) | queued;
 }
-static void blk_flush_complete_seq_end_io(struct request_queue *q,
+static void flush_end_io(struct request *flush_rq, int error)
-                                          unsigned seq, int error)
 {
-        bool was_empty = elv_queue_empty(q);
+        struct request_queue *q = flush_rq->q;
-        struct request *next_rq;
+        struct list_head *running = &q->flush_queue[q->flush_running_idx];
+        bool queued = false;
+        struct request *rq, *n;
+        BUG_ON(q->flush_pending_idx == q->flush_running_idx);
+        /* account completion of the flush request */
+        q->flush_running_idx ^= 1;
+        elv_completed_request(q, flush_rq);
-        next_rq = blk_flush_complete_seq(q, seq, error);
+        /* and push the waiting requests to the next stage */
+        list_for_each_entry_safe(rq, n, running, flush.list) {
+                unsigned int seq = blk_flush_cur_seq(rq);
+                BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+                queued |= blk_flush_complete_seq(rq, seq, error);
+        }
        /*
         * Moving a request silently to empty queue_head may stall the
@@ -70,127 +217,153 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
         * from request completion path and calling directly into
         * request_fn may confuse the driver.  Always use kblockd.
         */
-        if (was_empty && next_rq)
+        if (queued)
-                __blk_run_queue(q, true);
+                blk_run_queue_async(q);
 }
-static void pre_flush_end_io(struct request *rq, int error)
+/**
+ * blk_kick_flush - consider issuing flush request
+ * @q: request_queue being kicked
+ *
+ * Flush related states of @q have changed, consider issuing flush request.
+ * Please read the comment at the top of this file for more info.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if flush was issued, %false otherwise.
+ */
+static bool blk_kick_flush(struct request_queue *q)
 {
-        elv_completed_request(rq->q, rq);
+        struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
-        blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
+        struct request *first_rq =
+                list_first_entry(pending, struct request, flush.list);
+        /* C1 described at the top of this file */
+        if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
+                return false;
+        /* C2 and C3 */
+        if (!list_empty(&q->flush_data_in_flight) &&
+            time_before(jiffies,
+                        q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+                return false;
+        /*
+         * Issue flush and toggle pending_idx.  This makes pending_idx
+         * different from running_idx, which means flush is in flight.
+         */
+        blk_rq_init(q, &q->flush_rq);
+        q->flush_rq.cmd_type = REQ_TYPE_FS;
+        q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+        q->flush_rq.rq_disk = first_rq->rq_disk;
+        q->flush_rq.end_io = flush_end_io;
+        q->flush_pending_idx ^= 1;
+        list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
+        return true;
 }
 static void flush_data_end_io(struct request *rq, int error)
 {
-        elv_completed_request(rq->q, rq);
+        struct request_queue *q = rq->q;
-        blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
-}
-static void post_flush_end_io(struct request *rq, int error)
+        /*
-{
+         * After populating an empty queue, kick it to avoid stall.  Read
-        elv_completed_request(rq->q, rq);
+         * the comment in flush_end_io().
-        blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
+         */
+        if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+                blk_run_queue_async(q);
 }
-static void init_flush_request(struct request *rq, struct gendisk *disk)
+/**
+ * blk_insert_flush - insert a new FLUSH/FUA request
+ * @rq: request to insert
+ *
+ * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * @rq is being submitted.  Analyze what needs to be done and put it on the
+ * right queue.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_insert_flush(struct request *rq)
 {
-        rq->cmd_type = REQ_TYPE_FS;
+        struct request_queue *q = rq->q;
-        rq->cmd_flags = WRITE_FLUSH;
+        unsigned int fflags = q->flush_flags;   /* may change, cache */
-        rq->rq_disk = disk;
+        unsigned int policy = blk_flush_policy(fflags, rq);
-}
-static struct request *queue_next_fseq(struct request_queue *q)
+        BUG_ON(rq->end_io);
-{
+        BUG_ON(!rq->bio || rq->bio != rq->biotail);
-        struct request *orig_rq = q->orig_flush_rq;
-        struct request *rq = &q->flush_rq;
-        blk_rq_init(q, rq);
+        /*
+         * @policy now records what operations need to be done.  Adjust
+         * REQ_FLUSH and FUA for the driver.
+         */
+        rq->cmd_flags &= ~REQ_FLUSH;
+        if (!(fflags & REQ_FUA))
+                rq->cmd_flags &= ~REQ_FUA;
-        switch (blk_flush_cur_seq(q)) {
+        /*
-        case QUEUE_FSEQ_PREFLUSH:
+         * If there's data but flush is not necessary, the request can be
-                init_flush_request(rq, orig_rq->rq_disk);
+         * processed directly without going through flush machinery.  Queue
-                rq->end_io = pre_flush_end_io;
+         * for normal execution.
-                break;
+         */
-        case QUEUE_FSEQ_DATA:
+        if ((policy & REQ_FSEQ_DATA) &&
-                init_request_from_bio(rq, orig_rq->bio);
+            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-                /*
+                list_add_tail(&rq->queuelist, &q->queue_head);
-                 * orig_rq->rq_disk may be different from
+                return;
-                 * bio->bi_bdev->bd_disk if orig_rq got here through
-                 * remapping drivers.  Make sure rq->rq_disk points
-                 * to the same one as orig_rq.
-                 */
-                rq->rq_disk = orig_rq->rq_disk;
-                rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
-                rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
-                rq->end_io = flush_data_end_io;
-                break;
-        case QUEUE_FSEQ_POSTFLUSH:
-                init_flush_request(rq, orig_rq->rq_disk);
-                rq->end_io = post_flush_end_io;
-                break;
-        default:
-                BUG();
        }
-        elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+        /*
-        return rq;
+         * @rq should go through flush machinery.  Mark it part of flush
+         * sequence and submit for further processing.
+         */
+        memset(&rq->flush, 0, sizeof(rq->flush));
+        INIT_LIST_HEAD(&rq->flush.list);
+        rq->cmd_flags |= REQ_FLUSH_SEQ;
+        rq->end_io = flush_data_end_io;
+        blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
 }
-struct request *blk_do_flush(struct request_queue *q, struct request *rq)
+/**
+ * blk_abort_flushes - @q is being aborted, abort flush requests
+ * @q: request_queue being aborted
+ *
+ * To be called from elv_abort_queue().  @q is being aborted.  Prepare all
+ * FLUSH/FUA requests for abortion.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_abort_flushes(struct request_queue *q)
 {
-        unsigned int fflags = q->flush_flags; /* may change, cache it */
+        struct request *rq, *n;
-        bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
+        int i;
-        bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
-        bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
-        unsigned skip = 0;
        /*
-         * Special case.  If there's data but flush is not necessary,
+         * Requests in flight for data are already owned by the dispatch
-         * the request can be issued directly.
+         * queue or the device driver.  Just restore for normal completion.
-         *
-         * Flush w/o data should be able to be issued directly too but
-         * currently some drivers assume that rq->bio contains
-         * non-zero data if it isn't NULL and empty FLUSH requests
-         * getting here usually have bio's without data.
         */
-        if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
+        list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
-                rq->cmd_flags &= ~REQ_FLUSH;
+                list_del_init(&rq->flush.list);
-                if (!has_fua)
+                blk_flush_restore_request(rq);
-                        rq->cmd_flags &= ~REQ_FUA;
-                return rq;
        }
        /*
-         * Sequenced flushes can't be processed in parallel.  If
+         * We need to give away requests on flush queues.  Restore for
-         * another one is already in progress, queue for later
+         * normal completion and put them on the dispatch queue.
-         * processing.
         */
-        if (q->flush_seq) {
+        for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
-                list_move_tail(&rq->queuelist, &q->pending_flushes);
+                list_for_each_entry_safe(rq, n, &q->flush_queue[i],
-                return NULL;
+                                         flush.list) {
+                        list_del_init(&rq->flush.list);
+                        blk_flush_restore_request(rq);
+                        list_add_tail(&rq->queuelist, &q->queue_head);
+                }
        }
-        /*
-         * Start a new flush sequence
-         */
-        q->flush_err = 0;
-        q->flush_seq |= QUEUE_FSEQ_STARTED;
-        /* adjust FLUSH/FUA of the original request and stash it away */
-        rq->cmd_flags &= ~REQ_FLUSH;
-        if (!has_fua)
-                rq->cmd_flags &= ~REQ_FUA;
-        blk_dequeue_request(rq);
-        q->orig_flush_rq = rq;
-        /* skip unneded sequences and return the first one */
-        if (!do_preflush)
-                skip |= QUEUE_FSEQ_PREFLUSH;
-        if (!blk_rq_sectors(rq))
-                skip |= QUEUE_FSEQ_DATA;
-        if (!do_postflush)
-                skip |= QUEUE_FSEQ_POSTFLUSH;
-        return blk_flush_complete_seq(q, skip, 0);
 }
 static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 54bcba6c02a7..129b9e209a3b 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -30,6 +30,8 @@
 static struct kmem_cache *integrity_cachep;
+static const char *bi_unsupported_name = "unsupported";
 /**
 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
 * @q:          request queue
@@ -358,6 +360,14 @@ static struct kobj_type integrity_ktype = {
        .release        = blk_integrity_release,
 };
+bool blk_integrity_is_initialized(struct gendisk *disk)
+{
+        struct blk_integrity *bi = blk_get_integrity(disk);
+        return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
+}
+EXPORT_SYMBOL(blk_integrity_is_initialized);
 /**
 * blk_integrity_register - Register a gendisk as being integrity-capable
 * @disk:       struct gendisk pointer to make integrity-aware
@@ -407,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
                bi->get_tag_fn = template->get_tag_fn;
                bi->tag_size = template->tag_size;
        } else
-                bi->name = "unsupported";
+                bi->name = bi_unsupported_name;
        return 0;
 }
diff --git a/block/blk-lib.c b/block/blk-lib.c
index bd3e8df4d5e2..25de73e4759b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -136,8 +136,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
 *
 * Description:
 *  Generate and issue number of bios with zerofiled pages.
- *  Send barrier at the beginning and at the end if requested. This guarantie
- *  correct request ordering. Empty barrier allow us to avoid post queue flush.
 */
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20d5e94..cfcc37cb222b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
        return 0;
 }
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+                          struct request *next)
+{
+        return attempt_merge(q, rq, next);
+}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af18..1fa769293597 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -164,25 +164,10 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
        blk_queue_congestion_threshold(q);
        q->nr_batching = BLK_BATCH_REQ;
-        q->unplug_thresh = 4;           /* hmm */
-        q->unplug_delay = msecs_to_jiffies(3);  /* 3 milliseconds */
-        if (q->unplug_delay == 0)
-                q->unplug_delay = 1;
-        q->unplug_timer.function = blk_unplug_timeout;
-        q->unplug_timer.data = (unsigned long)q;
        blk_set_default_limits(&q->limits);
        blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
        /*
-         * If the caller didn't supply a lock, fall back to our embedded
-         * per-queue locks
-         */
-        if (!q->queue_lock)
-                q->queue_lock = &q->__queue_lock;
-        /*
         * by default assume old behaviour and bounce for any highmem page
         */
        blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 41fb69150b4d..bd236313f35d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
                blk_set_queue_full(q, BLK_RW_SYNC);
-        } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+        } else {
                blk_clear_queue_full(q, BLK_RW_SYNC);
                wake_up(&rl->wait[BLK_RW_SYNC]);
        }
        if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
                blk_set_queue_full(q, BLK_RW_ASYNC);
-        } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+        } else {
                blk_clear_queue_full(q, BLK_RW_ASYNC);
                wake_up(&rl->wait[BLK_RW_ASYNC]);
        }
@@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj)
        blk_sync_queue(q);
-        blk_throtl_exit(q);
        if (rl->rq_pool)
                mempool_destroy(rl->rq_pool);
@@ -500,7 +498,6 @@ int blk_register_queue(struct gendisk *disk)
 {
        int ret;
        struct device *dev = disk_to_dev(disk);
        struct request_queue *q = disk->queue;
        if (WARN_ON(!q))
@@ -511,8 +508,10 @@ int blk_register_queue(struct gendisk *disk)
                return ret;
        ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
-        if (ret < 0)
+        if (ret < 0) {
+                blk_trace_remove_sysfs(dev);
                return ret;
+        }
        kobject_uevent(&q->kobj, KOBJ_ADD);
@@ -523,7 +522,7 @@ int blk_register_queue(struct gendisk *disk)
        if (ret) {
                kobject_uevent(&q->kobj, KOBJ_REMOVE);
                kobject_del(&q->kobj);
-                blk_trace_remove_sysfs(disk_to_dev(disk));
+                blk_trace_remove_sysfs(dev);
                kobject_put(&dev->kobj);
                return ret;
        }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e36cc10a346c..0475a22a420d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -77,7 +77,7 @@ struct throtl_grp {
        unsigned long slice_end[2];
        /* Some throttle limits got updated for the group */
-        bool limits_changed;
+        int limits_changed;
 };
 struct throtl_data
@@ -102,7 +102,7 @@ struct throtl_data
        /* Work for dispatching throttled bios */
        struct delayed_work throtl_work;
-        atomic_t limits_changed;
+        int limits_changed;
 };
 enum tg_state_flags {
@@ -201,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
        RB_CLEAR_NODE(&tg->rb_node);
        bio_list_init(&tg->bio_lists[0]);
        bio_list_init(&tg->bio_lists[1]);
+        td->limits_changed = false;
        /*
         * Take the initial reference that will be released on destroy
@@ -737,34 +738,36 @@ static void throtl_process_limit_change(struct throtl_data *td)
        struct throtl_grp *tg;
        struct hlist_node *pos, *n;
-        if (!atomic_read(&td->limits_changed))
+        if (!td->limits_changed)
                return;
-        throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+        xchg(&td->limits_changed, false);
-        /*
+        throtl_log(td, "limits changed");
-         * Make sure updates from throtl_update_blkio_group_read_bps() group
-         * of functions to tg->limits_changed are visible. We do not
-         * want update td->limits_changed to be visible but update to
-         * tg->limits_changed not being visible yet on this cpu. Hence
-         * the read barrier.
-         */
-        smp_rmb();
        hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-                if (throtl_tg_on_rr(tg) && tg->limits_changed) {
+                if (!tg->limits_changed)
-                        throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+                        continue;
-                                " riops=%u wiops=%u", tg->bps[READ],
-                                tg->bps[WRITE], tg->iops[READ],
+                if (!xchg(&tg->limits_changed, false))
-                                tg->iops[WRITE]);
+                        continue;
+                throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+                        " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
+                        tg->iops[READ], tg->iops[WRITE]);
+                /*
+                 * Restart the slices for both READ and WRITES. It
+                 * might happen that a group's limit are dropped
+                 * suddenly and we don't want to account recently
+                 * dispatched IO with new low rate
+                 */
+                throtl_start_new_slice(td, tg, 0);
+                throtl_start_new_slice(td, tg, 1);
+                if (throtl_tg_on_rr(tg))
                        tg_update_disptime(td, tg);
-                        tg->limits_changed = false;
-                }
        }
-        smp_mb__before_atomic_dec();
-        atomic_dec(&td->limits_changed);
-        smp_mb__after_atomic_dec();
 }
 /* Dispatch throttled bios. Should be called without queue lock held. */
@@ -774,6 +777,7 @@ static int throtl_dispatch(struct request_queue *q)
        unsigned int nr_disp = 0;
        struct bio_list bio_list_on_stack;
        struct bio *bio;
+        struct blk_plug plug;
        spin_lock_irq(q->queue_lock);
@@ -802,9 +806,10 @@ out:
         * immediate dispatch
         */
        if (nr_disp) {
+                blk_start_plug(&plug);
                while((bio = bio_list_pop(&bio_list_on_stack)))
                        generic_make_request(bio);
-                blk_unplug(q);
+                blk_finish_plug(&plug);
        }
        return nr_disp;
 }
@@ -825,7 +830,8 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
        struct delayed_work *dwork = &td->throtl_work;
-        if (total_nr_queued(td) > 0) {
+        /* schedule work if limits changed even if no bio is queued */
+        if (total_nr_queued(td) > 0 || td->limits_changed) {
                /*
                 * We might have a work scheduled to be executed in future.
                 * Cancel that and schedule a new one.
@@ -898,10 +904,19 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
        spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
+static void throtl_update_blkio_group_common(struct throtl_data *td,
+                                struct throtl_grp *tg)
+{
+        xchg(&tg->limits_changed, true);
+        xchg(&td->limits_changed, true);
+        /* Schedule a work now to process the limit change */
+        throtl_schedule_delayed_work(td, 0);
+}
 /*
 * For all update functions, key should be a valid pointer because these
 * update functions are called under blkcg_lock, that means, blkg is
- * valid and in turn key is valid. queue exit path can not race becuase
+ * valid and in turn key is valid. queue exit path can not race because
 * of blkcg_lock
 *
 * Can not take queue lock in update functions as queue lock under blkcg_lock
@@ -911,64 +926,43 @@ static void throtl_update_blkio_group_read_bps(void *key,
                                struct blkio_group *blkg, u64 read_bps)
 {
        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
-        tg_of_blkg(blkg)->bps[READ] = read_bps;
+        tg->bps[READ] = read_bps;
-        /* Make sure read_bps is updated before setting limits_changed */
+        throtl_update_blkio_group_common(td, tg);
-        smp_wmb();
-        tg_of_blkg(blkg)->limits_changed = true;
-        /* Make sure tg->limits_changed is updated before td->limits_changed */
-        smp_mb__before_atomic_inc();
-        atomic_inc(&td->limits_changed);
-        smp_mb__after_atomic_inc();
-        /* Schedule a work now to process the limit change */
-        throtl_schedule_delayed_work(td, 0);
 }
 static void throtl_update_blkio_group_write_bps(void *key,
                                struct blkio_group *blkg, u64 write_bps)
 {
        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
-        tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+        tg->bps[WRITE] = write_bps;
-        smp_wmb();
+        throtl_update_blkio_group_common(td, tg);
-        tg_of_blkg(blkg)->limits_changed = true;
-        smp_mb__before_atomic_inc();
-        atomic_inc(&td->limits_changed);
-        smp_mb__after_atomic_inc();
-        throtl_schedule_delayed_work(td, 0);
 }
 static void throtl_update_blkio_group_read_iops(void *key,
                        struct blkio_group *blkg, unsigned int read_iops)
 {
        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
-        tg_of_blkg(blkg)->iops[READ] = read_iops;
+        tg->iops[READ] = read_iops;
-        smp_wmb();
+        throtl_update_blkio_group_common(td, tg);
-        tg_of_blkg(blkg)->limits_changed = true;
-        smp_mb__before_atomic_inc();
-        atomic_inc(&td->limits_changed);
-        smp_mb__after_atomic_inc();
-        throtl_schedule_delayed_work(td, 0);
 }
 static void throtl_update_blkio_group_write_iops(void *key,
                        struct blkio_group *blkg, unsigned int write_iops)
 {
        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
-        tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+        tg->iops[WRITE] = write_iops;
-        smp_wmb();
+        throtl_update_blkio_group_common(td, tg);
-        tg_of_blkg(blkg)->limits_changed = true;
-        smp_mb__before_atomic_inc();
-        atomic_inc(&td->limits_changed);
-        smp_mb__after_atomic_inc();
-        throtl_schedule_delayed_work(td, 0);
 }
-void throtl_shutdown_timer_wq(struct request_queue *q)
+static void throtl_shutdown_wq(struct request_queue *q)
 {
        struct throtl_data *td = q->td;
@@ -1009,20 +1003,28 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
                /*
                 * There is already another bio queued in same dir. No
                 * need to update dispatch time.
-                 * Still update the disptime if rate limits on this group
-                 * were changed.
                 */
-                if (!tg->limits_changed)
+                update_disptime = false;
-                        update_disptime = false;
-                else
-                        tg->limits_changed = false;
                goto queue_bio;
        }
        /* Bio is with-in rate limit of group */
        if (tg_may_dispatch(td, tg, bio, NULL)) {
                throtl_charge_bio(tg, bio);
+                /*
+                 * We need to trim slice even when bios are not being queued
+                 * otherwise it might happen that a bio is not queued for
+                 * a long time and slice keeps on extending and trim is not
+                 * called for a long time. Now if limits are reduced suddenly
+                 * we take into account all the IO dispatched so far at new
+                 * low rate and * newly queued IO gets a really long dispatch
+                 * time.
+                 *
+                 * So keep on trimming slice even if bio is not queued.
+                 */
+                throtl_trim_slice(td, tg, rw);
                goto out;
        }
@@ -1058,7 +1060,7 @@ int blk_throtl_init(struct request_queue *q)
        INIT_HLIST_HEAD(&td->tg_list);
        td->tg_service_tree = THROTL_RB_ROOT;
-        atomic_set(&td->limits_changed, 0);
+        td->limits_changed = false;
        /* Init root group */
        tg = &td->root_tg;
@@ -1070,6 +1072,7 @@ int blk_throtl_init(struct request_queue *q)
        /* Practically unlimited BW */
        tg->bps[0] = tg->bps[1] = -1;
        tg->iops[0] = tg->iops[1] = -1;
+        td->limits_changed = false;
        /*
         * Set root group reference to 2. One reference will be dropped when
@@ -1102,7 +1105,7 @@ void blk_throtl_exit(struct request_queue *q)
        BUG_ON(!td);
-        throtl_shutdown_timer_wq(q);
+        throtl_shutdown_wq(q);
        spin_lock_irq(q->queue_lock);
        throtl_release_tgs(td);
@@ -1132,7 +1135,7 @@ void blk_throtl_exit(struct request_queue *q)
         * update limits through cgroup and another work got queued, cancel
         * it.
         */
-        throtl_shutdown_timer_wq(q);
+        throtl_shutdown_wq(q);
        throtl_td_free(td);
 }
diff --git a/block/blk.h b/block/blk.h
index 2db8f32838e7..61263463e38e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
-void blk_unplug_work(struct work_struct *work);
-void blk_unplug_timeout(unsigned long data);
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
@@ -34,7 +32,7 @@ enum rq_atomic_flags {
 /*
 * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them suceeds
+ * sure that only one of them succeeds
 */
 static inline int blk_mark_rq_complete(struct request *rq)
 {
@@ -51,21 +49,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
 */
 #define ELV_ON_HASH(rq)         (!hlist_unhashed(&(rq)->hash))
-struct request *blk_do_flush(struct request_queue *q, struct request *rq);
+void blk_insert_flush(struct request *rq);
+void blk_abort_flushes(struct request_queue *q);
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
        struct request *rq;
        while (1) {
-                while (!list_empty(&q->queue_head)) {
+                if (!list_empty(&q->queue_head)) {
                        rq = list_entry_rq(q->queue_head.next);
-                        if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+                        return rq;
-                            rq == &q->flush_rq)
-                                return rq;
-                        rq = blk_do_flush(q, rq);
-                        if (rq)
-                                return rq;
                }
                if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
@@ -109,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
                      struct bio *bio);
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+                                struct request *next);
 void blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ea83a4f0c27d..5b52011e3a40 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
 #define RQ_CIC(rq)              \
-        ((struct cfq_io_context *) (rq)->elevator_private)
+        ((struct cfq_io_context *) (rq)->elevator_private[0])
-#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private[1])
-#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private3)
+#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private[2])
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -146,7 +146,6 @@ struct cfq_queue {
        struct cfq_rb_root *service_tree;
        struct cfq_queue *new_cfqq;
        struct cfq_group *cfqg;
-        struct cfq_group *orig_cfqg;
        /* Number of sectors dispatched from queue in single dispatch round */
        unsigned long nr_sectors;
 };
@@ -179,6 +178,8 @@ struct cfq_group {
        /* group service_tree key */
        u64 vdisktime;
        unsigned int weight;
+        unsigned int new_weight;
+        bool needs_update;
        /* number of cfqq currently on this group */
        int nr_cfqq;
@@ -238,6 +239,7 @@ struct cfq_data {
        struct rb_root prio_trees[CFQ_PRIO_LISTS];
        unsigned int busy_queues;
+        unsigned int busy_sync_queues;
        int rq_in_driver;
        int rq_in_flight[2];
@@ -285,7 +287,6 @@ struct cfq_data {
        unsigned int cfq_slice_idle;
        unsigned int cfq_group_idle;
        unsigned int cfq_latency;
-        unsigned int cfq_group_isolation;
        unsigned int cic_index;
        struct list_head cic_list;
@@ -501,13 +502,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
        }
 }
-static int cfq_queue_empty(struct request_queue *q)
-{
-        struct cfq_data *cfqd = q->elevator->elevator_data;
-        return !cfqd->rq_queued;
-}
 /*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -558,15 +552,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
-        u64 vdisktime = st->min_vdisktime;
        struct cfq_group *cfqg;
        if (st->left) {
                cfqg = rb_entry_cfqg(st->left);
-                vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+                st->min_vdisktime = max_vdisktime(st->min_vdisktime,
+                                                  cfqg->vdisktime);
        }
-        st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
 }
 /*
@@ -863,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 }
 static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_update_group_weight(struct cfq_group *cfqg)
+{
+        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+        if (cfqg->needs_update) {
+                cfqg->weight = cfqg->new_weight;
+                cfqg->needs_update = false;
+        }
+}
+static void
+cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+        cfq_update_group_weight(cfqg);
+        __cfq_group_service_tree_add(st, cfqg);
+        st->total_weight += cfqg->weight;
+}
+static void
+cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
        struct cfq_group *__cfqg;
@@ -876,7 +888,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
        /*
         * Currently put the group at the end. Later implement something
         * so that groups get lesser vtime based on their weights, so that
-         * if group does not loose all if it was not continously backlogged.
+         * if group does not loose all if it was not continuously backlogged.
         */
        n = rb_last(&st->rb);
        if (n) {
@@ -884,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
                cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
        } else
                cfqg->vdisktime = st->min_vdisktime;
+        cfq_group_service_tree_add(st, cfqg);
+}
-        __cfq_group_service_tree_add(st, cfqg);
+static void
-        st->total_weight += cfqg->weight;
+cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+        st->total_weight -= cfqg->weight;
+        if (!RB_EMPTY_NODE(&cfqg->rb_node))
+                cfq_rb_erase(&cfqg->rb_node, st);
 }
 static void
-cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
@@ -902,14 +920,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
                return;
        cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-        st->total_weight -= cfqg->weight;
+        cfq_group_service_tree_del(st, cfqg);
-        if (!RB_EMPTY_NODE(&cfqg->rb_node))
-                cfq_rb_erase(&cfqg->rb_node, st);
        cfqg->saved_workload_slice = 0;
        cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
-static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
+                                                unsigned int *unaccounted_time)
 {
        unsigned int slice_used;
@@ -928,8 +945,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
                                        1);
        } else {
                slice_used = jiffies - cfqq->slice_start;
-                if (slice_used > cfqq->allocated_slice)
+                if (slice_used > cfqq->allocated_slice) {
+                        *unaccounted_time = slice_used - cfqq->allocated_slice;
                        slice_used = cfqq->allocated_slice;
+                }
+                if (time_after(cfqq->slice_start, cfqq->dispatch_start))
+                        *unaccounted_time += cfqq->slice_start -
+                                        cfqq->dispatch_start;
        }
        return slice_used;
@@ -939,12 +961,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
                                struct cfq_queue *cfqq)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
-        unsigned int used_sl, charge;
+        unsigned int used_sl, charge, unaccounted_sl = 0;
        int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
                        - cfqg->service_tree_idle.count;
        BUG_ON(nr_sync < 0);
-        used_sl = charge = cfq_cfqq_slice_usage(cfqq);
+        used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
        if (iops_mode(cfqd))
                charge = cfqq->slice_dispatch;
@@ -952,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
                charge = cfqq->allocated_slice;
        /* Can't update vdisktime while group is on service tree */
-        cfq_rb_erase(&cfqg->rb_node, st);
+        cfq_group_service_tree_del(st, cfqg);
        cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-        __cfq_group_service_tree_add(st, cfqg);
+        /* If a new weight was requested, update now, off tree */
+        cfq_group_service_tree_add(st, cfqg);
        /* This group is being expired. Save the context */
        if (time_after(cfqd->workload_expires, jiffies)) {
@@ -970,7 +993,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
        cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
                        " sect=%u", used_sl, cfqq->slice_dispatch, charge,
                        iops_mode(cfqd), cfqq->nr_sectors);
-        cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+        cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
+                                          unaccounted_sl);
        cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
@@ -985,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
                                        unsigned int weight)
 {
-        cfqg_of_blkg(blkg)->weight = weight;
+        struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+        cfqg->new_weight = weight;
+        cfqg->needs_update = true;
 }
 static struct cfq_group *
@@ -1187,32 +1213,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        int new_cfqq = 1;
        int group_changed = 0;
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-        if (!cfqd->cfq_group_isolation
-            && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
-            && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
-                /* Move this cfq to root group */
-                cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-                if (!RB_EMPTY_NODE(&cfqq->rb_node))
-                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-                cfqq->orig_cfqg = cfqq->cfqg;
-                cfqq->cfqg = &cfqd->root_group;
-                cfqd->root_group.ref++;
-                group_changed = 1;
-        } else if (!cfqd->cfq_group_isolation
-                   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
-                /* cfqq is sequential now needs to go to its original group */
-                BUG_ON(cfqq->cfqg != &cfqd->root_group);
-                if (!RB_EMPTY_NODE(&cfqq->rb_node))
-                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-                cfq_put_cfqg(cfqq->cfqg);
-                cfqq->cfqg = cfqq->orig_cfqg;
-                cfqq->orig_cfqg = NULL;
-                group_changed = 1;
-                cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
-        }
-#endif
        service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
                                                cfqq_type(cfqq));
        if (cfq_class_idle(cfqq)) {
@@ -1284,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        service_tree->count++;
        if ((add_front || !new_cfqq) && !group_changed)
                return;
-        cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+        cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
 }
 static struct cfq_queue *
@@ -1372,6 +1372,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        BUG_ON(cfq_cfqq_on_rr(cfqq));
        cfq_mark_cfqq_on_rr(cfqq);
        cfqd->busy_queues++;
+        if (cfq_cfqq_sync(cfqq))
+                cfqd->busy_sync_queues++;
        cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -1395,9 +1397,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                cfqq->p_root = NULL;
        }
-        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+        cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
        BUG_ON(!cfqd->busy_queues);
        cfqd->busy_queues--;
+        if (cfq_cfqq_sync(cfqq))
+                cfqd->busy_sync_queues--;
 }
 /*
@@ -2405,6 +2409,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         * Does this cfqq already have too much IO in flight?
         */
        if (cfqq->dispatched >= max_dispatch) {
+                bool promote_sync = false;
                /*
                 * idle queue must always only have a single IO in flight
                 */
@@ -2412,15 +2417,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                        return false;
                /*
+                 * If there is only one sync queue
+                 * we can ignore async queue here and give the sync
+                 * queue no dispatch limit. The reason is a sync queue can
+                 * preempt async queue, limiting the sync queue doesn't make
+                 * sense. This is useful for aiostress test.
+                 */
+                if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
+                        promote_sync = true;
+                /*
                 * We have other queues, don't allow more IO from this one
                 */
-                if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
+                if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
+                                !promote_sync)
                        return false;
                /*
                 * Sole queue user, no limit
                 */
-                if (cfqd->busy_queues == 1)
+                if (cfqd->busy_queues == 1 || promote_sync)
                        max_dispatch = -1;
                else
                        /*
@@ -2542,7 +2558,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
        struct cfq_data *cfqd = cfqq->cfqd;
-        struct cfq_group *cfqg, *orig_cfqg;
+        struct cfq_group *cfqg;
        BUG_ON(cfqq->ref <= 0);
@@ -2554,7 +2570,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
        BUG_ON(rb_first(&cfqq->sort_list));
        BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
        cfqg = cfqq->cfqg;
-        orig_cfqg = cfqq->orig_cfqg;
        if (unlikely(cfqd->active_queue == cfqq)) {
                __cfq_slice_expired(cfqd, cfqq, 0);
@@ -2564,33 +2579,23 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
        BUG_ON(cfq_cfqq_on_rr(cfqq));
        kmem_cache_free(cfq_pool, cfqq);
        cfq_put_cfqg(cfqg);
-        if (orig_cfqg)
-                cfq_put_cfqg(orig_cfqg);
 }
 /*
- * Must always be called with the rcu_read_lock() held
+ * Call func for each cic attached to this ioc.
 */
 static void
-__call_for_each_cic(struct io_context *ioc,
+call_for_each_cic(struct io_context *ioc,
-                    void (*func)(struct io_context *, struct cfq_io_context *))
+                  void (*func)(struct io_context *, struct cfq_io_context *))
 {
        struct cfq_io_context *cic;
        struct hlist_node *n;
+        rcu_read_lock();
        hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
                func(ioc, cic);
-}
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
-                  void (*func)(struct io_context *, struct cfq_io_context *))
-{
-        rcu_read_lock();
-        __call_for_each_cic(ioc, func);
        rcu_read_unlock();
 }
@@ -2651,7 +2656,7 @@ static void cfq_free_io_context(struct io_context *ioc)
         * should be ok to iterate over the known list, we will see all cic's
         * since no new ones are added.
         */
-        __call_for_each_cic(ioc, cic_free_func);
+        call_for_each_cic(ioc, cic_free_func);
 }
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -3355,7 +3360,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                            cfqd->busy_queues > 1) {
                                cfq_del_timer(cfqd, cfqq);
                                cfq_clear_cfqq_wait_request(cfqq);
-                                __blk_run_queue(cfqd->queue, false);
+                                __blk_run_queue(cfqd->queue);
                        } else {
                                cfq_blkiocg_update_idle_time_stats(
                                                &cfqq->cfqg->blkg);
@@ -3370,7 +3375,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                 * this new queue is RT and the current one is BE
                 */
                cfq_preempt_queue(cfqd, cfqq);
-                __blk_run_queue(cfqd->queue, false);
+                __blk_run_queue(cfqd->queue);
        }
 }
@@ -3613,12 +3618,12 @@ static void cfq_put_request(struct request *rq)
                put_io_context(RQ_CIC(rq)->ioc);
-                rq->elevator_private = NULL;
+                rq->elevator_private[0] = NULL;
-                rq->elevator_private2 = NULL;
+                rq->elevator_private[1] = NULL;
                /* Put down rq reference on cfqg */
                cfq_put_cfqg(RQ_CFQG(rq));
-                rq->elevator_private3 = NULL;
+                rq->elevator_private[2] = NULL;
                cfq_put_queue(cfqq);
        }
@@ -3705,13 +3710,12 @@ new_queue:
        }
        cfqq->allocated[rw]++;
-        cfqq->ref++;
-        rq->elevator_private = cic;
-        rq->elevator_private2 = cfqq;
-        rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+        cfqq->ref++;
+        rq->elevator_private[0] = cic;
+        rq->elevator_private[1] = cfqq;
+        rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
        spin_unlock_irqrestore(q->queue_lock, flags);
        return 0;
 queue_fail:
@@ -3731,7 +3735,7 @@ static void cfq_kick_queue(struct work_struct *work)
        struct request_queue *q = cfqd->queue;
        spin_lock_irq(q->queue_lock);
-        __blk_run_queue(cfqd->queue, false);
+        __blk_run_queue(cfqd->queue);
        spin_unlock_irq(q->queue_lock);
 }
@@ -3953,7 +3957,6 @@ static void *cfq_init_queue(struct request_queue *q)
        cfqd->cfq_slice_idle = cfq_slice_idle;
        cfqd->cfq_group_idle = cfq_group_idle;
        cfqd->cfq_latency = 1;
-        cfqd->cfq_group_isolation = 0;
        cfqd->hw_tag = -1;
        /*
         * we optimistically start assuming sync ops weren't delayed in last
@@ -4029,7 +4032,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
-SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
 #undef SHOW_FUNCTION
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
@@ -4063,7 +4065,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
                UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
-STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
 #undef STORE_FUNCTION
 #define CFQ_ATTR(name) \
@@ -4081,7 +4082,6 @@ static struct elv_fs_entry cfq_attrs[] = {
        CFQ_ATTR(slice_idle),
        CFQ_ATTR(group_idle),
        CFQ_ATTR(low_latency),
-        CFQ_ATTR(group_isolation),
        __ATTR_NULL
 };
@@ -4096,7 +4096,6 @@ static struct elevator_type iosched_cfq = {
                .elevator_add_req_fn =          cfq_insert_request,
                .elevator_activate_req_fn =     cfq_activate_request,
                .elevator_deactivate_req_fn =   cfq_deactivate_request,
-                .elevator_queue_empty_fn =      cfq_queue_empty,
                .elevator_completed_req_fn =    cfq_completed_request,
                .elevator_former_req_fn =       elv_rb_former_request,
                .elevator_latter_req_fn =       elv_rb_latter_request,
diff --git a/block/cfq.h b/block/cfq.h
index 54a6d90f8e8c..2a155927e37c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 }
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                        unsigned long time)
+                        unsigned long time, unsigned long unaccounted_time)
 {
-        blkiocg_update_timeslice_used(blkg, time);
+        blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
 }
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
                        unsigned long dequeue) {}
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                        unsigned long time) {}
+                        unsigned long time, unsigned long unaccounted_time) {}
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
                                bool direction, bool sync) {}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b23..5139c0ea1864 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
        return 1;
 }
-static int deadline_queue_empty(struct request_queue *q)
-{
-        struct deadline_data *dd = q->elevator->elevator_data;
-        return list_empty(&dd->fifo_list[WRITE])
-                && list_empty(&dd->fifo_list[READ]);
-}
 static void deadline_exit_queue(struct elevator_queue *e)
 {
        struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
                .elevator_merge_req_fn =        deadline_merged_requests,
                .elevator_dispatch_fn =         deadline_dispatch_requests,
                .elevator_add_req_fn =          deadline_add_request,
-                .elevator_queue_empty_fn =      deadline_queue_empty,
                .elevator_former_req_fn =       elv_rb_former_request,
                .elevator_latter_req_fn =       elv_rb_latter_request,
                .elevator_init_fn =             deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 236e93c1f46c..45ca1e34f582 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
-static inline int elv_try_merge(struct request *__rq, struct bio *bio)
+int elv_try_merge(struct request *__rq, struct bio *bio)
 {
        int ret = ELEVATOR_NO_MERGE;
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
        struct list_head *entry;
        int stop_flags;
+        BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
        if (q->last_merge == rq)
                q->last_merge = NULL;
@@ -519,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
        return ELEVATOR_NO_MERGE;
 }
+/*
+ * Attempt to do an insertion back merge. Only check for the case where
+ * we can append 'rq' to an existing request, so we can throw 'rq' away
+ * afterwards.
+ *
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_insert_merge(struct request_queue *q,
+                                     struct request *rq)
+{
+        struct request *__rq;
+        if (blk_queue_nomerges(q))
+                return false;
+        /*
+         * First try one-hit cache.
+         */
+        if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+                return true;
+        if (blk_queue_noxmerges(q))
+                return false;
+        /*
+         * See if our hash lookup can find a potential backmerge.
+         */
+        __rq = elv_rqhash_find(q, blk_rq_pos(rq));
+        if (__rq && blk_attempt_req_merge(q, __rq, rq))
+                return true;
+        return false;
+}
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
        struct elevator_queue *e = q->elevator;
@@ -536,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
                             struct request *next)
 {
        struct elevator_queue *e = q->elevator;
+        const int next_sorted = next->cmd_flags & REQ_SORTED;
-        if (e->ops->elevator_merge_req_fn)
+        if (next_sorted && e->ops->elevator_merge_req_fn)
                e->ops->elevator_merge_req_fn(q, rq, next);
        elv_rqhash_reposition(q, rq);
-        elv_rqhash_del(q, next);
-        q->nr_sorted--;
+        if (next_sorted) {
+                elv_rqhash_del(q, next);
+                q->nr_sorted--;
+        }
        q->last_merge = rq;
 }
@@ -570,7 +610,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
        rq->cmd_flags &= ~REQ_STARTED;
-        elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+        __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
 void elv_drain_elevator(struct request_queue *q)
@@ -602,7 +642,7 @@ void elv_quiesce_start(struct request_queue *q)
         */
        elv_drain_elevator(q);
        while (q->rq.elvpriv) {
-                __blk_run_queue(q, false);
+                __blk_run_queue(q);
                spin_unlock_irq(q->queue_lock);
                msleep(10);
                spin_lock_irq(q->queue_lock);
@@ -615,23 +655,28 @@ void elv_quiesce_end(struct request_queue *q)
        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
 }
-void elv_insert(struct request_queue *q, struct request *rq, int where)
+void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
-        int unplug_it = 1;
        trace_block_rq_insert(q, rq);
        rq->q = q;
+        BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+        if (rq->cmd_flags & REQ_SOFTBARRIER) {
+                /* barriers are scheduling boundary, update end_sector */
+                if (rq->cmd_type == REQ_TYPE_FS ||
+                    (rq->cmd_flags & REQ_DISCARD)) {
+                        q->end_sector = rq_end_sector(rq);
+                        q->boundary_rq = rq;
+                }
+        } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
+                    (where == ELEVATOR_INSERT_SORT ||
+                     where == ELEVATOR_INSERT_SORT_MERGE))
+                where = ELEVATOR_INSERT_BACK;
        switch (where) {
        case ELEVATOR_INSERT_REQUEUE:
-                /*
-                 * Most requeues happen because of a busy condition,
-                 * don't force unplug of the queue for that case.
-                 * Clear unplug_it and fall through.
-                 */
-                unplug_it = 0;
        case ELEVATOR_INSERT_FRONT:
                rq->cmd_flags |= REQ_SOFTBARRIER;
                list_add(&rq->queuelist, &q->queue_head);
@@ -651,9 +696,17 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
                 *   with anything.  There's no point in delaying queue
                 *   processing.
                 */
-                __blk_run_queue(q, false);
+                __blk_run_queue(q);
                break;
+        case ELEVATOR_INSERT_SORT_MERGE:
+                /*
+                 * If we succeed in merging this request with one in the
+                 * queue already, we are done - rq has now been freed,
+                 * so no need to do anything further.
+                 */
+                if (elv_attempt_insert_merge(q, rq))
+                        break;
        case ELEVATOR_INSERT_SORT:
                BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
                       !(rq->cmd_flags & REQ_DISCARD));
@@ -673,67 +726,28 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
                q->elevator->ops->elevator_add_req_fn(q, rq);
                break;
+        case ELEVATOR_INSERT_FLUSH:
+                rq->cmd_flags |= REQ_SOFTBARRIER;
+                blk_insert_flush(rq);
+                break;
        default:
                printk(KERN_ERR "%s: bad insertion point %d\n",
                       __func__, where);
                BUG();
        }
-        if (unplug_it && blk_queue_plugged(q)) {
-                int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
-                                - queue_in_flight(q);
-                if (nrq >= q->unplug_thresh)
-                        __generic_unplug_device(q);
-        }
-}
-void __elv_add_request(struct request_queue *q, struct request *rq, int where,
-                       int plug)
-{
-        if (rq->cmd_flags & REQ_SOFTBARRIER) {
-                /* barriers are scheduling boundary, update end_sector */
-                if (rq->cmd_type == REQ_TYPE_FS ||
-                    (rq->cmd_flags & REQ_DISCARD)) {
-                        q->end_sector = rq_end_sector(rq);
-                        q->boundary_rq = rq;
-                }
-        } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
-                    where == ELEVATOR_INSERT_SORT)
-                where = ELEVATOR_INSERT_BACK;
-        if (plug)
-                blk_plug_device(q);
-        elv_insert(q, rq, where);
 }
 EXPORT_SYMBOL(__elv_add_request);
-void elv_add_request(struct request_queue *q, struct request *rq, int where,
+void elv_add_request(struct request_queue *q, struct request *rq, int where)
-                     int plug)
 {
        unsigned long flags;
        spin_lock_irqsave(q->queue_lock, flags);
-        __elv_add_request(q, rq, where, plug);
+        __elv_add_request(q, rq, where);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(elv_add_request);
-int elv_queue_empty(struct request_queue *q)
-{
-        struct elevator_queue *e = q->elevator;
-        if (!list_empty(&q->queue_head))
-                return 0;
-        if (e->ops->elevator_queue_empty_fn)
-                return e->ops->elevator_queue_empty_fn(q);
-        return 1;
-}
-EXPORT_SYMBOL(elv_queue_empty);
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
@@ -759,7 +773,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
        if (e->ops->elevator_set_req_fn)
                return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
-        rq->elevator_private = NULL;
+        rq->elevator_private[0] = NULL;
        return 0;
 }
@@ -785,6 +799,8 @@ void elv_abort_queue(struct request_queue *q)
 {
        struct request *rq;
+        blk_abort_flushes(q);
        while (!list_empty(&q->queue_head)) {
                rq = list_entry_rq(q->queue_head.next);
                rq->cmd_flags |= REQ_QUIET;
diff --git a/block/genhd.c b/block/genhd.c
index cbf1112a885c..2dd988723d73 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -739,7 +739,7 @@ void __init printk_all_partitions(void)
                /*
                 * Don't show empty devices or things that have been
-                 * surpressed
+                 * suppressed
                 */
                if (get_capacity(disk) == 0 ||
                    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
@@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
                           "%u %lu %lu %llu %u %u %u %u\n",
                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
                           disk_name(gp, hd->partno, buf),
-                           part_stat_read(hd, ios[0]),
+                           part_stat_read(hd, ios[READ]),
-                           part_stat_read(hd, merges[0]),
+                           part_stat_read(hd, merges[READ]),
-                           (unsigned long long)part_stat_read(hd, sectors[0]),
+                           (unsigned long long)part_stat_read(hd, sectors[READ]),
-                           jiffies_to_msecs(part_stat_read(hd, ticks[0])),
+                           jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
-                           part_stat_read(hd, ios[1]),
+                           part_stat_read(hd, ios[WRITE]),
-                           part_stat_read(hd, merges[1]),
+                           part_stat_read(hd, merges[WRITE]),
-                           (unsigned long long)part_stat_read(hd, sectors[1]),
+                           (unsigned long long)part_stat_read(hd, sectors[WRITE]),
-                           jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
                           part_in_flight(hd),
                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
                           jiffies_to_msecs(part_stat_read(hd, time_in_queue))
@@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk)
 void disk_unblock_events(struct gendisk *disk)
 {
        if (disk->ev)
-                __disk_unblock_events(disk, true);
+                __disk_unblock_events(disk, false);
 }
 /**
@@ -1588,9 +1588,13 @@ static void disk_events_workfn(struct work_struct *work)
        spin_unlock_irq(&ev->lock);
-        /* tell userland about new events */
+        /*
+         * Tell userland about new events.  Only the events listed in
+         * @disk->events are reported.  Unlisted events are processed the
+         * same internally but never get reported to userland.
+         */
        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
-                if (events & (1 << i))
+                if (events & disk->events & (1 << i))
                        envp[nr_events++] = disk_uevents[i];
        if (nr_events)
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd37..06389e9ef96d 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
        list_add_tail(&rq->queuelist, &nd->queue);
 }
-static int noop_queue_empty(struct request_queue *q)
-{
-        struct noop_data *nd = q->elevator->elevator_data;
-        return list_empty(&nd->queue);
-}
 static struct request *
 noop_former_request(struct request_queue *q, struct request *rq)
 {
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
                .elevator_merge_req_fn          = noop_merged_requests,
                .elevator_dispatch_fn           = noop_dispatch,
                .elevator_add_req_fn            = noop_add_request,
-                .elevator_queue_empty_fn        = noop_queue_empty,
                .elevator_former_req_fn         = noop_former_request,
                .elevator_latter_req_fn         = noop_latter_request,
                .elevator_init_fn               = noop_init_queue,