9 files changed, 161 insertions, 256 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7dbbaf5fb3e..c85d74cae200 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,20 +23,6 @@ static LIST_HEAD(blkio_list);
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
-bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
-{
-        if (!css_tryget(&blkcg->css))
-                return false;
-        return true;
-}
-EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
-void blkiocg_css_put(struct blkio_cgroup *blkcg)
-{
-        css_put(&blkcg->css);
-}
-EXPORT_SYMBOL_GPL(blkiocg_css_put);
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4d316df863b4..84bf745fa775 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -43,9 +43,6 @@ struct blkio_group {
        unsigned long sectors;
 };
-extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
-extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
                                                unsigned int weight);
diff --git a/block/blk-core.c b/block/blk-core.c
index 718897e6d37f..9fe174dc74d1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1147,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 */
 static inline bool queue_should_plug(struct request_queue *q)
 {
-        return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
+        return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
 }
 static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1490,9 +1490,9 @@ end_io:
 /*
 * We only want one ->make_request_fn to be active at a time,
 * else stack usage with stacked devices could be a problem.
- * So use current->bio_{list,tail} to keep a list of requests
+ * So use current->bio_list to keep a list of requests
 * submited by a make_request_fn function.
- * current->bio_tail is also used as a flag to say if
+ * current->bio_list is also used as a flag to say if
 * generic_make_request is currently active in this task or not.
 * If it is NULL, then no make_request is active.  If it is non-NULL,
 * then a make_request is active, and new requests should be added
@@ -1500,11 +1500,11 @@ end_io:
 */
 void generic_make_request(struct bio *bio)
 {
-        if (current->bio_tail) {
+        struct bio_list bio_list_on_stack;
+        if (current->bio_list) {
                /* make_request is active */
-                *(current->bio_tail) = bio;
+                bio_list_add(current->bio_list, bio);
-                bio->bi_next = NULL;
-                current->bio_tail = &bio->bi_next;
                return;
        }
        /* following loop may be a bit non-obvious, and so deserves some
@@ -1512,30 +1512,27 @@ void generic_make_request(struct bio *bio)
         * Before entering the loop, bio->bi_next is NULL (as all callers
         * ensure that) so we have a list with a single bio.
         * We pretend that we have just taken it off a longer list, so
-         * we assign bio_list to the next (which is NULL) and bio_tail
+         * we assign bio_list to a pointer to the bio_list_on_stack,
-         * to &bio_list, thus initialising the bio_list of new bios to be
+         * thus initialising the bio_list of new bios to be
         * added.  __generic_make_request may indeed add some more bios
         * through a recursive call to generic_make_request.  If it
         * did, we find a non-NULL value in bio_list and re-enter the loop
         * from the top.  In this case we really did just take the bio
-         * of the top of the list (no pretending) and so fixup bio_list and
+         * of the top of the list (no pretending) and so remove it from
-         * bio_tail or bi_next, and call into __generic_make_request again.
+         * bio_list, and call into __generic_make_request again.
         *
         * The loop was structured like this to make only one call to
         * __generic_make_request (which is important as it is large and
         * inlined) and to keep the structure simple.
         */
        BUG_ON(bio->bi_next);
+        bio_list_init(&bio_list_on_stack);
+        current->bio_list = &bio_list_on_stack;
        do {
-                current->bio_list = bio->bi_next;
-                if (bio->bi_next == NULL)
-                        current->bio_tail = &current->bio_list;
-                else
-                        bio->bi_next = NULL;
                __generic_make_request(bio);
-                bio = current->bio_list;
+                bio = bio_list_pop(current->bio_list);
        } while (bio);
-        current->bio_tail = NULL; /* deactivate */
+        current->bio_list = NULL; /* deactivate */
 }
 EXPORT_SYMBOL(generic_make_request);
@@ -1617,8 +1614,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
         * limitation.
         */
        blk_recalc_rq_segments(rq);
-        if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
+        if (rq->nr_phys_segments > queue_max_segments(q)) {
-            rq->nr_phys_segments > queue_max_hw_segments(q)) {
                printk(KERN_ERR "%s: over max segments limit.\n", __func__);
                return -EIO;
        }
@@ -1859,15 +1855,8 @@ void blk_dequeue_request(struct request *rq)
         * and to it is freed is accounted as io that is in progress at
         * the driver side.
         */
-        if (blk_account_rq(rq)) {
+        if (blk_account_rq(rq))
                q->in_flight[rq_is_sync(rq)]++;
-                /*
-                 * Mark this device as supporting hardware queuing, if
-                 * we have more IOs in flight than 4.
-                 */
-                if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
-                        set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
-        }
 }
 /**
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 98e6bf61b0ac..3f65c8aadb2f 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -91,7 +91,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
                spin_lock_init(&ret->lock);
                ret->ioprio_changed = 0;
                ret->ioprio = 0;
-                ret->last_waited = jiffies; /* doesn't matter... */
+                ret->last_waited = 0; /* doesn't matter... */
                ret->nr_batch_requests = 0; /* because this is 0 */
                INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
                INIT_HLIST_HEAD(&ret->cic_list);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99cb5cf1f447..5e7dc9973458 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -206,8 +206,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
        int nr_phys_segs = bio_phys_segments(q, bio);
-        if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
+        if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
-            req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
                        q->last_merge = NULL;
@@ -300,10 +299,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
                total_phys_segments--;
        }
-        if (total_phys_segments > queue_max_phys_segments(q))
+        if (total_phys_segments > queue_max_segments(q))
-                return 0;
-        if (total_phys_segments > queue_max_hw_segments(q))
                return 0;
        /* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5eeb9e0d256e..31e7a9375c13 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,10 +91,9 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 */
 void blk_set_default_limits(struct queue_limits *lim)
 {
-        lim->max_phys_segments = MAX_PHYS_SEGMENTS;
+        lim->max_segments = BLK_MAX_SEGMENTS;
-        lim->max_hw_segments = MAX_HW_SEGMENTS;
        lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-        lim->max_segment_size = MAX_SEGMENT_SIZE;
+        lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
        lim->max_sectors = BLK_DEF_MAX_SECTORS;
        lim->max_hw_sectors = INT_MAX;
        lim->max_discard_sectors = 0;
@@ -154,7 +153,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
        q->unplug_timer.data = (unsigned long)q;
        blk_set_default_limits(&q->limits);
-        blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
+        blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
        /*
         * If the caller didn't supply a lock, fall back to our embedded
@@ -210,37 +209,32 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 /**
- * blk_queue_max_sectors - set max sectors for a request for this queue
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
 * @q:  the request queue for the device
- * @max_sectors:  max sectors in the usual 512b unit
+ * @max_hw_sectors:  max hardware sectors in the usual 512b unit
 *
 * Description:
- *    Enables a low level driver to set an upper limit on the size of
+ *    Enables a low level driver to set a hard upper limit,
- *    received requests.
+ *    max_hw_sectors, on the size of requests.  max_hw_sectors is set by
+ *    the device driver based upon the combined capabilities of I/O
+ *    controller and storage device.
+ *
+ *    max_sectors is a soft limit imposed by the block layer for
+ *    filesystem type requests.  This value can be overridden on a
+ *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
+ *    The soft limit can not exceed max_hw_sectors.
 **/
-void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 {
-        if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
+        if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
-                max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+                max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
                printk(KERN_INFO "%s: set to minimum %d\n",
-                       __func__, max_sectors);
+                       __func__, max_hw_sectors);
        }
-        if (BLK_DEF_MAX_SECTORS > max_sectors)
+        q->limits.max_hw_sectors = max_hw_sectors;
-                q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors;
+        q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
-        else {
+                                      BLK_DEF_MAX_SECTORS);
-                q->limits.max_sectors = BLK_DEF_MAX_SECTORS;
-                q->limits.max_hw_sectors = max_sectors;
-        }
-}
-EXPORT_SYMBOL(blk_queue_max_sectors);
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
-{
-        if (BLK_DEF_MAX_SECTORS > max_sectors)
-                q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS;
-        else
-                q->limits.max_hw_sectors = max_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -257,17 +251,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 /**
- * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * blk_queue_max_segments - set max hw segments for a request for this queue
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
 *
 * Description:
 *    Enables a low level driver to set an upper limit on the number of
- *    physical data segments in a request.  This would be the largest sized
+ *    hw data segments in a request.
- *    scatter list the driver could handle.
 **/
-void blk_queue_max_phys_segments(struct request_queue *q,
+void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
-                                 unsigned short max_segments)
 {
        if (!max_segments) {
                max_segments = 1;
@@ -275,33 +267,9 @@ void blk_queue_max_phys_segments(struct request_queue *q,
                       __func__, max_segments);
        }
-        q->limits.max_phys_segments = max_segments;
+        q->limits.max_segments = max_segments;
 }
-EXPORT_SYMBOL(blk_queue_max_phys_segments);
+EXPORT_SYMBOL(blk_queue_max_segments);
-/**
- * blk_queue_max_hw_segments - set max hw segments for a request for this queue
- * @q:  the request queue for the device
- * @max_segments:  max number of segments
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the number of
- *    hw data segments in a request.  This would be the largest number of
- *    address/length pairs the host adapter can actually give at once
- *    to the device.
- **/
-void blk_queue_max_hw_segments(struct request_queue *q,
-                               unsigned short max_segments)
-{
-        if (!max_segments) {
-                max_segments = 1;
-                printk(KERN_INFO "%s: set to minimum %d\n",
-                       __func__, max_segments);
-        }
-        q->limits.max_hw_segments = max_segments;
-}
-EXPORT_SYMBOL(blk_queue_max_hw_segments);
 /**
 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
@@ -507,7 +475,7 @@ static unsigned int lcm(unsigned int a, unsigned int b)
 * blk_stack_limits - adjust queue_limits for stacked devices
 * @t:  the stacking driver limits (top device)
 * @b:  the underlying queue limits (bottom, component device)
- * @offset:  offset to beginning of data within component device
+ * @start:  first data sector within component device
 *
 * Description:
 *    This function is used by stacking drivers like MD and DM to ensure
@@ -525,10 +493,9 @@ static unsigned int lcm(unsigned int a, unsigned int b)
 *    the alignment_offset is undefined.
 */
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
-                     sector_t offset)
+                     sector_t start)
 {
-        sector_t alignment;
+        unsigned int top, bottom, alignment, ret = 0;
-        unsigned int top, bottom, ret = 0;
        t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
        t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -537,18 +504,14 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
                                            b->seg_boundary_mask);
-        t->max_phys_segments = min_not_zero(t->max_phys_segments,
+        t->max_segments = min_not_zero(t->max_segments, b->max_segments);
-                                            b->max_phys_segments);
-        t->max_hw_segments = min_not_zero(t->max_hw_segments,
-                                          b->max_hw_segments);
        t->max_segment_size = min_not_zero(t->max_segment_size,
                                           b->max_segment_size);
        t->misaligned |= b->misaligned;
-        alignment = queue_limit_alignment_offset(b, offset);
+        alignment = queue_limit_alignment_offset(b, start);
        /* Bottom device has different alignment.  Check that it is
         * compatible with the current top alignment.
@@ -611,11 +574,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        /* Discard alignment and granularity */
        if (b->discard_granularity) {
-                unsigned int granularity = b->discard_granularity;
+                alignment = queue_limit_discard_alignment(b, start);
-                offset &= granularity - 1;
-                alignment = (granularity + b->discard_alignment - offset)
-                        & (granularity - 1);
                if (t->discard_granularity != 0 &&
                    t->discard_alignment != alignment) {
@@ -657,7 +616,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
        start += get_start_sect(bdev);
-        return blk_stack_limits(t, &bq->limits, start << 9);
+        return blk_stack_limits(t, &bq->limits, start);
 }
 EXPORT_SYMBOL(bdev_stack_limits);
@@ -668,9 +627,8 @@ EXPORT_SYMBOL(bdev_stack_limits);
 * @offset:  offset to beginning of data within component device
 *
 * Description:
- *    Merges the limits for two queues.  Returns 0 if alignment
+ *    Merges the limits for a top level gendisk and a bottom level
- *    didn't change.  Returns -1 if adding the bottom device caused
+ *    block_device.
- *    misalignment.
 */
 void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
                       sector_t offset)
@@ -678,9 +636,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
        struct request_queue *t = disk->queue;
        struct request_queue *b = bdev_get_queue(bdev);
-        offset += get_start_sect(bdev) << 9;
+        if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
-        if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
                char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
                disk_name(disk, 0, top);
@@ -752,22 +708,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad);
 * does is adjust the queue so that the buf is always appended
 * silently to the scatterlist.
 *
- * Note: This routine adjusts max_hw_segments to make room for
+ * Note: This routine adjusts max_hw_segments to make room for appending
- * appending the drain buffer.  If you call
+ * the drain buffer.  If you call blk_queue_max_segments() after calling
- * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after
+ * this routine, you must set the limit to one fewer than your device
- * calling this routine, you must set the limit to one fewer than your
+ * can support otherwise there won't be room for the drain buffer.
- * device can support otherwise there won't be room for the drain
- * buffer.
 */
 int blk_queue_dma_drain(struct request_queue *q,
                               dma_drain_needed_fn *dma_drain_needed,
                               void *buf, unsigned int size)
 {
-        if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
+        if (queue_max_segments(q) < 2)
                return -EINVAL;
        /* make room for appending the drain */
-        blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
+        blk_queue_max_segments(q, queue_max_segments(q) - 1);
-        blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
        q->dma_drain_needed = dma_drain_needed;
        q->dma_drain_buffer = buf;
        q->dma_drain_size = size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8606c9543fdd..e85442415db3 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -189,7 +189,8 @@ static ssize_t queue_nonrot_store(struct request_queue *q, const char *page,
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
-        return queue_var_show(blk_queue_nomerges(q), page);
+        return queue_var_show((blk_queue_nomerges(q) << 1) |
+                               blk_queue_noxmerges(q), page);
 }
 static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
@@ -199,10 +200,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
        ssize_t ret = queue_var_store(&nm, page, count);
        spin_lock_irq(q->queue_lock);
-        if (nm)
+        queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+        queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
+        if (nm == 2)
                queue_flag_set(QUEUE_FLAG_NOMERGES, q);
-        else
+        else if (nm)
-                queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+                queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
        spin_unlock_irq(q->queue_lock);
        return ret;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 17b768d0d42f..dee9d9378fee 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -19,7 +19,7 @@
 * tunables
 */
 /* max queue in one round of service */
-static const int cfq_quantum = 4;
+static const int cfq_quantum = 8;
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 /* maximum backwards seek, in KiB */
 static const int cfq_back_max = 16 * 1024;
@@ -42,16 +42,14 @@ static const int cfq_hist_divisor = 4;
 */
 #define CFQ_MIN_TT              (2)
-/*
- * Allow merged cfqqs to perform this amount of seeky I/O before
- * deciding to break the queues up again.
- */
-#define CFQQ_COOP_TOUT          (HZ)
 #define CFQ_SLICE_SCALE         (5)
 #define CFQ_HW_QUEUE_MIN        (5)
 #define CFQ_SERVICE_SHIFT       12
+#define CFQQ_SEEK_THR           (sector_t)(8 * 100)
+#define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
+#define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
 #define RQ_CIC(rq)              \
        ((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
@@ -80,11 +78,12 @@ struct cfq_rb_root {
        struct rb_root rb;
        struct rb_node *left;
        unsigned count;
+        unsigned total_weight;
        u64 min_vdisktime;
        struct rb_node *active;
-        unsigned total_weight;
 };
-#define CFQ_RB_ROOT     (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
+#define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
+                        .count = 0, .min_vdisktime = 0, }
 /*
 * Per process-grouping structure
@@ -118,11 +117,11 @@ struct cfq_queue {
        /* time when queue got scheduled in to dispatch first request. */
        unsigned long dispatch_start;
        unsigned int allocated_slice;
+        unsigned int slice_dispatch;
        /* time when first request from queue completed and slice started. */
        unsigned long slice_start;
        unsigned long slice_end;
        long slice_resid;
-        unsigned int slice_dispatch;
        /* pending metadata requests */
        int meta_pending;
@@ -133,14 +132,11 @@ struct cfq_queue {
        unsigned short ioprio, org_ioprio;
        unsigned short ioprio_class, org_ioprio_class;
-        unsigned int seek_samples;
-        u64 seek_total;
-        sector_t seek_mean;
-        sector_t last_request_pos;
-        unsigned long seeky_start;
        pid_t pid;
+        u32 seek_history;
+        sector_t last_request_pos;
        struct cfq_rb_root *service_tree;
        struct cfq_queue *new_cfqq;
        struct cfq_group *cfqg;
@@ -227,8 +223,8 @@ struct cfq_data {
        unsigned int busy_queues;
-        int rq_in_driver[2];
+        int rq_in_driver;
-        int sync_flight;
+        int rq_in_flight[2];
        /*
         * queue-depth detection
@@ -314,6 +310,7 @@ enum cfqq_state_flags {
        CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
        CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
        CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
+        CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
        CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
        CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
 };
@@ -342,6 +339,7 @@ CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
+CFQ_CFQQ_FNS(split_coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
@@ -419,11 +417,6 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
                                                struct io_context *);
-static inline int rq_in_driver(struct cfq_data *cfqd)
-{
-        return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
-}
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
                                            bool is_sync)
 {
@@ -953,10 +946,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
        unsigned int major, minor;
-        /* Do we need to take this reference */
-        if (!blkiocg_css_tryget(blkcg))
-                return NULL;;
        cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
        if (cfqg || !create)
                goto done;
@@ -987,7 +976,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 done:
-        blkiocg_css_put(blkcg);
        return cfqg;
 }
@@ -1422,9 +1410,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-        cfqd->rq_in_driver[rq_is_sync(rq)]++;
+        cfqd->rq_in_driver++;
        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
-                                                rq_in_driver(cfqd));
+                                                cfqd->rq_in_driver);
        cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
@@ -1432,12 +1420,11 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-        const int sync = rq_is_sync(rq);
-        WARN_ON(!cfqd->rq_in_driver[sync]);
+        WARN_ON(!cfqd->rq_in_driver);
-        cfqd->rq_in_driver[sync]--;
+        cfqd->rq_in_driver--;
        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
-                                                rq_in_driver(cfqd));
+                                                cfqd->rq_in_driver);
 }
 static void cfq_remove_request(struct request *rq)
@@ -1566,6 +1553,15 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        cfq_clear_cfqq_wait_busy(cfqq);
        /*
+         * If this cfqq is shared between multiple processes, check to
+         * make sure that those processes are still issuing I/Os within
+         * the mean seek distance.  If not, it may be time to break the
+         * queues apart again.
+         */
+        if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
+                cfq_mark_cfqq_split_coop(cfqq);
+        /*
         * store what was left of this slice, if the queue idled/timed out
         */
        if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
@@ -1663,22 +1659,10 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
                return cfqd->last_position - blk_rq_pos(rq);
 }
-#define CFQQ_SEEK_THR           8 * 1024
-#define CFQQ_SEEKY(cfqq)        ((cfqq)->seek_mean > CFQQ_SEEK_THR)
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                               struct request *rq, bool for_preempt)
 {
-        sector_t sdist = cfqq->seek_mean;
+        return cfq_dist_from_last(cfqd, rq) <= CFQQ_SEEK_THR;
-        if (!sample_valid(cfqq->seek_samples))
-                sdist = CFQQ_SEEK_THR;
-        /* if seek_mean is big, using it as close criteria is meaningless */
-        if (sdist > CFQQ_SEEK_THR && !for_preempt)
-                sdist = CFQQ_SEEK_THR;
-        return cfq_dist_from_last(cfqd, rq) <= sdist;
 }
 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
@@ -1874,8 +1858,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
        cfqq->dispatched++;
        elv_dispatch_sort(q, rq);
-        if (cfq_cfqq_sync(cfqq))
+        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-                cfqd->sync_flight++;
        cfqq->nr_sectors += blk_rq_sectors(rq);
 }
@@ -2215,6 +2198,19 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
        return dispatched;
 }
+static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
+        struct cfq_queue *cfqq)
+{
+        /* the queue hasn't finished any request, can't estimate */
+        if (cfq_cfqq_slice_new(cfqq))
+                return 1;
+        if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
+                cfqq->slice_end))
+                return 1;
+        return 0;
+}
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
        unsigned int max_dispatch;
@@ -2222,16 +2218,16 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        /*
         * Drain async requests before we start sync IO
         */
-        if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+        if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
                return false;
        /*
         * If this is an async queue and we have sync IO in flight, let it wait
         */
-        if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+        if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
                return false;
-        max_dispatch = cfqd->cfq_quantum;
+        max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
        if (cfq_class_idle(cfqq))
                max_dispatch = 1;
@@ -2248,13 +2244,22 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                /*
                 * We have other queues, don't allow more IO from this one
                 */
-                if (cfqd->busy_queues > 1)
+                if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
                        return false;
                /*
                 * Sole queue user, no limit
                 */
-                max_dispatch = -1;
+                if (cfqd->busy_queues == 1)
+                        max_dispatch = -1;
+                else
+                        /*
+                         * Normally we start throttling cfqq when cfq_quantum/2
+                         * requests have been dispatched. But we can drive
+                         * deeper queue depths at the beginning of slice
+                         * subjected to upper limit of cfq_quantum.
+                         * */
+                        max_dispatch = cfqd->cfq_quantum;
        }
        /*
@@ -2976,43 +2981,20 @@ static void
 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                       struct request *rq)
 {
-        sector_t sdist;
+        sector_t sdist = 0;
-        u64 total;
+        sector_t n_sec = blk_rq_sectors(rq);
+        if (cfqq->last_request_pos) {
-        if (!cfqq->last_request_pos)
+                if (cfqq->last_request_pos < blk_rq_pos(rq))
-                sdist = 0;
+                        sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
-        else if (cfqq->last_request_pos < blk_rq_pos(rq))
+                else
-                sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
+                        sdist = cfqq->last_request_pos - blk_rq_pos(rq);
-        else
+        }
-                sdist = cfqq->last_request_pos - blk_rq_pos(rq);
-        /*
+        cfqq->seek_history <<= 1;
-         * Don't allow the seek distance to get too large from the
+        if (blk_queue_nonrot(cfqd->queue))
-         * odd fragment, pagein, etc
+                cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
-         */
-        if (cfqq->seek_samples <= 60) /* second&third seek */
-                sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
        else
-                sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
+                cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
-        cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
-        cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
-        total = cfqq->seek_total + (cfqq->seek_samples/2);
-        do_div(total, cfqq->seek_samples);
-        cfqq->seek_mean = (sector_t)total;
-        /*
-         * If this cfqq is shared between multiple processes, check to
-         * make sure that those processes are still issuing I/Os within
-         * the mean seek distance.  If not, it may be time to break the
-         * queues apart again.
-         */
-        if (cfq_cfqq_coop(cfqq)) {
-                if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
-                        cfqq->seeky_start = jiffies;
-                else if (!CFQQ_SEEKY(cfqq))
-                        cfqq->seeky_start = 0;
-        }
 }
 /*
@@ -3037,8 +3019,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                cfq_mark_cfqq_deep(cfqq);
        if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-            (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
+            (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
-             && CFQQ_SEEKY(cfqq)))
                enable_idle = 0;
        else if (sample_valid(cic->ttime_samples)) {
                if (cic->ttime_mean > cfqd->cfq_slice_idle)
@@ -3224,14 +3205,14 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
        struct cfq_queue *cfqq = cfqd->active_queue;
-        if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
+        if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
-                cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
+                cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
        if (cfqd->hw_tag == 1)
                return;
        if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
-            rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
+            cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
                return;
        /*
@@ -3241,7 +3222,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
         */
        if (cfqq && cfq_cfqq_idle_window(cfqq) &&
            cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
-            CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
+            CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
                return;
        if (cfqd->hw_tag_samples++ < 50)
@@ -3294,13 +3275,12 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
        cfq_update_hw_tag(cfqd);
-        WARN_ON(!cfqd->rq_in_driver[sync]);
+        WARN_ON(!cfqd->rq_in_driver);
        WARN_ON(!cfqq->dispatched);
-        cfqd->rq_in_driver[sync]--;
+        cfqd->rq_in_driver--;
        cfqq->dispatched--;
-        if (cfq_cfqq_sync(cfqq))
+        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
-                cfqd->sync_flight--;
        if (sync) {
                RQ_CIC(rq)->last_end_request = now;
@@ -3354,7 +3334,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
                }
        }
-        if (!rq_in_driver(cfqd))
+        if (!cfqd->rq_in_driver)
                cfq_schedule_dispatch(cfqd);
 }
@@ -3453,14 +3433,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
        return cic_to_cfqq(cic, 1);
 }
-static int should_split_cfqq(struct cfq_queue *cfqq)
-{
-        if (cfqq->seeky_start &&
-            time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
-                return 1;
-        return 0;
-}
 /*
 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
 * was the last process referring to said cfqq.
@@ -3469,9 +3441,9 @@ static struct cfq_queue *
 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
 {
        if (cfqq_process_refs(cfqq) == 1) {
-                cfqq->seeky_start = 0;
                cfqq->pid = current->pid;
                cfq_clear_cfqq_coop(cfqq);
+                cfq_clear_cfqq_split_coop(cfqq);
                return cfqq;
        }
@@ -3510,7 +3482,7 @@ new_queue:
                /*
                 * If the queue was seeky for too long, break it apart.
                 */
-                if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
+                if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
                        cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
                        cfqq = split_cfqq(cic, cfqq);
                        if (!cfqq)
diff --git a/block/elevator.c b/block/elevator.c
index 9ad5ccc4c5ee..ee3a883840f2 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -474,6 +474,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
        int ret;
        /*
+         * Levels of merges:
+         *      nomerges:  No merges at all attempted
+         *      noxmerges: Only simple one-hit cache try
+         *      merges:    All merge tries attempted
+         */
+        if (blk_queue_nomerges(q))
+                return ELEVATOR_NO_MERGE;
+        /*
         * First try one-hit cache.
         */
        if (q->last_merge) {
@@ -484,7 +493,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
                }
        }
-        if (blk_queue_nomerges(q))
+        if (blk_queue_noxmerges(q))
                return ELEVATOR_NO_MERGE;
        /*