Merge commit 'v2.6.30-rc3' into tracing/hw-branch-tracing

Conflicts: arch/x86/kernel/ptrace.c Merge reason: fix the conflict above, and also pick up the CONFIG_BROKEN dependency change from upstream so that we can remove it here. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-04-24 04:11:18 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-04-24 04:11:23 -0400
commit: 416dfdcdb894432547ead4fcb9fa6a36b396059e (patch)
tree: 8033fdda07397a59c5fa98c88927040906ce6c1a /block
parent: 56449f437add737a1e5e1cb7e00f63ac8ead1938 (diff)
parent: 091069740304c979f957ceacec39c461d0192158 (diff)
11 files changed, 453 insertions, 245 deletions
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 631f6f44460a..c48fa670d221 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -17,9 +17,6 @@
 #include <linux/rbtree.h>
 #include <linux/interrupt.h>
-#define REQ_SYNC        1
-#define REQ_ASYNC       0
 /*
 * See Documentation/block/as-iosched.txt
 */
@@ -93,7 +90,7 @@ struct as_data {
        struct list_head fifo_list[2];
        struct request *next_rq[2];     /* next in sort order */
-        sector_t last_sector[2];        /* last REQ_SYNC & REQ_ASYNC sectors */
+        sector_t last_sector[2];        /* last SYNC & ASYNC sectors */
        unsigned long exit_prob;        /* probability a task will exit while
                                           being waited on */
@@ -109,7 +106,7 @@ struct as_data {
        unsigned long last_check_fifo[2];
        int changed_batch;              /* 1: waiting for old batch to end */
        int new_batch;                  /* 1: waiting on first read complete */
-        int batch_data_dir;             /* current batch REQ_SYNC / REQ_ASYNC */
+        int batch_data_dir;             /* current batch SYNC / ASYNC */
        int write_batch_count;          /* max # of reqs in a write batch */
        int current_write_count;        /* how many requests left this batch */
        int write_batch_idled;          /* has the write batch gone idle? */
@@ -554,7 +551,7 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
        if (aic == NULL)
                return;
-        if (data_dir == REQ_SYNC) {
+        if (data_dir == BLK_RW_SYNC) {
                unsigned long in_flight = atomic_read(&aic->nr_queued)
                                        + atomic_read(&aic->nr_dispatched);
                spin_lock(&aic->lock);
@@ -811,7 +808,7 @@ static void as_update_rq(struct as_data *ad, struct request *rq)
 */
 static void update_write_batch(struct as_data *ad)
 {
-        unsigned long batch = ad->batch_expire[REQ_ASYNC];
+        unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
        long write_time;
        write_time = (jiffies - ad->current_batch_expires) + batch;
@@ -855,7 +852,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
                kblockd_schedule_work(q, &ad->antic_work);
                ad->changed_batch = 0;
-                if (ad->batch_data_dir == REQ_SYNC)
+                if (ad->batch_data_dir == BLK_RW_SYNC)
                        ad->new_batch = 1;
        }
        WARN_ON(ad->nr_dispatched == 0);
@@ -869,7 +866,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
        if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
                update_write_batch(ad);
                ad->current_batch_expires = jiffies +
-                                ad->batch_expire[REQ_SYNC];
+                                ad->batch_expire[BLK_RW_SYNC];
                ad->new_batch = 0;
        }
@@ -960,7 +957,7 @@ static inline int as_batch_expired(struct as_data *ad)
        if (ad->changed_batch || ad->new_batch)
                return 0;
-        if (ad->batch_data_dir == REQ_SYNC)
+        if (ad->batch_data_dir == BLK_RW_SYNC)
                /* TODO! add a check so a complete fifo gets written? */
                return time_after(jiffies, ad->current_batch_expires);
@@ -986,7 +983,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
         */
        ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
-        if (data_dir == REQ_SYNC) {
+        if (data_dir == BLK_RW_SYNC) {
                struct io_context *ioc = RQ_IOC(rq);
                /* In case we have to anticipate after this */
                copy_io_context(&ad->io_context, &ioc);
@@ -1025,41 +1022,41 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
 static int as_dispatch_request(struct request_queue *q, int force)
 {
        struct as_data *ad = q->elevator->elevator_data;
-        const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
+        const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
-        const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
+        const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
        struct request *rq;
        if (unlikely(force)) {
                /*
                 * Forced dispatch, accounting is useless.  Reset
                 * accounting states and dump fifo_lists.  Note that
-                 * batch_data_dir is reset to REQ_SYNC to avoid
+                 * batch_data_dir is reset to BLK_RW_SYNC to avoid
                 * screwing write batch accounting as write batch
                 * accounting occurs on W->R transition.
                 */
                int dispatched = 0;
-                ad->batch_data_dir = REQ_SYNC;
+                ad->batch_data_dir = BLK_RW_SYNC;
                ad->changed_batch = 0;
                ad->new_batch = 0;
-                while (ad->next_rq[REQ_SYNC]) {
+                while (ad->next_rq[BLK_RW_SYNC]) {
-                        as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]);
+                        as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
                        dispatched++;
                }
-                ad->last_check_fifo[REQ_SYNC] = jiffies;
+                ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
-                while (ad->next_rq[REQ_ASYNC]) {
+                while (ad->next_rq[BLK_RW_ASYNC]) {
-                        as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]);
+                        as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
                        dispatched++;
                }
-                ad->last_check_fifo[REQ_ASYNC] = jiffies;
+                ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
                return dispatched;
        }
        /* Signal that the write batch was uncontended, so we can't time it */
-        if (ad->batch_data_dir == REQ_ASYNC && !reads) {
+        if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
                if (ad->current_write_count == 0 || !writes)
                        ad->write_batch_idled = 1;
        }
@@ -1076,8 +1073,8 @@ static int as_dispatch_request(struct request_queue *q, int force)
                 */
                rq = ad->next_rq[ad->batch_data_dir];
-                if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
+                if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
-                        if (as_fifo_expired(ad, REQ_SYNC))
+                        if (as_fifo_expired(ad, BLK_RW_SYNC))
                                goto fifo_expired;
                        if (as_can_anticipate(ad, rq)) {
@@ -1090,7 +1087,7 @@ static int as_dispatch_request(struct request_queue *q, int force)
                        /* we have a "next request" */
                        if (reads && !writes)
                                ad->current_batch_expires =
-                                        jiffies + ad->batch_expire[REQ_SYNC];
+                                        jiffies + ad->batch_expire[BLK_RW_SYNC];
                        goto dispatch_request;
                }
        }
@@ -1101,20 +1098,20 @@ static int as_dispatch_request(struct request_queue *q, int force)
         */
        if (reads) {
-                BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC]));
+                BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
-                if (writes && ad->batch_data_dir == REQ_SYNC)
+                if (writes && ad->batch_data_dir == BLK_RW_SYNC)
                        /*
                         * Last batch was a read, switch to writes
                         */
                        goto dispatch_writes;
-                if (ad->batch_data_dir == REQ_ASYNC) {
+                if (ad->batch_data_dir == BLK_RW_ASYNC) {
                        WARN_ON(ad->new_batch);
                        ad->changed_batch = 1;
                }
-                ad->batch_data_dir = REQ_SYNC;
+                ad->batch_data_dir = BLK_RW_SYNC;
-                rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next);
+                rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
                ad->last_check_fifo[ad->batch_data_dir] = jiffies;
                goto dispatch_request;
        }
@@ -1125,9 +1122,9 @@ static int as_dispatch_request(struct request_queue *q, int force)
        if (writes) {
 dispatch_writes:
-                BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC]));
+                BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
-                if (ad->batch_data_dir == REQ_SYNC) {
+                if (ad->batch_data_dir == BLK_RW_SYNC) {
                        ad->changed_batch = 1;
                        /*
@@ -1137,11 +1134,11 @@ dispatch_writes:
                         */
                        ad->new_batch = 0;
                }
-                ad->batch_data_dir = REQ_ASYNC;
+                ad->batch_data_dir = BLK_RW_ASYNC;
                ad->current_write_count = ad->write_batch_count;
                ad->write_batch_idled = 0;
-                rq = rq_entry_fifo(ad->fifo_list[REQ_ASYNC].next);
+                rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
-                ad->last_check_fifo[REQ_ASYNC] = jiffies;
+                ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
                goto dispatch_request;
        }
@@ -1164,9 +1161,9 @@ fifo_expired:
                if (ad->nr_dispatched)
                        return 0;
-                if (ad->batch_data_dir == REQ_ASYNC)
+                if (ad->batch_data_dir == BLK_RW_ASYNC)
                        ad->current_batch_expires = jiffies +
-                                        ad->batch_expire[REQ_ASYNC];
+                                        ad->batch_expire[BLK_RW_ASYNC];
                else
                        ad->new_batch = 1;
@@ -1238,8 +1235,8 @@ static int as_queue_empty(struct request_queue *q)
 {
        struct as_data *ad = q->elevator->elevator_data;
-        return list_empty(&ad->fifo_list[REQ_ASYNC])
+        return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
-                && list_empty(&ad->fifo_list[REQ_SYNC]);
+                && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
 }
 static int
@@ -1346,8 +1343,8 @@ static void as_exit_queue(struct elevator_queue *e)
        del_timer_sync(&ad->antic_timer);
        cancel_work_sync(&ad->antic_work);
-        BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
+        BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
-        BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
+        BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
        put_io_context(ad->io_context);
        kfree(ad);
@@ -1372,18 +1369,18 @@ static void *as_init_queue(struct request_queue *q)
        init_timer(&ad->antic_timer);
        INIT_WORK(&ad->antic_work, as_work_handler);
-        INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
+        INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
-        INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
+        INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
-        ad->sort_list[REQ_SYNC] = RB_ROOT;
+        ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
-        ad->sort_list[REQ_ASYNC] = RB_ROOT;
+        ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
-        ad->fifo_expire[REQ_SYNC] = default_read_expire;
+        ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
-        ad->fifo_expire[REQ_ASYNC] = default_write_expire;
+        ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
        ad->antic_expire = default_antic_expire;
-        ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
+        ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
-        ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
+        ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
-        ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
+        ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
-        ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
+        ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
        if (ad->write_batch_count < 2)
                ad->write_batch_count = 2;
@@ -1432,11 +1429,11 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)	\
        struct as_data *ad = e->elevator_data;                  \
        return as_var_show(jiffies_to_msecs((__VAR)), (page));  \
 }
-SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
-SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
 SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
-SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
 #undef SHOW_FUNCTION
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                         \
@@ -1451,13 +1448,14 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
        *(__PTR) = msecs_to_jiffies(*(__PTR));                          \
        return ret;                                                     \
 }
-STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
+STORE_FUNCTION(as_write_expire_store,
+                        &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
 STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
 STORE_FUNCTION(as_read_batch_expire_store,
-                        &ad->batch_expire[REQ_SYNC], 0, INT_MAX);
+                        &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
 STORE_FUNCTION(as_write_batch_expire_store,
-                        &ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
+                        &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
 #undef STORE_FUNCTION
 #define AS_ATTR(name) \
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f7dae57e6cab..20b4111fa050 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -319,9 +319,6 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
                return -ENXIO;
        bio = bio_alloc(GFP_KERNEL, 0);
-        if (!bio)
-                return -ENOMEM;
        bio->bi_end_io = bio_end_empty_barrier;
        bio->bi_private = &wait;
        bio->bi_bdev = bdev;
diff --git a/block/blk-core.c b/block/blk-core.c
index 25572802dac2..07ab75403e1a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,12 +64,11 @@ static struct workqueue_struct *kblockd_workqueue;
 static void drive_stat_acct(struct request *rq, int new_io)
 {
-        struct gendisk *disk = rq->rq_disk;
        struct hd_struct *part;
        int rw = rq_data_dir(rq);
        int cpu;
-        if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue))
+        if (!blk_fs_request(rq) || !blk_do_io_stat(rq))
                return;
        cpu = part_stat_lock();
@@ -132,6 +131,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->cmd = rq->__cmd;
+        rq->cmd_len = BLK_MAX_CDB;
        rq->tag = -1;
        rq->ref_count = 1;
 }
@@ -1124,8 +1124,6 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        if (bio_sync(bio))
                req->cmd_flags |= REQ_RW_SYNC;
-        if (bio_unplug(bio))
-                req->cmd_flags |= REQ_UNPLUG;
        if (bio_rw_meta(bio))
                req->cmd_flags |= REQ_RW_META;
        if (bio_noidle(bio))
@@ -1675,9 +1673,7 @@ EXPORT_SYMBOL(blkdev_dequeue_request);
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
-        struct gendisk *disk = req->rq_disk;
+        if (!blk_do_io_stat(req))
-        if (!disk || !blk_do_io_stat(disk->queue))
                return;
        if (blk_fs_request(req)) {
@@ -1694,9 +1690,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 static void blk_account_io_done(struct request *req)
 {
-        struct gendisk *disk = req->rq_disk;
+        if (!blk_do_io_stat(req))
-        if (!disk || !blk_do_io_stat(disk->queue))
                return;
        /*
@@ -1711,7 +1705,7 @@ static void blk_account_io_done(struct request *req)
                int cpu;
                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(disk, req->sector);
+                part = disk_map_sector_rcu(req->rq_disk, req->sector);
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e39cb24b7679..63760ca3da0f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -338,6 +338,22 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
        return 1;
 }
+static void blk_account_io_merge(struct request *req)
+{
+        if (blk_do_io_stat(req)) {
+                struct hd_struct *part;
+                int cpu;
+                cpu = part_stat_lock();
+                part = disk_map_sector_rcu(req->rq_disk, req->sector);
+                part_round_stats(cpu, part);
+                part_dec_in_flight(part);
+                part_stat_unlock();
+        }
+}
 /*
 * Has to be called with the request spinlock acquired
 */
@@ -386,18 +402,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
        elv_merge_requests(q, req, next);
-        if (req->rq_disk) {
+        blk_account_io_merge(req);
-                struct hd_struct *part;
-                int cpu;
-                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(req->rq_disk, req->sector);
-                part_round_stats(cpu, part);
-                part_dec_in_flight(part);
-                part_stat_unlock();
-        }
        req->ioprio = ioprio_best(req->ioprio, next->ioprio);
        if (blk_rq_cpu_valid(next))
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 59fd05d9f1d5..69c42adde52b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -431,7 +431,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
 *
 * description:
 *    set required memory and length alignment for direct dma transactions.
- *    this is used when buiding direct io requests for the queue.
+ *    this is used when building direct io requests for the queue.
 *
 **/
 void blk_queue_dma_alignment(struct request_queue *q, int mask)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ff9bba3379a..cac4e9febe6a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -209,10 +209,14 @@ static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
        ssize_t ret = queue_var_store(&stats, page, count);
        spin_lock_irq(q->queue_lock);
+        elv_quiesce_start(q);
        if (stats)
                queue_flag_set(QUEUE_FLAG_IO_STAT, q);
        else
                queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
+        elv_quiesce_end(q);
        spin_unlock_irq(q->queue_lock);
        return ret;
diff --git a/block/blk.h b/block/blk.h
index 3ee94358b43d..5dfc41267a08 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -70,6 +70,10 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 int blk_dev_init(void);
+void elv_quiesce_start(struct request_queue *q);
+void elv_quiesce_end(struct request_queue *q);
 /*
 * Return the threshold (number of used requests) at which the queue is
 * considered to be congested.  It include a little hysteresis to keep the
@@ -108,12 +112,14 @@ static inline int blk_cpu_to_group(int cpu)
 #endif
 }
-static inline int blk_do_io_stat(struct request_queue *q)
+static inline int blk_do_io_stat(struct request *rq)
 {
-        if (q)
+        struct gendisk *disk = rq->rq_disk;
-                return blk_queue_io_stat(q);
-        return 0;
+        if (!disk || !disk->queue)
+                return 0;
+        return blk_queue_io_stat(disk->queue) && (rq->cmd_flags & REQ_ELVPRIV);
 }
 #endif
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9e809345f71a..0d3b70de3d80 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -56,9 +56,6 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
-#define ASYNC                   (0)
-#define SYNC                    (1)
 #define sample_valid(samples)   ((samples) > 80)
 /*
@@ -83,6 +80,14 @@ struct cfq_data {
         * rr list of queues with requests and the count of them
         */
        struct cfq_rb_root service_tree;
+        /*
+         * Each priority tree is sorted by next_request position.  These
+         * trees are used when determining if two or more queues are
+         * interleaving requests (see cfq_close_cooperator).
+         */
+        struct rb_root prio_trees[CFQ_PRIO_LISTS];
        unsigned int busy_queues;
        /*
         * Used to track any pending rt requests so we can pre-empt current
@@ -147,6 +152,8 @@ struct cfq_queue {
        struct rb_node rb_node;
        /* service_tree key */
        unsigned long rb_key;
+        /* prio tree member */
+        struct rb_node p_node;
        /* sorted list of pending requests */
        struct rb_root sort_list;
        /* if fifo isn't expired, next request to serve */
@@ -160,6 +167,7 @@ struct cfq_queue {
        unsigned long slice_end;
        long slice_resid;
+        unsigned int slice_dispatch;
        /* pending metadata requests */
        int meta_pending;
@@ -176,15 +184,15 @@ struct cfq_queue {
 enum cfqq_state_flags {
        CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
        CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
+        CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
        CFQ_CFQQ_FLAG_must_alloc,       /* must be allowed rq alloc */
        CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
-        CFQ_CFQQ_FLAG_must_dispatch,    /* must dispatch, even if expired */
        CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
        CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
        CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
-        CFQ_CFQQ_FLAG_queue_new,        /* queue never been serviced */
        CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
        CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
+        CFQ_CFQQ_FLAG_coop,             /* has done a coop jump of the queue */
 };
 #define CFQ_CFQQ_FNS(name)                                              \
@@ -203,15 +211,15 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
 CFQ_CFQQ_FNS(on_rr);
 CFQ_CFQQ_FNS(wait_request);
+CFQ_CFQQ_FNS(must_dispatch);
 CFQ_CFQQ_FNS(must_alloc);
 CFQ_CFQQ_FNS(must_alloc_slice);
-CFQ_CFQQ_FNS(must_dispatch);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
-CFQ_CFQQ_FNS(queue_new);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
+CFQ_CFQQ_FNS(coop);
 #undef CFQ_CFQQ_FNS
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
@@ -420,13 +428,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
        return NULL;
 }
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+        rb_erase(n, root);
+        RB_CLEAR_NODE(n);
+}
 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 {
        if (root->left == n)
                root->left = NULL;
+        rb_erase_init(n, &root->rb);
-        rb_erase(n, &root->rb);
-        RB_CLEAR_NODE(n);
 }
 /*
@@ -471,8 +483,8 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 * requests waiting to be processed. It is sorted in the order that
 * we will service the queues.
 */
-static void cfq_service_tree_add(struct cfq_data *cfqd,
+static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-                                    struct cfq_queue *cfqq, int add_front)
+                                 int add_front)
 {
        struct rb_node **p, *parent;
        struct cfq_queue *__cfqq;
@@ -545,6 +557,63 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
        rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
 }
+static struct cfq_queue *
+cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector,
+                     struct rb_node **ret_parent, struct rb_node ***rb_link)
+{
+        struct rb_root *root = &cfqd->prio_trees[ioprio];
+        struct rb_node **p, *parent;
+        struct cfq_queue *cfqq = NULL;
+        parent = NULL;
+        p = &root->rb_node;
+        while (*p) {
+                struct rb_node **n;
+                parent = *p;
+                cfqq = rb_entry(parent, struct cfq_queue, p_node);
+                /*
+                 * Sort strictly based on sector.  Smallest to the left,
+                 * largest to the right.
+                 */
+                if (sector > cfqq->next_rq->sector)
+                        n = &(*p)->rb_right;
+                else if (sector < cfqq->next_rq->sector)
+                        n = &(*p)->rb_left;
+                else
+                        break;
+                p = n;
+        }
+        *ret_parent = parent;
+        if (rb_link)
+                *rb_link = p;
+        return NULL;
+}
+static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+        struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio];
+        struct rb_node **p, *parent;
+        struct cfq_queue *__cfqq;
+        if (!RB_EMPTY_NODE(&cfqq->p_node))
+                rb_erase_init(&cfqq->p_node, root);
+        if (cfq_class_idle(cfqq))
+                return;
+        if (!cfqq->next_rq)
+                return;
+        __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector,
+                                         &parent, &p);
+        BUG_ON(__cfqq);
+        rb_link_node(&cfqq->p_node, parent, p);
+        rb_insert_color(&cfqq->p_node, root);
+}
 /*
 * Update cfqq's position in the service tree.
 */
@@ -553,8 +622,10 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        /*
         * Resorting requires the cfqq to be on the RR list already.
         */
-        if (cfq_cfqq_on_rr(cfqq))
+        if (cfq_cfqq_on_rr(cfqq)) {
                cfq_service_tree_add(cfqd, cfqq, 0);
+                cfq_prio_tree_add(cfqd, cfqq);
+        }
 }
 /*
@@ -585,6 +656,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        if (!RB_EMPTY_NODE(&cfqq->rb_node))
                cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+        if (!RB_EMPTY_NODE(&cfqq->p_node))
+                rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]);
        BUG_ON(!cfqd->busy_queues);
        cfqd->busy_queues--;
@@ -614,7 +687,7 @@ static void cfq_add_rq_rb(struct request *rq)
 {
        struct cfq_queue *cfqq = RQ_CFQQ(rq);
        struct cfq_data *cfqd = cfqq->cfqd;
-        struct request *__alias;
+        struct request *__alias, *prev;
        cfqq->queued[rq_is_sync(rq)]++;
@@ -631,7 +704,15 @@ static void cfq_add_rq_rb(struct request *rq)
        /*
         * check if this request is a better next-serve candidate
         */
+        prev = cfqq->next_rq;
        cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+        /*
+         * adjust priority tree position, if ->next_rq changes
+         */
+        if (prev != cfqq->next_rq)
+                cfq_prio_tree_add(cfqd, cfqq);
        BUG_ON(!cfqq->next_rq);
 }
@@ -774,10 +855,15 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
        if (cfqq) {
                cfq_log_cfqq(cfqd, cfqq, "set_active");
                cfqq->slice_end = 0;
+                cfqq->slice_dispatch = 0;
+                cfq_clear_cfqq_wait_request(cfqq);
+                cfq_clear_cfqq_must_dispatch(cfqq);
                cfq_clear_cfqq_must_alloc_slice(cfqq);
                cfq_clear_cfqq_fifo_expire(cfqq);
                cfq_mark_cfqq_slice_new(cfqq);
-                cfq_clear_cfqq_queue_new(cfqq);
+                del_timer(&cfqd->idle_slice_timer);
        }
        cfqd->active_queue = cfqq;
@@ -795,7 +881,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (cfq_cfqq_wait_request(cfqq))
                del_timer(&cfqd->idle_slice_timer);
-        cfq_clear_cfqq_must_dispatch(cfqq);
        cfq_clear_cfqq_wait_request(cfqq);
        /*
@@ -840,11 +925,15 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 /*
 * Get and set a new active queue for service.
 */
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
+                                              struct cfq_queue *cfqq)
 {
-        struct cfq_queue *cfqq;
+        if (!cfqq) {
+                cfqq = cfq_get_next_queue(cfqd);
+                if (cfqq)
+                        cfq_clear_cfqq_coop(cfqq);
+        }
-        cfqq = cfq_get_next_queue(cfqd);
        __cfq_set_active_queue(cfqd, cfqq);
        return cfqq;
 }
@@ -868,17 +957,89 @@ static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
        return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
 }
-static int cfq_close_cooperator(struct cfq_data *cfq_data,
+static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
-                                struct cfq_queue *cfqq)
+                                    struct cfq_queue *cur_cfqq)
 {
+        struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio];
+        struct rb_node *parent, *node;
+        struct cfq_queue *__cfqq;
+        sector_t sector = cfqd->last_position;
+        if (RB_EMPTY_ROOT(root))
+                return NULL;
+        /*
+         * First, if we find a request starting at the end of the last
+         * request, choose it.
+         */
+        __cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio,
+                                      sector, &parent, NULL);
+        if (__cfqq)
+                return __cfqq;
+        /*
+         * If the exact sector wasn't found, the parent of the NULL leaf
+         * will contain the closest sector.
+         */
+        __cfqq = rb_entry(parent, struct cfq_queue, p_node);
+        if (cfq_rq_close(cfqd, __cfqq->next_rq))
+                return __cfqq;
+        if (__cfqq->next_rq->sector < sector)
+                node = rb_next(&__cfqq->p_node);
+        else
+                node = rb_prev(&__cfqq->p_node);
+        if (!node)
+                return NULL;
+        __cfqq = rb_entry(node, struct cfq_queue, p_node);
+        if (cfq_rq_close(cfqd, __cfqq->next_rq))
+                return __cfqq;
+        return NULL;
+}
+/*
+ * cfqd - obvious
+ * cur_cfqq - passed in so that we don't decide that the current queue is
+ *            closely cooperating with itself.
+ *
+ * So, basically we're assuming that that cur_cfqq has dispatched at least
+ * one request, and that cfqd->last_position reflects a position on the disk
+ * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
+ * assumption.
+ */
+static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+                                              struct cfq_queue *cur_cfqq,
+                                              int probe)
+{
+        struct cfq_queue *cfqq;
+        /*
+         * A valid cfq_io_context is necessary to compare requests against
+         * the seek_mean of the current cfqq.
+         */
+        if (!cfqd->active_cic)
+                return NULL;
        /*
         * We should notice if some of the queues are cooperating, eg
         * working closely on the same area of the disk. In that case,
         * we can group them together and don't waste time idling.
         */
-        return 0;
+        cfqq = cfqq_close(cfqd, cur_cfqq);
+        if (!cfqq)
+                return NULL;
+        if (cfq_cfqq_coop(cfqq))
+                return NULL;
+        if (!probe)
+                cfq_mark_cfqq_coop(cfqq);
+        return cfqq;
 }
 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -917,14 +1078,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
        if (!cic || !atomic_read(&cic->ioc->nr_tasks))
                return;
-        /*
-         * See if this prio level has a good candidate
-         */
-        if (cfq_close_cooperator(cfqd, cfqq) &&
-            (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
-                return;
-        cfq_mark_cfqq_must_dispatch(cfqq);
        cfq_mark_cfqq_wait_request(cfqq);
        /*
@@ -937,7 +1090,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
                sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
        mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-        cfq_log(cfqd, "arm_idle: %lu", sl);
+        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
 }
 /*
@@ -1001,7 +1154,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
-        struct cfq_queue *cfqq;
+        struct cfq_queue *cfqq, *new_cfqq = NULL;
        cfqq = cfqd->active_queue;
        if (!cfqq)
@@ -1010,7 +1163,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
        /*
         * The active queue has run out of time, expire it and select new.
         */
-        if (cfq_slice_used(cfqq))
+        if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
                goto expire;
        /*
@@ -1035,6 +1188,16 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
                goto keep_queue;
        /*
+         * If another queue has a request waiting within our mean seek
+         * distance, let it run.  The expire code will check for close
+         * cooperators and put the close queue at the front of the service
+         * tree.
+         */
+        new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
+        if (new_cfqq)
+                goto expire;
+        /*
         * No requests pending. If the active queue still has requests in
         * flight or is idling for a new request, allow either of these
         * conditions to happen (or time out) before selecting a new queue.
@@ -1048,71 +1211,11 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 expire:
        cfq_slice_expired(cfqd, 0);
 new_queue:
-        cfqq = cfq_set_active_queue(cfqd);
+        cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
        return cfqq;
 }
-/*
- * Dispatch some requests from cfqq, moving them to the request queue
- * dispatch list.
- */
-static int
-__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-                        int max_dispatch)
-{
-        int dispatched = 0;
-        BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
-        do {
-                struct request *rq;
-                /*
-                 * follow expired path, else get first next available
-                 */
-                rq = cfq_check_fifo(cfqq);
-                if (rq == NULL)
-                        rq = cfqq->next_rq;
-                /*
-                 * finally, insert request into driver dispatch list
-                 */
-                cfq_dispatch_insert(cfqd->queue, rq);
-                dispatched++;
-                if (!cfqd->active_cic) {
-                        atomic_inc(&RQ_CIC(rq)->ioc->refcount);
-                        cfqd->active_cic = RQ_CIC(rq);
-                }
-                if (RB_EMPTY_ROOT(&cfqq->sort_list))
-                        break;
-                /*
-                 * If there is a non-empty RT cfqq waiting for current
-                 * cfqq's timeslice to complete, pre-empt this cfqq
-                 */
-                if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues)
-                        break;
-        } while (dispatched < max_dispatch);
-        /*
-         * expire an async queue immediately if it has used up its slice. idle
-         * queue always expire after 1 dispatch round.
-         */
-        if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
-            dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
-            cfq_class_idle(cfqq))) {
-                cfqq->slice_end = jiffies + 1;
-                cfq_slice_expired(cfqd, 0);
-        }
-        return dispatched;
-}
 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 {
        int dispatched = 0;
@@ -1146,11 +1249,45 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
        return dispatched;
 }
+/*
+ * Dispatch a request from cfqq, moving them to the request queue
+ * dispatch list.
+ */
+static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+        struct request *rq;
+        BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
+        /*
+         * follow expired path, else get first next available
+         */
+        rq = cfq_check_fifo(cfqq);
+        if (!rq)
+                rq = cfqq->next_rq;
+        /*
+         * insert request into driver dispatch list
+         */
+        cfq_dispatch_insert(cfqd->queue, rq);
+        if (!cfqd->active_cic) {
+                struct cfq_io_context *cic = RQ_CIC(rq);
+                atomic_inc(&cic->ioc->refcount);
+                cfqd->active_cic = cic;
+        }
+}
+/*
+ * Find the cfqq that we need to service and move a request from that to the
+ * dispatch list
+ */
 static int cfq_dispatch_requests(struct request_queue *q, int force)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
        struct cfq_queue *cfqq;
-        int dispatched;
+        unsigned int max_dispatch;
        if (!cfqd->busy_queues)
                return 0;
@@ -1158,29 +1295,63 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
        if (unlikely(force))
                return cfq_forced_dispatch(cfqd);
-        dispatched = 0;
+        cfqq = cfq_select_queue(cfqd);
-        while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
+        if (!cfqq)
-                int max_dispatch;
+                return 0;
+        /*
+         * If this is an async queue and we have sync IO in flight, let it wait
+         */
+        if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+                return 0;
-                max_dispatch = cfqd->cfq_quantum;
+        max_dispatch = cfqd->cfq_quantum;
+        if (cfq_class_idle(cfqq))
+                max_dispatch = 1;
+        /*
+         * Does this cfqq already have too much IO in flight?
+         */
+        if (cfqq->dispatched >= max_dispatch) {
+                /*
+                 * idle queue must always only have a single IO in flight
+                 */
                if (cfq_class_idle(cfqq))
-                        max_dispatch = 1;
+                        return 0;
-                if (cfqq->dispatched >= max_dispatch && cfqd->busy_queues > 1)
+                /*
-                        break;
+                 * We have other queues, don't allow more IO from this one
+                 */
+                if (cfqd->busy_queues > 1)
+                        return 0;
-                if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+                /*
-                        break;
+                 * we are the only queue, allow up to 4 times of 'quantum'
+                 */
+                if (cfqq->dispatched >= 4 * max_dispatch)
+                        return 0;
+        }
-                cfq_clear_cfqq_must_dispatch(cfqq);
+        /*
-                cfq_clear_cfqq_wait_request(cfqq);
+         * Dispatch a request from this cfqq
-                del_timer(&cfqd->idle_slice_timer);
+         */
+        cfq_dispatch_request(cfqd, cfqq);
+        cfqq->slice_dispatch++;
+        cfq_clear_cfqq_must_dispatch(cfqq);
-                dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
+        /*
+         * expire an async queue immediately if it has used up its slice. idle
+         * queue always expire after 1 dispatch round.
+         */
+        if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
+            cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
+            cfq_class_idle(cfqq))) {
+                cfqq->slice_end = jiffies + 1;
+                cfq_slice_expired(cfqd, 0);
        }
-        cfq_log(cfqd, "dispatched=%d", dispatched);
+        cfq_log(cfqd, "dispatched a request");
-        return dispatched;
+        return 1;
 }
 /*
@@ -1323,14 +1494,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
        if (ioc->ioc_data == cic)
                rcu_assign_pointer(ioc->ioc_data, NULL);
-        if (cic->cfqq[ASYNC]) {
+        if (cic->cfqq[BLK_RW_ASYNC]) {
-                cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
+                cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-                cic->cfqq[ASYNC] = NULL;
+                cic->cfqq[BLK_RW_ASYNC] = NULL;
        }
-        if (cic->cfqq[SYNC]) {
+        if (cic->cfqq[BLK_RW_SYNC]) {
-                cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
+                cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
-                cic->cfqq[SYNC] = NULL;
+                cic->cfqq[BLK_RW_SYNC] = NULL;
        }
 }
@@ -1439,17 +1610,18 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-        cfqq = cic->cfqq[ASYNC];
+        cfqq = cic->cfqq[BLK_RW_ASYNC];
        if (cfqq) {
                struct cfq_queue *new_cfqq;
-                new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC);
+                new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
+                                                GFP_ATOMIC);
                if (new_cfqq) {
-                        cic->cfqq[ASYNC] = new_cfqq;
+                        cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
                        cfq_put_queue(cfqq);
                }
        }
-        cfqq = cic->cfqq[SYNC];
+        cfqq = cic->cfqq[BLK_RW_SYNC];
        if (cfqq)
                cfq_mark_cfqq_prio_changed(cfqq);
@@ -1500,13 +1672,13 @@ retry:
                }
                RB_CLEAR_NODE(&cfqq->rb_node);
+                RB_CLEAR_NODE(&cfqq->p_node);
                INIT_LIST_HEAD(&cfqq->fifo);
                atomic_set(&cfqq->ref, 0);
                cfqq->cfqd = cfqd;
                cfq_mark_cfqq_prio_changed(cfqq);
-                cfq_mark_cfqq_queue_new(cfqq);
                cfq_init_prio_data(cfqq, ioc);
@@ -1893,14 +2065,22 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (cfqq == cfqd->active_queue) {
                /*
-                 * if we are waiting for a request for this queue, let it rip
+                 * Remember that we saw a request from this process, but
-                 * immediately and flag that we must not expire this queue
+                 * don't start queuing just yet. Otherwise we risk seeing lots
-                 * just now
+                 * of tiny requests, because we disrupt the normal plugging
+                 * and merging. If the request is already larger than a single
+                 * page, let it rip immediately. For that case we assume that
+                 * merging is already done. Ditto for a busy system that
+                 * has other work pending, don't risk delaying until the
+                 * idle timer unplug to continue working.
                 */
                if (cfq_cfqq_wait_request(cfqq)) {
+                        if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
+                            cfqd->busy_queues > 1) {
+                                del_timer(&cfqd->idle_slice_timer);
+                                blk_start_queueing(cfqd->queue);
+                        }
                        cfq_mark_cfqq_must_dispatch(cfqq);
-                        del_timer(&cfqd->idle_slice_timer);
-                        blk_start_queueing(cfqd->queue);
                }
        } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
                /*
@@ -1910,7 +2090,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                 * this new queue is RT and the current one is BE
                 */
                cfq_preempt_queue(cfqd, cfqq);
-                cfq_mark_cfqq_must_dispatch(cfqq);
                blk_start_queueing(cfqd->queue);
        }
 }
@@ -1986,16 +2165,24 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
         * or if we want to idle in case it has no pending requests.
         */
        if (cfqd->active_queue == cfqq) {
+                const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
                if (cfq_cfqq_slice_new(cfqq)) {
                        cfq_set_prio_slice(cfqd, cfqq);
                        cfq_clear_cfqq_slice_new(cfqq);
                }
+                /*
+                 * If there are no requests waiting in this queue, and
+                 * there are other queues ready to issue requests, AND
+                 * those other queues are issuing requests within our
+                 * mean seek distance, give them a chance to run instead
+                 * of idling.
+                 */
                if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
                        cfq_slice_expired(cfqd, 1);
-                else if (sync && !rq_noidle(rq) &&
+                else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
-                         RB_EMPTY_ROOT(&cfqq->sort_list)) {
+                         sync && !rq_noidle(rq))
                        cfq_arm_slice_timer(cfqd);
-                }
        }
        if (!cfqd->rq_in_driver)
@@ -2056,7 +2243,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
        if (!cic)
                return ELV_MQUEUE_MAY;
-        cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
+        cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
        if (cfqq) {
                cfq_init_prio_data(cfqq, cic->ioc);
                cfq_prio_boost(cfqq);
@@ -2146,11 +2333,10 @@ static void cfq_kick_queue(struct work_struct *work)
        struct cfq_data *cfqd =
                container_of(work, struct cfq_data, unplug_work);
        struct request_queue *q = cfqd->queue;
-        unsigned long flags;
-        spin_lock_irqsave(q->queue_lock, flags);
+        spin_lock_irq(q->queue_lock);
        blk_start_queueing(q);
-        spin_unlock_irqrestore(q->queue_lock, flags);
+        spin_unlock_irq(q->queue_lock);
 }
 /*
@@ -2172,6 +2358,12 @@ static void cfq_idle_slice_timer(unsigned long data)
                timed_out = 0;
                /*
+                 * We saw a request before the queue expired, let it through
+                 */
+                if (cfq_cfqq_must_dispatch(cfqq))
+                        goto out_kick;
+                /*
                 * expired
                 */
                if (cfq_slice_used(cfqq))
@@ -2187,10 +2379,8 @@ static void cfq_idle_slice_timer(unsigned long data)
                /*
                 * not expired and it has a request pending, let it dispatch
                 */
-                if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
+                if (!RB_EMPTY_ROOT(&cfqq->sort_list))
-                        cfq_mark_cfqq_must_dispatch(cfqq);
                        goto out_kick;
-                }
        }
 expire:
        cfq_slice_expired(cfqd, timed_out);
diff --git a/block/elevator.c b/block/elevator.c
index ca6788a0195a..7073a9072577 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -573,7 +573,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
        elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
-static void elv_drain_elevator(struct request_queue *q)
+void elv_drain_elevator(struct request_queue *q)
 {
        static int printed;
        while (q->elevator->ops->elevator_dispatch_fn(q, 1))
@@ -587,6 +587,31 @@ static void elv_drain_elevator(struct request_queue *q)
        }
 }
+/*
+ * Call with queue lock held, interrupts disabled
+ */
+void elv_quiesce_start(struct request_queue *q)
+{
+        queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
+        /*
+         * make sure we don't have any requests in flight
+         */
+        elv_drain_elevator(q);
+        while (q->rq.elvpriv) {
+                blk_start_queueing(q);
+                spin_unlock_irq(q->queue_lock);
+                msleep(10);
+                spin_lock_irq(q->queue_lock);
+                elv_drain_elevator(q);
+        }
+}
+void elv_quiesce_end(struct request_queue *q)
+{
+        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+}
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
        struct list_head *pos;
@@ -1101,18 +1126,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
         * Turn on BYPASS and drain all requests w/ elevator private data
         */
        spin_lock_irq(q->queue_lock);
+        elv_quiesce_start(q);
-        queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
-        elv_drain_elevator(q);
-        while (q->rq.elvpriv) {
-                blk_start_queueing(q);
-                spin_unlock_irq(q->queue_lock);
-                msleep(10);
-                spin_lock_irq(q->queue_lock);
-                elv_drain_elevator(q);
-        }
        /*
         * Remember old elevator.
@@ -1136,7 +1150,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
         */
        elevator_exit(old_elevator);
        spin_lock_irq(q->queue_lock);
-        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+        elv_quiesce_end(q);
        spin_unlock_irq(q->queue_lock);
        blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
diff --git a/block/ioctl.c b/block/ioctl.c
index 0f22e629b13c..ad474d4bbcce 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -146,8 +146,6 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
                struct bio *bio;
                bio = bio_alloc(GFP_KERNEL, 0);
-                if (!bio)
-                        return -ENOMEM;
                bio->bi_end_io = blk_ioc_discard_endio;
                bio->bi_bdev = bdev;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 626ee274c5c4..84b7f8709f41 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -217,7 +217,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
                                 struct bio *bio)
 {
-        int ret = 0;
+        int r, ret = 0;
        /*
         * fill in all the output members
@@ -242,7 +242,9 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
                        ret = -EFAULT;
        }
-        blk_rq_unmap_user(bio);
+        r = blk_rq_unmap_user(bio);
+        if (!ret)
+                ret = r;
        blk_put_request(rq);
        return ret;
author	Ingo Molnar <mingo@elte.hu>	2009-04-24 04:11:18 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-04-24 04:11:23 -0400
commit	416dfdcdb894432547ead4fcb9fa6a36b396059e (patch)
tree	8033fdda07397a59c5fa98c88927040906ce6c1a /block
parent	56449f437add737a1e5e1cb7e00f63ac8ead1938 (diff)
parent	091069740304c979f957ceacec39c461d0192158 (diff)