Merge branch 'master' into gfs2

author: Steven Whitehouse <swhiteho@redhat.com> 2006-10-02 08:45:08 -0400
committer: Steven Whitehouse <swhiteho@redhat.com> 2006-10-02 08:45:08 -0400
commit: 59458f40e25915a355d8b1d701425fe9f4f9ea23 (patch)
tree: f1c9a2934df686e36d75f759ab7313b6f0e0e5f9 /block
parent: 825f9075d74028d11d7f5932f04e1b5db3022b51 (diff)
parent: d834c16516d1ebec4766fc58c059bf01311e6045 (diff)
12 files changed, 933 insertions, 1700 deletions
diff --git a/block/Kconfig b/block/Kconfig
index b6f5f0a79655..83766a6bdee2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -1,6 +1,24 @@
 #
 # Block layer core configuration
 #
+config BLOCK
+       bool "Enable the block layer" if EMBEDDED
+       default y
+       help
+         This permits the block layer to be removed from the kernel if it's not
+         needed (on some embedded devices for example).  If this option is
+         disabled, then blockdev files will become unusable and some
+         filesystems (such as ext3) will become unavailable.
+         This option will also disable SCSI character devices and USB storage
+         since they make use of various block layer definitions and
+         facilities.
+         Say Y here unless you know you really don't want to mount disks and
+         suchlike.
+if BLOCK
 #XXX - it makes sense to enable this only for 32-bit subarch's, not for x86_64
 #for instance.
 config LBD
@@ -33,4 +51,6 @@ config LSF
          If unsure, say Y.
+endif
 source block/Kconfig.iosched
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 48d090e266fc..903f0d3b6852 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -1,3 +1,4 @@
+if BLOCK
 menu "IO Schedulers"
@@ -67,3 +68,5 @@ config DEFAULT_IOSCHED
        default "noop" if DEFAULT_NOOP
 endmenu
+endif
diff --git a/block/Makefile b/block/Makefile
index c05de0e0037f..4b84d0d5947b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the kernel block layer
 #
-obj-y   := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
+obj-$(CONFIG_BLOCK) := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
 obj-$(CONFIG_IOSCHED_NOOP)      += noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)        += as-iosched.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 5da56d48fbd3..50b95e4c1425 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1,7 +1,7 @@
 /*
 *  Anticipatory & deadline i/o scheduler.
 *
- *  Copyright (C) 2002 Jens Axboe <axboe@suse.de>
+ *  Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
 *                     Nick Piggin <nickpiggin@yahoo.com.au>
 *
 */
@@ -14,7 +14,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/hash.h>
 #include <linux/rbtree.h>
 #include <linux/interrupt.h>
@@ -93,9 +92,8 @@ struct as_data {
        struct rb_root sort_list[2];
        struct list_head fifo_list[2];
-        struct as_rq *next_arq[2];      /* next in sort order */
+        struct request *next_rq[2];     /* next in sort order */
        sector_t last_sector[2];        /* last REQ_SYNC & REQ_ASYNC sectors */
-        struct hlist_head *hash;        /* request hash */
        unsigned long exit_prob;        /* probability a task will exit while
                                           being waited on */
@@ -115,7 +113,6 @@ struct as_data {
        int write_batch_count;          /* max # of reqs in a write batch */
        int current_write_count;        /* how many requests left this batch */
        int write_batch_idled;          /* has the write batch gone idle? */
-        mempool_t *arq_pool;
        enum anticipation_status antic_status;
        unsigned long antic_start;      /* jiffies: when it started */
@@ -133,8 +130,6 @@ struct as_data {
        unsigned long antic_expire;
 };
-#define list_entry_fifo(ptr)    list_entry((ptr), struct as_rq, fifo)
 /*
 * per-request data.
 */
@@ -150,40 +145,14 @@ enum arq_state {
        AS_RQ_POSTSCHED,        /* when they shouldn't be */
 };
-struct as_rq {
+#define RQ_IOC(rq)      ((struct io_context *) (rq)->elevator_private)
-        /*
+#define RQ_STATE(rq)    ((enum arq_state)(rq)->elevator_private2)
-         * rbtree index, key is the starting offset
+#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
-         */
-        struct rb_node rb_node;
-        sector_t rb_key;
-        struct request *request;
-        struct io_context *io_context;  /* The submitting task */
-        /*
-         * request hash, key is the ending offset (for back merge lookup)
-         */
-        struct hlist_node hash;
-        /*
-         * expire fifo
-         */
-        struct list_head fifo;
-        unsigned long expires;
-        unsigned int is_sync;
+static DEFINE_PER_CPU(unsigned long, ioc_count);
-        enum arq_state state;
-};
-#define RQ_DATA(rq)     ((struct as_rq *) (rq)->elevator_private)
-static kmem_cache_t *arq_pool;
-static atomic_t ioc_count = ATOMIC_INIT(0);
 static struct completion *ioc_gone;
-static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
+static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
 static void as_antic_stop(struct as_data *ad);
 /*
@@ -194,7 +163,8 @@ static void as_antic_stop(struct as_data *ad);
 static void free_as_io_context(struct as_io_context *aic)
 {
        kfree(aic);
-        if (atomic_dec_and_test(&ioc_count) && ioc_gone)
+        elv_ioc_count_dec(ioc_count);
+        if (ioc_gone && !elv_ioc_count_read(ioc_count))
                complete(ioc_gone);
 }
@@ -230,7 +200,7 @@ static struct as_io_context *alloc_as_io_context(void)
                ret->seek_total = 0;
                ret->seek_samples = 0;
                ret->seek_mean = 0;
-                atomic_inc(&ioc_count);
+                elv_ioc_count_inc(ioc_count);
        }
        return ret;
@@ -240,9 +210,9 @@ static struct as_io_context *alloc_as_io_context(void)
 * If the current task has no AS IO context then create one and initialise it.
 * Then take a ref on the task's io context and return it.
 */
-static struct io_context *as_get_io_context(void)
+static struct io_context *as_get_io_context(int node)
 {
-        struct io_context *ioc = get_io_context(GFP_ATOMIC);
+        struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
        if (ioc && !ioc->aic) {
                ioc->aic = alloc_as_io_context();
                if (!ioc->aic) {
@@ -253,194 +223,43 @@ static struct io_context *as_get_io_context(void)
        return ioc;
 }
-static void as_put_io_context(struct as_rq *arq)
+static void as_put_io_context(struct request *rq)
 {
        struct as_io_context *aic;
-        if (unlikely(!arq->io_context))
+        if (unlikely(!RQ_IOC(rq)))
                return;
-        aic = arq->io_context->aic;
+        aic = RQ_IOC(rq)->aic;
-        if (arq->is_sync == REQ_SYNC && aic) {
+        if (rq_is_sync(rq) && aic) {
                spin_lock(&aic->lock);
                set_bit(AS_TASK_IORUNNING, &aic->state);
                aic->last_end_request = jiffies;
                spin_unlock(&aic->lock);
        }
-        put_io_context(arq->io_context);
+        put_io_context(RQ_IOC(rq));
-}
-/*
- * the back merge hash support functions
- */
-static const int as_hash_shift = 6;
-#define AS_HASH_BLOCK(sec)      ((sec) >> 3)
-#define AS_HASH_FN(sec)         (hash_long(AS_HASH_BLOCK((sec)), as_hash_shift))
-#define AS_HASH_ENTRIES         (1 << as_hash_shift)
-#define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
-static inline void __as_del_arq_hash(struct as_rq *arq)
-{
-        hlist_del_init(&arq->hash);
-}
-static inline void as_del_arq_hash(struct as_rq *arq)
-{
-        if (!hlist_unhashed(&arq->hash))
-                __as_del_arq_hash(arq);
-}
-static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq)
-{
-        struct request *rq = arq->request;
-        BUG_ON(!hlist_unhashed(&arq->hash));
-        hlist_add_head(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]);
-}
-/*
- * move hot entry to front of chain
- */
-static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq)
-{
-        struct request *rq = arq->request;
-        struct hlist_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))];
-        if (hlist_unhashed(&arq->hash)) {
-                WARN_ON(1);
-                return;
-        }
-        if (&arq->hash != head->first) {
-                hlist_del(&arq->hash);
-                hlist_add_head(&arq->hash, head);
-        }
-}
-static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
-{
-        struct hlist_head *hash_list = &ad->hash[AS_HASH_FN(offset)];
-        struct hlist_node *entry, *next;
-        struct as_rq *arq;
-        hlist_for_each_entry_safe(arq, entry, next, hash_list, hash) {
-                struct request *__rq = arq->request;
-                BUG_ON(hlist_unhashed(&arq->hash));
-                if (!rq_mergeable(__rq)) {
-                        as_del_arq_hash(arq);
-                        continue;
-                }
-                if (rq_hash_key(__rq) == offset)
-                        return __rq;
-        }
-        return NULL;
 }
 /*
 * rb tree support functions
 */
-#define rb_entry_arq(node)      rb_entry((node), struct as_rq, rb_node)
+#define RQ_RB_ROOT(ad, rq)      (&(ad)->sort_list[rq_is_sync((rq))])
-#define ARQ_RB_ROOT(ad, arq)    (&(ad)->sort_list[(arq)->is_sync])
-#define rq_rb_key(rq)           (rq)->sector
-/*
- * as_find_first_arq finds the first (lowest sector numbered) request
- * for the specified data_dir. Used to sweep back to the start of the disk
- * (1-way elevator) after we process the last (highest sector) request.
- */
-static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
-{
-        struct rb_node *n = ad->sort_list[data_dir].rb_node;
-        if (n == NULL)
-                return NULL;
-        for (;;) {
-                if (n->rb_left == NULL)
-                        return rb_entry_arq(n);
-                n = n->rb_left;
-        }
-}
-/*
- * Add the request to the rb tree if it is unique.  If there is an alias (an
- * existing request against the same sector), which can happen when using
- * direct IO, then return the alias.
- */
-static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
-{
-        struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
-        struct rb_node *parent = NULL;
-        struct as_rq *__arq;
-        struct request *rq = arq->request;
-        arq->rb_key = rq_rb_key(rq);
-        while (*p) {
-                parent = *p;
-                __arq = rb_entry_arq(parent);
-                if (arq->rb_key < __arq->rb_key)
-                        p = &(*p)->rb_left;
-                else if (arq->rb_key > __arq->rb_key)
-                        p = &(*p)->rb_right;
-                else
-                        return __arq;
-        }
-        rb_link_node(&arq->rb_node, parent, p);
-        rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
-        return NULL;
-}
-static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+static void as_add_rq_rb(struct as_data *ad, struct request *rq)
 {
-        struct as_rq *alias;
+        struct request *alias;
-        while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) {
+        while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
                as_move_to_dispatch(ad, alias);
                as_antic_stop(ad);
        }
 }
-static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
+static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
-{
-        if (!RB_EMPTY_NODE(&arq->rb_node)) {
-                WARN_ON(1);
-                return;
-        }
-        rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
-        RB_CLEAR_NODE(&arq->rb_node);
-}
-static struct request *
-as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
 {
-        struct rb_node *n = ad->sort_list[data_dir].rb_node;
+        elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
-        struct as_rq *arq;
-        while (n) {
-                arq = rb_entry_arq(n);
-                if (sector < arq->rb_key)
-                        n = n->rb_left;
-                else if (sector > arq->rb_key)
-                        n = n->rb_right;
-                else
-                        return arq->request;
-        }
-        return NULL;
 }
 /*
@@ -458,26 +277,26 @@ as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
 * as_choose_req selects the preferred one of two requests of the same data_dir
 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
 */
-static struct as_rq *
+static struct request *
-as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
+as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
 {
        int data_dir;
        sector_t last, s1, s2, d1, d2;
        int r1_wrap=0, r2_wrap=0;       /* requests are behind the disk head */
        const sector_t maxback = MAXBACK;
-        if (arq1 == NULL || arq1 == arq2)
+        if (rq1 == NULL || rq1 == rq2)
-                return arq2;
+                return rq2;
-        if (arq2 == NULL)
+        if (rq2 == NULL)
-                return arq1;
+                return rq1;
-        data_dir = arq1->is_sync;
+        data_dir = rq_is_sync(rq1);
        last = ad->last_sector[data_dir];
-        s1 = arq1->request->sector;
+        s1 = rq1->sector;
-        s2 = arq2->request->sector;
+        s2 = rq2->sector;
-        BUG_ON(data_dir != arq2->is_sync);
+        BUG_ON(data_dir != rq_is_sync(rq2));
        /*
         * Strict one way elevator _except_ in the case where we allow
@@ -504,61 +323,58 @@ as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
        /* Found required data */
        if (!r1_wrap && r2_wrap)
-                return arq1;
+                return rq1;
        else if (!r2_wrap && r1_wrap)
-                return arq2;
+                return rq2;
        else if (r1_wrap && r2_wrap) {
                /* both behind the head */
                if (s1 <= s2)
-                        return arq1;
+                        return rq1;
                else
-                        return arq2;
+                        return rq2;
        }
        /* Both requests in front of the head */
        if (d1 < d2)
-                return arq1;
+                return rq1;
        else if (d2 < d1)
-                return arq2;
+                return rq2;
        else {
                if (s1 >= s2)
-                        return arq1;
+                        return rq1;
                else
-                        return arq2;
+                        return rq2;
        }
 }
 /*
- * as_find_next_arq finds the next request after @prev in elevator order.
+ * as_find_next_rq finds the next request after @prev in elevator order.
 * this with as_choose_req form the basis for how the scheduler chooses
 * what request to process next. Anticipation works on top of this.
 */
-static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
+static struct request *
+as_find_next_rq(struct as_data *ad, struct request *last)
 {
-        const int data_dir = last->is_sync;
-        struct as_rq *ret;
        struct rb_node *rbnext = rb_next(&last->rb_node);
        struct rb_node *rbprev = rb_prev(&last->rb_node);
-        struct as_rq *arq_next, *arq_prev;
+        struct request *next = NULL, *prev = NULL;
-        BUG_ON(!RB_EMPTY_NODE(&last->rb_node));
+        BUG_ON(RB_EMPTY_NODE(&last->rb_node));
        if (rbprev)
-                arq_prev = rb_entry_arq(rbprev);
+                prev = rb_entry_rq(rbprev);
-        else
-                arq_prev = NULL;
        if (rbnext)
-                arq_next = rb_entry_arq(rbnext);
+                next = rb_entry_rq(rbnext);
        else {
-                arq_next = as_find_first_arq(ad, data_dir);
+                const int data_dir = rq_is_sync(last);
-                if (arq_next == last)
-                        arq_next = NULL;
-        }
-        ret = as_choose_req(ad, arq_next, arq_prev);
+                rbnext = rb_first(&ad->sort_list[data_dir]);
+                if (rbnext && rbnext != &last->rb_node)
+                        next = rb_entry_rq(rbnext);
+        }
-        return ret;
+        return as_choose_req(ad, next, prev);
 }
 /*
@@ -712,8 +528,7 @@ static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
 static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
                                struct request *rq)
 {
-        struct as_rq *arq = RQ_DATA(rq);
+        int data_dir = rq_is_sync(rq);
-        int data_dir = arq->is_sync;
        unsigned long thinktime = 0;
        sector_t seek_dist;
@@ -752,11 +567,11 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
 * previous one issued.
 */
 static int as_close_req(struct as_data *ad, struct as_io_context *aic,
-                                struct as_rq *arq)
+                        struct request *rq)
 {
        unsigned long delay;    /* milliseconds */
        sector_t last = ad->last_sector[ad->batch_data_dir];
-        sector_t next = arq->request->sector;
+        sector_t next = rq->sector;
        sector_t delta; /* acceptable close offset (in sectors) */
        sector_t s;
@@ -813,7 +628,7 @@ static int as_close_req(struct as_data *ad, struct as_io_context *aic,
 *
 * If this task has queued some other IO, do not enter enticipation.
 */
-static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
+static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 {
        struct io_context *ioc;
        struct as_io_context *aic;
@@ -821,7 +636,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
        ioc = ad->io_context;
        BUG_ON(!ioc);
-        if (arq && ioc == arq->io_context) {
+        if (rq && ioc == RQ_IOC(rq)) {
                /* request from same process */
                return 1;
        }
@@ -848,7 +663,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
                return 1;
        }
-        if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) {
+        if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
                /*
                 * Found a close request that is not one of ours.
                 *
@@ -864,7 +679,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
                        ad->exit_no_coop = (7*ad->exit_no_coop)/8;
                }
-                as_update_iohist(ad, aic, arq->request);
+                as_update_iohist(ad, aic, rq);
                return 1;
        }
@@ -891,10 +706,10 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
 }
 /*
- * as_can_anticipate indicates whether we should either run arq
+ * as_can_anticipate indicates whether we should either run rq
 * or keep anticipating a better request.
 */
-static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
+static int as_can_anticipate(struct as_data *ad, struct request *rq)
 {
        if (!ad->io_context)
                /*
@@ -908,7 +723,7 @@ static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
                 */
                return 0;
-        if (as_can_break_anticipation(ad, arq))
+        if (as_can_break_anticipation(ad, rq))
                /*
                 * This request is a good candidate. Don't keep anticipating,
                 * run it.
@@ -926,16 +741,16 @@ static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
 }
 /*
- * as_update_arq must be called whenever a request (arq) is added to
+ * as_update_rq must be called whenever a request (rq) is added to
 * the sort_list. This function keeps caches up to date, and checks if the
 * request might be one we are "anticipating"
 */
-static void as_update_arq(struct as_data *ad, struct as_rq *arq)
+static void as_update_rq(struct as_data *ad, struct request *rq)
 {
-        const int data_dir = arq->is_sync;
+        const int data_dir = rq_is_sync(rq);
-        /* keep the next_arq cache up to date */
+        /* keep the next_rq cache up to date */
-        ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
+        ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
        /*
         * have we been anticipating this request?
@@ -944,7 +759,7 @@ static void as_update_arq(struct as_data *ad, struct as_rq *arq)
         */
        if (ad->antic_status == ANTIC_WAIT_REQ
                        || ad->antic_status == ANTIC_WAIT_NEXT) {
-                if (as_can_break_anticipation(ad, arq))
+                if (as_can_break_anticipation(ad, rq))
                        as_antic_stop(ad);
        }
 }
@@ -984,12 +799,11 @@ static void update_write_batch(struct as_data *ad)
 static void as_completed_request(request_queue_t *q, struct request *rq)
 {
        struct as_data *ad = q->elevator->elevator_data;
-        struct as_rq *arq = RQ_DATA(rq);
        WARN_ON(!list_empty(&rq->queuelist));
-        if (arq->state != AS_RQ_REMOVED) {
+        if (RQ_STATE(rq) != AS_RQ_REMOVED) {
-                printk("arq->state %d\n", arq->state);
+                printk("rq->state %d\n", RQ_STATE(rq));
                WARN_ON(1);
                goto out;
        }
@@ -1009,14 +823,14 @@ static void as_completed_request(request_queue_t *q, struct request *rq)
         * actually serviced. This should help devices with big TCQ windows
         * and writeback caches
         */
-        if (ad->new_batch && ad->batch_data_dir == arq->is_sync) {
+        if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
                update_write_batch(ad);
                ad->current_batch_expires = jiffies +
                                ad->batch_expire[REQ_SYNC];
                ad->new_batch = 0;
        }
-        if (ad->io_context == arq->io_context && ad->io_context) {
+        if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
                ad->antic_start = jiffies;
                ad->ioc_finished = 1;
                if (ad->antic_status == ANTIC_WAIT_REQ) {
@@ -1028,9 +842,9 @@ static void as_completed_request(request_queue_t *q, struct request *rq)
                }
        }
-        as_put_io_context(arq);
+        as_put_io_context(rq);
 out:
-        arq->state = AS_RQ_POSTSCHED;
+        RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
 }
 /*
@@ -1041,27 +855,27 @@ out:
 */
 static void as_remove_queued_request(request_queue_t *q, struct request *rq)
 {
-        struct as_rq *arq = RQ_DATA(rq);
+        const int data_dir = rq_is_sync(rq);
-        const int data_dir = arq->is_sync;
        struct as_data *ad = q->elevator->elevator_data;
+        struct io_context *ioc;
-        WARN_ON(arq->state != AS_RQ_QUEUED);
+        WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
-        if (arq->io_context && arq->io_context->aic) {
+        ioc = RQ_IOC(rq);
-                BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
+        if (ioc && ioc->aic) {
-                atomic_dec(&arq->io_context->aic->nr_queued);
+                BUG_ON(!atomic_read(&ioc->aic->nr_queued));
+                atomic_dec(&ioc->aic->nr_queued);
        }
        /*
-         * Update the "next_arq" cache if we are about to remove its
+         * Update the "next_rq" cache if we are about to remove its
         * entry
         */
-        if (ad->next_arq[data_dir] == arq)
+        if (ad->next_rq[data_dir] == rq)
-                ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+                ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
-        list_del_init(&arq->fifo);
+        rq_fifo_clear(rq);
-        as_del_arq_hash(arq);
+        as_del_rq_rb(ad, rq);
-        as_del_arq_rb(ad, arq);
 }
 /*
@@ -1074,7 +888,7 @@ static void as_remove_queued_request(request_queue_t *q, struct request *rq)
 */
 static int as_fifo_expired(struct as_data *ad, int adir)
 {
-        struct as_rq *arq;
+        struct request *rq;
        long delta_jif;
        delta_jif = jiffies - ad->last_check_fifo[adir];
@@ -1088,9 +902,9 @@ static int as_fifo_expired(struct as_data *ad, int adir)
        if (list_empty(&ad->fifo_list[adir]))
                return 0;
-        arq = list_entry_fifo(ad->fifo_list[adir].next);
+        rq = rq_entry_fifo(ad->fifo_list[adir].next);
-        return time_after(jiffies, arq->expires);
+        return time_after(jiffies, rq_fifo_time(rq));
 }
 /*
@@ -1113,25 +927,25 @@ static inline int as_batch_expired(struct as_data *ad)
 /*
 * move an entry to dispatch queue
 */
-static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
+static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
 {
-        struct request *rq = arq->request;
+        const int data_dir = rq_is_sync(rq);
-        const int data_dir = arq->is_sync;
-        BUG_ON(!RB_EMPTY_NODE(&arq->rb_node));
+        BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
        as_antic_stop(ad);
        ad->antic_status = ANTIC_OFF;
        /*
         * This has to be set in order to be correctly updated by
-         * as_find_next_arq
+         * as_find_next_rq
         */
        ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
        if (data_dir == REQ_SYNC) {
+                struct io_context *ioc = RQ_IOC(rq);
                /* In case we have to anticipate after this */
-                copy_io_context(&ad->io_context, &arq->io_context);
+                copy_io_context(&ad->io_context, &ioc);
        } else {
                if (ad->io_context) {
                        put_io_context(ad->io_context);
@@ -1143,19 +957,19 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
        }
        ad->ioc_finished = 0;
-        ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+        ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
        /*
         * take it off the sort and fifo list, add to dispatch queue
         */
        as_remove_queued_request(ad->q, rq);
-        WARN_ON(arq->state != AS_RQ_QUEUED);
+        WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
        elv_dispatch_sort(ad->q, rq);
-        arq->state = AS_RQ_DISPATCHED;
+        RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
-        if (arq->io_context && arq->io_context->aic)
+        if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-                atomic_inc(&arq->io_context->aic->nr_dispatched);
+                atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
        ad->nr_dispatched++;
 }
@@ -1167,9 +981,9 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 static int as_dispatch_request(request_queue_t *q, int force)
 {
        struct as_data *ad = q->elevator->elevator_data;
-        struct as_rq *arq;
        const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
        const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
+        struct request *rq;
        if (unlikely(force)) {
                /*
@@ -1185,14 +999,14 @@ static int as_dispatch_request(request_queue_t *q, int force)
                ad->changed_batch = 0;
                ad->new_batch = 0;
-                while (ad->next_arq[REQ_SYNC]) {
+                while (ad->next_rq[REQ_SYNC]) {
-                        as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
+                        as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]);
                        dispatched++;
                }
                ad->last_check_fifo[REQ_SYNC] = jiffies;
-                while (ad->next_arq[REQ_ASYNC]) {
+                while (ad->next_rq[REQ_ASYNC]) {
-                        as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
+                        as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]);
                        dispatched++;
                }
                ad->last_check_fifo[REQ_ASYNC] = jiffies;
@@ -1216,19 +1030,19 @@ static int as_dispatch_request(request_queue_t *q, int force)
                /*
                 * batch is still running or no reads or no writes
                 */
-                arq = ad->next_arq[ad->batch_data_dir];
+                rq = ad->next_rq[ad->batch_data_dir];
                if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
                        if (as_fifo_expired(ad, REQ_SYNC))
                                goto fifo_expired;
-                        if (as_can_anticipate(ad, arq)) {
+                        if (as_can_anticipate(ad, rq)) {
                                as_antic_waitreq(ad);
                                return 0;
                        }
                }
-                if (arq) {
+                if (rq) {
                        /* we have a "next request" */
                        if (reads && !writes)
                                ad->current_batch_expires =
@@ -1256,7 +1070,7 @@ static int as_dispatch_request(request_queue_t *q, int force)
                        ad->changed_batch = 1;
                }
                ad->batch_data_dir = REQ_SYNC;
-                arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
+                rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next);
                ad->last_check_fifo[ad->batch_data_dir] = jiffies;
                goto dispatch_request;
        }
@@ -1282,7 +1096,7 @@ dispatch_writes:
                ad->batch_data_dir = REQ_ASYNC;
                ad->current_write_count = ad->write_batch_count;
                ad->write_batch_idled = 0;
-                arq = ad->next_arq[ad->batch_data_dir];
+                rq = ad->next_rq[ad->batch_data_dir];
                goto dispatch_request;
        }
@@ -1296,8 +1110,7 @@ dispatch_request:
        if (as_fifo_expired(ad, ad->batch_data_dir)) {
 fifo_expired:
-                arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
+                rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
-                BUG_ON(arq == NULL);
        }
        if (ad->changed_batch) {
@@ -1316,70 +1129,58 @@ fifo_expired:
        }
        /*
-         * arq is the selected appropriate request.
+         * rq is the selected appropriate request.
         */
-        as_move_to_dispatch(ad, arq);
+        as_move_to_dispatch(ad, rq);
        return 1;
 }
 /*
- * add arq to rbtree and fifo
+ * add rq to rbtree and fifo
 */
 static void as_add_request(request_queue_t *q, struct request *rq)
 {
        struct as_data *ad = q->elevator->elevator_data;
-        struct as_rq *arq = RQ_DATA(rq);
        int data_dir;
-        arq->state = AS_RQ_NEW;
+        RQ_SET_STATE(rq, AS_RQ_NEW);
-        if (rq_data_dir(arq->request) == READ
+        data_dir = rq_is_sync(rq);
-                        || (arq->request->flags & REQ_RW_SYNC))
-                arq->is_sync = 1;
-        else
-                arq->is_sync = 0;
-        data_dir = arq->is_sync;
-        arq->io_context = as_get_io_context();
+        rq->elevator_private = as_get_io_context(q->node);
-        if (arq->io_context) {
+        if (RQ_IOC(rq)) {
-                as_update_iohist(ad, arq->io_context->aic, arq->request);
+                as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
-                atomic_inc(&arq->io_context->aic->nr_queued);
+                atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
        }
-        as_add_arq_rb(ad, arq);
+        as_add_rq_rb(ad, rq);
-        if (rq_mergeable(arq->request))
-                as_add_arq_hash(ad, arq);
        /*
         * set expire time (only used for reads) and add to fifo list
         */
-        arq->expires = jiffies + ad->fifo_expire[data_dir];
+        rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
-        list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
+        list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
-        as_update_arq(ad, arq); /* keep state machine up to date */
+        as_update_rq(ad, rq); /* keep state machine up to date */
-        arq->state = AS_RQ_QUEUED;
+        RQ_SET_STATE(rq, AS_RQ_QUEUED);
 }
 static void as_activate_request(request_queue_t *q, struct request *rq)
 {
-        struct as_rq *arq = RQ_DATA(rq);
+        WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
+        RQ_SET_STATE(rq, AS_RQ_REMOVED);
-        WARN_ON(arq->state != AS_RQ_DISPATCHED);
+        if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-        arq->state = AS_RQ_REMOVED;
+                atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
-        if (arq->io_context && arq->io_context->aic)
-                atomic_dec(&arq->io_context->aic->nr_dispatched);
 }
 static void as_deactivate_request(request_queue_t *q, struct request *rq)
 {
-        struct as_rq *arq = RQ_DATA(rq);
+        WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
+        RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
-        WARN_ON(arq->state != AS_RQ_REMOVED);
+        if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-        arq->state = AS_RQ_DISPATCHED;
+                atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
-        if (arq->io_context && arq->io_context->aic)
-                atomic_inc(&arq->io_context->aic->nr_dispatched);
 }
 /*
@@ -1396,93 +1197,35 @@ static int as_queue_empty(request_queue_t *q)
                && list_empty(&ad->fifo_list[REQ_SYNC]);
 }
-static struct request *as_former_request(request_queue_t *q,
-                                        struct request *rq)
-{
-        struct as_rq *arq = RQ_DATA(rq);
-        struct rb_node *rbprev = rb_prev(&arq->rb_node);
-        struct request *ret = NULL;
-        if (rbprev)
-                ret = rb_entry_arq(rbprev)->request;
-        return ret;
-}
-static struct request *as_latter_request(request_queue_t *q,
-                                        struct request *rq)
-{
-        struct as_rq *arq = RQ_DATA(rq);
-        struct rb_node *rbnext = rb_next(&arq->rb_node);
-        struct request *ret = NULL;
-        if (rbnext)
-                ret = rb_entry_arq(rbnext)->request;
-        return ret;
-}
 static int
 as_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
        struct as_data *ad = q->elevator->elevator_data;
        sector_t rb_key = bio->bi_sector + bio_sectors(bio);
        struct request *__rq;
-        int ret;
-        /*
-         * see if the merge hash can satisfy a back merge
-         */
-        __rq = as_find_arq_hash(ad, bio->bi_sector);
-        if (__rq) {
-                BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-                if (elv_rq_merge_ok(__rq, bio)) {
-                        ret = ELEVATOR_BACK_MERGE;
-                        goto out;
-                }
-        }
        /*
         * check for front merge
         */
-        __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
+        __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
-        if (__rq) {
+        if (__rq && elv_rq_merge_ok(__rq, bio)) {
-                BUG_ON(rb_key != rq_rb_key(__rq));
+                *req = __rq;
+                return ELEVATOR_FRONT_MERGE;
-                if (elv_rq_merge_ok(__rq, bio)) {
-                        ret = ELEVATOR_FRONT_MERGE;
-                        goto out;
-                }
        }
        return ELEVATOR_NO_MERGE;
-out:
-        if (ret) {
-                if (rq_mergeable(__rq))
-                        as_hot_arq_hash(ad, RQ_DATA(__rq));
-        }
-        *req = __rq;
-        return ret;
 }
-static void as_merged_request(request_queue_t *q, struct request *req)
+static void as_merged_request(request_queue_t *q, struct request *req, int type)
 {
        struct as_data *ad = q->elevator->elevator_data;
-        struct as_rq *arq = RQ_DATA(req);
-        /*
-         * hash always needs to be repositioned, key is end sector
-         */
-        as_del_arq_hash(arq);
-        as_add_arq_hash(ad, arq);
        /*
         * if the merge was a front merge, we need to reposition request
         */
-        if (rq_rb_key(req) != arq->rb_key) {
+        if (type == ELEVATOR_FRONT_MERGE) {
-                as_del_arq_rb(ad, arq);
+                as_del_rq_rb(ad, req);
-                as_add_arq_rb(ad, arq);
+                as_add_rq_rb(ad, req);
                /*
                 * Note! At this stage of this and the next function, our next
                 * request may not be optimal - eg the request may have "grown"
@@ -1494,38 +1237,22 @@ static void as_merged_request(request_queue_t *q, struct request *req)
 static void as_merged_requests(request_queue_t *q, struct request *req,
                                struct request *next)
 {
-        struct as_data *ad = q->elevator->elevator_data;
-        struct as_rq *arq = RQ_DATA(req);
-        struct as_rq *anext = RQ_DATA(next);
-        BUG_ON(!arq);
-        BUG_ON(!anext);
        /*
-         * reposition arq (this is the merged request) in hash, and in rbtree
+         * if next expires before rq, assign its expire time to arq
-         * in case of a front merge
+         * and move into next position (next will be deleted) in fifo
         */
-        as_del_arq_hash(arq);
+        if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
-        as_add_arq_hash(ad, arq);
+                if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
+                        struct io_context *rioc = RQ_IOC(req);
-        if (rq_rb_key(req) != arq->rb_key) {
+                        struct io_context *nioc = RQ_IOC(next);
-                as_del_arq_rb(ad, arq);
-                as_add_arq_rb(ad, arq);
-        }
-        /*
+                        list_move(&req->queuelist, &next->queuelist);
-         * if anext expires before arq, assign its expire time to arq
+                        rq_set_fifo_time(req, rq_fifo_time(next));
-         * and move into anext position (anext will be deleted) in fifo
-         */
-        if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
-                if (time_before(anext->expires, arq->expires)) {
-                        list_move(&arq->fifo, &anext->fifo);
-                        arq->expires = anext->expires;
                        /*
                         * Don't copy here but swap, because when anext is
                         * removed below, it must contain the unused context
                         */
-                        swap_io_context(&arq->io_context, &anext->io_context);
+                        swap_io_context(&rioc, &nioc);
                }
        }
@@ -1533,9 +1260,9 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
         * kill knowledge of next, this one is a goner
         */
        as_remove_queued_request(q, next);
-        as_put_io_context(anext);
+        as_put_io_context(next);
-        anext->state = AS_RQ_MERGED;
+        RQ_SET_STATE(next, AS_RQ_MERGED);
 }
 /*
@@ -1553,61 +1280,18 @@ static void as_work_handler(void *data)
        unsigned long flags;
        spin_lock_irqsave(q->queue_lock, flags);
-        if (!as_queue_empty(q))
+        blk_start_queueing(q);
-                q->request_fn(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
-static void as_put_request(request_queue_t *q, struct request *rq)
+static int as_may_queue(request_queue_t *q, int rw)
-{
-        struct as_data *ad = q->elevator->elevator_data;
-        struct as_rq *arq = RQ_DATA(rq);
-        if (!arq) {
-                WARN_ON(1);
-                return;
-        }
-        if (unlikely(arq->state != AS_RQ_POSTSCHED &&
-                     arq->state != AS_RQ_PRESCHED &&
-                     arq->state != AS_RQ_MERGED)) {
-                printk("arq->state %d\n", arq->state);
-                WARN_ON(1);
-        }
-        mempool_free(arq, ad->arq_pool);
-        rq->elevator_private = NULL;
-}
-static int as_set_request(request_queue_t *q, struct request *rq,
-                          struct bio *bio, gfp_t gfp_mask)
-{
-        struct as_data *ad = q->elevator->elevator_data;
-        struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
-        if (arq) {
-                memset(arq, 0, sizeof(*arq));
-                RB_CLEAR_NODE(&arq->rb_node);
-                arq->request = rq;
-                arq->state = AS_RQ_PRESCHED;
-                arq->io_context = NULL;
-                INIT_HLIST_NODE(&arq->hash);
-                INIT_LIST_HEAD(&arq->fifo);
-                rq->elevator_private = arq;
-                return 0;
-        }
-        return 1;
-}
-static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
 {
        int ret = ELV_MQUEUE_MAY;
        struct as_data *ad = q->elevator->elevator_data;
        struct io_context *ioc;
        if (ad->antic_status == ANTIC_WAIT_REQ ||
                        ad->antic_status == ANTIC_WAIT_NEXT) {
-                ioc = as_get_io_context();
+                ioc = as_get_io_context(q->node);
                if (ad->io_context == ioc)
                        ret = ELV_MQUEUE_MUST;
                put_io_context(ioc);
@@ -1626,23 +1310,16 @@ static void as_exit_queue(elevator_t *e)
        BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
        BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
-        mempool_destroy(ad->arq_pool);
        put_io_context(ad->io_context);
-        kfree(ad->hash);
        kfree(ad);
 }
 /*
- * initialize elevator private data (as_data), and alloc a arq for
+ * initialize elevator private data (as_data).
- * each request on the free lists
 */
 static void *as_init_queue(request_queue_t *q, elevator_t *e)
 {
        struct as_data *ad;
-        int i;
-        if (!arq_pool)
-                return NULL;
        ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
        if (!ad)
@@ -1651,30 +1328,12 @@ static void *as_init_queue(request_queue_t *q, elevator_t *e)
        ad->q = q; /* Identify what queue the data belongs to */
-        ad->hash = kmalloc_node(sizeof(struct hlist_head)*AS_HASH_ENTRIES,
-                                GFP_KERNEL, q->node);
-        if (!ad->hash) {
-                kfree(ad);
-                return NULL;
-        }
-        ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-                                mempool_free_slab, arq_pool, q->node);
-        if (!ad->arq_pool) {
-                kfree(ad->hash);
-                kfree(ad);
-                return NULL;
-        }
        /* anticipatory scheduling helpers */
        ad->antic_timer.function = as_antic_timeout;
        ad->antic_timer.data = (unsigned long)q;
        init_timer(&ad->antic_timer);
        INIT_WORK(&ad->antic_work, as_work_handler, q);
-        for (i = 0; i < AS_HASH_ENTRIES; i++)
-                INIT_HLIST_HEAD(&ad->hash[i]);
        INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
        INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
        ad->sort_list[REQ_SYNC] = RB_ROOT;
@@ -1787,10 +1446,8 @@ static struct elevator_type iosched_as = {
                .elevator_deactivate_req_fn =   as_deactivate_request,
                .elevator_queue_empty_fn =      as_queue_empty,
                .elevator_completed_req_fn =    as_completed_request,
-                .elevator_former_req_fn =       as_former_request,
+                .elevator_former_req_fn =       elv_rb_former_request,
-                .elevator_latter_req_fn =       as_latter_request,
+                .elevator_latter_req_fn =       elv_rb_latter_request,
-                .elevator_set_req_fn =          as_set_request,
-                .elevator_put_req_fn =          as_put_request,
                .elevator_may_queue_fn =        as_may_queue,
                .elevator_init_fn =             as_init_queue,
                .elevator_exit_fn =             as_exit_queue,
@@ -1806,11 +1463,6 @@ static int __init as_init(void)
 {
        int ret;
-        arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
-                                     0, 0, NULL, NULL);
-        if (!arq_pool)
-                return -ENOMEM;
        ret = elv_register(&iosched_as);
        if (!ret) {
                /*
@@ -1822,21 +1474,19 @@ static int __init as_init(void)
                return 0;
        }
-        kmem_cache_destroy(arq_pool);
        return ret;
 }
 static void __exit as_exit(void)
 {
-        DECLARE_COMPLETION(all_gone);
+        DECLARE_COMPLETION_ONSTACK(all_gone);
        elv_unregister(&iosched_as);
        ioc_gone = &all_gone;
        /* ioc_gone's update must be visible before reading ioc_count */
        smp_wmb();
-        if (atomic_read(&ioc_count))
+        if (elv_ioc_count_read(ioc_count))
                wait_for_completion(ioc_gone);
        synchronize_rcu();
-        kmem_cache_destroy(arq_pool);
 }
 module_init(as_init);
diff --git a/block/blktrace.c b/block/blktrace.c
index 2b4ef2b89b8d..135593c8e45b 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -69,7 +69,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
 /*
 * Bio action bits of interest
 */
-static u32 bio_act[5] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD) };
+static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
 /*
 * More could be added as needed, taking care to increment the decrementer
@@ -81,6 +81,8 @@ static u32 bio_act[5] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_AC
        (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
 #define trace_ahead_bit(rw)     \
        (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
+#define trace_meta_bit(rw)      \
+        (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
 /*
 * The worker for the various blk_add_trace*() types. Fills out a
@@ -103,6 +105,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= bio_act[trace_barrier_bit(rw)];
        what |= bio_act[trace_sync_bit(rw)];
        what |= bio_act[trace_ahead_bit(rw)];
+        what |= bio_act[trace_meta_bit(rw)];
        pid = tsk->pid;
        if (unlikely(act_log_check(bt, what, sector, pid)))
@@ -450,8 +453,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 **/
 void blk_trace_shutdown(request_queue_t *q)
 {
-        blk_trace_startstop(q, 0);
+        if (q->blk_trace) {
-        blk_trace_remove(q);
+                blk_trace_startstop(q, 0);
+                blk_trace_remove(q);
+        }
 }
 /*
@@ -471,6 +476,9 @@ static void blk_check_time(unsigned long long *t)
        *t -= (a + b) / 2;
 }
+/*
+ * calibrate our inter-CPU timings
+ */
 static void blk_trace_check_cpu_time(void *data)
 {
        unsigned long long *t;
@@ -488,20 +496,6 @@ static void blk_trace_check_cpu_time(void *data)
        put_cpu();
 }
-/*
- * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
- * timings
- */
-static void blk_trace_calibrate_offsets(void)
-{
-        unsigned long flags;
-        smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
-        local_irq_save(flags);
-        blk_trace_check_cpu_time(NULL);
-        local_irq_restore(flags);
-}
 static void blk_trace_set_ht_offsets(void)
 {
 #if defined(CONFIG_SCHED_SMT)
@@ -530,7 +524,7 @@ static void blk_trace_set_ht_offsets(void)
 static __init int blk_trace_init(void)
 {
        mutex_init(&blk_tree_mutex);
-        blk_trace_calibrate_offsets();
+        on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
        blk_trace_set_ht_offsets();
        return 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3a3aee08ec5f..d3d76136f53a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4,7 +4,7 @@
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
- *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
+ *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 */
 #include <linux/module.h>
 #include <linux/blkdev.h>
@@ -17,7 +17,6 @@
 * tunables
 */
 static const int cfq_quantum = 4;               /* max queue in one round of service */
-static const int cfq_queued = 8;                /* minimum rq allocate limit per-queue*/
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 static const int cfq_back_max = 16 * 1024;      /* maximum backwards seek, in KiB */
 static const int cfq_back_penalty = 2;          /* penalty of a backwards seek */
@@ -32,8 +31,6 @@ static int cfq_slice_idle = HZ / 125;
 #define CFQ_KEY_ASYNC           (0)
-static DEFINE_SPINLOCK(cfq_exit_lock);
 /*
 * for the hash of cfqq inside the cfqd
 */
@@ -41,37 +38,19 @@ static DEFINE_SPINLOCK(cfq_exit_lock);
 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
-/*
- * for the hash of crq inside the cfqq
- */
-#define CFQ_MHASH_SHIFT         6
-#define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
-#define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
-#define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
-#define list_entry_fifo(ptr)    list_entry((ptr), struct request, queuelist)
-#define RQ_DATA(rq)             (rq)->elevator_private
+#define RQ_CIC(rq)              ((struct cfq_io_context*)(rq)->elevator_private)
+#define RQ_CFQQ(rq)             ((rq)->elevator_private2)
-/*
- * rb-tree defines
- */
-#define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)           (rq)->sector
-static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
 static kmem_cache_t *cfq_ioc_pool;
-static atomic_t ioc_count = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(unsigned long, ioc_count);
 static struct completion *ioc_gone;
 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define cfq_class_be(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 #define ASYNC                   (0)
@@ -103,29 +82,14 @@ struct cfq_data {
        unsigned int busy_queues;
        /*
-         * non-ordered list of empty cfqq's
-         */
-        struct list_head empty_list;
-        /*
         * cfqq lookup hash
         */
        struct hlist_head *cfq_hash;
-        /*
-         * global crq hash for all queues
-         */
-        struct hlist_head *crq_hash;
-        mempool_t *crq_pool;
        int rq_in_driver;
        int hw_tag;
        /*
-         * schedule slice state info
-         */
-        /*
         * idle window management
         */
        struct timer_list idle_slice_timer;
@@ -141,13 +105,10 @@ struct cfq_data {
        sector_t last_sector;
        unsigned long last_end_request;
-        unsigned int rq_starved;
        /*
         * tunables, see top of file
         */
        unsigned int cfq_quantum;
-        unsigned int cfq_queued;
        unsigned int cfq_fifo_expire[2];
        unsigned int cfq_back_penalty;
        unsigned int cfq_back_max;
@@ -170,23 +131,24 @@ struct cfq_queue {
        struct hlist_node cfq_hash;
        /* hash key */
        unsigned int key;
-        /* on either rr or empty list of cfqd */
+        /* member of the rr/busy/cur/idle cfqd list */
        struct list_head cfq_list;
        /* sorted list of pending requests */
        struct rb_root sort_list;
        /* if fifo isn't expired, next request to serve */
-        struct cfq_rq *next_crq;
+        struct request *next_rq;
        /* requests queued in sort_list */
        int queued[2];
        /* currently allocated requests */
        int allocated[2];
+        /* pending metadata requests */
+        int meta_pending;
        /* fifo list of requests in sort_list */
        struct list_head fifo;
        unsigned long slice_start;
        unsigned long slice_end;
        unsigned long slice_left;
-        unsigned long service_last;
        /* number of requests that are on the dispatch list */
        int on_dispatch[2];
@@ -199,18 +161,6 @@ struct cfq_queue {
        unsigned int flags;
 };
-struct cfq_rq {
-        struct rb_node rb_node;
-        sector_t rb_key;
-        struct request *request;
-        struct hlist_node hash;
-        struct cfq_queue *cfq_queue;
-        struct cfq_io_context *io_context;
-        unsigned int crq_flags;
-};
 enum cfqq_state_flags {
        CFQ_CFQQ_FLAG_on_rr = 0,
        CFQ_CFQQ_FLAG_wait_request,
@@ -220,6 +170,7 @@ enum cfqq_state_flags {
        CFQ_CFQQ_FLAG_fifo_expire,
        CFQ_CFQQ_FLAG_idle_window,
        CFQ_CFQQ_FLAG_prio_changed,
+        CFQ_CFQQ_FLAG_queue_new,
 };
 #define CFQ_CFQQ_FNS(name)                                              \
@@ -244,70 +195,14 @@ CFQ_CFQQ_FNS(must_dispatch);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
+CFQ_CFQQ_FNS(queue_new);
 #undef CFQ_CFQQ_FNS
-enum cfq_rq_state_flags {
-        CFQ_CRQ_FLAG_is_sync = 0,
-};
-#define CFQ_CRQ_FNS(name)                                               \
-static inline void cfq_mark_crq_##name(struct cfq_rq *crq)              \
-{                                                                       \
-        crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);                   \
-}                                                                       \
-static inline void cfq_clear_crq_##name(struct cfq_rq *crq)             \
-{                                                                       \
-        crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);                  \
-}                                                                       \
-static inline int cfq_crq_##name(const struct cfq_rq *crq)              \
-{                                                                       \
-        return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;      \
-}
-CFQ_CRQ_FNS(is_sync);
-#undef CFQ_CRQ_FNS
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
-static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
+static void cfq_dispatch_insert(request_queue_t *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 /*
- * lots of deadline iosched dupes, can be abstracted later...
- */
-static inline void cfq_del_crq_hash(struct cfq_rq *crq)
-{
-        hlist_del_init(&crq->hash);
-}
-static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-        const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
-        hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
-}
-static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
-{
-        struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-        struct hlist_node *entry, *next;
-        hlist_for_each_safe(entry, next, hash_list) {
-                struct cfq_rq *crq = list_entry_hash(entry);
-                struct request *__rq = crq->request;
-                if (!rq_mergeable(__rq)) {
-                        cfq_del_crq_hash(crq);
-                        continue;
-                }
-                if (rq_hash_key(__rq) == offset)
-                        return __rq;
-        }
-        return NULL;
-}
-/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
@@ -333,12 +228,12 @@ static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 }
 /*
- * Lifted from AS - choose which of crq1 and crq2 that is best served now.
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
 * We choose the request that is closest to the head right now. Distance
 * behind the head is penalized and only allowed to a certain extent.
 */
-static struct cfq_rq *
+static struct request *
-cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
+cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 {
        sector_t last, s1, s2, d1 = 0, d2 = 0;
        unsigned long back_max;
@@ -346,18 +241,22 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
        unsigned wrap = 0; /* bit mask: requests behind the disk head? */
-        if (crq1 == NULL || crq1 == crq2)
+        if (rq1 == NULL || rq1 == rq2)
-                return crq2;
+                return rq2;
-        if (crq2 == NULL)
+        if (rq2 == NULL)
-                return crq1;
+                return rq1;
-        if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
+        if (rq_is_sync(rq1) && !rq_is_sync(rq2))
-                return crq1;
+                return rq1;
-        else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
+        else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
-                return crq2;
+                return rq2;
+        if (rq_is_meta(rq1) && !rq_is_meta(rq2))
+                return rq1;
+        else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
+                return rq2;
-        s1 = crq1->request->sector;
+        s1 = rq1->sector;
-        s2 = crq2->request->sector;
+        s2 = rq2->sector;
        last = cfqd->last_sector;
@@ -392,23 +291,23 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
         * check two variables for all permutations: --> faster!
         */
        switch (wrap) {
-        case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
+        case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
                if (d1 < d2)
-                        return crq1;
+                        return rq1;
                else if (d2 < d1)
-                        return crq2;
+                        return rq2;
                else {
                        if (s1 >= s2)
-                                return crq1;
+                                return rq1;
                        else
-                                return crq2;
+                                return rq2;
                }
        case CFQ_RQ2_WRAP:
-                return crq1;
+                return rq1;
        case CFQ_RQ1_WRAP:
-                return crq2;
+                return rq2;
-        case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
+        case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
        default:
                /*
                 * Since both rqs are wrapped,
@@ -417,50 +316,43 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
                 * since back seek takes more time than forward.
                 */
                if (s1 <= s2)
-                        return crq1;
+                        return rq1;
                else
-                        return crq2;
+                        return rq2;
        }
 }
 /*
 * would be nice to take fifo expire time into account as well
 */
-static struct cfq_rq *
+static struct request *
-cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-                  struct cfq_rq *last)
+                  struct request *last)
 {
-        struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
+        struct rb_node *rbnext = rb_next(&last->rb_node);
-        struct rb_node *rbnext, *rbprev;
+        struct rb_node *rbprev = rb_prev(&last->rb_node);
+        struct request *next = NULL, *prev = NULL;
-        if (!(rbnext = rb_next(&last->rb_node))) {
-                rbnext = rb_first(&cfqq->sort_list);
-                if (rbnext == &last->rb_node)
-                        rbnext = NULL;
-        }
-        rbprev = rb_prev(&last->rb_node);
+        BUG_ON(RB_EMPTY_NODE(&last->rb_node));
        if (rbprev)
-                crq_prev = rb_entry_crq(rbprev);
+                prev = rb_entry_rq(rbprev);
-        if (rbnext)
-                crq_next = rb_entry_crq(rbnext);
-        return cfq_choose_req(cfqd, crq_next, crq_prev);
-}
-static void cfq_update_next_crq(struct cfq_rq *crq)
+        if (rbnext)
-{
+                next = rb_entry_rq(rbnext);
-        struct cfq_queue *cfqq = crq->cfq_queue;
+        else {
+                rbnext = rb_first(&cfqq->sort_list);
+                if (rbnext && rbnext != &last->rb_node)
+                        next = rb_entry_rq(rbnext);
+        }
-        if (cfqq->next_crq == crq)
+        return cfq_choose_req(cfqd, next, prev);
-                cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
 }
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 {
        struct cfq_data *cfqd = cfqq->cfqd;
-        struct list_head *list, *entry;
+        struct list_head *list;
        BUG_ON(!cfq_cfqq_on_rr(cfqq));
@@ -485,31 +377,26 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
        }
        /*
-         * if queue was preempted, just add to front to be fair. busy_rr
+         * If this queue was preempted or is new (never been serviced), let
-         * isn't sorted, but insert at the back for fairness.
+         * it be added first for fairness but beind other new queues.
+         * Otherwise, just add to the back  of the list.
         */
-        if (preempted || list == &cfqd->busy_rr) {
+        if (preempted || cfq_cfqq_queue_new(cfqq)) {
-                if (preempted)
+                struct list_head *n = list;
-                        list = list->prev;
+                struct cfq_queue *__cfqq;
-                list_add_tail(&cfqq->cfq_list, list);
+                while (n->next != list) {
-                return;
+                        __cfqq = list_entry_cfqq(n->next);
-        }
+                        if (!cfq_cfqq_queue_new(__cfqq))
+                                break;
-        /*
+                        n = n->next;
-         * sort by when queue was last serviced
+                }
-         */
-        entry = list;
-        while ((entry = entry->prev) != list) {
-                struct cfq_queue *__cfqq = list_entry_cfqq(entry);
-                if (!__cfqq->service_last)
+                list = n;
-                        break;
-                if (time_before(__cfqq->service_last, cfqq->service_last))
-                        break;
        }
-        list_add(&cfqq->cfq_list, entry);
+        list_add_tail(&cfqq->cfq_list, list);
 }
 /*
@@ -531,7 +418,7 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
        BUG_ON(!cfq_cfqq_on_rr(cfqq));
        cfq_clear_cfqq_on_rr(cfqq);
-        list_move(&cfqq->cfq_list, &cfqd->empty_list);
+        list_del_init(&cfqq->cfq_list);
        BUG_ON(!cfqd->busy_queues);
        cfqd->busy_queues--;
@@ -540,81 +427,43 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 /*
 * rb tree support functions
 */
-static inline void cfq_del_crq_rb(struct cfq_rq *crq)
+static inline void cfq_del_rq_rb(struct request *rq)
 {
-        struct cfq_queue *cfqq = crq->cfq_queue;
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
        struct cfq_data *cfqd = cfqq->cfqd;
-        const int sync = cfq_crq_is_sync(crq);
+        const int sync = rq_is_sync(rq);
        BUG_ON(!cfqq->queued[sync]);
        cfqq->queued[sync]--;
-        cfq_update_next_crq(crq);
+        elv_rb_del(&cfqq->sort_list, rq);
-        rb_erase(&crq->rb_node, &cfqq->sort_list);
        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
                cfq_del_cfqq_rr(cfqd, cfqq);
 }
-static struct cfq_rq *
+static void cfq_add_rq_rb(struct request *rq)
-__cfq_add_crq_rb(struct cfq_rq *crq)
 {
-        struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
-        struct rb_node *parent = NULL;
-        struct cfq_rq *__crq;
-        while (*p) {
-                parent = *p;
-                __crq = rb_entry_crq(parent);
-                if (crq->rb_key < __crq->rb_key)
-                        p = &(*p)->rb_left;
-                else if (crq->rb_key > __crq->rb_key)
-                        p = &(*p)->rb_right;
-                else
-                        return __crq;
-        }
-        rb_link_node(&crq->rb_node, parent, p);
-        return NULL;
-}
-static void cfq_add_crq_rb(struct cfq_rq *crq)
-{
-        struct cfq_queue *cfqq = crq->cfq_queue;
        struct cfq_data *cfqd = cfqq->cfqd;
-        struct request *rq = crq->request;
+        struct request *__alias;
-        struct cfq_rq *__alias;
-        crq->rb_key = rq_rb_key(rq);
+        cfqq->queued[rq_is_sync(rq)]++;
-        cfqq->queued[cfq_crq_is_sync(crq)]++;
        /*
         * looks a little odd, but the first insert might return an alias.
         * if that happens, put the alias on the dispatch list
         */
-        while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
+        while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
                cfq_dispatch_insert(cfqd->queue, __alias);
-        rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-        if (!cfq_cfqq_on_rr(cfqq))
-                cfq_add_cfqq_rr(cfqd, cfqq);
-        /*
-         * check if this request is a better next-serve candidate
-         */
-        cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
 }
 static inline void
-cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
-        rb_erase(&crq->rb_node, &cfqq->sort_list);
+        elv_rb_del(&cfqq->sort_list, rq);
-        cfqq->queued[cfq_crq_is_sync(crq)]--;
+        cfqq->queued[rq_is_sync(rq)]--;
+        cfq_add_rq_rb(rq);
-        cfq_add_crq_rb(crq);
 }
 static struct request *
@@ -623,27 +472,14 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
        struct task_struct *tsk = current;
        pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
        struct cfq_queue *cfqq;
-        struct rb_node *n;
-        sector_t sector;
        cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
-        if (!cfqq)
+        if (cfqq) {
-                goto out;
+                sector_t sector = bio->bi_sector + bio_sectors(bio);
-        sector = bio->bi_sector + bio_sectors(bio);
-        n = cfqq->sort_list.rb_node;
-        while (n) {
-                struct cfq_rq *crq = rb_entry_crq(n);
-                if (sector < crq->rb_key)
+                return elv_rb_find(&cfqq->sort_list, sector);
-                        n = n->rb_left;
-                else if (sector > crq->rb_key)
-                        n = n->rb_right;
-                else
-                        return crq->request;
        }
-out:
        return NULL;
 }
@@ -673,11 +509,18 @@ static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
 static void cfq_remove_request(struct request *rq)
 {
-        struct cfq_rq *crq = RQ_DATA(rq);
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
+        if (cfqq->next_rq == rq)
+                cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
        list_del_init(&rq->queuelist);
-        cfq_del_crq_rb(crq);
+        cfq_del_rq_rb(rq);
-        cfq_del_crq_hash(crq);
+        if (rq_is_meta(rq)) {
+                WARN_ON(!cfqq->meta_pending);
+                cfqq->meta_pending--;
+        }
 }
 static int
@@ -685,39 +528,23 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
        struct request *__rq;
-        int ret;
-        __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-        if (__rq && elv_rq_merge_ok(__rq, bio)) {
-                ret = ELEVATOR_BACK_MERGE;
-                goto out;
-        }
        __rq = cfq_find_rq_fmerge(cfqd, bio);
        if (__rq && elv_rq_merge_ok(__rq, bio)) {
-                ret = ELEVATOR_FRONT_MERGE;
+                *req = __rq;
-                goto out;
+                return ELEVATOR_FRONT_MERGE;
        }
        return ELEVATOR_NO_MERGE;
-out:
-        *req = __rq;
-        return ret;
 }
-static void cfq_merged_request(request_queue_t *q, struct request *req)
+static void cfq_merged_request(request_queue_t *q, struct request *req,
+                               int type)
 {
-        struct cfq_data *cfqd = q->elevator->elevator_data;
+        if (type == ELEVATOR_FRONT_MERGE) {
-        struct cfq_rq *crq = RQ_DATA(req);
+                struct cfq_queue *cfqq = RQ_CFQQ(req);
-        cfq_del_crq_hash(crq);
-        cfq_add_crq_hash(cfqd, crq);
-        if (rq_rb_key(req) != crq->rb_key) {
-                struct cfq_queue *cfqq = crq->cfq_queue;
-                cfq_update_next_crq(crq);
+                cfq_reposition_rq_rb(cfqq, req);
-                cfq_reposition_crq_rb(cfqq, crq);
        }
 }
@@ -725,8 +552,6 @@ static void
 cfq_merged_requests(request_queue_t *q, struct request *rq,
                    struct request *next)
 {
-        cfq_merged_request(q, rq);
        /*
         * reposition in fifo if next is older than rq
         */
@@ -768,13 +593,12 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (cfq_cfqq_wait_request(cfqq))
                del_timer(&cfqd->idle_slice_timer);
-        if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
+        if (!preempted && !cfq_cfqq_dispatched(cfqq))
-                cfqq->service_last = now;
                cfq_schedule_dispatch(cfqd);
-        }
        cfq_clear_cfqq_must_dispatch(cfqq);
        cfq_clear_cfqq_wait_request(cfqq);
+        cfq_clear_cfqq_queue_new(cfqq);
        /*
         * store what was left of this slice, if the queue idled out
@@ -868,26 +692,25 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 {
        struct cfq_queue *cfqq = NULL;
-        /*
+        if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
-         * if current list is non-empty, grab first entry. if it is empty,
+                /*
-         * get next prio level and grab first entry then if any are spliced
+                 * if current list is non-empty, grab first entry. if it is
-         */
+                 * empty, get next prio level and grab first entry then if any
-        if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
+                 * are spliced
+                 */
                cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+        } else if (!list_empty(&cfqd->busy_rr)) {
-        /*
+                /*
-         * If no new queues are available, check if the busy list has some
+                 * If no new queues are available, check if the busy list has
-         * before falling back to idle io.
+                 * some before falling back to idle io.
-         */
+                 */
-        if (!cfqq && !list_empty(&cfqd->busy_rr))
                cfqq = list_entry_cfqq(cfqd->busy_rr.next);
+        } else if (!list_empty(&cfqd->idle_rr)) {
-        /*
+                /*
-         * if we have idle queues and no rt or be queues had pending
+                 * if we have idle queues and no rt or be queues had pending
-         * requests, either allow immediate service if the grace period
+                 * requests, either allow immediate service if the grace period
-         * has passed or arm the idle grace timer
+                 * has passed or arm the idle grace timer
-         */
+                 */
-        if (!cfqq && !list_empty(&cfqd->idle_rr)) {
                unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
                if (time_after_eq(jiffies, end))
@@ -942,16 +765,14 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        return 1;
 }
-static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
+static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-        struct cfq_queue *cfqq = crq->cfq_queue;
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
-        struct request *rq;
-        cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
+        cfq_remove_request(rq);
-        cfq_remove_request(crq->request);
+        cfqq->on_dispatch[rq_is_sync(rq)]++;
-        cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
+        elv_dispatch_sort(q, rq);
-        elv_dispatch_sort(q, crq->request);
        rq = list_entry(q->queue_head.prev, struct request, queuelist);
        cfqd->last_sector = rq->sector + rq->nr_sectors;
@@ -960,24 +781,23 @@ static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
 /*
 * return expired entry, or NULL to just start from scratch in rbtree
 */
-static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
+static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 {
        struct cfq_data *cfqd = cfqq->cfqd;
        struct request *rq;
-        struct cfq_rq *crq;
+        int fifo;
        if (cfq_cfqq_fifo_expire(cfqq))
                return NULL;
+        if (list_empty(&cfqq->fifo))
+                return NULL;
-        if (!list_empty(&cfqq->fifo)) {
+        fifo = cfq_cfqq_class_sync(cfqq);
-                int fifo = cfq_cfqq_class_sync(cfqq);
+        rq = rq_entry_fifo(cfqq->fifo.next);
-                crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
+        if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
-                rq = crq->request;
+                cfq_mark_cfqq_fifo_expire(cfqq);
-                if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
+                return rq;
-                        cfq_mark_cfqq_fifo_expire(cfqq);
-                        return crq;
-                }
        }
        return NULL;
@@ -1063,25 +883,25 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
        do {
-                struct cfq_rq *crq;
+                struct request *rq;
                /*
                 * follow expired path, else get first next available
                 */
-                if ((crq = cfq_check_fifo(cfqq)) == NULL)
+                if ((rq = cfq_check_fifo(cfqq)) == NULL)
-                        crq = cfqq->next_crq;
+                        rq = cfqq->next_rq;
                /*
                 * finally, insert request into driver dispatch list
                 */
-                cfq_dispatch_insert(cfqd->queue, crq);
+                cfq_dispatch_insert(cfqd->queue, rq);
                cfqd->dispatch_slice++;
                dispatched++;
                if (!cfqd->active_cic) {
-                        atomic_inc(&crq->io_context->ioc->refcount);
+                        atomic_inc(&RQ_CIC(rq)->ioc->refcount);
-                        cfqd->active_cic = crq->io_context;
+                        cfqd->active_cic = RQ_CIC(rq);
                }
                if (RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -1112,13 +932,12 @@ static int
 cfq_forced_dispatch_cfqqs(struct list_head *list)
 {
        struct cfq_queue *cfqq, *next;
-        struct cfq_rq *crq;
        int dispatched;
        dispatched = 0;
        list_for_each_entry_safe(cfqq, next, list, cfq_list) {
-                while ((crq = cfqq->next_crq)) {
+                while (cfqq->next_rq) {
-                        cfq_dispatch_insert(cfqq->cfqd->queue, crq);
+                        cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
                        dispatched++;
                }
                BUG_ON(!list_empty(&cfqq->fifo));
@@ -1194,8 +1013,8 @@ cfq_dispatch_requests(request_queue_t *q, int force)
 }
 /*
- * task holds one reference to the queue, dropped when task exits. each crq
+ * task holds one reference to the queue, dropped when task exits. each rq
- * in-flight on this queue also holds a reference, dropped when crq is freed.
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
 *
 * queue lock must be held here.
 */
@@ -1223,7 +1042,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
        kmem_cache_free(cfq_pool, cfqq);
 }
-static inline struct cfq_queue *
+static struct cfq_queue *
 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
                    const int hashval)
 {
@@ -1260,62 +1079,63 @@ static void cfq_free_io_context(struct io_context *ioc)
                freed++;
        }
-        if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
+        elv_ioc_count_mod(ioc_count, -freed);
+        if (ioc_gone && !elv_ioc_count_read(ioc_count))
                complete(ioc_gone);
 }
-static void cfq_trim(struct io_context *ioc)
+static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-        ioc->set_ioprio = NULL;
+        if (unlikely(cfqq == cfqd->active_queue))
-        cfq_free_io_context(ioc);
+                __cfq_slice_expired(cfqd, cfqq, 0);
+        cfq_put_queue(cfqq);
 }
-/*
+static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
- * Called with interrupts disabled
+                                         struct cfq_io_context *cic)
- */
-static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
-        struct cfq_data *cfqd = cic->key;
+        list_del_init(&cic->queue_list);
-        request_queue_t *q;
+        smp_wmb();
+        cic->key = NULL;
-        if (!cfqd)
-                return;
-        q = cfqd->queue;
-        WARN_ON(!irqs_disabled());
-        spin_lock(q->queue_lock);
        if (cic->cfqq[ASYNC]) {
-                if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
+                cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
-                        __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
-                cfq_put_queue(cic->cfqq[ASYNC]);
                cic->cfqq[ASYNC] = NULL;
        }
        if (cic->cfqq[SYNC]) {
-                if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
+                cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
-                        __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
-                cfq_put_queue(cic->cfqq[SYNC]);
                cic->cfqq[SYNC] = NULL;
        }
+}
-        cic->key = NULL;
-        list_del_init(&cic->queue_list);
+/*
-        spin_unlock(q->queue_lock);
+ * Called with interrupts disabled
+ */
+static void cfq_exit_single_io_context(struct cfq_io_context *cic)
+{
+        struct cfq_data *cfqd = cic->key;
+        if (cfqd) {
+                request_queue_t *q = cfqd->queue;
+                spin_lock_irq(q->queue_lock);
+                __cfq_exit_single_io_context(cfqd, cic);
+                spin_unlock_irq(q->queue_lock);
+        }
 }
 static void cfq_exit_io_context(struct io_context *ioc)
 {
        struct cfq_io_context *__cic;
-        unsigned long flags;
        struct rb_node *n;
        /*
         * put the reference this task is holding to the various queues
         */
-        spin_lock_irqsave(&cfq_exit_lock, flags);
        n = rb_first(&ioc->cic_root);
        while (n != NULL) {
@@ -1324,22 +1144,21 @@ static void cfq_exit_io_context(struct io_context *ioc)
                cfq_exit_single_io_context(__cic);
                n = rb_next(n);
        }
-        spin_unlock_irqrestore(&cfq_exit_lock, flags);
 }
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
-        struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
+        struct cfq_io_context *cic;
+        cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node);
        if (cic) {
                memset(cic, 0, sizeof(*cic));
                cic->last_end_request = jiffies;
                INIT_LIST_HEAD(&cic->queue_list);
                cic->dtor = cfq_free_io_context;
                cic->exit = cfq_exit_io_context;
-                atomic_inc(&ioc_count);
+                elv_ioc_count_inc(ioc_count);
        }
        return cic;
@@ -1420,15 +1239,12 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
        spin_unlock(cfqd->queue->queue_lock);
 }
-/*
+static void cfq_ioc_set_ioprio(struct io_context *ioc)
- * callback from sys_ioprio_set, irqs are disabled
- */
-static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 {
        struct cfq_io_context *cic;
        struct rb_node *n;
-        spin_lock(&cfq_exit_lock);
+        ioc->ioprio_changed = 0;
        n = rb_first(&ioc->cic_root);
        while (n != NULL) {
@@ -1437,10 +1253,6 @@ static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
                changed_ioprio(cic);
                n = rb_next(n);
        }
-        spin_unlock(&cfq_exit_lock);
-        return 0;
 }
 static struct cfq_queue *
@@ -1460,12 +1272,18 @@ retry:
                        cfqq = new_cfqq;
                        new_cfqq = NULL;
                } else if (gfp_mask & __GFP_WAIT) {
+                        /*
+                         * Inform the allocator of the fact that we will
+                         * just repeat this allocation if it fails, to allow
+                         * the allocator to do whatever it needs to attempt to
+                         * free memory.
+                         */
                        spin_unlock_irq(cfqd->queue->queue_lock);
-                        new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+                        new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node);
                        spin_lock_irq(cfqd->queue->queue_lock);
                        goto retry;
                } else {
-                        cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+                        cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node);
                        if (!cfqq)
                                goto out;
                }
@@ -1480,13 +1298,13 @@ retry:
                hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
                atomic_set(&cfqq->ref, 0);
                cfqq->cfqd = cfqd;
-                cfqq->service_last = 0;
                /*
                 * set ->slice_left to allow preemption for a new process
                 */
                cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
                cfq_mark_cfqq_idle_window(cfqq);
                cfq_mark_cfqq_prio_changed(cfqq);
+                cfq_mark_cfqq_queue_new(cfqq);
                cfq_init_prio_data(cfqq);
        }
@@ -1502,12 +1320,10 @@ out:
 static void
 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
 {
-        spin_lock(&cfq_exit_lock);
+        WARN_ON(!list_empty(&cic->queue_list));
        rb_erase(&cic->rb_node, &ioc->cic_root);
-        list_del_init(&cic->queue_list);
-        spin_unlock(&cfq_exit_lock);
        kmem_cache_free(cfq_ioc_pool, cic);
-        atomic_dec(&ioc_count);
+        elv_ioc_count_dec(ioc_count);
 }
 static struct cfq_io_context *
@@ -1551,7 +1367,6 @@ cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
        cic->ioc = ioc;
        cic->key = cfqd;
-        ioc->set_ioprio = cfq_ioc_set_ioprio;
 restart:
        parent = NULL;
        p = &ioc->cic_root.rb_node;
@@ -1573,11 +1388,12 @@ restart:
                        BUG();
        }
-        spin_lock(&cfq_exit_lock);
        rb_link_node(&cic->rb_node, parent, p);
        rb_insert_color(&cic->rb_node, &ioc->cic_root);
+        spin_lock_irq(cfqd->queue->queue_lock);
        list_add(&cic->queue_list, &cfqd->cic_list);
-        spin_unlock(&cfq_exit_lock);
+        spin_unlock_irq(cfqd->queue->queue_lock);
 }
 /*
@@ -1593,7 +1409,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
        might_sleep_if(gfp_mask & __GFP_WAIT);
-        ioc = get_io_context(gfp_mask);
+        ioc = get_io_context(gfp_mask, cfqd->queue->node);
        if (!ioc)
                return NULL;
@@ -1607,6 +1423,10 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
        cfq_cic_link(cfqd, ioc, cic);
 out:
+        smp_read_barrier_depends();
+        if (unlikely(ioc->ioprio_changed))
+                cfq_ioc_set_ioprio(ioc);
        return cic;
 err:
        put_io_context(ioc);
@@ -1640,15 +1460,15 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 static void
 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
-                       struct cfq_rq *crq)
+                       struct request *rq)
 {
        sector_t sdist;
        u64 total;
-        if (cic->last_request_pos < crq->request->sector)
+        if (cic->last_request_pos < rq->sector)
-                sdist = crq->request->sector - cic->last_request_pos;
+                sdist = rq->sector - cic->last_request_pos;
        else
-                sdist = cic->last_request_pos - crq->request->sector;
+                sdist = cic->last_request_pos - rq->sector;
        /*
         * Don't allow the seek distance to get too large from the
@@ -1699,7 +1519,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 */
 static int
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
-                   struct cfq_rq *crq)
+                   struct request *rq)
 {
        struct cfq_queue *cfqq = cfqd->active_queue;
@@ -1718,7 +1538,17 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
         */
        if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
                return 0;
-        if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
+        /*
+         * if the new request is sync, but the currently running queue is
+         * not, let the sync request have priority.
+         */
+        if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+                return 1;
+        /*
+         * So both queues are sync. Let the new request get disk time if
+         * it's a metadata request and the current queue is doing regular IO.
+         */
+        if (rq_is_meta(rq) && !cfqq->meta_pending)
                return 1;
        return 0;
@@ -1730,47 +1560,45 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-        struct cfq_queue *__cfqq, *next;
+        cfq_slice_expired(cfqd, 1);
-        list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
-                cfq_resort_rr_list(__cfqq, 1);
        if (!cfqq->slice_left)
                cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
-        cfqq->slice_end = cfqq->slice_left + jiffies;
+        /*
-        cfq_slice_expired(cfqd, 1);
+         * Put the new queue at the front of the of the current list,
-        __cfq_set_active_queue(cfqd, cfqq);
+         * so we know that it will be selected next.
-}
+         */
+        BUG_ON(!cfq_cfqq_on_rr(cfqq));
-/*
+        list_move(&cfqq->cfq_list, &cfqd->cur_rr);
- * should really be a ll_rw_blk.c helper
- */
-static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-        request_queue_t *q = cfqd->queue;
-        if (!blk_queue_plugged(q))
+        cfqq->slice_end = cfqq->slice_left + jiffies;
-                q->request_fn(q);
-        else
-                __generic_unplug_device(q);
 }
 /*
- * Called when a new fs request (crq) is added (to cfqq). Check if there's
+ * Called when a new fs request (rq) is added (to cfqq). Check if there's
 * something we should do about it
 */
 static void
-cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-                 struct cfq_rq *crq)
+                struct request *rq)
 {
-        struct cfq_io_context *cic = crq->io_context;
+        struct cfq_io_context *cic = RQ_CIC(rq);
+        if (rq_is_meta(rq))
+                cfqq->meta_pending++;
+        /*
+         * check if this request is a better next-serve candidate)) {
+         */
+        cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+        BUG_ON(!cfqq->next_rq);
        /*
         * we never wait for an async request and we don't allow preemption
         * of an async request. so just return early
         */
-        if (!cfq_crq_is_sync(crq)) {
+        if (!rq_is_sync(rq)) {
                /*
                 * sync process issued an async request, if it's waiting
                 * then expire it and kick rq handling.
@@ -1778,17 +1606,17 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                if (cic == cfqd->active_cic &&
                    del_timer(&cfqd->idle_slice_timer)) {
                        cfq_slice_expired(cfqd, 0);
-                        cfq_start_queueing(cfqd, cfqq);
+                        blk_start_queueing(cfqd->queue);
                }
                return;
        }
        cfq_update_io_thinktime(cfqd, cic);
-        cfq_update_io_seektime(cfqd, cic, crq);
+        cfq_update_io_seektime(cfqd, cic, rq);
        cfq_update_idle_window(cfqd, cfqq, cic);
        cic->last_queue = jiffies;
-        cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
+        cic->last_request_pos = rq->sector + rq->nr_sectors;
        if (cfqq == cfqd->active_queue) {
                /*
@@ -1799,9 +1627,9 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                if (cfq_cfqq_wait_request(cfqq)) {
                        cfq_mark_cfqq_must_dispatch(cfqq);
                        del_timer(&cfqd->idle_slice_timer);
-                        cfq_start_queueing(cfqd, cfqq);
+                        blk_start_queueing(cfqd->queue);
                }
-        } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
+        } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
                /*
                 * not the active queue - expire current slice if it is
                 * idle and has expired it's mean thinktime or this new queue
@@ -1809,34 +1637,32 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                 */
                cfq_preempt_queue(cfqd, cfqq);
                cfq_mark_cfqq_must_dispatch(cfqq);
-                cfq_start_queueing(cfqd, cfqq);
+                blk_start_queueing(cfqd->queue);
        }
 }
 static void cfq_insert_request(request_queue_t *q, struct request *rq)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-        struct cfq_rq *crq = RQ_DATA(rq);
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
-        struct cfq_queue *cfqq = crq->cfq_queue;
        cfq_init_prio_data(cfqq);
-        cfq_add_crq_rb(crq);
+        cfq_add_rq_rb(rq);
-        list_add_tail(&rq->queuelist, &cfqq->fifo);
+        if (!cfq_cfqq_on_rr(cfqq))
+                cfq_add_cfqq_rr(cfqd, cfqq);
-        if (rq_mergeable(rq))
+        list_add_tail(&rq->queuelist, &cfqq->fifo);
-                cfq_add_crq_hash(cfqd, crq);
-        cfq_crq_enqueued(cfqd, cfqq, crq);
+        cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 static void cfq_completed_request(request_queue_t *q, struct request *rq)
 {
-        struct cfq_rq *crq = RQ_DATA(rq);
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
-        struct cfq_queue *cfqq = crq->cfq_queue;
        struct cfq_data *cfqd = cfqq->cfqd;
-        const int sync = cfq_crq_is_sync(crq);
+        const int sync = rq_is_sync(rq);
        unsigned long now;
        now = jiffies;
@@ -1849,15 +1675,11 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
        if (!cfq_class_idle(cfqq))
                cfqd->last_end_request = now;
-        if (!cfq_cfqq_dispatched(cfqq)) {
+        if (!cfq_cfqq_dispatched(cfqq) && cfq_cfqq_on_rr(cfqq))
-                if (cfq_cfqq_on_rr(cfqq)) {
+                cfq_resort_rr_list(cfqq, 0);
-                        cfqq->service_last = now;
-                        cfq_resort_rr_list(cfqq, 0);
-                }
-        }
        if (sync)
-                crq->io_context->last_end_request = now;
+                RQ_CIC(rq)->last_end_request = now;
        /*
         * If this is the active queue, check if it needs to be expired,
@@ -1873,30 +1695,6 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
        }
 }
-static struct request *
-cfq_former_request(request_queue_t *q, struct request *rq)
-{
-        struct cfq_rq *crq = RQ_DATA(rq);
-        struct rb_node *rbprev = rb_prev(&crq->rb_node);
-        if (rbprev)
-                return rb_entry_crq(rbprev)->request;
-        return NULL;
-}
-static struct request *
-cfq_latter_request(request_queue_t *q, struct request *rq)
-{
-        struct cfq_rq *crq = RQ_DATA(rq);
-        struct rb_node *rbnext = rb_next(&crq->rb_node);
-        if (rbnext)
-                return rb_entry_crq(rbnext)->request;
-        return NULL;
-}
 /*
 * we temporarily boost lower priority queues if they are holding fs exclusive
 * resources. they are boosted to normal prio (CLASS_BE/4)
@@ -1933,9 +1731,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
                cfq_resort_rr_list(cfqq, 0);
 }
-static inline int
+static inline int __cfq_may_queue(struct cfq_queue *cfqq)
-__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-                struct task_struct *task, int rw)
 {
        if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
            !cfq_cfqq_must_alloc_slice(cfqq)) {
@@ -1946,7 +1742,7 @@ __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        return ELV_MQUEUE_MAY;
 }
-static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
+static int cfq_may_queue(request_queue_t *q, int rw)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
        struct task_struct *tsk = current;
@@ -1963,48 +1759,30 @@ static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
                cfq_init_prio_data(cfqq);
                cfq_prio_boost(cfqq);
-                return __cfq_may_queue(cfqd, cfqq, tsk, rw);
+                return __cfq_may_queue(cfqq);
        }
        return ELV_MQUEUE_MAY;
 }
-static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
-{
-        struct cfq_data *cfqd = q->elevator->elevator_data;
-        if (unlikely(cfqd->rq_starved)) {
-                struct request_list *rl = &q->rq;
-                smp_mb();
-                if (waitqueue_active(&rl->wait[READ]))
-                        wake_up(&rl->wait[READ]);
-                if (waitqueue_active(&rl->wait[WRITE]))
-                        wake_up(&rl->wait[WRITE]);
-        }
-}
 /*
 * queue lock held here
 */
 static void cfq_put_request(request_queue_t *q, struct request *rq)
 {
-        struct cfq_data *cfqd = q->elevator->elevator_data;
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
-        struct cfq_rq *crq = RQ_DATA(rq);
-        if (crq) {
+        if (cfqq) {
-                struct cfq_queue *cfqq = crq->cfq_queue;
                const int rw = rq_data_dir(rq);
                BUG_ON(!cfqq->allocated[rw]);
                cfqq->allocated[rw]--;
-                put_io_context(crq->io_context->ioc);
+                put_io_context(RQ_CIC(rq)->ioc);
-                mempool_free(crq, cfqd->crq_pool);
                rq->elevator_private = NULL;
+                rq->elevator_private2 = NULL;
-                cfq_check_waiters(q, cfqq);
                cfq_put_queue(cfqq);
        }
 }
@@ -2013,8 +1791,7 @@ static void cfq_put_request(request_queue_t *q, struct request *rq)
 * Allocate cfq data structures associated with this request.
 */
 static int
-cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
-                gfp_t gfp_mask)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
        struct task_struct *tsk = current;
@@ -2022,7 +1799,6 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
        const int rw = rq_data_dir(rq);
        pid_t key = cfq_queue_pid(tsk, rw);
        struct cfq_queue *cfqq;
-        struct cfq_rq *crq;
        unsigned long flags;
        int is_sync = key != CFQ_KEY_ASYNC;
@@ -2046,42 +1822,18 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
        cfqq->allocated[rw]++;
        cfq_clear_cfqq_must_alloc(cfqq);
-        cfqd->rq_starved = 0;
        atomic_inc(&cfqq->ref);
-        spin_unlock_irqrestore(q->queue_lock, flags);
-        crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
+        spin_unlock_irqrestore(q->queue_lock, flags);
-        if (crq) {
-                RB_CLEAR_NODE(&crq->rb_node);
-                crq->rb_key = 0;
-                crq->request = rq;
-                INIT_HLIST_NODE(&crq->hash);
-                crq->cfq_queue = cfqq;
-                crq->io_context = cic;
-                if (is_sync)
-                        cfq_mark_crq_is_sync(crq);
-                else
-                        cfq_clear_crq_is_sync(crq);
-                rq->elevator_private = crq;
+        rq->elevator_private = cic;
-                return 0;
+        rq->elevator_private2 = cfqq;
-        }
+        return 0;
-        spin_lock_irqsave(q->queue_lock, flags);
-        cfqq->allocated[rw]--;
-        if (!(cfqq->allocated[0] + cfqq->allocated[1]))
-                cfq_mark_cfqq_must_alloc(cfqq);
-        cfq_put_queue(cfqq);
 queue_fail:
        if (cic)
                put_io_context(cic->ioc);
-        /*
-         * mark us rq allocation starved. we need to kickstart the process
-         * ourselves if there are no pending requests that can do it for us.
-         * that would be an extremely rare OOM situation
-         */
-        cfqd->rq_starved = 1;
        cfq_schedule_dispatch(cfqd);
        spin_unlock_irqrestore(q->queue_lock, flags);
        return 1;
@@ -2090,27 +1842,10 @@ queue_fail:
 static void cfq_kick_queue(void *data)
 {
        request_queue_t *q = data;
-        struct cfq_data *cfqd = q->elevator->elevator_data;
        unsigned long flags;
        spin_lock_irqsave(q->queue_lock, flags);
+        blk_start_queueing(q);
-        if (cfqd->rq_starved) {
-                struct request_list *rl = &q->rq;
-                /*
-                 * we aren't guaranteed to get a request after this, but we
-                 * have to be opportunistic
-                 */
-                smp_mb();
-                if (waitqueue_active(&rl->wait[READ]))
-                        wake_up(&rl->wait[READ]);
-                if (waitqueue_active(&rl->wait[WRITE]))
-                        wake_up(&rl->wait[WRITE]);
-        }
-        blk_remove_plug(q);
-        q->request_fn(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
@@ -2193,7 +1928,6 @@ static void cfq_exit_queue(elevator_t *e)
        cfq_shutdown_timer_wq(cfqd);
-        spin_lock(&cfq_exit_lock);
        spin_lock_irq(q->queue_lock);
        if (cfqd->active_queue)
@@ -2203,25 +1937,14 @@ static void cfq_exit_queue(elevator_t *e)
                struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
                                                        struct cfq_io_context,
                                                        queue_list);
-                if (cic->cfqq[ASYNC]) {
-                        cfq_put_queue(cic->cfqq[ASYNC]);
+                __cfq_exit_single_io_context(cfqd, cic);
-                        cic->cfqq[ASYNC] = NULL;
-                }
-                if (cic->cfqq[SYNC]) {
-                        cfq_put_queue(cic->cfqq[SYNC]);
-                        cic->cfqq[SYNC] = NULL;
-                }
-                cic->key = NULL;
-                list_del_init(&cic->queue_list);
        }
        spin_unlock_irq(q->queue_lock);
-        spin_unlock(&cfq_exit_lock);
        cfq_shutdown_timer_wq(cfqd);
-        mempool_destroy(cfqd->crq_pool);
-        kfree(cfqd->crq_hash);
        kfree(cfqd->cfq_hash);
        kfree(cfqd);
 }
@@ -2231,7 +1954,7 @@ static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
        struct cfq_data *cfqd;
        int i;
-        cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
+        cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
        if (!cfqd)
                return NULL;
@@ -2243,23 +1966,12 @@ static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
        INIT_LIST_HEAD(&cfqd->busy_rr);
        INIT_LIST_HEAD(&cfqd->cur_rr);
        INIT_LIST_HEAD(&cfqd->idle_rr);
-        INIT_LIST_HEAD(&cfqd->empty_list);
        INIT_LIST_HEAD(&cfqd->cic_list);
-        cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
+        cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
-        if (!cfqd->crq_hash)
-                goto out_crqhash;
-        cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
        if (!cfqd->cfq_hash)
-                goto out_cfqhash;
+                goto out_free;
-        cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
-        if (!cfqd->crq_pool)
-                goto out_crqpool;
-        for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-                INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
        for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
                INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
@@ -2275,7 +1987,6 @@ static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
        INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
-        cfqd->cfq_queued = cfq_queued;
        cfqd->cfq_quantum = cfq_quantum;
        cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
        cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
@@ -2287,19 +1998,13 @@ static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
        cfqd->cfq_slice_idle = cfq_slice_idle;
        return cfqd;
-out_crqpool:
+out_free:
-        kfree(cfqd->cfq_hash);
-out_cfqhash:
-        kfree(cfqd->crq_hash);
-out_crqhash:
        kfree(cfqd);
        return NULL;
 }
 static void cfq_slab_kill(void)
 {
-        if (crq_pool)
-                kmem_cache_destroy(crq_pool);
        if (cfq_pool)
                kmem_cache_destroy(cfq_pool);
        if (cfq_ioc_pool)
@@ -2308,11 +2013,6 @@ static void cfq_slab_kill(void)
 static int __init cfq_slab_setup(void)
 {
-        crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
-                                        NULL, NULL);
-        if (!crq_pool)
-                goto fail;
        cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
                                        NULL, NULL);
        if (!cfq_pool)
@@ -2358,7 +2058,6 @@ static ssize_t __FUNC(elevator_t *e, char *page)			\
        return cfq_var_show(__data, (page));                            \
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
-SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
@@ -2386,7 +2085,6 @@ static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
        return ret;                                                     \
 }
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
@@ -2402,7 +2100,6 @@ STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX,
 static struct elv_fs_entry cfq_attrs[] = {
        CFQ_ATTR(quantum),
-        CFQ_ATTR(queued),
        CFQ_ATTR(fifo_expire_sync),
        CFQ_ATTR(fifo_expire_async),
        CFQ_ATTR(back_seek_max),
@@ -2425,14 +2122,14 @@ static struct elevator_type iosched_cfq = {
                .elevator_deactivate_req_fn =   cfq_deactivate_request,
                .elevator_queue_empty_fn =      cfq_queue_empty,
                .elevator_completed_req_fn =    cfq_completed_request,
-                .elevator_former_req_fn =       cfq_former_request,
+                .elevator_former_req_fn =       elv_rb_former_request,
-                .elevator_latter_req_fn =       cfq_latter_request,
+                .elevator_latter_req_fn =       elv_rb_latter_request,
                .elevator_set_req_fn =          cfq_set_request,
                .elevator_put_req_fn =          cfq_put_request,
                .elevator_may_queue_fn =        cfq_may_queue,
                .elevator_init_fn =             cfq_init_queue,
                .elevator_exit_fn =             cfq_exit_queue,
-                .trim =                         cfq_trim,
+                .trim =                         cfq_free_io_context,
        },
        .elevator_attrs =       cfq_attrs,
        .elevator_name =        "cfq",
@@ -2463,12 +2160,12 @@ static int __init cfq_init(void)
 static void __exit cfq_exit(void)
 {
-        DECLARE_COMPLETION(all_gone);
+        DECLARE_COMPLETION_ONSTACK(all_gone);
        elv_unregister(&iosched_cfq);
        ioc_gone = &all_gone;
        /* ioc_gone's update must be visible before reading ioc_count */
        smp_wmb();
-        if (atomic_read(&ioc_count))
+        if (elv_ioc_count_read(ioc_count))
                wait_for_completion(ioc_gone);
        synchronize_rcu();
        cfq_slab_kill();
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index c7ca9f0b6498..b7c5b34cb7b4 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -1,7 +1,7 @@
 /*
 *  Deadline i/o scheduler.
 *
- *  Copyright (C) 2002 Jens Axboe <axboe@suse.de>
+ *  Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/hash.h>
 #include <linux/rbtree.h>
 /*
@@ -24,13 +23,6 @@ static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
                                     by the above parameters. For throughput. */
-static const int deadline_hash_shift = 5;
-#define DL_HASH_BLOCK(sec)      ((sec) >> 3)
-#define DL_HASH_FN(sec)         (hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
-#define DL_HASH_ENTRIES         (1 << deadline_hash_shift)
-#define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
-#define ON_HASH(drq)            (!hlist_unhashed(&(drq)->hash))
 struct deadline_data {
        /*
         * run time data
@@ -45,8 +37,7 @@ struct deadline_data {
        /*
         * next in sort order. read, write or both are NULL
         */
-        struct deadline_rq *next_drq[2];
+        struct request *next_rq[2];
-        struct hlist_head *hash;        /* request hash */
        unsigned int batching;          /* number of sequential requests made */
        sector_t last_sector;           /* head position */
        unsigned int starved;           /* times reads have starved writes */
@@ -58,240 +49,69 @@ struct deadline_data {
        int fifo_batch;
        int writes_starved;
        int front_merges;
-        mempool_t *drq_pool;
 };
-/*
+static void deadline_move_request(struct deadline_data *, struct request *);
- * pre-request data.
- */
-struct deadline_rq {
-        /*
-         * rbtree index, key is the starting offset
-         */
-        struct rb_node rb_node;
-        sector_t rb_key;
-        struct request *request;
-        /*
-         * request hash, key is the ending offset (for back merge lookup)
-         */
-        struct hlist_node hash;
-        /*
-         * expire fifo
-         */
-        struct list_head fifo;
-        unsigned long expires;
-};
-static void deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq);
-static kmem_cache_t *drq_pool;
-#define RQ_DATA(rq)     ((struct deadline_rq *) (rq)->elevator_private)
-/*
+#define RQ_RB_ROOT(dd, rq)      (&(dd)->sort_list[rq_data_dir((rq))])
- * the back merge hash support functions
- */
-static inline void __deadline_del_drq_hash(struct deadline_rq *drq)
-{
-        hlist_del_init(&drq->hash);
-}
-static inline void deadline_del_drq_hash(struct deadline_rq *drq)
-{
-        if (ON_HASH(drq))
-                __deadline_del_drq_hash(drq);
-}
-static inline void
-deadline_add_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
-{
-        struct request *rq = drq->request;
-        BUG_ON(ON_HASH(drq));
-        hlist_add_head(&drq->hash, &dd->hash[DL_HASH_FN(rq_hash_key(rq))]);
-}
-/*
- * move hot entry to front of chain
- */
-static inline void
-deadline_hot_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
-{
-        struct request *rq = drq->request;
-        struct hlist_head *head = &dd->hash[DL_HASH_FN(rq_hash_key(rq))];
-        if (ON_HASH(drq) && &drq->hash != head->first) {
-                hlist_del(&drq->hash);
-                hlist_add_head(&drq->hash, head);
-        }
-}
-static struct request *
-deadline_find_drq_hash(struct deadline_data *dd, sector_t offset)
-{
-        struct hlist_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
-        struct hlist_node *entry, *next;
-        struct deadline_rq *drq;
-        hlist_for_each_entry_safe(drq, entry, next, hash_list, hash) {
-                struct request *__rq = drq->request;
-                BUG_ON(!ON_HASH(drq));
-                if (!rq_mergeable(__rq)) {
-                        __deadline_del_drq_hash(drq);
-                        continue;
-                }
-                if (rq_hash_key(__rq) == offset)
-                        return __rq;
-        }
-        return NULL;
-}
-/*
- * rb tree support functions
- */
-#define rb_entry_drq(node)      rb_entry((node), struct deadline_rq, rb_node)
-#define DRQ_RB_ROOT(dd, drq)    (&(dd)->sort_list[rq_data_dir((drq)->request)])
-#define rq_rb_key(rq)           (rq)->sector
-static struct deadline_rq *
-__deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
-{
-        struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node;
-        struct rb_node *parent = NULL;
-        struct deadline_rq *__drq;
-        while (*p) {
-                parent = *p;
-                __drq = rb_entry_drq(parent);
-                if (drq->rb_key < __drq->rb_key)
-                        p = &(*p)->rb_left;
-                else if (drq->rb_key > __drq->rb_key)
-                        p = &(*p)->rb_right;
-                else
-                        return __drq;
-        }
-        rb_link_node(&drq->rb_node, parent, p);
-        return NULL;
-}
 static void
-deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
 {
-        struct deadline_rq *__alias;
+        struct rb_root *root = RQ_RB_ROOT(dd, rq);
+        struct request *__alias;
-        drq->rb_key = rq_rb_key(drq->request);
 retry:
-        __alias = __deadline_add_drq_rb(dd, drq);
+        __alias = elv_rb_add(root, rq);
-        if (!__alias) {
+        if (unlikely(__alias)) {
-                rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
+                deadline_move_request(dd, __alias);
-                return;
+                goto retry;
        }
-        deadline_move_request(dd, __alias);
-        goto retry;
 }
 static inline void
-deadline_del_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
 {
-        const int data_dir = rq_data_dir(drq->request);
+        const int data_dir = rq_data_dir(rq);
-        if (dd->next_drq[data_dir] == drq) {
+        if (dd->next_rq[data_dir] == rq) {
-                struct rb_node *rbnext = rb_next(&drq->rb_node);
+                struct rb_node *rbnext = rb_next(&rq->rb_node);
-                dd->next_drq[data_dir] = NULL;
+                dd->next_rq[data_dir] = NULL;
                if (rbnext)
-                        dd->next_drq[data_dir] = rb_entry_drq(rbnext);
+                        dd->next_rq[data_dir] = rb_entry_rq(rbnext);
-        }
-        BUG_ON(!RB_EMPTY_NODE(&drq->rb_node));
-        rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
-        RB_CLEAR_NODE(&drq->rb_node);
-}
-static struct request *
-deadline_find_drq_rb(struct deadline_data *dd, sector_t sector, int data_dir)
-{
-        struct rb_node *n = dd->sort_list[data_dir].rb_node;
-        struct deadline_rq *drq;
-        while (n) {
-                drq = rb_entry_drq(n);
-                if (sector < drq->rb_key)
-                        n = n->rb_left;
-                else if (sector > drq->rb_key)
-                        n = n->rb_right;
-                else
-                        return drq->request;
        }
-        return NULL;
+        elv_rb_del(RQ_RB_ROOT(dd, rq), rq);
 }
 /*
- * deadline_find_first_drq finds the first (lowest sector numbered) request
+ * add rq to rbtree and fifo
- * for the specified data_dir. Used to sweep back to the start of the disk
- * (1-way elevator) after we process the last (highest sector) request.
- */
-static struct deadline_rq *
-deadline_find_first_drq(struct deadline_data *dd, int data_dir)
-{
-        struct rb_node *n = dd->sort_list[data_dir].rb_node;
-        for (;;) {
-                if (n->rb_left == NULL)
-                        return rb_entry_drq(n);
-                
-                n = n->rb_left;
-        }
-}
-/*
- * add drq to rbtree and fifo
 */
 static void
 deadline_add_request(struct request_queue *q, struct request *rq)
 {
        struct deadline_data *dd = q->elevator->elevator_data;
-        struct deadline_rq *drq = RQ_DATA(rq);
+        const int data_dir = rq_data_dir(rq);
-        const int data_dir = rq_data_dir(drq->request);
+        deadline_add_rq_rb(dd, rq);
-        deadline_add_drq_rb(dd, drq);
        /*
         * set expire time (only used for reads) and add to fifo list
         */
-        drq->expires = jiffies + dd->fifo_expire[data_dir];
+        rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
-        list_add_tail(&drq->fifo, &dd->fifo_list[data_dir]);
+        list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
-        if (rq_mergeable(rq))
-                deadline_add_drq_hash(dd, drq);
 }
 /*
- * remove rq from rbtree, fifo, and hash
+ * remove rq from rbtree and fifo.
 */
 static void deadline_remove_request(request_queue_t *q, struct request *rq)
 {
-        struct deadline_rq *drq = RQ_DATA(rq);
        struct deadline_data *dd = q->elevator->elevator_data;
-        list_del_init(&drq->fifo);
+        rq_fifo_clear(rq);
-        deadline_del_drq_rb(dd, drq);
+        deadline_del_rq_rb(dd, rq);
-        deadline_del_drq_hash(drq);
 }
 static int
@@ -302,27 +122,14 @@ deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
        int ret;
        /*
-         * see if the merge hash can satisfy a back merge
-         */
-        __rq = deadline_find_drq_hash(dd, bio->bi_sector);
-        if (__rq) {
-                BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-                if (elv_rq_merge_ok(__rq, bio)) {
-                        ret = ELEVATOR_BACK_MERGE;
-                        goto out;
-                }
-        }
-        /*
         * check for front merge
         */
        if (dd->front_merges) {
-                sector_t rb_key = bio->bi_sector + bio_sectors(bio);
+                sector_t sector = bio->bi_sector + bio_sectors(bio);
-                __rq = deadline_find_drq_rb(dd, rb_key, bio_data_dir(bio));
+                __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
                if (__rq) {
-                        BUG_ON(rb_key != rq_rb_key(__rq));
+                        BUG_ON(sector != __rq->sector);
                        if (elv_rq_merge_ok(__rq, bio)) {
                                ret = ELEVATOR_FRONT_MERGE;
@@ -333,29 +140,21 @@ deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
        return ELEVATOR_NO_MERGE;
 out:
-        if (ret)
-                deadline_hot_drq_hash(dd, RQ_DATA(__rq));
        *req = __rq;
        return ret;
 }
-static void deadline_merged_request(request_queue_t *q, struct request *req)
+static void deadline_merged_request(request_queue_t *q, struct request *req,
+                                    int type)
 {
        struct deadline_data *dd = q->elevator->elevator_data;
-        struct deadline_rq *drq = RQ_DATA(req);
-        /*
-         * hash always needs to be repositioned, key is end sector
-         */
-        deadline_del_drq_hash(drq);
-        deadline_add_drq_hash(dd, drq);
        /*
         * if the merge was a front merge, we need to reposition request
         */
-        if (rq_rb_key(req) != drq->rb_key) {
+        if (type == ELEVATOR_FRONT_MERGE) {
-                deadline_del_drq_rb(dd, drq);
+                elv_rb_del(RQ_RB_ROOT(dd, req), req);
-                deadline_add_drq_rb(dd, drq);
+                deadline_add_rq_rb(dd, req);
        }
 }
@@ -363,33 +162,14 @@ static void
 deadline_merged_requests(request_queue_t *q, struct request *req,
                         struct request *next)
 {
-        struct deadline_data *dd = q->elevator->elevator_data;
-        struct deadline_rq *drq = RQ_DATA(req);
-        struct deadline_rq *dnext = RQ_DATA(next);
-        BUG_ON(!drq);
-        BUG_ON(!dnext);
        /*
-         * reposition drq (this is the merged request) in hash, and in rbtree
+         * if next expires before rq, assign its expire time to rq
-         * in case of a front merge
+         * and move into next position (next will be deleted) in fifo
         */
-        deadline_del_drq_hash(drq);
+        if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
-        deadline_add_drq_hash(dd, drq);
+                if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
+                        list_move(&req->queuelist, &next->queuelist);
-        if (rq_rb_key(req) != drq->rb_key) {
+                        rq_set_fifo_time(req, rq_fifo_time(next));
-                deadline_del_drq_rb(dd, drq);
-                deadline_add_drq_rb(dd, drq);
-        }
-        /*
-         * if dnext expires before drq, assign its expire time to drq
-         * and move into dnext position (dnext will be deleted) in fifo
-         */
-        if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) {
-                if (time_before(dnext->expires, drq->expires)) {
-                        list_move(&drq->fifo, &dnext->fifo);
-                        drq->expires = dnext->expires;
                }
        }
@@ -403,52 +183,50 @@ deadline_merged_requests(request_queue_t *q, struct request *req,
 * move request from sort list to dispatch queue.
 */
 static inline void
-deadline_move_to_dispatch(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
 {
-        request_queue_t *q = drq->request->q;
+        request_queue_t *q = rq->q;
-        deadline_remove_request(q, drq->request);
+        deadline_remove_request(q, rq);
-        elv_dispatch_add_tail(q, drq->request);
+        elv_dispatch_add_tail(q, rq);
 }
 /*
 * move an entry to dispatch queue
 */
 static void
-deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_move_request(struct deadline_data *dd, struct request *rq)
 {
-        const int data_dir = rq_data_dir(drq->request);
+        const int data_dir = rq_data_dir(rq);
-        struct rb_node *rbnext = rb_next(&drq->rb_node);
+        struct rb_node *rbnext = rb_next(&rq->rb_node);
-        dd->next_drq[READ] = NULL;
+        dd->next_rq[READ] = NULL;
-        dd->next_drq[WRITE] = NULL;
+        dd->next_rq[WRITE] = NULL;
        if (rbnext)
-                dd->next_drq[data_dir] = rb_entry_drq(rbnext);
+                dd->next_rq[data_dir] = rb_entry_rq(rbnext);
        
-        dd->last_sector = drq->request->sector + drq->request->nr_sectors;
+        dd->last_sector = rq->sector + rq->nr_sectors;
        /*
         * take it off the sort and fifo list, move
         * to dispatch queue
         */
-        deadline_move_to_dispatch(dd, drq);
+        deadline_move_to_dispatch(dd, rq);
 }
-#define list_entry_fifo(ptr)    list_entry((ptr), struct deadline_rq, fifo)
 /*
 * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
 */
 static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
 {
-        struct deadline_rq *drq = list_entry_fifo(dd->fifo_list[ddir].next);
+        struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
        /*
-         * drq is expired!
+         * rq is expired!
         */
-        if (time_after(jiffies, drq->expires))
+        if (time_after(jiffies, rq_fifo_time(rq)))
                return 1;
        return 0;
@@ -463,21 +241,21 @@ static int deadline_dispatch_requests(request_queue_t *q, int force)
        struct deadline_data *dd = q->elevator->elevator_data;
        const int reads = !list_empty(&dd->fifo_list[READ]);
        const int writes = !list_empty(&dd->fifo_list[WRITE]);
-        struct deadline_rq *drq;
+        struct request *rq;
        int data_dir;
        /*
         * batches are currently reads XOR writes
         */
-        if (dd->next_drq[WRITE])
+        if (dd->next_rq[WRITE])
-                drq = dd->next_drq[WRITE];
+                rq = dd->next_rq[WRITE];
        else
-                drq = dd->next_drq[READ];
+                rq = dd->next_rq[READ];
-        if (drq) {
+        if (rq) {
                /* we have a "next request" */
                
-                if (dd->last_sector != drq->request->sector)
+                if (dd->last_sector != rq->sector)
                        /* end the batch on a non sequential request */
                        dd->batching += dd->fifo_batch;
                
@@ -526,30 +304,33 @@ dispatch_find_request:
        if (deadline_check_fifo(dd, data_dir)) {
                /* An expired request exists - satisfy it */
                dd->batching = 0;
-                drq = list_entry_fifo(dd->fifo_list[data_dir].next);
+                rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
                
-        } else if (dd->next_drq[data_dir]) {
+        } else if (dd->next_rq[data_dir]) {
                /*
                 * The last req was the same dir and we have a next request in
                 * sort order. No expired requests so continue on from here.
                 */
-                drq = dd->next_drq[data_dir];
+                rq = dd->next_rq[data_dir];
        } else {
+                struct rb_node *node;
                /*
                 * The last req was the other direction or we have run out of
                 * higher-sectored requests. Go back to the lowest sectored
                 * request (1 way elevator) and start a new batch.
                 */
                dd->batching = 0;
-                drq = deadline_find_first_drq(dd, data_dir);
+                node = rb_first(&dd->sort_list[data_dir]);
+                if (node)
+                        rq = rb_entry_rq(node);
        }
 dispatch_request:
        /*
-         * drq is the selected appropriate request.
+         * rq is the selected appropriate request.
         */
        dd->batching++;
-        deadline_move_request(dd, drq);
+        deadline_move_request(dd, rq);
        return 1;
 }
@@ -562,30 +343,6 @@ static int deadline_queue_empty(request_queue_t *q)
                && list_empty(&dd->fifo_list[READ]);
 }
-static struct request *
-deadline_former_request(request_queue_t *q, struct request *rq)
-{
-        struct deadline_rq *drq = RQ_DATA(rq);
-        struct rb_node *rbprev = rb_prev(&drq->rb_node);
-        if (rbprev)
-                return rb_entry_drq(rbprev)->request;
-        return NULL;
-}
-static struct request *
-deadline_latter_request(request_queue_t *q, struct request *rq)
-{
-        struct deadline_rq *drq = RQ_DATA(rq);
-        struct rb_node *rbnext = rb_next(&drq->rb_node);
-        if (rbnext)
-                return rb_entry_drq(rbnext)->request;
-        return NULL;
-}
 static void deadline_exit_queue(elevator_t *e)
 {
        struct deadline_data *dd = e->elevator_data;
@@ -593,46 +350,21 @@ static void deadline_exit_queue(elevator_t *e)
        BUG_ON(!list_empty(&dd->fifo_list[READ]));
        BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
-        mempool_destroy(dd->drq_pool);
-        kfree(dd->hash);
        kfree(dd);
 }
 /*
- * initialize elevator private data (deadline_data), and alloc a drq for
+ * initialize elevator private data (deadline_data).
- * each request on the free lists
 */
 static void *deadline_init_queue(request_queue_t *q, elevator_t *e)
 {
        struct deadline_data *dd;
-        int i;
-        if (!drq_pool)
-                return NULL;
        dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
        if (!dd)
                return NULL;
        memset(dd, 0, sizeof(*dd));
-        dd->hash = kmalloc_node(sizeof(struct hlist_head)*DL_HASH_ENTRIES,
-                                GFP_KERNEL, q->node);
-        if (!dd->hash) {
-                kfree(dd);
-                return NULL;
-        }
-        dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-                                        mempool_free_slab, drq_pool, q->node);
-        if (!dd->drq_pool) {
-                kfree(dd->hash);
-                kfree(dd);
-                return NULL;
-        }
-        for (i = 0; i < DL_HASH_ENTRIES; i++)
-                INIT_HLIST_HEAD(&dd->hash[i]);
        INIT_LIST_HEAD(&dd->fifo_list[READ]);
        INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
        dd->sort_list[READ] = RB_ROOT;
@@ -645,39 +377,6 @@ static void *deadline_init_queue(request_queue_t *q, elevator_t *e)
        return dd;
 }
-static void deadline_put_request(request_queue_t *q, struct request *rq)
-{
-        struct deadline_data *dd = q->elevator->elevator_data;
-        struct deadline_rq *drq = RQ_DATA(rq);
-        mempool_free(drq, dd->drq_pool);
-        rq->elevator_private = NULL;
-}
-static int
-deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
-                     gfp_t gfp_mask)
-{
-        struct deadline_data *dd = q->elevator->elevator_data;
-        struct deadline_rq *drq;
-        drq = mempool_alloc(dd->drq_pool, gfp_mask);
-        if (drq) {
-                memset(drq, 0, sizeof(*drq));
-                RB_CLEAR_NODE(&drq->rb_node);
-                drq->request = rq;
-                INIT_HLIST_NODE(&drq->hash);
-                INIT_LIST_HEAD(&drq->fifo);
-                rq->elevator_private = drq;
-                return 0;
-        }
-        return 1;
-}
 /*
 * sysfs parts below
 */
@@ -757,10 +456,8 @@ static struct elevator_type iosched_deadline = {
                .elevator_dispatch_fn =         deadline_dispatch_requests,
                .elevator_add_req_fn =          deadline_add_request,
                .elevator_queue_empty_fn =      deadline_queue_empty,
-                .elevator_former_req_fn =       deadline_former_request,
+                .elevator_former_req_fn =       elv_rb_former_request,
-                .elevator_latter_req_fn =       deadline_latter_request,
+                .elevator_latter_req_fn =       elv_rb_latter_request,
-                .elevator_set_req_fn =          deadline_set_request,
-                .elevator_put_req_fn =          deadline_put_request,
                .elevator_init_fn =             deadline_init_queue,
                .elevator_exit_fn =             deadline_exit_queue,
        },
@@ -772,24 +469,11 @@ static struct elevator_type iosched_deadline = {
 static int __init deadline_init(void)
 {
-        int ret;
+        return elv_register(&iosched_deadline);
-        drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
-                                     0, 0, NULL, NULL);
-        if (!drq_pool)
-                return -ENOMEM;
-        ret = elv_register(&iosched_deadline);
-        if (ret)
-                kmem_cache_destroy(drq_pool);
-        return ret;
 }
 static void __exit deadline_exit(void)
 {
-        kmem_cache_destroy(drq_pool);
        elv_unregister(&iosched_deadline);
 }
diff --git a/block/elevator.c b/block/elevator.c
index 9b72dc7c8a5c..487dd3da8853 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -3,7 +3,7 @@
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
- * 30042000 Jens Axboe <axboe@suse.de> :
+ * 30042000 Jens Axboe <axboe@kernel.dk> :
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
@@ -33,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
+#include <linux/hash.h>
 #include <asm/uaccess.h>
@@ -40,6 +41,16 @@ static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 /*
+ * Merge hash stuff.
+ */
+static const int elv_hash_shift = 6;
+#define ELV_HASH_BLOCK(sec)     ((sec) >> 3)
+#define ELV_HASH_FN(sec)        (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
+#define ELV_HASH_ENTRIES        (1 << elv_hash_shift)
+#define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
+#define ELV_ON_HASH(rq)         (!hlist_unhashed(&(rq)->hash))
+/*
 * can we safely merge with this request?
 */
 inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
@@ -56,8 +67,7 @@ inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
        /*
         * same device and no special stuff set, merge is ok
         */
-        if (rq->rq_disk == bio->bi_bdev->bd_disk &&
+        if (rq->rq_disk == bio->bi_bdev->bd_disk && !rq->special)
-            !rq->waiting && !rq->special)
                return 1;
        return 0;
@@ -151,27 +161,44 @@ __setup("elevator=", elevator_setup);
 static struct kobj_type elv_ktype;
-static elevator_t *elevator_alloc(struct elevator_type *e)
+static elevator_t *elevator_alloc(request_queue_t *q, struct elevator_type *e)
-{
+{
-        elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+        elevator_t *eq;
-        if (eq) {
+        int i;
-                memset(eq, 0, sizeof(*eq));
-                eq->ops = &e->ops;
+        eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL, q->node);
-                eq->elevator_type = e;
+        if (unlikely(!eq))
-                kobject_init(&eq->kobj);
+                goto err;
-                snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-                eq->kobj.ktype = &elv_ktype;
+        memset(eq, 0, sizeof(*eq));
-                mutex_init(&eq->sysfs_lock);
+        eq->ops = &e->ops;
-        } else {
+        eq->elevator_type = e;
-                elevator_put(e);
+        kobject_init(&eq->kobj);
-        }
+        snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
+        eq->kobj.ktype = &elv_ktype;
+        mutex_init(&eq->sysfs_lock);
+        eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
+                                        GFP_KERNEL, q->node);
+        if (!eq->hash)
+                goto err;
+        for (i = 0; i < ELV_HASH_ENTRIES; i++)
+                INIT_HLIST_HEAD(&eq->hash[i]);
        return eq;
+err:
+        kfree(eq);
+        elevator_put(e);
+        return NULL;
 }
 static void elevator_release(struct kobject *kobj)
 {
        elevator_t *e = container_of(kobj, elevator_t, kobj);
        elevator_put(e->elevator_type);
+        kfree(e->hash);
        kfree(e);
 }
@@ -198,7 +225,7 @@ int elevator_init(request_queue_t *q, char *name)
                e = elevator_get("noop");
        }
-        eq = elevator_alloc(e);
+        eq = elevator_alloc(q, e);
        if (!eq)
                return -ENOMEM;
@@ -212,6 +239,8 @@ int elevator_init(request_queue_t *q, char *name)
        return ret;
 }
+EXPORT_SYMBOL(elevator_init);
 void elevator_exit(elevator_t *e)
 {
        mutex_lock(&e->sysfs_lock);
@@ -223,10 +252,118 @@ void elevator_exit(elevator_t *e)
        kobject_put(&e->kobj);
 }
+EXPORT_SYMBOL(elevator_exit);
+static inline void __elv_rqhash_del(struct request *rq)
+{
+        hlist_del_init(&rq->hash);
+}
+static void elv_rqhash_del(request_queue_t *q, struct request *rq)
+{
+        if (ELV_ON_HASH(rq))
+                __elv_rqhash_del(rq);
+}
+static void elv_rqhash_add(request_queue_t *q, struct request *rq)
+{
+        elevator_t *e = q->elevator;
+        BUG_ON(ELV_ON_HASH(rq));
+        hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
+}
+static void elv_rqhash_reposition(request_queue_t *q, struct request *rq)
+{
+        __elv_rqhash_del(rq);
+        elv_rqhash_add(q, rq);
+}
+static struct request *elv_rqhash_find(request_queue_t *q, sector_t offset)
+{
+        elevator_t *e = q->elevator;
+        struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
+        struct hlist_node *entry, *next;
+        struct request *rq;
+        hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
+                BUG_ON(!ELV_ON_HASH(rq));
+                if (unlikely(!rq_mergeable(rq))) {
+                        __elv_rqhash_del(rq);
+                        continue;
+                }
+                if (rq_hash_key(rq) == offset)
+                        return rq;
+        }
+        return NULL;
+}
+/*
+ * RB-tree support functions for inserting/lookup/removal of requests
+ * in a sorted RB tree.
+ */
+struct request *elv_rb_add(struct rb_root *root, struct request *rq)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct request *__rq;
+        while (*p) {
+                parent = *p;
+                __rq = rb_entry(parent, struct request, rb_node);
+                if (rq->sector < __rq->sector)
+                        p = &(*p)->rb_left;
+                else if (rq->sector > __rq->sector)
+                        p = &(*p)->rb_right;
+                else
+                        return __rq;
+        }
+        rb_link_node(&rq->rb_node, parent, p);
+        rb_insert_color(&rq->rb_node, root);
+        return NULL;
+}
+EXPORT_SYMBOL(elv_rb_add);
+void elv_rb_del(struct rb_root *root, struct request *rq)
+{
+        BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
+        rb_erase(&rq->rb_node, root);
+        RB_CLEAR_NODE(&rq->rb_node);
+}
+EXPORT_SYMBOL(elv_rb_del);
+struct request *elv_rb_find(struct rb_root *root, sector_t sector)
+{
+        struct rb_node *n = root->rb_node;
+        struct request *rq;
+        while (n) {
+                rq = rb_entry(n, struct request, rb_node);
+                if (sector < rq->sector)
+                        n = n->rb_left;
+                else if (sector > rq->sector)
+                        n = n->rb_right;
+                else
+                        return rq;
+        }
+        return NULL;
+}
+EXPORT_SYMBOL(elv_rb_find);
 /*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
- * entry.  If sort != 0, rq is sort-inserted; otherwise, rq will be
+ * entry.  rq is sort insted into the dispatch queue. To be used by
- * appended to the dispatch queue.  To be used by specific elevators.
+ * specific elevators.
 */
 void elv_dispatch_sort(request_queue_t *q, struct request *rq)
 {
@@ -235,6 +372,9 @@ void elv_dispatch_sort(request_queue_t *q, struct request *rq)
        if (q->last_merge == rq)
                q->last_merge = NULL;
+        elv_rqhash_del(q, rq);
        q->nr_sorted--;
        boundary = q->end_sector;
@@ -242,7 +382,7 @@ void elv_dispatch_sort(request_queue_t *q, struct request *rq)
        list_for_each_prev(entry, &q->queue_head) {
                struct request *pos = list_entry_rq(entry);
-                if (pos->flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED))
+                if (pos->cmd_flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED))
                        break;
                if (rq->sector >= boundary) {
                        if (pos->sector < boundary)
@@ -258,11 +398,38 @@ void elv_dispatch_sort(request_queue_t *q, struct request *rq)
        list_add(&rq->queuelist, entry);
 }
+EXPORT_SYMBOL(elv_dispatch_sort);
+/*
+ * Insert rq into dispatch queue of q.  Queue lock must be held on
+ * entry.  rq is added to the back of the dispatch queue. To be used by
+ * specific elevators.
+ */
+void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
+{
+        if (q->last_merge == rq)
+                q->last_merge = NULL;
+        elv_rqhash_del(q, rq);
+        q->nr_sorted--;
+        q->end_sector = rq_end_sector(rq);
+        q->boundary_rq = rq;
+        list_add_tail(&rq->queuelist, &q->queue_head);
+}
+EXPORT_SYMBOL(elv_dispatch_add_tail);
 int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
        elevator_t *e = q->elevator;
+        struct request *__rq;
        int ret;
+        /*
+         * First try one-hit cache.
+         */
        if (q->last_merge) {
                ret = elv_try_merge(q->last_merge, bio);
                if (ret != ELEVATOR_NO_MERGE) {
@@ -271,18 +438,30 @@ int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
                }
        }
+        /*
+         * See if our hash lookup can find a potential backmerge.
+         */
+        __rq = elv_rqhash_find(q, bio->bi_sector);
+        if (__rq && elv_rq_merge_ok(__rq, bio)) {
+                *req = __rq;
+                return ELEVATOR_BACK_MERGE;
+        }
        if (e->ops->elevator_merge_fn)
                return e->ops->elevator_merge_fn(q, req, bio);
        return ELEVATOR_NO_MERGE;
 }
-void elv_merged_request(request_queue_t *q, struct request *rq)
+void elv_merged_request(request_queue_t *q, struct request *rq, int type)
 {
        elevator_t *e = q->elevator;
        if (e->ops->elevator_merged_fn)
-                e->ops->elevator_merged_fn(q, rq);
+                e->ops->elevator_merged_fn(q, rq, type);
+        if (type == ELEVATOR_BACK_MERGE)
+                elv_rqhash_reposition(q, rq);
        q->last_merge = rq;
 }
@@ -294,8 +473,11 @@ void elv_merge_requests(request_queue_t *q, struct request *rq,
        if (e->ops->elevator_merge_req_fn)
                e->ops->elevator_merge_req_fn(q, rq, next);
-        q->nr_sorted--;
+        elv_rqhash_reposition(q, rq);
+        elv_rqhash_del(q, next);
+        q->nr_sorted--;
        q->last_merge = rq;
 }
@@ -313,7 +495,7 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
                        e->ops->elevator_deactivate_req_fn(q, rq);
        }
-        rq->flags &= ~REQ_STARTED;
+        rq->cmd_flags &= ~REQ_STARTED;
        elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
@@ -344,13 +526,13 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
        switch (where) {
        case ELEVATOR_INSERT_FRONT:
-                rq->flags |= REQ_SOFTBARRIER;
+                rq->cmd_flags |= REQ_SOFTBARRIER;
                list_add(&rq->queuelist, &q->queue_head);
                break;
        case ELEVATOR_INSERT_BACK:
-                rq->flags |= REQ_SOFTBARRIER;
+                rq->cmd_flags |= REQ_SOFTBARRIER;
                elv_drain_elevator(q);
                list_add_tail(&rq->queuelist, &q->queue_head);
                /*
@@ -369,10 +551,14 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
        case ELEVATOR_INSERT_SORT:
                BUG_ON(!blk_fs_request(rq));
-                rq->flags |= REQ_SORTED;
+                rq->cmd_flags |= REQ_SORTED;
                q->nr_sorted++;
-                if (q->last_merge == NULL && rq_mergeable(rq))
+                if (rq_mergeable(rq)) {
-                        q->last_merge = rq;
+                        elv_rqhash_add(q, rq);
+                        if (!q->last_merge)
+                                q->last_merge = rq;
+                }
                /*
                 * Some ioscheds (cfq) run q->request_fn directly, so
                 * rq cannot be accessed after calling
@@ -387,7 +573,7 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
                 * insertion; otherwise, requests should be requeued
                 * in ordseq order.
                 */
-                rq->flags |= REQ_SOFTBARRIER;
+                rq->cmd_flags |= REQ_SOFTBARRIER;
                if (q->ordseq == 0) {
                        list_add(&rq->queuelist, &q->queue_head);
@@ -429,9 +615,9 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
                       int plug)
 {
        if (q->ordcolor)
-                rq->flags |= REQ_ORDERED_COLOR;
+                rq->cmd_flags |= REQ_ORDERED_COLOR;
-        if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
+        if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
                /*
                 * toggle ordered color
                 */
@@ -452,7 +638,7 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
                        q->end_sector = rq_end_sector(rq);
                        q->boundary_rq = rq;
                }
-        } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
+        } else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
                where = ELEVATOR_INSERT_BACK;
        if (plug)
@@ -461,6 +647,8 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
        elv_insert(q, rq, where);
 }
+EXPORT_SYMBOL(__elv_add_request);
 void elv_add_request(request_queue_t *q, struct request *rq, int where,
                     int plug)
 {
@@ -471,6 +659,8 @@ void elv_add_request(request_queue_t *q, struct request *rq, int where,
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
+EXPORT_SYMBOL(elv_add_request);
 static inline struct request *__elv_next_request(request_queue_t *q)
 {
        struct request *rq;
@@ -493,7 +683,7 @@ struct request *elv_next_request(request_queue_t *q)
        int ret;
        while ((rq = __elv_next_request(q)) != NULL) {
-                if (!(rq->flags & REQ_STARTED)) {
+                if (!(rq->cmd_flags & REQ_STARTED)) {
                        elevator_t *e = q->elevator;
                        /*
@@ -510,7 +700,7 @@ struct request *elv_next_request(request_queue_t *q)
                         * it, a request that has been delayed should
                         * not be passed by new incoming requests
                         */
-                        rq->flags |= REQ_STARTED;
+                        rq->cmd_flags |= REQ_STARTED;
                        blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
                }
@@ -519,7 +709,7 @@ struct request *elv_next_request(request_queue_t *q)
                        q->boundary_rq = NULL;
                }
-                if ((rq->flags & REQ_DONTPREP) || !q->prep_rq_fn)
+                if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn)
                        break;
                ret = q->prep_rq_fn(q, rq);
@@ -541,7 +731,7 @@ struct request *elv_next_request(request_queue_t *q)
                                nr_bytes = rq->data_len;
                        blkdev_dequeue_request(rq);
-                        rq->flags |= REQ_QUIET;
+                        rq->cmd_flags |= REQ_QUIET;
                        end_that_request_chunk(rq, 0, nr_bytes);
                        end_that_request_last(rq, 0);
                } else {
@@ -554,9 +744,12 @@ struct request *elv_next_request(request_queue_t *q)
        return rq;
 }
+EXPORT_SYMBOL(elv_next_request);
 void elv_dequeue_request(request_queue_t *q, struct request *rq)
 {
        BUG_ON(list_empty(&rq->queuelist));
+        BUG_ON(ELV_ON_HASH(rq));
        list_del_init(&rq->queuelist);
@@ -569,6 +762,8 @@ void elv_dequeue_request(request_queue_t *q, struct request *rq)
                q->in_flight++;
 }
+EXPORT_SYMBOL(elv_dequeue_request);
 int elv_queue_empty(request_queue_t *q)
 {
        elevator_t *e = q->elevator;
@@ -582,6 +777,8 @@ int elv_queue_empty(request_queue_t *q)
        return 1;
 }
+EXPORT_SYMBOL(elv_queue_empty);
 struct request *elv_latter_request(request_queue_t *q, struct request *rq)
 {
        elevator_t *e = q->elevator;
@@ -600,13 +797,12 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
        return NULL;
 }
-int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+int elv_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
-                    gfp_t gfp_mask)
 {
        elevator_t *e = q->elevator;
        if (e->ops->elevator_set_req_fn)
-                return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
+                return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
        rq->elevator_private = NULL;
        return 0;
@@ -620,12 +816,12 @@ void elv_put_request(request_queue_t *q, struct request *rq)
                e->ops->elevator_put_req_fn(q, rq);
 }
-int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
+int elv_may_queue(request_queue_t *q, int rw)
 {
        elevator_t *e = q->elevator;
        if (e->ops->elevator_may_queue_fn)
-                return e->ops->elevator_may_queue_fn(q, rw, bio);
+                return e->ops->elevator_may_queue_fn(q, rw);
        return ELV_MQUEUE_MAY;
 }
@@ -792,7 +988,7 @@ static int elevator_switch(request_queue_t *q, struct elevator_type *new_e)
        /*
         * Allocate new elevator
         */
-        e = elevator_alloc(new_e);
+        e = elevator_alloc(q, new_e);
        if (!e)
                return 0;
@@ -908,11 +1104,26 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name)
        return len;
 }
-EXPORT_SYMBOL(elv_dispatch_sort);
+struct request *elv_rb_former_request(request_queue_t *q, struct request *rq)
-EXPORT_SYMBOL(elv_add_request);
+{
-EXPORT_SYMBOL(__elv_add_request);
+        struct rb_node *rbprev = rb_prev(&rq->rb_node);
-EXPORT_SYMBOL(elv_next_request);
-EXPORT_SYMBOL(elv_dequeue_request);
+        if (rbprev)
-EXPORT_SYMBOL(elv_queue_empty);
+                return rb_entry_rq(rbprev);
-EXPORT_SYMBOL(elevator_exit);
-EXPORT_SYMBOL(elevator_init);
+        return NULL;
+}
+EXPORT_SYMBOL(elv_rb_former_request);
+struct request *elv_rb_latter_request(request_queue_t *q, struct request *rq)
+{
+        struct rb_node *rbnext = rb_next(&rq->rb_node);
+        if (rbnext)
+                return rb_entry_rq(rbnext);
+        return NULL;
+}
+EXPORT_SYMBOL(elv_rb_latter_request);
diff --git a/block/genhd.c b/block/genhd.c
index 25d1f42568cc..653919d50cd4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -295,10 +295,15 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 static int __init genhd_device_init(void)
 {
+        int err;
        bdev_map = kobj_map_init(base_probe, &block_subsys_lock);
        blk_dev_init();
-        subsystem_register(&block_subsys);
+        err = subsystem_register(&block_subsys);
-        return 0;
+        if (err < 0)
+                printk(KERN_WARNING "%s: subsystem_register error: %d\n",
+                        __FUNCTION__, err);
+        return err;
 }
 subsys_initcall(genhd_device_init);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 9c3a06bcb7ba..83425fb3c8db 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -39,6 +39,7 @@ static void blk_unplug_timeout(unsigned long data);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
 static void init_request_from_bio(struct request *req, struct bio *bio);
 static int __make_request(request_queue_t *q, struct bio *bio);
+static struct io_context *current_io_context(gfp_t gfp_flags, int node);
 /*
 * For the allocated request tables
@@ -277,19 +278,19 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 EXPORT_SYMBOL(blk_queue_make_request);
-static inline void rq_init(request_queue_t *q, struct request *rq)
+static void rq_init(request_queue_t *q, struct request *rq)
 {
        INIT_LIST_HEAD(&rq->queuelist);
        INIT_LIST_HEAD(&rq->donelist);
        rq->errors = 0;
-        rq->rq_status = RQ_ACTIVE;
        rq->bio = rq->biotail = NULL;
+        INIT_HLIST_NODE(&rq->hash);
+        RB_CLEAR_NODE(&rq->rb_node);
        rq->ioprio = 0;
        rq->buffer = NULL;
        rq->ref_count = 1;
        rq->q = q;
-        rq->waiting = NULL;
        rq->special = NULL;
        rq->data_len = 0;
        rq->data = NULL;
@@ -382,8 +383,8 @@ unsigned blk_ordered_req_seq(struct request *rq)
        if (rq == &q->post_flush_rq)
                return QUEUE_ORDSEQ_POSTFLUSH;
-        if ((rq->flags & REQ_ORDERED_COLOR) ==
+        if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
-            (q->orig_bar_rq->flags & REQ_ORDERED_COLOR))
+            (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
                return QUEUE_ORDSEQ_DRAIN;
        else
                return QUEUE_ORDSEQ_DONE;
@@ -446,11 +447,11 @@ static void queue_flush(request_queue_t *q, unsigned which)
                end_io = post_flush_end_io;
        }
+        rq->cmd_flags = REQ_HARDBARRIER;
        rq_init(q, rq);
-        rq->flags = REQ_HARDBARRIER;
        rq->elevator_private = NULL;
+        rq->elevator_private2 = NULL;
        rq->rq_disk = q->bar_rq.rq_disk;
-        rq->rl = NULL;
        rq->end_io = end_io;
        q->prepare_flush_fn(q, rq);
@@ -471,11 +472,13 @@ static inline struct request *start_ordered(request_queue_t *q,
        blkdev_dequeue_request(rq);
        q->orig_bar_rq = rq;
        rq = &q->bar_rq;
+        rq->cmd_flags = 0;
        rq_init(q, rq);
-        rq->flags = bio_data_dir(q->orig_bar_rq->bio);
+        if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-        rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
+                rq->cmd_flags |= REQ_RW;
+        rq->cmd_flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
        rq->elevator_private = NULL;
-        rq->rl = NULL;
+        rq->elevator_private2 = NULL;
        init_request_from_bio(rq, q->orig_bar_rq->bio);
        rq->end_io = bar_end_io;
@@ -587,8 +590,8 @@ static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
        return 0;
 }
-static inline int ordered_bio_endio(struct request *rq, struct bio *bio,
+static int ordered_bio_endio(struct request *rq, struct bio *bio,
-                                    unsigned int nbytes, int error)
+                             unsigned int nbytes, int error)
 {
        request_queue_t *q = rq->q;
        bio_end_io_t *endio;
@@ -1124,7 +1127,7 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq)
        }
        list_del_init(&rq->queuelist);
-        rq->flags &= ~REQ_QUEUED;
+        rq->cmd_flags &= ~REQ_QUEUED;
        rq->tag = -1;
        if (unlikely(bqt->tag_index[tag] == NULL))
@@ -1160,7 +1163,7 @@ int blk_queue_start_tag(request_queue_t *q, struct request *rq)
        struct blk_queue_tag *bqt = q->queue_tags;
        int tag;
-        if (unlikely((rq->flags & REQ_QUEUED))) {
+        if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
                printk(KERN_ERR 
                       "%s: request %p for device [%s] already tagged %d",
                       __FUNCTION__, rq,
@@ -1168,13 +1171,18 @@ int blk_queue_start_tag(request_queue_t *q, struct request *rq)
                BUG();
        }
-        tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
+        /*
-        if (tag >= bqt->max_depth)
+         * Protect against shared tag maps, as we may not have exclusive
-                return 1;
+         * access to the tag map.
+         */
+        do {
+                tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
+                if (tag >= bqt->max_depth)
+                        return 1;
-        __set_bit(tag, bqt->tag_map);
+        } while (test_and_set_bit(tag, bqt->tag_map));
-        rq->flags |= REQ_QUEUED;
+        rq->cmd_flags |= REQ_QUEUED;
        rq->tag = tag;
        bqt->tag_index[tag] = rq;
        blkdev_dequeue_request(rq);
@@ -1210,65 +1218,31 @@ void blk_queue_invalidate_tags(request_queue_t *q)
                        printk(KERN_ERR
                               "%s: bad tag found on list\n", __FUNCTION__);
                        list_del_init(&rq->queuelist);
-                        rq->flags &= ~REQ_QUEUED;
+                        rq->cmd_flags &= ~REQ_QUEUED;
                } else
                        blk_queue_end_tag(q, rq);
-                rq->flags &= ~REQ_STARTED;
+                rq->cmd_flags &= ~REQ_STARTED;
                __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
        }
 }
 EXPORT_SYMBOL(blk_queue_invalidate_tags);
-static const char * const rq_flags[] = {
-        "REQ_RW",
-        "REQ_FAILFAST",
-        "REQ_SORTED",
-        "REQ_SOFTBARRIER",
-        "REQ_HARDBARRIER",
-        "REQ_FUA",
-        "REQ_CMD",
-        "REQ_NOMERGE",
-        "REQ_STARTED",
-        "REQ_DONTPREP",
-        "REQ_QUEUED",
-        "REQ_ELVPRIV",
-        "REQ_PC",
-        "REQ_BLOCK_PC",
-        "REQ_SENSE",
-        "REQ_FAILED",
-        "REQ_QUIET",
-        "REQ_SPECIAL",
-        "REQ_DRIVE_CMD",
-        "REQ_DRIVE_TASK",
-        "REQ_DRIVE_TASKFILE",
-        "REQ_PREEMPT",
-        "REQ_PM_SUSPEND",
-        "REQ_PM_RESUME",
-        "REQ_PM_SHUTDOWN",
-        "REQ_ORDERED_COLOR",
-};
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
        int bit;
-        printk("%s: dev %s: flags = ", msg,
+        printk("%s: dev %s: type=%x, flags=%x\n", msg,
-                rq->rq_disk ? rq->rq_disk->disk_name : "?");
+                rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
-        bit = 0;
+                rq->cmd_flags);
-        do {
-                if (rq->flags & (1 << bit))
-                        printk("%s ", rq_flags[bit]);
-                bit++;
-        } while (bit < __REQ_NR_BITS);
        printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
                                                       rq->nr_sectors,
                                                       rq->current_nr_sectors);
        printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
-        if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
+        if (blk_pc_request(rq)) {
                printk("cdb: ");
                for (bit = 0; bit < sizeof(rq->cmd); bit++)
                        printk("%02x ", rq->cmd[bit]);
@@ -1441,7 +1415,7 @@ static inline int ll_new_mergeable(request_queue_t *q,
        int nr_phys_segs = bio_phys_segments(q, bio);
        if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
-                req->flags |= REQ_NOMERGE;
+                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
                        q->last_merge = NULL;
                return 0;
@@ -1464,7 +1438,7 @@ static inline int ll_new_hw_segment(request_queue_t *q,
        if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
            || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
-                req->flags |= REQ_NOMERGE;
+                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
                        q->last_merge = NULL;
                return 0;
@@ -1491,7 +1465,7 @@ static int ll_back_merge_fn(request_queue_t *q, struct request *req,
                max_sectors = q->max_sectors;
        if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
-                req->flags |= REQ_NOMERGE;
+                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
                        q->last_merge = NULL;
                return 0;
@@ -1530,7 +1504,7 @@ static int ll_front_merge_fn(request_queue_t *q, struct request *req,
        if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
-                req->flags |= REQ_NOMERGE;
+                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
                        q->last_merge = NULL;
                return 0;
@@ -1847,8 +1821,7 @@ static void blk_release_queue(struct kobject *kobj)
        if (q->queue_tags)
                __blk_queue_free_tags(q);
-        if (q->blk_trace)
+        blk_trace_shutdown(q);
-                blk_trace_shutdown(q);
        kmem_cache_free(requestq_cachep, q);
 }
@@ -2030,14 +2003,13 @@ EXPORT_SYMBOL(blk_get_queue);
 static inline void blk_free_request(request_queue_t *q, struct request *rq)
 {
-        if (rq->flags & REQ_ELVPRIV)
+        if (rq->cmd_flags & REQ_ELVPRIV)
                elv_put_request(q, rq);
        mempool_free(rq, q->rq.rq_pool);
 }
-static inline struct request *
+static struct request *
-blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
+blk_alloc_request(request_queue_t *q, int rw, int priv, gfp_t gfp_mask)
-                  int priv, gfp_t gfp_mask)
 {
        struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
@@ -2045,17 +2017,17 @@ blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
                return NULL;
        /*
-         * first three bits are identical in rq->flags and bio->bi_rw,
+         * first three bits are identical in rq->cmd_flags and bio->bi_rw,
         * see bio.h and blkdev.h
         */
-        rq->flags = rw;
+        rq->cmd_flags = rw | REQ_ALLOCED;
        if (priv) {
-                if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+                if (unlikely(elv_set_request(q, rq, gfp_mask))) {
                        mempool_free(rq, q->rq.rq_pool);
                        return NULL;
                }
-                rq->flags |= REQ_ELVPRIV;
+                rq->cmd_flags |= REQ_ELVPRIV;
        }
        return rq;
@@ -2142,13 +2114,13 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
        struct io_context *ioc = NULL;
        int may_queue, priv;
-        may_queue = elv_may_queue(q, rw, bio);
+        may_queue = elv_may_queue(q, rw);
        if (may_queue == ELV_MQUEUE_NO)
                goto rq_starved;
        if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
                if (rl->count[rw]+1 >= q->nr_requests) {
-                        ioc = current_io_context(GFP_ATOMIC);
+                        ioc = current_io_context(GFP_ATOMIC, q->node);
                        /*
                         * The queue will fill after this allocation, so set
                         * it as full, and mark this process as "batching".
@@ -2190,7 +2162,7 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
        spin_unlock_irq(q->queue_lock);
-        rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
+        rq = blk_alloc_request(q, rw, priv, gfp_mask);
        if (unlikely(!rq)) {
                /*
                 * Allocation failed presumably due to memory. Undo anything
@@ -2226,7 +2198,6 @@ rq_starved:
                ioc->nr_batch_requests--;
        
        rq_init(q, rq);
-        rq->rl = rl;
        blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
@@ -2269,7 +2240,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
                         * up to a big batch of them for a small period time.
                         * See ioc_batching, ioc_set_batching
                         */
-                        ioc = current_io_context(GFP_NOIO);
+                        ioc = current_io_context(GFP_NOIO, q->node);
                        ioc_set_batching(q, ioc);
                        spin_lock_irq(q->queue_lock);
@@ -2301,6 +2272,25 @@ struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask)
 EXPORT_SYMBOL(blk_get_request);
 /**
+ * blk_start_queueing - initiate dispatch of requests to device
+ * @q:          request queue to kick into gear
+ *
+ * This is basically a helper to remove the need to know whether a queue
+ * is plugged or not if someone just wants to initiate dispatch of requests
+ * for this queue.
+ *
+ * The queue lock must be held with interrupts disabled.
+ */
+void blk_start_queueing(request_queue_t *q)
+{
+        if (!blk_queue_plugged(q))
+                q->request_fn(q);
+        else
+                __generic_unplug_device(q);
+}
+EXPORT_SYMBOL(blk_start_queueing);
+/**
 * blk_requeue_request - put a request back on queue
 * @q:          request queue where request should be inserted
 * @rq:         request to be inserted
@@ -2352,7 +2342,8 @@ void blk_insert_request(request_queue_t *q, struct request *rq,
         * must not attempt merges on this) and that it acts as a soft
         * barrier
         */
-        rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
+        rq->cmd_type = REQ_TYPE_SPECIAL;
+        rq->cmd_flags |= REQ_SOFTBARRIER;
        rq->special = data;
@@ -2366,11 +2357,7 @@ void blk_insert_request(request_queue_t *q, struct request *rq,
        drive_stat_acct(rq, rq->nr_sectors, 1);
        __elv_add_request(q, rq, where, 0);
+        blk_start_queueing(q);
-        if (blk_queue_plugged(q))
-                __generic_unplug_device(q);
-        else
-                q->request_fn(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
@@ -2559,7 +2546,7 @@ void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
        rq->rq_disk = bd_disk;
-        rq->flags |= REQ_NOMERGE;
+        rq->cmd_flags |= REQ_NOMERGE;
        rq->end_io = done;
        WARN_ON(irqs_disabled());
        spin_lock_irq(q->queue_lock);
@@ -2599,10 +2586,9 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
                rq->sense_len = 0;
        }
-        rq->waiting = &wait;
+        rq->end_io_data = &wait;
        blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
        wait_for_completion(&wait);
-        rq->waiting = NULL;
        if (rq->errors)
                err = -EIO;
@@ -2711,8 +2697,6 @@ EXPORT_SYMBOL_GPL(disk_round_stats);
 */
 void __blk_put_request(request_queue_t *q, struct request *req)
 {
-        struct request_list *rl = req->rl;
        if (unlikely(!q))
                return;
        if (unlikely(--req->ref_count))
@@ -2720,18 +2704,16 @@ void __blk_put_request(request_queue_t *q, struct request *req)
        elv_completed_request(q, req);
-        req->rq_status = RQ_INACTIVE;
-        req->rl = NULL;
        /*
         * Request may not have originated from ll_rw_blk. if not,
         * it didn't come out of our reserved rq pools
         */
-        if (rl) {
+        if (req->cmd_flags & REQ_ALLOCED) {
                int rw = rq_data_dir(req);
-                int priv = req->flags & REQ_ELVPRIV;
+                int priv = req->cmd_flags & REQ_ELVPRIV;
                BUG_ON(!list_empty(&req->queuelist));
+                BUG_ON(!hlist_unhashed(&req->hash));
                blk_free_request(q, req);
                freed_request(q, rw, priv);
@@ -2765,9 +2747,9 @@ EXPORT_SYMBOL(blk_put_request);
 */
 void blk_end_sync_rq(struct request *rq, int error)
 {
-        struct completion *waiting = rq->waiting;
+        struct completion *waiting = rq->end_io_data;
-        rq->waiting = NULL;
+        rq->end_io_data = NULL;
        __blk_put_request(rq->q, rq);
        /*
@@ -2830,7 +2812,7 @@ static int attempt_merge(request_queue_t *q, struct request *req,
        if (rq_data_dir(req) != rq_data_dir(next)
            || req->rq_disk != next->rq_disk
-            || next->waiting || next->special)
+            || next->special)
                return 0;
        /*
@@ -2891,22 +2873,24 @@ static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
 static void init_request_from_bio(struct request *req, struct bio *bio)
 {
-        req->flags |= REQ_CMD;
+        req->cmd_type = REQ_TYPE_FS;
        /*
         * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
         */
        if (bio_rw_ahead(bio) || bio_failfast(bio))
-                req->flags |= REQ_FAILFAST;
+                req->cmd_flags |= REQ_FAILFAST;
        /*
         * REQ_BARRIER implies no merging, but lets make it explicit
         */
        if (unlikely(bio_barrier(bio)))
-                req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+                req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
        if (bio_sync(bio))
-                req->flags |= REQ_RW_SYNC;
+                req->cmd_flags |= REQ_RW_SYNC;
+        if (bio_rw_meta(bio))
+                req->cmd_flags |= REQ_RW_META;
        req->errors = 0;
        req->hard_sector = req->sector = bio->bi_sector;
@@ -2915,7 +2899,6 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
        req->nr_phys_segments = bio_phys_segments(req->q, bio);
        req->nr_hw_segments = bio_hw_segments(req->q, bio);
        req->buffer = bio_data(bio);    /* see ->buffer comment above */
-        req->waiting = NULL;
        req->bio = req->biotail = bio;
        req->ioprio = bio_prio(bio);
        req->rq_disk = bio->bi_bdev->bd_disk;
@@ -2925,17 +2908,11 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
        struct request *req;
-        int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
+        int el_ret, nr_sectors, barrier, err;
-        unsigned short prio;
+        const unsigned short prio = bio_prio(bio);
-        sector_t sector;
+        const int sync = bio_sync(bio);
-        sector = bio->bi_sector;
        nr_sectors = bio_sectors(bio);
-        cur_nr_sectors = bio_cur_sectors(bio);
-        prio = bio_prio(bio);
-        rw = bio_data_dir(bio);
-        sync = bio_sync(bio);
        /*
         * low level driver can indicate that it wants pages above a
@@ -2944,8 +2921,6 @@ static int __make_request(request_queue_t *q, struct bio *bio)
         */
        blk_queue_bounce(q, &bio);
-        spin_lock_prefetch(q->queue_lock);
        barrier = bio_barrier(bio);
        if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
                err = -EOPNOTSUPP;
@@ -2973,7 +2948,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
                        req->ioprio = ioprio_best(req->ioprio, prio);
                        drive_stat_acct(req, nr_sectors, 0);
                        if (!attempt_back_merge(q, req))
-                                elv_merged_request(q, req);
+                                elv_merged_request(q, req, el_ret);
                        goto out;
                case ELEVATOR_FRONT_MERGE:
@@ -2993,14 +2968,14 @@ static int __make_request(request_queue_t *q, struct bio *bio)
                         * not touch req->buffer either...
                         */
                        req->buffer = bio_data(bio);
-                        req->current_nr_sectors = cur_nr_sectors;
+                        req->current_nr_sectors = bio_cur_sectors(bio);
-                        req->hard_cur_sectors = cur_nr_sectors;
+                        req->hard_cur_sectors = req->current_nr_sectors;
-                        req->sector = req->hard_sector = sector;
+                        req->sector = req->hard_sector = bio->bi_sector;
                        req->nr_sectors = req->hard_nr_sectors += nr_sectors;
                        req->ioprio = ioprio_best(req->ioprio, prio);
                        drive_stat_acct(req, nr_sectors, 0);
                        if (!attempt_front_merge(q, req))
-                                elv_merged_request(q, req);
+                                elv_merged_request(q, req, el_ret);
                        goto out;
                /* ELV_NO_MERGE: elevator says don't/can't merge. */
@@ -3013,7 +2988,7 @@ get_rq:
         * Grab a free request. This is might sleep but can not fail.
         * Returns with the queue unlocked.
         */
-        req = get_request_wait(q, rw, bio);
+        req = get_request_wait(q, bio_data_dir(bio), bio);
        /*
         * After dropping the lock and possibly sleeping here, our request
@@ -3307,7 +3282,7 @@ static int __end_that_request_first(struct request *req, int uptodate,
                req->errors = 0;
        if (!uptodate) {
-                if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
+                if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
                        printk("end_request: I/O error, dev %s, sector %llu\n",
                                req->rq_disk ? req->rq_disk->disk_name : "?",
                                (unsigned long long)req->sector);
@@ -3570,8 +3545,8 @@ EXPORT_SYMBOL(end_request);
 void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
 {
-        /* first two bits are identical in rq->flags and bio->bi_rw */
+        /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
-        rq->flags |= (bio->bi_rw & 3);
+        rq->cmd_flags |= (bio->bi_rw & 3);
        rq->nr_phys_segments = bio_phys_segments(q, bio);
        rq->nr_hw_segments = bio_hw_segments(q, bio);
@@ -3659,25 +3634,22 @@ EXPORT_SYMBOL(put_io_context);
 /* Called by the exitting task */
 void exit_io_context(void)
 {
-        unsigned long flags;
        struct io_context *ioc;
        struct cfq_io_context *cic;
-        local_irq_save(flags);
        task_lock(current);
        ioc = current->io_context;
        current->io_context = NULL;
-        ioc->task = NULL;
        task_unlock(current);
-        local_irq_restore(flags);
+        ioc->task = NULL;
        if (ioc->aic && ioc->aic->exit)
                ioc->aic->exit(ioc->aic);
        if (ioc->cic_root.rb_node != NULL) {
                cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
                cic->exit(ioc);
        }
- 
        put_io_context(ioc);
 }
@@ -3689,7 +3661,7 @@ void exit_io_context(void)
 * but since the current task itself holds a reference, the context can be
 * used in general code, so long as it stays within `current` context.
 */
-struct io_context *current_io_context(gfp_t gfp_flags)
+static struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
        struct task_struct *tsk = current;
        struct io_context *ret;
@@ -3698,11 +3670,11 @@ struct io_context *current_io_context(gfp_t gfp_flags)
        if (likely(ret))
                return ret;
-        ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
+        ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
        if (ret) {
                atomic_set(&ret->refcount, 1);
                ret->task = current;
-                ret->set_ioprio = NULL;
+                ret->ioprio_changed = 0;
                ret->last_waited = jiffies; /* doesn't matter... */
                ret->nr_batch_requests = 0; /* because this is 0 */
                ret->aic = NULL;
@@ -3722,10 +3694,10 @@ EXPORT_SYMBOL(current_io_context);
 *
 * This is always called in the context of the task which submitted the I/O.
 */
-struct io_context *get_io_context(gfp_t gfp_flags)
+struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
        struct io_context *ret;
-        ret = current_io_context(gfp_flags);
+        ret = current_io_context(gfp_flags, node);
        if (likely(ret))
                atomic_inc(&ret->refcount);
        return ret;
@@ -3838,9 +3810,6 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
        ssize_t ret = queue_var_store(&ra_kb, page, count);
        spin_lock_irq(q->queue_lock);
-        if (ra_kb > (q->max_sectors >> 1))
-                ra_kb = (q->max_sectors >> 1);
        q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
        spin_unlock_irq(q->queue_lock);
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 56a7c620574f..79af43179421 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -69,7 +69,7 @@ static void *noop_init_queue(request_queue_t *q, elevator_t *e)
 {
        struct noop_data *nd;
-        nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+        nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
        if (!nd)
                return NULL;
        INIT_LIST_HEAD(&nd->queue);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index b33eda26e205..2dc326421a24 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -294,7 +294,7 @@ static int sg_io(struct file *file, request_queue_t *q,
        rq->sense = sense;
        rq->sense_len = 0;
-        rq->flags |= REQ_BLOCK_PC;
+        rq->cmd_type = REQ_TYPE_BLOCK_PC;
        bio = rq->bio;
        /*
@@ -470,7 +470,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
        memset(sense, 0, sizeof(sense));
        rq->sense = sense;
        rq->sense_len = 0;
-        rq->flags |= REQ_BLOCK_PC;
+        rq->cmd_type = REQ_TYPE_BLOCK_PC;
        blk_execute_rq(q, disk, rq, 0);
@@ -502,7 +502,7 @@ static int __blk_send_generic(request_queue_t *q, struct gendisk *bd_disk, int c
        int err;
        rq = blk_get_request(q, WRITE, __GFP_WAIT);
-        rq->flags |= REQ_BLOCK_PC;
+        rq->cmd_type = REQ_TYPE_BLOCK_PC;
        rq->data = NULL;
        rq->data_len = 0;
        rq->timeout = BLK_DEFAULT_TIMEOUT;
author	Steven Whitehouse <swhiteho@redhat.com>	2006-10-02 08:45:08 -0400
committer	Steven Whitehouse <swhiteho@redhat.com>	2006-10-02 08:45:08 -0400
commit	59458f40e25915a355d8b1d701425fe9f4f9ea23 (patch)
tree	f1c9a2934df686e36d75f759ab7313b6f0e0e5f9 /block
parent	825f9075d74028d11d7f5932f04e1b5db3022b51 (diff)
parent	d834c16516d1ebec4766fc58c059bf01311e6045 (diff)