[BLOCK] Reimplement elevator switch

This patch reimplements elevator switch. This patch assumes generic dispatch queue patchset is applied. * Each request is tagged with REQ_ELVPRIV flag if it has its elevator private data set. * Requests which doesn't have REQ_ELVPRIV flag set never enter iosched. They are always directly back inserted to dispatch queue. Of course, elevator_put_req_fn is called only for requests which have its REQ_ELVPRIV set. * Request queue maintains the current number of requests which have its elevator data set (elevator_set_req_fn called) in q->rq->elvpriv. * If a request queue has QUEUE_FLAG_BYPASS set, elevator private data is not allocated for new requests. To switch to another iosched, we set QUEUE_FLAG_BYPASS and wait until elvpriv goes to zero; then, we attach the new iosched and clears QUEUE_FLAG_BYPASS. New implementation is much simpler and main code paths are less cluttered, IMHO. Signed-off-by: Tejun Heo <htejun@gmail.com> Signed-off-by: Jens Axboe <axboe@suse.de>
author: Tejun Heo <htejun@gmail.com> 2005-10-28 02:29:39 -0400
committer: Jens Axboe <axboe@nelson.home.kernel.dk> 2005-10-28 02:48:12 -0400
commit: cb98fc8bb9c141009e2bda99c0db39d387e142cf (patch)
tree: 8957f8a79f39c3e6633a0dbb165ced8b530aca0c
parent: cb19833dccb32f97cacbfff834b53523915f13f6 (diff)
3 files changed, 66 insertions, 164 deletions
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index af2388e73f61..272d93946621 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
+#include <linux/delay.h>
 #include <asm/uaccess.h>
@@ -131,11 +132,7 @@ static int elevator_attach(request_queue_t *q, struct elevator_type *e,
        eq->ops = &e->ops;
        eq->elevator_type = e;
-        INIT_LIST_HEAD(&q->queue_head);
-        q->last_merge = NULL;
        q->elevator = eq;
-        q->end_sector = 0;
-        q->boundary_rq = NULL;
        if (eq->ops->elevator_init_fn)
                ret = eq->ops->elevator_init_fn(q, eq);
@@ -184,6 +181,12 @@ int elevator_init(request_queue_t *q, char *name)
        struct elevator_queue *eq;
        int ret = 0;
+        INIT_LIST_HEAD(&q->queue_head);
+        q->last_merge = NULL;
+        q->end_sector = 0;
+        q->boundary_rq = NULL;
+        q->max_back_kb = 0;
        elevator_setup_default();
        if (!name)
@@ -336,23 +339,14 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
                        q->end_sector = rq_end_sector(rq);
                        q->boundary_rq = rq;
                }
-        }
+        } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
+                where = ELEVATOR_INSERT_BACK;
        if (plug)
                blk_plug_device(q);
        rq->q = q;
-        if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
-                /*
-                 * if drain is set, store the request "locally". when the drain
-                 * is finished, the requests will be handed ordered to the io
-                 * scheduler
-                 */
-                list_add_tail(&rq->queuelist, &q->drain_list);
-                return;
-        }
        switch (where) {
        case ELEVATOR_INSERT_FRONT:
                rq->flags |= REQ_SOFTBARRIER;
@@ -659,25 +653,36 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason. we also do an intermediate
+ * one, if the new one fails init for some reason.
- * switch to noop to ensure safety with stack-allocated requests, since they
- * don't originate from the block layer allocator. noop is safe here, because
- * it never needs to touch the elevator itself for completion events. DRAIN
- * flags will make sure we don't touch it for additions either.
 */
 static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 {
-        elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+        elevator_t *old_elevator, *e;
-        struct elevator_type *noop_elevator = NULL;
-        elevator_t *old_elevator;
+        /*
+         * Allocate new elevator
+         */
+        e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
        if (!e)
                goto error;
        /*
-         * first step, drain requests from the block freelist
+         * Turn on BYPASS and drain all requests w/ elevator private data
         */
-        blk_wait_queue_drained(q, 0);
+        spin_lock_irq(q->queue_lock);
+        set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+        while (q->elevator->ops->elevator_dispatch_fn(q, 1))
+                ;
+        while (q->rq.elvpriv) {
+                spin_unlock_irq(q->queue_lock);
+                msleep(100);
+                spin_lock_irq(q->queue_lock);
+        }
+        spin_unlock_irq(q->queue_lock);
        /*
         * unregister old elevator data
@@ -686,18 +691,6 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
        old_elevator = q->elevator;
        /*
-         * next step, switch to noop since it uses no private rq structures
-         * and doesn't allocate any memory for anything. then wait for any
-         * non-fs requests in-flight
-         */
-        noop_elevator = elevator_get("noop");
-        spin_lock_irq(q->queue_lock);
-        elevator_attach(q, noop_elevator, e);
-        spin_unlock_irq(q->queue_lock);
-        blk_wait_queue_drained(q, 1);
-        /*
         * attach and start new elevator
         */
        if (elevator_attach(q, new_e, e))
@@ -707,11 +700,10 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
                goto fail_register;
        /*
-         * finally exit old elevator and start queue again
+         * finally exit old elevator and turn off BYPASS.
         */
        elevator_exit(old_elevator);
-        blk_finish_queue_drain(q);
+        clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
-        elevator_put(noop_elevator);
        return;
 fail_register:
@@ -720,13 +712,13 @@ fail_register:
         * one again (along with re-adding the sysfs dir)
         */
        elevator_exit(e);
+        e = NULL;
 fail:
        q->elevator = old_elevator;
        elv_register_queue(q);
-        blk_finish_queue_drain(q);
+        clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+        kfree(e);
 error:
-        if (noop_elevator)
-                elevator_put(noop_elevator);
        elevator_put(new_e);
        printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index d2a66fd309c3..f7c9931cb380 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -263,8 +263,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
        blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
        blk_queue_activity_fn(q, NULL, NULL);
-        INIT_LIST_HEAD(&q->drain_list);
 }
 EXPORT_SYMBOL(blk_queue_make_request);
@@ -1050,6 +1048,7 @@ static char *rq_flags[] = {
        "REQ_STARTED",
        "REQ_DONTPREP",
        "REQ_QUEUED",
+        "REQ_ELVPRIV",
        "REQ_PC",
        "REQ_BLOCK_PC",
        "REQ_SENSE",
@@ -1640,9 +1639,9 @@ static int blk_init_free_list(request_queue_t *q)
        rl->count[READ] = rl->count[WRITE] = 0;
        rl->starved[READ] = rl->starved[WRITE] = 0;
+        rl->elvpriv = 0;
        init_waitqueue_head(&rl->wait[READ]);
        init_waitqueue_head(&rl->wait[WRITE]);
-        init_waitqueue_head(&rl->drain);
        rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
                                mempool_free_slab, request_cachep, q->node);
@@ -1785,12 +1784,14 @@ EXPORT_SYMBOL(blk_get_queue);
 static inline void blk_free_request(request_queue_t *q, struct request *rq)
 {
-        elv_put_request(q, rq);
+        if (rq->flags & REQ_ELVPRIV)
+                elv_put_request(q, rq);
        mempool_free(rq, q->rq.rq_pool);
 }
 static inline struct request *
-blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
+                  int priv, int gfp_mask)
 {
        struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
@@ -1803,11 +1804,15 @@ blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
         */
        rq->flags = rw;
-        if (!elv_set_request(q, rq, bio, gfp_mask))
+        if (priv) {
-                return rq;
+                if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+                        mempool_free(rq, q->rq.rq_pool);
+                        return NULL;
+                }
+                rq->flags |= REQ_ELVPRIV;
+        }
-        mempool_free(rq, q->rq.rq_pool);
+        return rq;
-        return NULL;
 }
 /*
@@ -1863,22 +1868,18 @@ static void __freed_request(request_queue_t *q, int rw)
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
-static void freed_request(request_queue_t *q, int rw)
+static void freed_request(request_queue_t *q, int rw, int priv)
 {
        struct request_list *rl = &q->rq;
        rl->count[rw]--;
+        if (priv)
+                rl->elvpriv--;
        __freed_request(q, rw);
        if (unlikely(rl->starved[rw ^ 1]))
                __freed_request(q, rw ^ 1);
-        if (!rl->count[READ] && !rl->count[WRITE]) {
-                smp_mb();
-                if (unlikely(waitqueue_active(&rl->drain)))
-                        wake_up(&rl->drain);
-        }
 }
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1893,9 +1894,7 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
        struct request *rq = NULL;
        struct request_list *rl = &q->rq;
        struct io_context *ioc = current_io_context(GFP_ATOMIC);
+        int priv;
-        if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
-                goto out;
        if (rl->count[rw]+1 >= q->nr_requests) {
                /*
@@ -1940,9 +1939,14 @@ get_rq:
        rl->starved[rw] = 0;
        if (rl->count[rw] >= queue_congestion_on_threshold(q))
                set_queue_congested(q, rw);
+        priv = !test_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+        if (priv)
+                rl->elvpriv++;
        spin_unlock_irq(q->queue_lock);
-        rq = blk_alloc_request(q, rw, bio, gfp_mask);
+        rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
        if (!rq) {
                /*
                 * Allocation failed presumably due to memory. Undo anything
@@ -1952,7 +1956,7 @@ get_rq:
                 * wait queue, but this is pretty rare.
                 */
                spin_lock_irq(q->queue_lock);
-                freed_request(q, rw);
+                freed_request(q, rw, priv);
                /*
                 * in the very unlikely event that allocation failed and no
@@ -2470,11 +2474,12 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
         */
        if (rl) {
                int rw = rq_data_dir(req);
+                int priv = req->flags & REQ_ELVPRIV;
                BUG_ON(!list_empty(&req->queuelist));
                blk_free_request(q, req);
-                freed_request(q, rw);
+                freed_request(q, rw, priv);
        }
 }
@@ -2802,97 +2807,6 @@ static inline void blk_partition_remap(struct bio *bio)
        }
 }
-void blk_finish_queue_drain(request_queue_t *q)
-{
-        struct request_list *rl = &q->rq;
-        struct request *rq;
-        int requeued = 0;
-        spin_lock_irq(q->queue_lock);
-        clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-        while (!list_empty(&q->drain_list)) {
-                rq = list_entry_rq(q->drain_list.next);
-                list_del_init(&rq->queuelist);
-                elv_requeue_request(q, rq);
-                requeued++;
-        }
-        if (requeued)
-                q->request_fn(q);
-        spin_unlock_irq(q->queue_lock);
-        wake_up(&rl->wait[0]);
-        wake_up(&rl->wait[1]);
-        wake_up(&rl->drain);
-}
-static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch)
-{
-        int wait = rl->count[READ] + rl->count[WRITE];
-        if (dispatch)
-                wait += !list_empty(&q->queue_head);
-        return wait;
-}
-/*
- * We rely on the fact that only requests allocated through blk_alloc_request()
- * have io scheduler private data structures associated with them. Any other
- * type of request (allocated on stack or through kmalloc()) should not go
- * to the io scheduler core, but be attached to the queue head instead.
- */
-void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
-{
-        struct request_list *rl = &q->rq;
-        DEFINE_WAIT(wait);
-        spin_lock_irq(q->queue_lock);
-        set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-        while (wait_drain(q, rl, wait_dispatch)) {
-                prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
-                if (wait_drain(q, rl, wait_dispatch)) {
-                        __generic_unplug_device(q);
-                        spin_unlock_irq(q->queue_lock);
-                        io_schedule();
-                        spin_lock_irq(q->queue_lock);
-                }
-                finish_wait(&rl->drain, &wait);
-        }
-        spin_unlock_irq(q->queue_lock);
-}
-/*
- * block waiting for the io scheduler being started again.
- */
-static inline void block_wait_queue_running(request_queue_t *q)
-{
-        DEFINE_WAIT(wait);
-        while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
-                struct request_list *rl = &q->rq;
-                prepare_to_wait_exclusive(&rl->drain, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                /*
-                 * re-check the condition. avoids using prepare_to_wait()
-                 * in the fast path (queue is running)
-                 */
-                if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
-                        io_schedule();
-                finish_wait(&rl->drain, &wait);
-        }
-}
 static void handle_bad_sector(struct bio *bio)
 {
        char b[BDEVNAME_SIZE];
@@ -2988,8 +2902,6 @@ end_io:
                if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                        goto end_io;
-                block_wait_queue_running(q);
                /*
                 * If this device has partitions, remap block n
                 * of partition p to block n+start(p) of the disk.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 159dbcd2eb59..6186d5e2110f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -107,9 +107,9 @@ typedef void (rq_end_io_fn)(struct request *);
 struct request_list {
        int count[2];
        int starved[2];
+        int elvpriv;
        mempool_t *rq_pool;
        wait_queue_head_t wait[2];
-        wait_queue_head_t drain;
 };
 #define BLK_MAX_CDB     16
@@ -211,6 +211,7 @@ enum rq_flag_bits {
        __REQ_STARTED,          /* drive already may have started this one */
        __REQ_DONTPREP,         /* don't call prep for this one */
        __REQ_QUEUED,           /* uses queueing */
+        __REQ_ELVPRIV,          /* elevator private data attached */
        /*
         * for ATA/ATAPI devices
         */
@@ -244,6 +245,7 @@ enum rq_flag_bits {
 #define REQ_STARTED     (1 << __REQ_STARTED)
 #define REQ_DONTPREP    (1 << __REQ_DONTPREP)
 #define REQ_QUEUED      (1 << __REQ_QUEUED)
+#define REQ_ELVPRIV     (1 << __REQ_ELVPRIV)
 #define REQ_PC          (1 << __REQ_PC)
 #define REQ_BLOCK_PC    (1 << __REQ_BLOCK_PC)
 #define REQ_SENSE       (1 << __REQ_SENSE)
@@ -413,8 +415,6 @@ struct request_queue
        unsigned int            sg_reserved_size;
        int                     node;
-        struct list_head        drain_list;
        /*
         * reserved for flush operations
         */
@@ -442,7 +442,7 @@ enum {
 #define QUEUE_FLAG_DEAD         5       /* queue being torn down */
 #define QUEUE_FLAG_REENTER      6       /* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED      7       /* queue is plugged */
-#define QUEUE_FLAG_DRAIN        8       /* draining queue for sched switch */
+#define QUEUE_FLAG_BYPASS       8       /* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_FLUSH        9       /* doing barrier flush sequence */
 #define blk_queue_plugged(q)    test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -668,8 +668,6 @@ extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(request_queue_t *);
 extern void __generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
-extern void blk_wait_queue_drained(request_queue_t *, int);
-extern void blk_finish_queue_drain(request_queue_t *);
 int blk_get_queue(request_queue_t *);
 request_queue_t *blk_alloc_queue(int gfp_mask);
author	Tejun Heo <htejun@gmail.com>	2005-10-28 02:29:39 -0400
committer	Jens Axboe <axboe@nelson.home.kernel.dk>	2005-10-28 02:48:12 -0400
commit	cb98fc8bb9c141009e2bda99c0db39d387e142cf (patch)
tree	8957f8a79f39c3e6633a0dbb165ced8b530aca0c
parent	cb19833dccb32f97cacbfff834b53523915f13f6 (diff)

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index af2388e73f61..272d93946621 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c
@@ -34,6 +34,7 @@
34	#include <linux/slab.h>	34	#include <linux/slab.h>
35	#include <linux/init.h>	35	#include <linux/init.h>
36	#include <linux/compiler.h>	36	#include <linux/compiler.h>
		37	#include <linux/delay.h>
37		38
38	#include <asm/uaccess.h>	39	#include <asm/uaccess.h>
39		40
@@ -131,11 +132,7 @@ static int elevator_attach(request_queue_t q, struct elevator_type e,
131	eq->ops = &e->ops;	132	eq->ops = &e->ops;
132	eq->elevator_type = e;	133	eq->elevator_type = e;
133		134
134	INIT_LIST_HEAD(&q->queue_head);
135	q->last_merge = NULL;
136	q->elevator = eq;	135	q->elevator = eq;
137	q->end_sector = 0;
138	q->boundary_rq = NULL;
139		136
140	if (eq->ops->elevator_init_fn)	137	if (eq->ops->elevator_init_fn)
141	ret = eq->ops->elevator_init_fn(q, eq);	138	ret = eq->ops->elevator_init_fn(q, eq);
@@ -184,6 +181,12 @@ int elevator_init(request_queue_t q, char name)
184	struct elevator_queue *eq;	181	struct elevator_queue *eq;
185	int ret = 0;	182	int ret = 0;
186		183
		184	INIT_LIST_HEAD(&q->queue_head);
		185	q->last_merge = NULL;
		186	q->end_sector = 0;
		187	q->boundary_rq = NULL;
		188	q->max_back_kb = 0;
		189
187	elevator_setup_default();	190	elevator_setup_default();
188		191
189	if (!name)	192	if (!name)
@@ -336,23 +339,14 @@ void __elv_add_request(request_queue_t q, struct request rq, int where,
336	q->end_sector = rq_end_sector(rq);	339	q->end_sector = rq_end_sector(rq);
337	q->boundary_rq = rq;	340	q->boundary_rq = rq;
338	}	341	}
339	}	342	} else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
		343	where = ELEVATOR_INSERT_BACK;
340		344
341	if (plug)	345	if (plug)
342	blk_plug_device(q);	346	blk_plug_device(q);
343		347
344	rq->q = q;	348	rq->q = q;
345		349
346	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
347	/*
348	* if drain is set, store the request "locally". when the drain
349	* is finished, the requests will be handed ordered to the io
350	* scheduler
351	*/
352	list_add_tail(&rq->queuelist, &q->drain_list);
353	return;
354	}
355
356	switch (where) {	350	switch (where) {
357	case ELEVATOR_INSERT_FRONT:	351	case ELEVATOR_INSERT_FRONT:
358	rq->flags \|= REQ_SOFTBARRIER;	352	rq->flags \|= REQ_SOFTBARRIER;
@@ -659,25 +653,36 @@ EXPORT_SYMBOL_GPL(elv_unregister);
659	* switch to new_e io scheduler. be careful not to introduce deadlocks -	653	* switch to new_e io scheduler. be careful not to introduce deadlocks -
660	* we don't free the old io scheduler, before we have allocated what we	654	* we don't free the old io scheduler, before we have allocated what we
661	* need for the new one. this way we have a chance of going back to the old	655	* need for the new one. this way we have a chance of going back to the old
662	* one, if the new one fails init for some reason. we also do an intermediate	656	* one, if the new one fails init for some reason.
663	* switch to noop to ensure safety with stack-allocated requests, since they
664	* don't originate from the block layer allocator. noop is safe here, because
665	* it never needs to touch the elevator itself for completion events. DRAIN
666	* flags will make sure we don't touch it for additions either.
667	*/	657	*/
668	static void elevator_switch(request_queue_t q, struct elevator_type new_e)	658	static void elevator_switch(request_queue_t q, struct elevator_type new_e)
669	{	659	{
670	elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);	660	elevator_t old_elevator, e;
671	struct elevator_type *noop_elevator = NULL;
672	elevator_t *old_elevator;
673		661
		662	/*
		663	* Allocate new elevator
		664	*/
		665	e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
674	if (!e)	666	if (!e)
675	goto error;	667	goto error;
676		668
677	/*	669	/*
678	* first step, drain requests from the block freelist	670	* Turn on BYPASS and drain all requests w/ elevator private data
679	*/	671	*/
680	blk_wait_queue_drained(q, 0);	672	spin_lock_irq(q->queue_lock);
		673
		674	set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
		675
		676	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
		677	;
		678
		679	while (q->rq.elvpriv) {
		680	spin_unlock_irq(q->queue_lock);
		681	msleep(100);
		682	spin_lock_irq(q->queue_lock);
		683	}
		684
		685	spin_unlock_irq(q->queue_lock);
681		686
682	/*	687	/*
683	* unregister old elevator data	688	* unregister old elevator data
@@ -686,18 +691,6 @@ static void elevator_switch(request_queue_t q, struct elevator_type new_e)
686	old_elevator = q->elevator;	691	old_elevator = q->elevator;
687		692
688	/*	693	/*
689	* next step, switch to noop since it uses no private rq structures
690	* and doesn't allocate any memory for anything. then wait for any
691	* non-fs requests in-flight
692	*/
693	noop_elevator = elevator_get("noop");
694	spin_lock_irq(q->queue_lock);
695	elevator_attach(q, noop_elevator, e);
696	spin_unlock_irq(q->queue_lock);
697
698	blk_wait_queue_drained(q, 1);
699
700	/*
701	* attach and start new elevator	694	* attach and start new elevator
702	*/	695	*/
703	if (elevator_attach(q, new_e, e))	696	if (elevator_attach(q, new_e, e))
@@ -707,11 +700,10 @@ static void elevator_switch(request_queue_t q, struct elevator_type new_e)
707	goto fail_register;	700	goto fail_register;
708		701
709	/*	702	/*
710	* finally exit old elevator and start queue again	703	* finally exit old elevator and turn off BYPASS.
711	*/	704	*/
712	elevator_exit(old_elevator);	705	elevator_exit(old_elevator);
713	blk_finish_queue_drain(q);	706	clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
714	elevator_put(noop_elevator);
715	return;	707	return;
716		708
717	fail_register:	709	fail_register:
@@ -720,13 +712,13 @@ fail_register:
720	* one again (along with re-adding the sysfs dir)	712	* one again (along with re-adding the sysfs dir)
721	*/	713	*/
722	elevator_exit(e);	714	elevator_exit(e);
		715	e = NULL;
723	fail:	716	fail:
724	q->elevator = old_elevator;	717	q->elevator = old_elevator;
725	elv_register_queue(q);	718	elv_register_queue(q);
726	blk_finish_queue_drain(q);	719	clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
		720	kfree(e);
727	error:	721	error:
728	if (noop_elevator)
729	elevator_put(noop_elevator);
730	elevator_put(new_e);	722	elevator_put(new_e);
731	printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);	723	printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
732	}	724	}


diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index d2a66fd309c3..f7c9931cb380 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c
@@ -263,8 +263,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
263	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);	263	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
264		264
265	blk_queue_activity_fn(q, NULL, NULL);	265	blk_queue_activity_fn(q, NULL, NULL);
266
267	INIT_LIST_HEAD(&q->drain_list);
268	}	266	}
269		267
270	EXPORT_SYMBOL(blk_queue_make_request);	268	EXPORT_SYMBOL(blk_queue_make_request);
@@ -1050,6 +1048,7 @@ static char *rq_flags[] = {
1050	"REQ_STARTED",	1048	"REQ_STARTED",
1051	"REQ_DONTPREP",	1049	"REQ_DONTPREP",
1052	"REQ_QUEUED",	1050	"REQ_QUEUED",
		1051	"REQ_ELVPRIV",
1053	"REQ_PC",	1052	"REQ_PC",
1054	"REQ_BLOCK_PC",	1053	"REQ_BLOCK_PC",
1055	"REQ_SENSE",	1054	"REQ_SENSE",
@@ -1640,9 +1639,9 @@ static int blk_init_free_list(request_queue_t *q)
1640		1639
1641	rl->count[READ] = rl->count[WRITE] = 0;	1640	rl->count[READ] = rl->count[WRITE] = 0;
1642	rl->starved[READ] = rl->starved[WRITE] = 0;	1641	rl->starved[READ] = rl->starved[WRITE] = 0;
		1642	rl->elvpriv = 0;
1643	init_waitqueue_head(&rl->wait[READ]);	1643	init_waitqueue_head(&rl->wait[READ]);
1644	init_waitqueue_head(&rl->wait[WRITE]);	1644	init_waitqueue_head(&rl->wait[WRITE]);
1645	init_waitqueue_head(&rl->drain);
1646		1645
1647	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,	1646	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1648	mempool_free_slab, request_cachep, q->node);	1647	mempool_free_slab, request_cachep, q->node);
@@ -1785,12 +1784,14 @@ EXPORT_SYMBOL(blk_get_queue);
1785		1784
1786	static inline void blk_free_request(request_queue_t q, struct request rq)	1785	static inline void blk_free_request(request_queue_t q, struct request rq)
1787	{	1786	{
1788	elv_put_request(q, rq);	1787	if (rq->flags & REQ_ELVPRIV)
		1788	elv_put_request(q, rq);
1789	mempool_free(rq, q->rq.rq_pool);	1789	mempool_free(rq, q->rq.rq_pool);
1790	}	1790	}
1791		1791
1792	static inline struct request *	1792	static inline struct request *
1793	blk_alloc_request(request_queue_t q, int rw, struct bio bio, int gfp_mask)	1793	blk_alloc_request(request_queue_t q, int rw, struct bio bio,
		1794	int priv, int gfp_mask)
1794	{	1795	{
1795	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);	1796	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1796		1797
@@ -1803,11 +1804,15 @@ blk_alloc_request(request_queue_t q, int rw, struct bio bio, int gfp_mask)
1803	*/	1804	*/
1804	rq->flags = rw;	1805	rq->flags = rw;
1805		1806
1806	if (!elv_set_request(q, rq, bio, gfp_mask))	1807	if (priv) {
1807	return rq;	1808	if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
		1809	mempool_free(rq, q->rq.rq_pool);
		1810	return NULL;
		1811	}
		1812	rq->flags \|= REQ_ELVPRIV;
		1813	}
1808		1814
1809	mempool_free(rq, q->rq.rq_pool);	1815	return rq;
1810	return NULL;
1811	}	1816	}
1812		1817
1813	/*	1818	/*
@@ -1863,22 +1868,18 @@ static void __freed_request(request_queue_t *q, int rw)
1863	* A request has just been released. Account for it, update the full and	1868	* A request has just been released. Account for it, update the full and
1864	* congestion status, wake up any waiters. Called under q->queue_lock.	1869	* congestion status, wake up any waiters. Called under q->queue_lock.
1865	*/	1870	*/
1866	static void freed_request(request_queue_t *q, int rw)	1871	static void freed_request(request_queue_t *q, int rw, int priv)
1867	{	1872	{
1868	struct request_list *rl = &q->rq;	1873	struct request_list *rl = &q->rq;
1869		1874
1870	rl->count[rw]--;	1875	rl->count[rw]--;
		1876	if (priv)
		1877	rl->elvpriv--;
1871		1878
1872	__freed_request(q, rw);	1879	__freed_request(q, rw);
1873		1880
1874	if (unlikely(rl->starved[rw ^ 1]))	1881	if (unlikely(rl->starved[rw ^ 1]))
1875	__freed_request(q, rw ^ 1);	1882	__freed_request(q, rw ^ 1);
1876
1877	if (!rl->count[READ] && !rl->count[WRITE]) {
1878	smp_mb();
1879	if (unlikely(waitqueue_active(&rl->drain)))
1880	wake_up(&rl->drain);
1881	}
1882	}	1883	}
1883		1884
1884	#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)	1885	#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1893,9 +1894,7 @@ static struct request get_request(request_queue_t q, int rw, struct bio *bio,
1893	struct request *rq = NULL;	1894	struct request *rq = NULL;
1894	struct request_list *rl = &q->rq;	1895	struct request_list *rl = &q->rq;
1895	struct io_context *ioc = current_io_context(GFP_ATOMIC);	1896	struct io_context *ioc = current_io_context(GFP_ATOMIC);
1896		1897	int priv;
1897	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
1898	goto out;
1899		1898
1900	if (rl->count[rw]+1 >= q->nr_requests) {	1899	if (rl->count[rw]+1 >= q->nr_requests) {
1901	/*	1900	/*
@@ -1940,9 +1939,14 @@ get_rq:
1940	rl->starved[rw] = 0;	1939	rl->starved[rw] = 0;
1941	if (rl->count[rw] >= queue_congestion_on_threshold(q))	1940	if (rl->count[rw] >= queue_congestion_on_threshold(q))
1942	set_queue_congested(q, rw);	1941	set_queue_congested(q, rw);
		1942
		1943	priv = !test_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
		1944	if (priv)
		1945	rl->elvpriv++;
		1946
1943	spin_unlock_irq(q->queue_lock);	1947	spin_unlock_irq(q->queue_lock);
1944		1948
1945	rq = blk_alloc_request(q, rw, bio, gfp_mask);	1949	rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
1946	if (!rq) {	1950	if (!rq) {
1947	/*	1951	/*
1948	* Allocation failed presumably due to memory. Undo anything	1952	* Allocation failed presumably due to memory. Undo anything
@@ -1952,7 +1956,7 @@ get_rq:
1952	* wait queue, but this is pretty rare.	1956	* wait queue, but this is pretty rare.
1953	*/	1957	*/
1954	spin_lock_irq(q->queue_lock);	1958	spin_lock_irq(q->queue_lock);
1955	freed_request(q, rw);	1959	freed_request(q, rw, priv);
1956		1960
1957	/*	1961	/*
1958	* in the very unlikely event that allocation failed and no	1962	* in the very unlikely event that allocation failed and no
@@ -2470,11 +2474,12 @@ static void __blk_put_request(request_queue_t q, struct request req)
2470	*/	2474	*/
2471	if (rl) {	2475	if (rl) {
2472	int rw = rq_data_dir(req);	2476	int rw = rq_data_dir(req);
		2477	int priv = req->flags & REQ_ELVPRIV;
2473		2478
2474	BUG_ON(!list_empty(&req->queuelist));	2479	BUG_ON(!list_empty(&req->queuelist));
2475		2480
2476	blk_free_request(q, req);	2481	blk_free_request(q, req);
2477	freed_request(q, rw);	2482	freed_request(q, rw, priv);
2478	}	2483	}
2479	}	2484	}
2480		2485
@@ -2802,97 +2807,6 @@ static inline void blk_partition_remap(struct bio *bio)
2802	}	2807	}
2803	}	2808	}
2804		2809
2805	void blk_finish_queue_drain(request_queue_t *q)
2806	{
2807	struct request_list *rl = &q->rq;
2808	struct request *rq;
2809	int requeued = 0;
2810
2811	spin_lock_irq(q->queue_lock);
2812	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
2813
2814	while (!list_empty(&q->drain_list)) {
2815	rq = list_entry_rq(q->drain_list.next);
2816
2817	list_del_init(&rq->queuelist);
2818	elv_requeue_request(q, rq);
2819	requeued++;
2820	}
2821
2822	if (requeued)
2823	q->request_fn(q);
2824
2825	spin_unlock_irq(q->queue_lock);
2826
2827	wake_up(&rl->wait[0]);
2828	wake_up(&rl->wait[1]);
2829	wake_up(&rl->drain);
2830	}
2831
2832	static int wait_drain(request_queue_t q, struct request_list rl, int dispatch)
2833	{
2834	int wait = rl->count[READ] + rl->count[WRITE];
2835
2836	if (dispatch)
2837	wait += !list_empty(&q->queue_head);
2838
2839	return wait;
2840	}
2841
2842	/*
2843	* We rely on the fact that only requests allocated through blk_alloc_request()
2844	* have io scheduler private data structures associated with them. Any other
2845	* type of request (allocated on stack or through kmalloc()) should not go
2846	* to the io scheduler core, but be attached to the queue head instead.
2847	*/
2848	void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
2849	{
2850	struct request_list *rl = &q->rq;
2851	DEFINE_WAIT(wait);
2852
2853	spin_lock_irq(q->queue_lock);
2854	set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
2855
2856	while (wait_drain(q, rl, wait_dispatch)) {
2857	prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
2858
2859	if (wait_drain(q, rl, wait_dispatch)) {
2860	__generic_unplug_device(q);
2861	spin_unlock_irq(q->queue_lock);
2862	io_schedule();
2863	spin_lock_irq(q->queue_lock);
2864	}
2865
2866	finish_wait(&rl->drain, &wait);
2867	}
2868
2869	spin_unlock_irq(q->queue_lock);
2870	}
2871
2872	/*
2873	* block waiting for the io scheduler being started again.
2874	*/
2875	static inline void block_wait_queue_running(request_queue_t *q)
2876	{
2877	DEFINE_WAIT(wait);
2878
2879	while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
2880	struct request_list *rl = &q->rq;
2881
2882	prepare_to_wait_exclusive(&rl->drain, &wait,
2883	TASK_UNINTERRUPTIBLE);
2884
2885	/*
2886	* re-check the condition. avoids using prepare_to_wait()
2887	* in the fast path (queue is running)
2888	*/
2889	if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
2890	io_schedule();
2891
2892	finish_wait(&rl->drain, &wait);
2893	}
2894	}
2895
2896	static void handle_bad_sector(struct bio *bio)	2810	static void handle_bad_sector(struct bio *bio)
2897	{	2811	{
2898	char b[BDEVNAME_SIZE];	2812	char b[BDEVNAME_SIZE];
@@ -2988,8 +2902,6 @@ end_io:
2988	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))	2902	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
2989	goto end_io;	2903	goto end_io;
2990		2904
2991	block_wait_queue_running(q);
2992
2993	/*	2905	/*
2994	* If this device has partitions, remap block n	2906	* If this device has partitions, remap block n
2995	* of partition p to block n+start(p) of the disk.	2907	* of partition p to block n+start(p) of the disk.


diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 159dbcd2eb59..6186d5e2110f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h
@@ -107,9 +107,9 @@ typedef void (rq_end_io_fn)(struct request *);
107	struct request_list {	107	struct request_list {
108	int count[2];	108	int count[2];
109	int starved[2];	109	int starved[2];
		110	int elvpriv;
110	mempool_t *rq_pool;	111	mempool_t *rq_pool;
111	wait_queue_head_t wait[2];	112	wait_queue_head_t wait[2];
112	wait_queue_head_t drain;
113	};	113	};
114		114
115	#define BLK_MAX_CDB 16	115	#define BLK_MAX_CDB 16
@@ -211,6 +211,7 @@ enum rq_flag_bits {
211	__REQ_STARTED, /* drive already may have started this one */	211	__REQ_STARTED, /* drive already may have started this one */
212	__REQ_DONTPREP, /* don't call prep for this one */	212	__REQ_DONTPREP, /* don't call prep for this one */
213	__REQ_QUEUED, /* uses queueing */	213	__REQ_QUEUED, /* uses queueing */
		214	__REQ_ELVPRIV, /* elevator private data attached */
214	/*	215	/*
215	* for ATA/ATAPI devices	216	* for ATA/ATAPI devices
216	*/	217	*/
@@ -244,6 +245,7 @@ enum rq_flag_bits {
244	#define REQ_STARTED (1 << __REQ_STARTED)	245	#define REQ_STARTED (1 << __REQ_STARTED)
245	#define REQ_DONTPREP (1 << __REQ_DONTPREP)	246	#define REQ_DONTPREP (1 << __REQ_DONTPREP)
246	#define REQ_QUEUED (1 << __REQ_QUEUED)	247	#define REQ_QUEUED (1 << __REQ_QUEUED)
		248	#define REQ_ELVPRIV (1 << __REQ_ELVPRIV)
247	#define REQ_PC (1 << __REQ_PC)	249	#define REQ_PC (1 << __REQ_PC)
248	#define REQ_BLOCK_PC (1 << __REQ_BLOCK_PC)	250	#define REQ_BLOCK_PC (1 << __REQ_BLOCK_PC)
249	#define REQ_SENSE (1 << __REQ_SENSE)	251	#define REQ_SENSE (1 << __REQ_SENSE)
@@ -413,8 +415,6 @@ struct request_queue
413	unsigned int sg_reserved_size;	415	unsigned int sg_reserved_size;
414	int node;	416	int node;
415		417
416	struct list_head drain_list;
417
418	/*	418	/*
419	* reserved for flush operations	419	* reserved for flush operations
420	*/	420	*/
@@ -442,7 +442,7 @@ enum {
442	#define QUEUE_FLAG_DEAD 5 /* queue being torn down */	442	#define QUEUE_FLAG_DEAD 5 /* queue being torn down */
443	#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */	443	#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */
444	#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */	444	#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */
445	#define QUEUE_FLAG_DRAIN 8 /* draining queue for sched switch */	445	#define QUEUE_FLAG_BYPASS 8 /* don't use elevator, just do FIFO */
446	#define QUEUE_FLAG_FLUSH 9 /* doing barrier flush sequence */	446	#define QUEUE_FLAG_FLUSH 9 /* doing barrier flush sequence */
447		447
448	#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)	448	#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -668,8 +668,6 @@ extern void blk_dump_rq_flags(struct request , char );
668	extern void generic_unplug_device(request_queue_t *);	668	extern void generic_unplug_device(request_queue_t *);
669	extern void __generic_unplug_device(request_queue_t *);	669	extern void __generic_unplug_device(request_queue_t *);
670	extern long nr_blockdev_pages(void);	670	extern long nr_blockdev_pages(void);
671	extern void blk_wait_queue_drained(request_queue_t *, int);
672	extern void blk_finish_queue_drain(request_queue_t *);
673		671
674	int blk_get_queue(request_queue_t *);	672	int blk_get_queue(request_queue_t *);
675	request_queue_t *blk_alloc_queue(int gfp_mask);	673	request_queue_t *blk_alloc_queue(int gfp_mask);