aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-15 15:24:45 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-15 15:24:45 -0500
commitb3c9dd182ed3bdcdaf0e42625a35924b0497afdc (patch)
treead48ad4d923fee147c736318d0fad35b3755f4f5 /block
parent83c2f912b43c3a7babbb6cb7ae2a5276c1ed2a3e (diff)
parent5d381efb3d1f1ef10535a31ca0dd9b22fe1e1922 (diff)
Merge branch 'for-3.3/core' of git://git.kernel.dk/linux-block
* 'for-3.3/core' of git://git.kernel.dk/linux-block: (37 commits) Revert "block: recursive merge requests" block: Stop using macro stubs for the bio data integrity calls blockdev: convert some macros to static inlines fs: remove unneeded plug in mpage_readpages() block: Add BLKROTATIONAL ioctl block: Introduce blk_set_stacking_limits function block: remove WARN_ON_ONCE() in exit_io_context() block: an exiting task should be allowed to create io_context block: ioc_cgroup_changed() needs to be exported block: recursive merge requests block, cfq: fix empty queue crash caused by request merge block, cfq: move icq creation and rq->elv.icq association to block core block, cfq: restructure io_cq creation path for io_context interface cleanup block, cfq: move io_cq exit/release to blk-ioc.c block, cfq: move icq cache management to block core block, cfq: move io_cq lookup to blk-ioc.c block, cfq: move cfqd->icq_list to request_queue and add request->elv.icq block, cfq: reorganize cfq_io_context into generic and cfq specific parts block: remove elevator_queue->ops block: reorder elevator switch sequence ... Fix up conflicts in: - block/blk-cgroup.c Switch from can_attach_task to can_attach - block/cfq-iosched.c conflict with now removed cic index changes (we now use q->id instead)
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c11
-rw-r--r--block/blk-core.c203
-rw-r--r--block/blk-exec.c8
-rw-r--r--block/blk-ioc.c485
-rw-r--r--block/blk-settings.c32
-rw-r--r--block/blk-sysfs.c12
-rw-r--r--block/blk-throttle.c4
-rw-r--r--block/blk.h58
-rw-r--r--block/bsg.c4
-rw-r--r--block/cfq-iosched.c619
-rw-r--r--block/compat_ioctl.c3
-rw-r--r--block/deadline-iosched.c4
-rw-r--r--block/elevator.c217
-rw-r--r--block/genhd.c2
-rw-r--r--block/ioctl.c2
-rw-r--r--block/noop-iosched.c4
16 files changed, 847 insertions, 821 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b8c143d68ee..fa8f2630944 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1655,11 +1655,12 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1655 struct io_context *ioc; 1655 struct io_context *ioc;
1656 1656
1657 cgroup_taskset_for_each(task, cgrp, tset) { 1657 cgroup_taskset_for_each(task, cgrp, tset) {
1658 task_lock(task); 1658 /* we don't lose anything even if ioc allocation fails */
1659 ioc = task->io_context; 1659 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1660 if (ioc) 1660 if (ioc) {
1661 ioc->cgroup_changed = 1; 1661 ioc_cgroup_changed(ioc);
1662 task_unlock(task); 1662 put_io_context(ioc, NULL);
1663 }
1663 } 1664 }
1664} 1665}
1665 1666
diff --git a/block/blk-core.c b/block/blk-core.c
index 15de223c7f9..e6c05a97ee2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
39EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 39EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
40EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); 40EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
41 41
42DEFINE_IDA(blk_queue_ida);
43
42/* 44/*
43 * For the allocated request tables 45 * For the allocated request tables
44 */ 46 */
@@ -358,7 +360,8 @@ EXPORT_SYMBOL(blk_put_queue);
358void blk_drain_queue(struct request_queue *q, bool drain_all) 360void blk_drain_queue(struct request_queue *q, bool drain_all)
359{ 361{
360 while (true) { 362 while (true) {
361 int nr_rqs; 363 bool drain = false;
364 int i;
362 365
363 spin_lock_irq(q->queue_lock); 366 spin_lock_irq(q->queue_lock);
364 367
@@ -375,14 +378,25 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
375 if (!list_empty(&q->queue_head)) 378 if (!list_empty(&q->queue_head))
376 __blk_run_queue(q); 379 __blk_run_queue(q);
377 380
378 if (drain_all) 381 drain |= q->rq.elvpriv;
379 nr_rqs = q->rq.count[0] + q->rq.count[1]; 382
380 else 383 /*
381 nr_rqs = q->rq.elvpriv; 384 * Unfortunately, requests are queued at and tracked from
385 * multiple places and there's no single counter which can
386 * be drained. Check all the queues and counters.
387 */
388 if (drain_all) {
389 drain |= !list_empty(&q->queue_head);
390 for (i = 0; i < 2; i++) {
391 drain |= q->rq.count[i];
392 drain |= q->in_flight[i];
393 drain |= !list_empty(&q->flush_queue[i]);
394 }
395 }
382 396
383 spin_unlock_irq(q->queue_lock); 397 spin_unlock_irq(q->queue_lock);
384 398
385 if (!nr_rqs) 399 if (!drain)
386 break; 400 break;
387 msleep(10); 401 msleep(10);
388 } 402 }
@@ -469,6 +483,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
469 if (!q) 483 if (!q)
470 return NULL; 484 return NULL;
471 485
486 q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
487 if (q->id < 0)
488 goto fail_q;
489
472 q->backing_dev_info.ra_pages = 490 q->backing_dev_info.ra_pages =
473 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 491 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
474 q->backing_dev_info.state = 0; 492 q->backing_dev_info.state = 0;
@@ -477,20 +495,17 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
477 q->node = node_id; 495 q->node = node_id;
478 496
479 err = bdi_init(&q->backing_dev_info); 497 err = bdi_init(&q->backing_dev_info);
480 if (err) { 498 if (err)
481 kmem_cache_free(blk_requestq_cachep, q); 499 goto fail_id;
482 return NULL;
483 }
484 500
485 if (blk_throtl_init(q)) { 501 if (blk_throtl_init(q))
486 kmem_cache_free(blk_requestq_cachep, q); 502 goto fail_id;
487 return NULL;
488 }
489 503
490 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 504 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
491 laptop_mode_timer_fn, (unsigned long) q); 505 laptop_mode_timer_fn, (unsigned long) q);
492 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 506 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
493 INIT_LIST_HEAD(&q->timeout_list); 507 INIT_LIST_HEAD(&q->timeout_list);
508 INIT_LIST_HEAD(&q->icq_list);
494 INIT_LIST_HEAD(&q->flush_queue[0]); 509 INIT_LIST_HEAD(&q->flush_queue[0]);
495 INIT_LIST_HEAD(&q->flush_queue[1]); 510 INIT_LIST_HEAD(&q->flush_queue[1]);
496 INIT_LIST_HEAD(&q->flush_data_in_flight); 511 INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -508,6 +523,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
508 q->queue_lock = &q->__queue_lock; 523 q->queue_lock = &q->__queue_lock;
509 524
510 return q; 525 return q;
526
527fail_id:
528 ida_simple_remove(&blk_queue_ida, q->id);
529fail_q:
530 kmem_cache_free(blk_requestq_cachep, q);
531 return NULL;
511} 532}
512EXPORT_SYMBOL(blk_alloc_queue_node); 533EXPORT_SYMBOL(blk_alloc_queue_node);
513 534
@@ -605,26 +626,31 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
605} 626}
606EXPORT_SYMBOL(blk_init_allocated_queue); 627EXPORT_SYMBOL(blk_init_allocated_queue);
607 628
608int blk_get_queue(struct request_queue *q) 629bool blk_get_queue(struct request_queue *q)
609{ 630{
610 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 631 if (likely(!blk_queue_dead(q))) {
611 kobject_get(&q->kobj); 632 __blk_get_queue(q);
612 return 0; 633 return true;
613 } 634 }
614 635
615 return 1; 636 return false;
616} 637}
617EXPORT_SYMBOL(blk_get_queue); 638EXPORT_SYMBOL(blk_get_queue);
618 639
619static inline void blk_free_request(struct request_queue *q, struct request *rq) 640static inline void blk_free_request(struct request_queue *q, struct request *rq)
620{ 641{
621 if (rq->cmd_flags & REQ_ELVPRIV) 642 if (rq->cmd_flags & REQ_ELVPRIV) {
622 elv_put_request(q, rq); 643 elv_put_request(q, rq);
644 if (rq->elv.icq)
645 put_io_context(rq->elv.icq->ioc, q);
646 }
647
623 mempool_free(rq, q->rq.rq_pool); 648 mempool_free(rq, q->rq.rq_pool);
624} 649}
625 650
626static struct request * 651static struct request *
627blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask) 652blk_alloc_request(struct request_queue *q, struct io_cq *icq,
653 unsigned int flags, gfp_t gfp_mask)
628{ 654{
629 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 655 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
630 656
@@ -635,10 +661,15 @@ blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
635 661
636 rq->cmd_flags = flags | REQ_ALLOCED; 662 rq->cmd_flags = flags | REQ_ALLOCED;
637 663
638 if ((flags & REQ_ELVPRIV) && 664 if (flags & REQ_ELVPRIV) {
639 unlikely(elv_set_request(q, rq, gfp_mask))) { 665 rq->elv.icq = icq;
640 mempool_free(rq, q->rq.rq_pool); 666 if (unlikely(elv_set_request(q, rq, gfp_mask))) {
641 return NULL; 667 mempool_free(rq, q->rq.rq_pool);
668 return NULL;
669 }
670 /* @rq->elv.icq holds on to io_context until @rq is freed */
671 if (icq)
672 get_io_context(icq->ioc);
642 } 673 }
643 674
644 return rq; 675 return rq;
@@ -750,11 +781,17 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
750{ 781{
751 struct request *rq = NULL; 782 struct request *rq = NULL;
752 struct request_list *rl = &q->rq; 783 struct request_list *rl = &q->rq;
753 struct io_context *ioc = NULL; 784 struct elevator_type *et;
785 struct io_context *ioc;
786 struct io_cq *icq = NULL;
754 const bool is_sync = rw_is_sync(rw_flags) != 0; 787 const bool is_sync = rw_is_sync(rw_flags) != 0;
788 bool retried = false;
755 int may_queue; 789 int may_queue;
790retry:
791 et = q->elevator->type;
792 ioc = current->io_context;
756 793
757 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 794 if (unlikely(blk_queue_dead(q)))
758 return NULL; 795 return NULL;
759 796
760 may_queue = elv_may_queue(q, rw_flags); 797 may_queue = elv_may_queue(q, rw_flags);
@@ -763,7 +800,20 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
763 800
764 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { 801 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
765 if (rl->count[is_sync]+1 >= q->nr_requests) { 802 if (rl->count[is_sync]+1 >= q->nr_requests) {
766 ioc = current_io_context(GFP_ATOMIC, q->node); 803 /*
804 * We want ioc to record batching state. If it's
805 * not already there, creating a new one requires
806 * dropping queue_lock, which in turn requires
807 * retesting conditions to avoid queue hang.
808 */
809 if (!ioc && !retried) {
810 spin_unlock_irq(q->queue_lock);
811 create_io_context(current, gfp_mask, q->node);
812 spin_lock_irq(q->queue_lock);
813 retried = true;
814 goto retry;
815 }
816
767 /* 817 /*
768 * The queue will fill after this allocation, so set 818 * The queue will fill after this allocation, so set
769 * it as full, and mark this process as "batching". 819 * it as full, and mark this process as "batching".
@@ -799,17 +849,36 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
799 rl->count[is_sync]++; 849 rl->count[is_sync]++;
800 rl->starved[is_sync] = 0; 850 rl->starved[is_sync] = 0;
801 851
852 /*
853 * Decide whether the new request will be managed by elevator. If
854 * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
855 * prevent the current elevator from being destroyed until the new
856 * request is freed. This guarantees icq's won't be destroyed and
857 * makes creating new ones safe.
858 *
859 * Also, lookup icq while holding queue_lock. If it doesn't exist,
860 * it will be created after releasing queue_lock.
861 */
802 if (blk_rq_should_init_elevator(bio) && 862 if (blk_rq_should_init_elevator(bio) &&
803 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { 863 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
804 rw_flags |= REQ_ELVPRIV; 864 rw_flags |= REQ_ELVPRIV;
805 rl->elvpriv++; 865 rl->elvpriv++;
866 if (et->icq_cache && ioc)
867 icq = ioc_lookup_icq(ioc, q);
806 } 868 }
807 869
808 if (blk_queue_io_stat(q)) 870 if (blk_queue_io_stat(q))
809 rw_flags |= REQ_IO_STAT; 871 rw_flags |= REQ_IO_STAT;
810 spin_unlock_irq(q->queue_lock); 872 spin_unlock_irq(q->queue_lock);
811 873
812 rq = blk_alloc_request(q, rw_flags, gfp_mask); 874 /* create icq if missing */
875 if (unlikely(et->icq_cache && !icq))
876 icq = ioc_create_icq(q, gfp_mask);
877
878 /* rqs are guaranteed to have icq on elv_set_request() if requested */
879 if (likely(!et->icq_cache || icq))
880 rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
881
813 if (unlikely(!rq)) { 882 if (unlikely(!rq)) {
814 /* 883 /*
815 * Allocation failed presumably due to memory. Undo anything 884 * Allocation failed presumably due to memory. Undo anything
@@ -871,10 +940,9 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
871 rq = get_request(q, rw_flags, bio, GFP_NOIO); 940 rq = get_request(q, rw_flags, bio, GFP_NOIO);
872 while (!rq) { 941 while (!rq) {
873 DEFINE_WAIT(wait); 942 DEFINE_WAIT(wait);
874 struct io_context *ioc;
875 struct request_list *rl = &q->rq; 943 struct request_list *rl = &q->rq;
876 944
877 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 945 if (unlikely(blk_queue_dead(q)))
878 return NULL; 946 return NULL;
879 947
880 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, 948 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -891,8 +959,8 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
891 * up to a big batch of them for a small period time. 959 * up to a big batch of them for a small period time.
892 * See ioc_batching, ioc_set_batching 960 * See ioc_batching, ioc_set_batching
893 */ 961 */
894 ioc = current_io_context(GFP_NOIO, q->node); 962 create_io_context(current, GFP_NOIO, q->node);
895 ioc_set_batching(q, ioc); 963 ioc_set_batching(q, current->io_context);
896 964
897 spin_lock_irq(q->queue_lock); 965 spin_lock_irq(q->queue_lock);
898 finish_wait(&rl->wait[is_sync], &wait); 966 finish_wait(&rl->wait[is_sync], &wait);
@@ -1009,54 +1077,6 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
1009 __elv_add_request(q, rq, where); 1077 __elv_add_request(q, rq, where);
1010} 1078}
1011 1079
1012/**
1013 * blk_insert_request - insert a special request into a request queue
1014 * @q: request queue where request should be inserted
1015 * @rq: request to be inserted
1016 * @at_head: insert request at head or tail of queue
1017 * @data: private data
1018 *
1019 * Description:
1020 * Many block devices need to execute commands asynchronously, so they don't
1021 * block the whole kernel from preemption during request execution. This is
1022 * accomplished normally by inserting aritficial requests tagged as
1023 * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
1024 * be scheduled for actual execution by the request queue.
1025 *
1026 * We have the option of inserting the head or the tail of the queue.
1027 * Typically we use the tail for new ioctls and so forth. We use the head
1028 * of the queue for things like a QUEUE_FULL message from a device, or a
1029 * host that is unable to accept a particular command.
1030 */
1031void blk_insert_request(struct request_queue *q, struct request *rq,
1032 int at_head, void *data)
1033{
1034 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
1035 unsigned long flags;
1036
1037 /*
1038 * tell I/O scheduler that this isn't a regular read/write (ie it
1039 * must not attempt merges on this) and that it acts as a soft
1040 * barrier
1041 */
1042 rq->cmd_type = REQ_TYPE_SPECIAL;
1043
1044 rq->special = data;
1045
1046 spin_lock_irqsave(q->queue_lock, flags);
1047
1048 /*
1049 * If command is tagged, release the tag
1050 */
1051 if (blk_rq_tagged(rq))
1052 blk_queue_end_tag(q, rq);
1053
1054 add_acct_request(q, rq, where);
1055 __blk_run_queue(q);
1056 spin_unlock_irqrestore(q->queue_lock, flags);
1057}
1058EXPORT_SYMBOL(blk_insert_request);
1059
1060static void part_round_stats_single(int cpu, struct hd_struct *part, 1080static void part_round_stats_single(int cpu, struct hd_struct *part,
1061 unsigned long now) 1081 unsigned long now)
1062{ 1082{
@@ -1766,6 +1786,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1766 return -EIO; 1786 return -EIO;
1767 1787
1768 spin_lock_irqsave(q->queue_lock, flags); 1788 spin_lock_irqsave(q->queue_lock, flags);
1789 if (unlikely(blk_queue_dead(q))) {
1790 spin_unlock_irqrestore(q->queue_lock, flags);
1791 return -ENODEV;
1792 }
1769 1793
1770 /* 1794 /*
1771 * Submitting request must be dequeued before calling this function 1795 * Submitting request must be dequeued before calling this function
@@ -2740,6 +2764,14 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
2740 trace_block_unplug(q, depth, !from_schedule); 2764 trace_block_unplug(q, depth, !from_schedule);
2741 2765
2742 /* 2766 /*
2767 * Don't mess with dead queue.
2768 */
2769 if (unlikely(blk_queue_dead(q))) {
2770 spin_unlock(q->queue_lock);
2771 return;
2772 }
2773
2774 /*
2743 * If we are punting this to kblockd, then we can safely drop 2775 * If we are punting this to kblockd, then we can safely drop
2744 * the queue_lock before waking kblockd (which needs to take 2776 * the queue_lock before waking kblockd (which needs to take
2745 * this lock). 2777 * this lock).
@@ -2815,6 +2847,15 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2815 depth = 0; 2847 depth = 0;
2816 spin_lock(q->queue_lock); 2848 spin_lock(q->queue_lock);
2817 } 2849 }
2850
2851 /*
2852 * Short-circuit if @q is dead
2853 */
2854 if (unlikely(blk_queue_dead(q))) {
2855 __blk_end_request_all(rq, -ENODEV);
2856 continue;
2857 }
2858
2818 /* 2859 /*
2819 * rq is already accounted, so use raw insert 2860 * rq is already accounted, so use raw insert
2820 */ 2861 */
diff --git a/block/blk-exec.c b/block/blk-exec.c
index a1ebceb332f..fb2cbd55162 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -50,7 +50,11 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
50{ 50{
51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
52 52
53 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 53 WARN_ON(irqs_disabled());
54 spin_lock_irq(q->queue_lock);
55
56 if (unlikely(blk_queue_dead(q))) {
57 spin_unlock_irq(q->queue_lock);
54 rq->errors = -ENXIO; 58 rq->errors = -ENXIO;
55 if (rq->end_io) 59 if (rq->end_io)
56 rq->end_io(rq, rq->errors); 60 rq->end_io(rq, rq->errors);
@@ -59,8 +63,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
59 63
60 rq->rq_disk = bd_disk; 64 rq->rq_disk = bd_disk;
61 rq->end_io = done; 65 rq->end_io = done;
62 WARN_ON(irqs_disabled());
63 spin_lock_irq(q->queue_lock);
64 __elv_add_request(q, rq, where); 66 __elv_add_request(q, rq, where);
65 __blk_run_queue(q); 67 __blk_run_queue(q);
66 /* the queue is stopped so it won't be run */ 68 /* the queue is stopped so it won't be run */
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 6f9bbd97865..27a06e00eae 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,53 +16,214 @@
16 */ 16 */
17static struct kmem_cache *iocontext_cachep; 17static struct kmem_cache *iocontext_cachep;
18 18
19static void cfq_dtor(struct io_context *ioc) 19/**
20 * get_io_context - increment reference count to io_context
21 * @ioc: io_context to get
22 *
23 * Increment reference count to @ioc.
24 */
25void get_io_context(struct io_context *ioc)
26{
27 BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
28 atomic_long_inc(&ioc->refcount);
29}
30EXPORT_SYMBOL(get_io_context);
31
32/*
33 * Releasing ioc may nest into another put_io_context() leading to nested
34 * fast path release. As the ioc's can't be the same, this is okay but
35 * makes lockdep whine. Keep track of nesting and use it as subclass.
36 */
37#ifdef CONFIG_LOCKDEP
38#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0)
39#define ioc_release_depth_inc(q) (q)->ioc_release_depth++
40#define ioc_release_depth_dec(q) (q)->ioc_release_depth--
41#else
42#define ioc_release_depth(q) 0
43#define ioc_release_depth_inc(q) do { } while (0)
44#define ioc_release_depth_dec(q) do { } while (0)
45#endif
46
47static void icq_free_icq_rcu(struct rcu_head *head)
48{
49 struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
50
51 kmem_cache_free(icq->__rcu_icq_cache, icq);
52}
53
54/*
55 * Exit and free an icq. Called with both ioc and q locked.
56 */
57static void ioc_exit_icq(struct io_cq *icq)
20{ 58{
21 if (!hlist_empty(&ioc->cic_list)) { 59 struct io_context *ioc = icq->ioc;
22 struct cfq_io_context *cic; 60 struct request_queue *q = icq->q;
61 struct elevator_type *et = q->elevator->type;
62
63 lockdep_assert_held(&ioc->lock);
64 lockdep_assert_held(q->queue_lock);
65
66 radix_tree_delete(&ioc->icq_tree, icq->q->id);
67 hlist_del_init(&icq->ioc_node);
68 list_del_init(&icq->q_node);
69
70 /*
71 * Both setting lookup hint to and clearing it from @icq are done
72 * under queue_lock. If it's not pointing to @icq now, it never
73 * will. Hint assignment itself can race safely.
74 */
75 if (rcu_dereference_raw(ioc->icq_hint) == icq)
76 rcu_assign_pointer(ioc->icq_hint, NULL);
23 77
24 cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, 78 if (et->ops.elevator_exit_icq_fn) {
25 cic_list); 79 ioc_release_depth_inc(q);
26 cic->dtor(ioc); 80 et->ops.elevator_exit_icq_fn(icq);
81 ioc_release_depth_dec(q);
27 } 82 }
83
84 /*
85 * @icq->q might have gone away by the time RCU callback runs
86 * making it impossible to determine icq_cache. Record it in @icq.
87 */
88 icq->__rcu_icq_cache = et->icq_cache;
89 call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
28} 90}
29 91
30/* 92/*
31 * IO Context helper functions. put_io_context() returns 1 if there are no 93 * Slow path for ioc release in put_io_context(). Performs double-lock
32 * more users of this io context, 0 otherwise. 94 * dancing to unlink all icq's and then frees ioc.
33 */ 95 */
34int put_io_context(struct io_context *ioc) 96static void ioc_release_fn(struct work_struct *work)
35{ 97{
36 if (ioc == NULL) 98 struct io_context *ioc = container_of(work, struct io_context,
37 return 1; 99 release_work);
100 struct request_queue *last_q = NULL;
38 101
39 BUG_ON(atomic_long_read(&ioc->refcount) == 0); 102 spin_lock_irq(&ioc->lock);
40 103
41 if (atomic_long_dec_and_test(&ioc->refcount)) { 104 while (!hlist_empty(&ioc->icq_list)) {
42 rcu_read_lock(); 105 struct io_cq *icq = hlist_entry(ioc->icq_list.first,
43 cfq_dtor(ioc); 106 struct io_cq, ioc_node);
44 rcu_read_unlock(); 107 struct request_queue *this_q = icq->q;
45 108
46 kmem_cache_free(iocontext_cachep, ioc); 109 if (this_q != last_q) {
47 return 1; 110 /*
111 * Need to switch to @this_q. Once we release
112 * @ioc->lock, it can go away along with @cic.
113 * Hold on to it.
114 */
115 __blk_get_queue(this_q);
116
117 /*
118 * blk_put_queue() might sleep thanks to kobject
119 * idiocy. Always release both locks, put and
120 * restart.
121 */
122 if (last_q) {
123 spin_unlock(last_q->queue_lock);
124 spin_unlock_irq(&ioc->lock);
125 blk_put_queue(last_q);
126 } else {
127 spin_unlock_irq(&ioc->lock);
128 }
129
130 last_q = this_q;
131 spin_lock_irq(this_q->queue_lock);
132 spin_lock(&ioc->lock);
133 continue;
134 }
135 ioc_exit_icq(icq);
48 } 136 }
49 return 0; 137
138 if (last_q) {
139 spin_unlock(last_q->queue_lock);
140 spin_unlock_irq(&ioc->lock);
141 blk_put_queue(last_q);
142 } else {
143 spin_unlock_irq(&ioc->lock);
144 }
145
146 kmem_cache_free(iocontext_cachep, ioc);
50} 147}
51EXPORT_SYMBOL(put_io_context);
52 148
53static void cfq_exit(struct io_context *ioc) 149/**
150 * put_io_context - put a reference of io_context
151 * @ioc: io_context to put
152 * @locked_q: request_queue the caller is holding queue_lock of (hint)
153 *
154 * Decrement reference count of @ioc and release it if the count reaches
155 * zero. If the caller is holding queue_lock of a queue, it can indicate
156 * that with @locked_q. This is an optimization hint and the caller is
157 * allowed to pass in %NULL even when it's holding a queue_lock.
158 */
159void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
54{ 160{
55 rcu_read_lock(); 161 struct request_queue *last_q = locked_q;
162 unsigned long flags;
163
164 if (ioc == NULL)
165 return;
166
167 BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
168 if (locked_q)
169 lockdep_assert_held(locked_q->queue_lock);
56 170
57 if (!hlist_empty(&ioc->cic_list)) { 171 if (!atomic_long_dec_and_test(&ioc->refcount))
58 struct cfq_io_context *cic; 172 return;
173
174 /*
175 * Destroy @ioc. This is a bit messy because icq's are chained
176 * from both ioc and queue, and ioc->lock nests inside queue_lock.
177 * The inner ioc->lock should be held to walk our icq_list and then
178 * for each icq the outer matching queue_lock should be grabbed.
179 * ie. We need to do reverse-order double lock dancing.
180 *
181 * Another twist is that we are often called with one of the
182 * matching queue_locks held as indicated by @locked_q, which
183 * prevents performing double-lock dance for other queues.
184 *
185 * So, we do it in two stages. The fast path uses the queue_lock
186 * the caller is holding and, if other queues need to be accessed,
187 * uses trylock to avoid introducing locking dependency. This can
188 * handle most cases, especially if @ioc was performing IO on only
189 * single device.
190 *
191 * If trylock doesn't cut it, we defer to @ioc->release_work which
192 * can do all the double-locking dancing.
193 */
194 spin_lock_irqsave_nested(&ioc->lock, flags,
195 ioc_release_depth(locked_q));
59 196
60 cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, 197 while (!hlist_empty(&ioc->icq_list)) {
61 cic_list); 198 struct io_cq *icq = hlist_entry(ioc->icq_list.first,
62 cic->exit(ioc); 199 struct io_cq, ioc_node);
200 struct request_queue *this_q = icq->q;
201
202 if (this_q != last_q) {
203 if (last_q && last_q != locked_q)
204 spin_unlock(last_q->queue_lock);
205 last_q = NULL;
206
207 if (!spin_trylock(this_q->queue_lock))
208 break;
209 last_q = this_q;
210 continue;
211 }
212 ioc_exit_icq(icq);
63 } 213 }
64 rcu_read_unlock(); 214
215 if (last_q && last_q != locked_q)
216 spin_unlock(last_q->queue_lock);
217
218 spin_unlock_irqrestore(&ioc->lock, flags);
219
220 /* if no icq is left, we're done; otherwise, kick release_work */
221 if (hlist_empty(&ioc->icq_list))
222 kmem_cache_free(iocontext_cachep, ioc);
223 else
224 schedule_work(&ioc->release_work);
65} 225}
226EXPORT_SYMBOL(put_io_context);
66 227
67/* Called by the exiting task */ 228/* Called by the exiting task */
68void exit_io_context(struct task_struct *task) 229void exit_io_context(struct task_struct *task)
@@ -74,86 +235,240 @@ void exit_io_context(struct task_struct *task)
74 task->io_context = NULL; 235 task->io_context = NULL;
75 task_unlock(task); 236 task_unlock(task);
76 237
77 if (atomic_dec_and_test(&ioc->nr_tasks)) 238 atomic_dec(&ioc->nr_tasks);
78 cfq_exit(ioc); 239 put_io_context(ioc, NULL);
240}
241
242/**
243 * ioc_clear_queue - break any ioc association with the specified queue
244 * @q: request_queue being cleared
245 *
246 * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked.
247 */
248void ioc_clear_queue(struct request_queue *q)
249{
250 lockdep_assert_held(q->queue_lock);
251
252 while (!list_empty(&q->icq_list)) {
253 struct io_cq *icq = list_entry(q->icq_list.next,
254 struct io_cq, q_node);
255 struct io_context *ioc = icq->ioc;
79 256
80 put_io_context(ioc); 257 spin_lock(&ioc->lock);
258 ioc_exit_icq(icq);
259 spin_unlock(&ioc->lock);
260 }
81} 261}
82 262
83struct io_context *alloc_io_context(gfp_t gfp_flags, int node) 263void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
264 int node)
84{ 265{
85 struct io_context *ioc; 266 struct io_context *ioc;
86 267
87 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); 268 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
88 if (ioc) { 269 node);
89 atomic_long_set(&ioc->refcount, 1); 270 if (unlikely(!ioc))
90 atomic_set(&ioc->nr_tasks, 1); 271 return;
91 spin_lock_init(&ioc->lock);
92 ioc->ioprio_changed = 0;
93 ioc->ioprio = 0;
94 ioc->last_waited = 0; /* doesn't matter... */
95 ioc->nr_batch_requests = 0; /* because this is 0 */
96 INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
97 INIT_HLIST_HEAD(&ioc->cic_list);
98 ioc->ioc_data = NULL;
99#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
100 ioc->cgroup_changed = 0;
101#endif
102 }
103 272
104 return ioc; 273 /* initialize */
274 atomic_long_set(&ioc->refcount, 1);
275 atomic_set(&ioc->nr_tasks, 1);
276 spin_lock_init(&ioc->lock);
277 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
278 INIT_HLIST_HEAD(&ioc->icq_list);
279 INIT_WORK(&ioc->release_work, ioc_release_fn);
280
281 /*
282 * Try to install. ioc shouldn't be installed if someone else
283 * already did or @task, which isn't %current, is exiting. Note
284 * that we need to allow ioc creation on exiting %current as exit
285 * path may issue IOs from e.g. exit_files(). The exit path is
286 * responsible for not issuing IO after exit_io_context().
287 */
288 task_lock(task);
289 if (!task->io_context &&
290 (task == current || !(task->flags & PF_EXITING)))
291 task->io_context = ioc;
292 else
293 kmem_cache_free(iocontext_cachep, ioc);
294 task_unlock(task);
105} 295}
106 296
107/* 297/**
108 * If the current task has no IO context then create one and initialise it. 298 * get_task_io_context - get io_context of a task
109 * Otherwise, return its existing IO context. 299 * @task: task of interest
300 * @gfp_flags: allocation flags, used if allocation is necessary
301 * @node: allocation node, used if allocation is necessary
110 * 302 *
111 * This returned IO context doesn't have a specifically elevated refcount, 303 * Return io_context of @task. If it doesn't exist, it is created with
112 * but since the current task itself holds a reference, the context can be 304 * @gfp_flags and @node. The returned io_context has its reference count
113 * used in general code, so long as it stays within `current` context. 305 * incremented.
306 *
307 * This function always goes through task_lock() and it's better to use
308 * %current->io_context + get_io_context() for %current.
114 */ 309 */
115struct io_context *current_io_context(gfp_t gfp_flags, int node) 310struct io_context *get_task_io_context(struct task_struct *task,
311 gfp_t gfp_flags, int node)
116{ 312{
117 struct task_struct *tsk = current; 313 struct io_context *ioc;
118 struct io_context *ret;
119
120 ret = tsk->io_context;
121 if (likely(ret))
122 return ret;
123
124 ret = alloc_io_context(gfp_flags, node);
125 if (ret) {
126 /* make sure set_task_ioprio() sees the settings above */
127 smp_wmb();
128 tsk->io_context = ret;
129 }
130 314
131 return ret; 315 might_sleep_if(gfp_flags & __GFP_WAIT);
316
317 do {
318 task_lock(task);
319 ioc = task->io_context;
320 if (likely(ioc)) {
321 get_io_context(ioc);
322 task_unlock(task);
323 return ioc;
324 }
325 task_unlock(task);
326 } while (create_io_context(task, gfp_flags, node));
327
328 return NULL;
132} 329}
330EXPORT_SYMBOL(get_task_io_context);
133 331
134/* 332/**
135 * If the current task has no IO context then create one and initialise it. 333 * ioc_lookup_icq - lookup io_cq from ioc
136 * If it does have a context, take a ref on it. 334 * @ioc: the associated io_context
335 * @q: the associated request_queue
137 * 336 *
138 * This is always called in the context of the task which submitted the I/O. 337 * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
338 * with @q->queue_lock held.
139 */ 339 */
140struct io_context *get_io_context(gfp_t gfp_flags, int node) 340struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
141{ 341{
142 struct io_context *ioc = NULL; 342 struct io_cq *icq;
343
344 lockdep_assert_held(q->queue_lock);
143 345
144 /* 346 /*
145 * Check for unlikely race with exiting task. ioc ref count is 347 * icq's are indexed from @ioc using radix tree and hint pointer,
146 * zero when ioc is being detached. 348 * both of which are protected with RCU. All removals are done
349 * holding both q and ioc locks, and we're holding q lock - if we
350 * find a icq which points to us, it's guaranteed to be valid.
147 */ 351 */
148 do { 352 rcu_read_lock();
149 ioc = current_io_context(gfp_flags, node); 353 icq = rcu_dereference(ioc->icq_hint);
150 if (unlikely(!ioc)) 354 if (icq && icq->q == q)
151 break; 355 goto out;
152 } while (!atomic_long_inc_not_zero(&ioc->refcount));
153 356
154 return ioc; 357 icq = radix_tree_lookup(&ioc->icq_tree, q->id);
358 if (icq && icq->q == q)
359 rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
360 else
361 icq = NULL;
362out:
363 rcu_read_unlock();
364 return icq;
155} 365}
156EXPORT_SYMBOL(get_io_context); 366EXPORT_SYMBOL(ioc_lookup_icq);
367
368/**
369 * ioc_create_icq - create and link io_cq
370 * @q: request_queue of interest
371 * @gfp_mask: allocation mask
372 *
373 * Make sure io_cq linking %current->io_context and @q exists. If either
374 * io_context and/or icq don't exist, they will be created using @gfp_mask.
375 *
376 * The caller is responsible for ensuring @ioc won't go away and @q is
377 * alive and will stay alive until this function returns.
378 */
379struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
380{
381 struct elevator_type *et = q->elevator->type;
382 struct io_context *ioc;
383 struct io_cq *icq;
384
385 /* allocate stuff */
386 ioc = create_io_context(current, gfp_mask, q->node);
387 if (!ioc)
388 return NULL;
389
390 icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
391 q->node);
392 if (!icq)
393 return NULL;
394
395 if (radix_tree_preload(gfp_mask) < 0) {
396 kmem_cache_free(et->icq_cache, icq);
397 return NULL;
398 }
399
400 icq->ioc = ioc;
401 icq->q = q;
402 INIT_LIST_HEAD(&icq->q_node);
403 INIT_HLIST_NODE(&icq->ioc_node);
404
405 /* lock both q and ioc and try to link @icq */
406 spin_lock_irq(q->queue_lock);
407 spin_lock(&ioc->lock);
408
409 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
410 hlist_add_head(&icq->ioc_node, &ioc->icq_list);
411 list_add(&icq->q_node, &q->icq_list);
412 if (et->ops.elevator_init_icq_fn)
413 et->ops.elevator_init_icq_fn(icq);
414 } else {
415 kmem_cache_free(et->icq_cache, icq);
416 icq = ioc_lookup_icq(ioc, q);
417 if (!icq)
418 printk(KERN_ERR "cfq: icq link failed!\n");
419 }
420
421 spin_unlock(&ioc->lock);
422 spin_unlock_irq(q->queue_lock);
423 radix_tree_preload_end();
424 return icq;
425}
426
427void ioc_set_changed(struct io_context *ioc, int which)
428{
429 struct io_cq *icq;
430 struct hlist_node *n;
431
432 hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
433 set_bit(which, &icq->changed);
434}
435
436/**
437 * ioc_ioprio_changed - notify ioprio change
438 * @ioc: io_context of interest
439 * @ioprio: new ioprio
440 *
441 * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all
442 * icq's. iosched is responsible for checking the bit and applying it on
443 * request issue path.
444 */
445void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
446{
447 unsigned long flags;
448
449 spin_lock_irqsave(&ioc->lock, flags);
450 ioc->ioprio = ioprio;
451 ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
452 spin_unlock_irqrestore(&ioc->lock, flags);
453}
454
455/**
456 * ioc_cgroup_changed - notify cgroup change
457 * @ioc: io_context of interest
458 *
459 * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's.
460 * iosched is responsible for checking the bit and applying it on request
461 * issue path.
462 */
463void ioc_cgroup_changed(struct io_context *ioc)
464{
465 unsigned long flags;
466
467 spin_lock_irqsave(&ioc->lock, flags);
468 ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
469 spin_unlock_irqrestore(&ioc->lock, flags);
470}
471EXPORT_SYMBOL(ioc_cgroup_changed);
157 472
158static int __init blk_ioc_init(void) 473static int __init blk_ioc_init(void)
159{ 474{
diff --git a/block/blk-settings.c b/block/blk-settings.c
index fa1eb0449a0..d3234fc494a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -104,9 +104,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
104 * @lim: the queue_limits structure to reset 104 * @lim: the queue_limits structure to reset
105 * 105 *
106 * Description: 106 * Description:
107 * Returns a queue_limit struct to its default state. Can be used by 107 * Returns a queue_limit struct to its default state.
108 * stacking drivers like DM that stage table swaps and reuse an
109 * existing device queue.
110 */ 108 */
111void blk_set_default_limits(struct queue_limits *lim) 109void blk_set_default_limits(struct queue_limits *lim)
112{ 110{
@@ -114,13 +112,12 @@ void blk_set_default_limits(struct queue_limits *lim)
114 lim->max_integrity_segments = 0; 112 lim->max_integrity_segments = 0;
115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 113 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 114 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
117 lim->max_sectors = BLK_DEF_MAX_SECTORS; 115 lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
118 lim->max_hw_sectors = INT_MAX;
119 lim->max_discard_sectors = 0; 116 lim->max_discard_sectors = 0;
120 lim->discard_granularity = 0; 117 lim->discard_granularity = 0;
121 lim->discard_alignment = 0; 118 lim->discard_alignment = 0;
122 lim->discard_misaligned = 0; 119 lim->discard_misaligned = 0;
123 lim->discard_zeroes_data = 1; 120 lim->discard_zeroes_data = 0;
124 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 121 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
125 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 122 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
126 lim->alignment_offset = 0; 123 lim->alignment_offset = 0;
@@ -131,6 +128,27 @@ void blk_set_default_limits(struct queue_limits *lim)
131EXPORT_SYMBOL(blk_set_default_limits); 128EXPORT_SYMBOL(blk_set_default_limits);
132 129
133/** 130/**
131 * blk_set_stacking_limits - set default limits for stacking devices
132 * @lim: the queue_limits structure to reset
133 *
134 * Description:
135 * Returns a queue_limit struct to its default state. Should be used
136 * by stacking drivers like DM that have no internal limits.
137 */
138void blk_set_stacking_limits(struct queue_limits *lim)
139{
140 blk_set_default_limits(lim);
141
142 /* Inherit limits from component devices */
143 lim->discard_zeroes_data = 1;
144 lim->max_segments = USHRT_MAX;
145 lim->max_hw_sectors = UINT_MAX;
146
147 lim->max_sectors = BLK_DEF_MAX_SECTORS;
148}
149EXPORT_SYMBOL(blk_set_stacking_limits);
150
151/**
134 * blk_queue_make_request - define an alternate make_request function for a device 152 * blk_queue_make_request - define an alternate make_request function for a device
135 * @q: the request queue for the device to be affected 153 * @q: the request queue for the device to be affected
136 * @mfn: the alternate make_request function 154 * @mfn: the alternate make_request function
@@ -165,8 +183,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
165 q->nr_batching = BLK_BATCH_REQ; 183 q->nr_batching = BLK_BATCH_REQ;
166 184
167 blk_set_default_limits(&q->limits); 185 blk_set_default_limits(&q->limits);
168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
169 q->limits.discard_zeroes_data = 0;
170 186
171 /* 187 /*
172 * by default assume old behaviour and bounce for any highmem page 188 * by default assume old behaviour and bounce for any highmem page
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e7f9f657f10..cf150011d80 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -425,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
425 if (!entry->show) 425 if (!entry->show)
426 return -EIO; 426 return -EIO;
427 mutex_lock(&q->sysfs_lock); 427 mutex_lock(&q->sysfs_lock);
428 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 428 if (blk_queue_dead(q)) {
429 mutex_unlock(&q->sysfs_lock); 429 mutex_unlock(&q->sysfs_lock);
430 return -ENOENT; 430 return -ENOENT;
431 } 431 }
@@ -447,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
447 447
448 q = container_of(kobj, struct request_queue, kobj); 448 q = container_of(kobj, struct request_queue, kobj);
449 mutex_lock(&q->sysfs_lock); 449 mutex_lock(&q->sysfs_lock);
450 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 450 if (blk_queue_dead(q)) {
451 mutex_unlock(&q->sysfs_lock); 451 mutex_unlock(&q->sysfs_lock);
452 return -ENOENT; 452 return -ENOENT;
453 } 453 }
@@ -479,8 +479,12 @@ static void blk_release_queue(struct kobject *kobj)
479 479
480 blk_sync_queue(q); 480 blk_sync_queue(q);
481 481
482 if (q->elevator) 482 if (q->elevator) {
483 spin_lock_irq(q->queue_lock);
484 ioc_clear_queue(q);
485 spin_unlock_irq(q->queue_lock);
483 elevator_exit(q->elevator); 486 elevator_exit(q->elevator);
487 }
484 488
485 blk_throtl_exit(q); 489 blk_throtl_exit(q);
486 490
@@ -494,6 +498,8 @@ static void blk_release_queue(struct kobject *kobj)
494 blk_trace_shutdown(q); 498 blk_trace_shutdown(q);
495 499
496 bdi_destroy(&q->backing_dev_info); 500 bdi_destroy(&q->backing_dev_info);
501
502 ida_simple_remove(&blk_queue_ida, q->id);
497 kmem_cache_free(blk_requestq_cachep, q); 503 kmem_cache_free(blk_requestq_cachep, q);
498} 504}
499 505
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4553245d931..5eed6a76721 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -310,7 +310,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
310 struct request_queue *q = td->queue; 310 struct request_queue *q = td->queue;
311 311
312 /* no throttling for dead queue */ 312 /* no throttling for dead queue */
313 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 313 if (unlikely(blk_queue_dead(q)))
314 return NULL; 314 return NULL;
315 315
316 rcu_read_lock(); 316 rcu_read_lock();
@@ -335,7 +335,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
335 spin_lock_irq(q->queue_lock); 335 spin_lock_irq(q->queue_lock);
336 336
337 /* Make sure @q is still alive */ 337 /* Make sure @q is still alive */
338 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 338 if (unlikely(blk_queue_dead(q))) {
339 kfree(tg); 339 kfree(tg);
340 return NULL; 340 return NULL;
341 } 341 }
diff --git a/block/blk.h b/block/blk.h
index 3f6551b3c92..7efd772336d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -1,6 +1,8 @@
1#ifndef BLK_INTERNAL_H 1#ifndef BLK_INTERNAL_H
2#define BLK_INTERNAL_H 2#define BLK_INTERNAL_H
3 3
4#include <linux/idr.h>
5
4/* Amount of time in which a process may batch requests */ 6/* Amount of time in which a process may batch requests */
5#define BLK_BATCH_TIME (HZ/50UL) 7#define BLK_BATCH_TIME (HZ/50UL)
6 8
@@ -9,6 +11,12 @@
9 11
10extern struct kmem_cache *blk_requestq_cachep; 12extern struct kmem_cache *blk_requestq_cachep;
11extern struct kobj_type blk_queue_ktype; 13extern struct kobj_type blk_queue_ktype;
14extern struct ida blk_queue_ida;
15
16static inline void __blk_get_queue(struct request_queue *q)
17{
18 kobject_get(&q->kobj);
19}
12 20
13void init_request_from_bio(struct request *req, struct bio *bio); 21void init_request_from_bio(struct request *req, struct bio *bio);
14void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 22void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
@@ -85,8 +93,8 @@ static inline struct request *__elv_next_request(struct request_queue *q)
85 q->flush_queue_delayed = 1; 93 q->flush_queue_delayed = 1;
86 return NULL; 94 return NULL;
87 } 95 }
88 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || 96 if (unlikely(blk_queue_dead(q)) ||
89 !q->elevator->ops->elevator_dispatch_fn(q, 0)) 97 !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
90 return NULL; 98 return NULL;
91 } 99 }
92} 100}
@@ -95,16 +103,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
95{ 103{
96 struct elevator_queue *e = q->elevator; 104 struct elevator_queue *e = q->elevator;
97 105
98 if (e->ops->elevator_activate_req_fn) 106 if (e->type->ops.elevator_activate_req_fn)
99 e->ops->elevator_activate_req_fn(q, rq); 107 e->type->ops.elevator_activate_req_fn(q, rq);
100} 108}
101 109
102static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) 110static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
103{ 111{
104 struct elevator_queue *e = q->elevator; 112 struct elevator_queue *e = q->elevator;
105 113
106 if (e->ops->elevator_deactivate_req_fn) 114 if (e->type->ops.elevator_deactivate_req_fn)
107 e->ops->elevator_deactivate_req_fn(q, rq); 115 e->type->ops.elevator_deactivate_req_fn(q, rq);
108} 116}
109 117
110#ifdef CONFIG_FAIL_IO_TIMEOUT 118#ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -119,8 +127,6 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
119} 127}
120#endif 128#endif
121 129
122struct io_context *current_io_context(gfp_t gfp_flags, int node);
123
124int ll_back_merge_fn(struct request_queue *q, struct request *req, 130int ll_back_merge_fn(struct request_queue *q, struct request *req,
125 struct bio *bio); 131 struct bio *bio);
126int ll_front_merge_fn(struct request_queue *q, struct request *req, 132int ll_front_merge_fn(struct request_queue *q, struct request *req,
@@ -189,6 +195,42 @@ static inline int blk_do_io_stat(struct request *rq)
189 (rq->cmd_flags & REQ_DISCARD)); 195 (rq->cmd_flags & REQ_DISCARD));
190} 196}
191 197
198/*
199 * Internal io_context interface
200 */
201void get_io_context(struct io_context *ioc);
202struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
203struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
204void ioc_clear_queue(struct request_queue *q);
205
206void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
207 int node);
208
209/**
210 * create_io_context - try to create task->io_context
211 * @task: target task
212 * @gfp_mask: allocation mask
213 * @node: allocation node
214 *
215 * If @task->io_context is %NULL, allocate a new io_context and install it.
216 * Returns the current @task->io_context which may be %NULL if allocation
217 * failed.
218 *
219 * Note that this function can't be called with IRQ disabled because
220 * task_lock which protects @task->io_context is IRQ-unsafe.
221 */
222static inline struct io_context *create_io_context(struct task_struct *task,
223 gfp_t gfp_mask, int node)
224{
225 WARN_ON_ONCE(irqs_disabled());
226 if (unlikely(!task->io_context))
227 create_io_context_slowpath(task, gfp_mask, node);
228 return task->io_context;
229}
230
231/*
232 * Internal throttling interface
233 */
192#ifdef CONFIG_BLK_DEV_THROTTLING 234#ifdef CONFIG_BLK_DEV_THROTTLING
193extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); 235extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
194extern void blk_throtl_drain(struct request_queue *q); 236extern void blk_throtl_drain(struct request_queue *q);
diff --git a/block/bsg.c b/block/bsg.c
index 9651ec7b87c..4cf703fd98b 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -769,12 +769,10 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
769 struct file *file) 769 struct file *file)
770{ 770{
771 struct bsg_device *bd; 771 struct bsg_device *bd;
772 int ret;
773#ifdef BSG_DEBUG 772#ifdef BSG_DEBUG
774 unsigned char buf[32]; 773 unsigned char buf[32];
775#endif 774#endif
776 ret = blk_get_queue(rq); 775 if (!blk_get_queue(rq))
777 if (ret)
778 return ERR_PTR(-ENXIO); 776 return ERR_PTR(-ENXIO);
779 777
780 bd = bsg_alloc_device(); 778 bd = bsg_alloc_device();
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3548705b04e..163263ddd38 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,7 @@
14#include <linux/rbtree.h> 14#include <linux/rbtree.h>
15#include <linux/ioprio.h> 15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h> 16#include <linux/blktrace_api.h>
17#include "blk.h"
17#include "cfq.h" 18#include "cfq.h"
18 19
19/* 20/*
@@ -53,20 +54,11 @@ static const int cfq_hist_divisor = 4;
53#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 54#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 55#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
55 56
56#define RQ_CIC(rq) \ 57#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
57 ((struct cfq_io_context *) (rq)->elevator_private[0]) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1]) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
60 60
61static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool;
63
64static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
65static struct completion *ioc_gone;
66static DEFINE_SPINLOCK(ioc_gone_lock);
67
68static DEFINE_SPINLOCK(cic_index_lock);
69static DEFINE_IDA(cic_index_ida);
70 62
71#define CFQ_PRIO_LISTS IOPRIO_BE_NR 63#define CFQ_PRIO_LISTS IOPRIO_BE_NR
72#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 64#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -75,6 +67,14 @@ static DEFINE_IDA(cic_index_ida);
75#define sample_valid(samples) ((samples) > 80) 67#define sample_valid(samples) ((samples) > 80)
76#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
77 69
70struct cfq_ttime {
71 unsigned long last_end_request;
72
73 unsigned long ttime_total;
74 unsigned long ttime_samples;
75 unsigned long ttime_mean;
76};
77
78/* 78/*
79 * Most of our rbtree usage is for sorting with min extraction, so 79 * Most of our rbtree usage is for sorting with min extraction, so
80 * if we cache the leftmost node we don't have to walk down the tree 80 * if we cache the leftmost node we don't have to walk down the tree
@@ -216,6 +216,12 @@ struct cfq_group {
216 struct cfq_ttime ttime; 216 struct cfq_ttime ttime;
217}; 217};
218 218
219struct cfq_io_cq {
220 struct io_cq icq; /* must be the first member */
221 struct cfq_queue *cfqq[2];
222 struct cfq_ttime ttime;
223};
224
219/* 225/*
220 * Per block device queue structure 226 * Per block device queue structure
221 */ 227 */
@@ -267,7 +273,7 @@ struct cfq_data {
267 struct work_struct unplug_work; 273 struct work_struct unplug_work;
268 274
269 struct cfq_queue *active_queue; 275 struct cfq_queue *active_queue;
270 struct cfq_io_context *active_cic; 276 struct cfq_io_cq *active_cic;
271 277
272 /* 278 /*
273 * async queue for each priority case 279 * async queue for each priority case
@@ -290,9 +296,6 @@ struct cfq_data {
290 unsigned int cfq_group_idle; 296 unsigned int cfq_group_idle;
291 unsigned int cfq_latency; 297 unsigned int cfq_latency;
292 298
293 unsigned int cic_index;
294 struct list_head cic_list;
295
296 /* 299 /*
297 * Fallback dummy cfqq for extreme OOM conditions 300 * Fallback dummy cfqq for extreme OOM conditions
298 */ 301 */
@@ -464,37 +467,35 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
464static void cfq_dispatch_insert(struct request_queue *, struct request *); 467static void cfq_dispatch_insert(struct request_queue *, struct request *);
465static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 468static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
466 struct io_context *, gfp_t); 469 struct io_context *, gfp_t);
467static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
468 struct io_context *);
469 470
470static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 471static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
471 bool is_sync)
472{ 472{
473 return cic->cfqq[is_sync]; 473 /* cic->icq is the first member, %NULL will convert to %NULL */
474 return container_of(icq, struct cfq_io_cq, icq);
474} 475}
475 476
476static inline void cic_set_cfqq(struct cfq_io_context *cic, 477static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
477 struct cfq_queue *cfqq, bool is_sync) 478 struct io_context *ioc)
478{ 479{
479 cic->cfqq[is_sync] = cfqq; 480 if (ioc)
481 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
482 return NULL;
480} 483}
481 484
482#define CIC_DEAD_KEY 1ul 485static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
483#define CIC_DEAD_INDEX_SHIFT 1
484
485static inline void *cfqd_dead_key(struct cfq_data *cfqd)
486{ 486{
487 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); 487 return cic->cfqq[is_sync];
488} 488}
489 489
490static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) 490static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
491 bool is_sync)
491{ 492{
492 struct cfq_data *cfqd = cic->key; 493 cic->cfqq[is_sync] = cfqq;
493 494}
494 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
495 return NULL;
496 495
497 return cfqd; 496static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
497{
498 return cic->icq.q->elevator->elevator_data;
498} 499}
499 500
500/* 501/*
@@ -1561,7 +1562,7 @@ static struct request *
1561cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1562cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1562{ 1563{
1563 struct task_struct *tsk = current; 1564 struct task_struct *tsk = current;
1564 struct cfq_io_context *cic; 1565 struct cfq_io_cq *cic;
1565 struct cfq_queue *cfqq; 1566 struct cfq_queue *cfqq;
1566 1567
1567 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1568 cic = cfq_cic_lookup(cfqd, tsk->io_context);
@@ -1687,7 +1688,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1687 struct bio *bio) 1688 struct bio *bio)
1688{ 1689{
1689 struct cfq_data *cfqd = q->elevator->elevator_data; 1690 struct cfq_data *cfqd = q->elevator->elevator_data;
1690 struct cfq_io_context *cic; 1691 struct cfq_io_cq *cic;
1691 struct cfq_queue *cfqq; 1692 struct cfq_queue *cfqq;
1692 1693
1693 /* 1694 /*
@@ -1697,12 +1698,19 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1697 return false; 1698 return false;
1698 1699
1699 /* 1700 /*
1700 * Lookup the cfqq that this bio will be queued with. Allow 1701 * Lookup the cfqq that this bio will be queued with and allow
1701 * merge only if rq is queued there. 1702 * merge only if rq is queued there. This function can be called
1703 * from plug merge without queue_lock. In such cases, ioc of @rq
1704 * and %current are guaranteed to be equal. Avoid lookup which
1705 * requires queue_lock by using @rq's cic.
1702 */ 1706 */
1703 cic = cfq_cic_lookup(cfqd, current->io_context); 1707 if (current->io_context == RQ_CIC(rq)->icq.ioc) {
1704 if (!cic) 1708 cic = RQ_CIC(rq);
1705 return false; 1709 } else {
1710 cic = cfq_cic_lookup(cfqd, current->io_context);
1711 if (!cic)
1712 return false;
1713 }
1706 1714
1707 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1715 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1708 return cfqq == RQ_CFQQ(rq); 1716 return cfqq == RQ_CFQQ(rq);
@@ -1786,7 +1794,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1786 cfqd->active_queue = NULL; 1794 cfqd->active_queue = NULL;
1787 1795
1788 if (cfqd->active_cic) { 1796 if (cfqd->active_cic) {
1789 put_io_context(cfqd->active_cic->ioc); 1797 put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue);
1790 cfqd->active_cic = NULL; 1798 cfqd->active_cic = NULL;
1791 } 1799 }
1792} 1800}
@@ -2006,7 +2014,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2006static void cfq_arm_slice_timer(struct cfq_data *cfqd) 2014static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2007{ 2015{
2008 struct cfq_queue *cfqq = cfqd->active_queue; 2016 struct cfq_queue *cfqq = cfqd->active_queue;
2009 struct cfq_io_context *cic; 2017 struct cfq_io_cq *cic;
2010 unsigned long sl, group_idle = 0; 2018 unsigned long sl, group_idle = 0;
2011 2019
2012 /* 2020 /*
@@ -2041,7 +2049,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2041 * task has exited, don't wait 2049 * task has exited, don't wait
2042 */ 2050 */
2043 cic = cfqd->active_cic; 2051 cic = cfqd->active_cic;
2044 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 2052 if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
2045 return; 2053 return;
2046 2054
2047 /* 2055 /*
@@ -2592,9 +2600,9 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2592 cfq_dispatch_insert(cfqd->queue, rq); 2600 cfq_dispatch_insert(cfqd->queue, rq);
2593 2601
2594 if (!cfqd->active_cic) { 2602 if (!cfqd->active_cic) {
2595 struct cfq_io_context *cic = RQ_CIC(rq); 2603 struct cfq_io_cq *cic = RQ_CIC(rq);
2596 2604
2597 atomic_long_inc(&cic->ioc->refcount); 2605 atomic_long_inc(&cic->icq.ioc->refcount);
2598 cfqd->active_cic = cic; 2606 cfqd->active_cic = cic;
2599 } 2607 }
2600 2608
@@ -2677,84 +2685,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2677 cfq_put_cfqg(cfqg); 2685 cfq_put_cfqg(cfqg);
2678} 2686}
2679 2687
2680/*
2681 * Call func for each cic attached to this ioc.
2682 */
2683static void
2684call_for_each_cic(struct io_context *ioc,
2685 void (*func)(struct io_context *, struct cfq_io_context *))
2686{
2687 struct cfq_io_context *cic;
2688 struct hlist_node *n;
2689
2690 rcu_read_lock();
2691
2692 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2693 func(ioc, cic);
2694
2695 rcu_read_unlock();
2696}
2697
2698static void cfq_cic_free_rcu(struct rcu_head *head)
2699{
2700 struct cfq_io_context *cic;
2701
2702 cic = container_of(head, struct cfq_io_context, rcu_head);
2703
2704 kmem_cache_free(cfq_ioc_pool, cic);
2705 elv_ioc_count_dec(cfq_ioc_count);
2706
2707 if (ioc_gone) {
2708 /*
2709 * CFQ scheduler is exiting, grab exit lock and check
2710 * the pending io context count. If it hits zero,
2711 * complete ioc_gone and set it back to NULL
2712 */
2713 spin_lock(&ioc_gone_lock);
2714 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2715 complete(ioc_gone);
2716 ioc_gone = NULL;
2717 }
2718 spin_unlock(&ioc_gone_lock);
2719 }
2720}
2721
2722static void cfq_cic_free(struct cfq_io_context *cic)
2723{
2724 call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2725}
2726
2727static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2728{
2729 unsigned long flags;
2730 unsigned long dead_key = (unsigned long) cic->key;
2731
2732 BUG_ON(!(dead_key & CIC_DEAD_KEY));
2733
2734 spin_lock_irqsave(&ioc->lock, flags);
2735 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2736 hlist_del_rcu(&cic->cic_list);
2737 spin_unlock_irqrestore(&ioc->lock, flags);
2738
2739 cfq_cic_free(cic);
2740}
2741
2742/*
2743 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2744 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2745 * and ->trim() which is called with the task lock held
2746 */
2747static void cfq_free_io_context(struct io_context *ioc)
2748{
2749 /*
2750 * ioc->refcount is zero here, or we are called from elv_unregister(),
2751 * so no more cic's are allowed to be linked into this ioc. So it
2752 * should be ok to iterate over the known list, we will see all cic's
2753 * since no new ones are added.
2754 */
2755 call_for_each_cic(ioc, cic_free_func);
2756}
2757
2758static void cfq_put_cooperator(struct cfq_queue *cfqq) 2688static void cfq_put_cooperator(struct cfq_queue *cfqq)
2759{ 2689{
2760 struct cfq_queue *__cfqq, *next; 2690 struct cfq_queue *__cfqq, *next;
@@ -2788,27 +2718,17 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2788 cfq_put_queue(cfqq); 2718 cfq_put_queue(cfqq);
2789} 2719}
2790 2720
2791static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2721static void cfq_init_icq(struct io_cq *icq)
2792 struct cfq_io_context *cic)
2793{ 2722{
2794 struct io_context *ioc = cic->ioc; 2723 struct cfq_io_cq *cic = icq_to_cic(icq);
2795
2796 list_del_init(&cic->queue_list);
2797 2724
2798 /* 2725 cic->ttime.last_end_request = jiffies;
2799 * Make sure dead mark is seen for dead queues 2726}
2800 */
2801 smp_wmb();
2802 cic->key = cfqd_dead_key(cfqd);
2803 2727
2804 rcu_read_lock(); 2728static void cfq_exit_icq(struct io_cq *icq)
2805 if (rcu_dereference(ioc->ioc_data) == cic) { 2729{
2806 rcu_read_unlock(); 2730 struct cfq_io_cq *cic = icq_to_cic(icq);
2807 spin_lock(&ioc->lock); 2731 struct cfq_data *cfqd = cic_to_cfqd(cic);
2808 rcu_assign_pointer(ioc->ioc_data, NULL);
2809 spin_unlock(&ioc->lock);
2810 } else
2811 rcu_read_unlock();
2812 2732
2813 if (cic->cfqq[BLK_RW_ASYNC]) { 2733 if (cic->cfqq[BLK_RW_ASYNC]) {
2814 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2734 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2821,57 +2741,6 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2821 } 2741 }
2822} 2742}
2823 2743
2824static void cfq_exit_single_io_context(struct io_context *ioc,
2825 struct cfq_io_context *cic)
2826{
2827 struct cfq_data *cfqd = cic_to_cfqd(cic);
2828
2829 if (cfqd) {
2830 struct request_queue *q = cfqd->queue;
2831 unsigned long flags;
2832
2833 spin_lock_irqsave(q->queue_lock, flags);
2834
2835 /*
2836 * Ensure we get a fresh copy of the ->key to prevent
2837 * race between exiting task and queue
2838 */
2839 smp_read_barrier_depends();
2840 if (cic->key == cfqd)
2841 __cfq_exit_single_io_context(cfqd, cic);
2842
2843 spin_unlock_irqrestore(q->queue_lock, flags);
2844 }
2845}
2846
2847/*
2848 * The process that ioc belongs to has exited, we need to clean up
2849 * and put the internal structures we have that belongs to that process.
2850 */
2851static void cfq_exit_io_context(struct io_context *ioc)
2852{
2853 call_for_each_cic(ioc, cfq_exit_single_io_context);
2854}
2855
2856static struct cfq_io_context *
2857cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2858{
2859 struct cfq_io_context *cic;
2860
2861 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2862 cfqd->queue->node);
2863 if (cic) {
2864 cic->ttime.last_end_request = jiffies;
2865 INIT_LIST_HEAD(&cic->queue_list);
2866 INIT_HLIST_NODE(&cic->cic_list);
2867 cic->dtor = cfq_free_io_context;
2868 cic->exit = cfq_exit_io_context;
2869 elv_ioc_count_inc(cfq_ioc_count);
2870 }
2871
2872 return cic;
2873}
2874
2875static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 2744static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2876{ 2745{
2877 struct task_struct *tsk = current; 2746 struct task_struct *tsk = current;
@@ -2914,21 +2783,18 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2914 cfq_clear_cfqq_prio_changed(cfqq); 2783 cfq_clear_cfqq_prio_changed(cfqq);
2915} 2784}
2916 2785
2917static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2786static void changed_ioprio(struct cfq_io_cq *cic)
2918{ 2787{
2919 struct cfq_data *cfqd = cic_to_cfqd(cic); 2788 struct cfq_data *cfqd = cic_to_cfqd(cic);
2920 struct cfq_queue *cfqq; 2789 struct cfq_queue *cfqq;
2921 unsigned long flags;
2922 2790
2923 if (unlikely(!cfqd)) 2791 if (unlikely(!cfqd))
2924 return; 2792 return;
2925 2793
2926 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2927
2928 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2794 cfqq = cic->cfqq[BLK_RW_ASYNC];
2929 if (cfqq) { 2795 if (cfqq) {
2930 struct cfq_queue *new_cfqq; 2796 struct cfq_queue *new_cfqq;
2931 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2797 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
2932 GFP_ATOMIC); 2798 GFP_ATOMIC);
2933 if (new_cfqq) { 2799 if (new_cfqq) {
2934 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2800 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
@@ -2939,14 +2805,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2939 cfqq = cic->cfqq[BLK_RW_SYNC]; 2805 cfqq = cic->cfqq[BLK_RW_SYNC];
2940 if (cfqq) 2806 if (cfqq)
2941 cfq_mark_cfqq_prio_changed(cfqq); 2807 cfq_mark_cfqq_prio_changed(cfqq);
2942
2943 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2944}
2945
2946static void cfq_ioc_set_ioprio(struct io_context *ioc)
2947{
2948 call_for_each_cic(ioc, changed_ioprio);
2949 ioc->ioprio_changed = 0;
2950} 2808}
2951 2809
2952static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2810static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2970,11 +2828,10 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2970} 2828}
2971 2829
2972#ifdef CONFIG_CFQ_GROUP_IOSCHED 2830#ifdef CONFIG_CFQ_GROUP_IOSCHED
2973static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2831static void changed_cgroup(struct cfq_io_cq *cic)
2974{ 2832{
2975 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2833 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2976 struct cfq_data *cfqd = cic_to_cfqd(cic); 2834 struct cfq_data *cfqd = cic_to_cfqd(cic);
2977 unsigned long flags;
2978 struct request_queue *q; 2835 struct request_queue *q;
2979 2836
2980 if (unlikely(!cfqd)) 2837 if (unlikely(!cfqd))
@@ -2982,8 +2839,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2982 2839
2983 q = cfqd->queue; 2840 q = cfqd->queue;
2984 2841
2985 spin_lock_irqsave(q->queue_lock, flags);
2986
2987 if (sync_cfqq) { 2842 if (sync_cfqq) {
2988 /* 2843 /*
2989 * Drop reference to sync queue. A new sync queue will be 2844 * Drop reference to sync queue. A new sync queue will be
@@ -2993,14 +2848,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2993 cic_set_cfqq(cic, NULL, 1); 2848 cic_set_cfqq(cic, NULL, 1);
2994 cfq_put_queue(sync_cfqq); 2849 cfq_put_queue(sync_cfqq);
2995 } 2850 }
2996
2997 spin_unlock_irqrestore(q->queue_lock, flags);
2998}
2999
3000static void cfq_ioc_set_cgroup(struct io_context *ioc)
3001{
3002 call_for_each_cic(ioc, changed_cgroup);
3003 ioc->cgroup_changed = 0;
3004} 2851}
3005#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2852#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3006 2853
@@ -3009,7 +2856,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
3009 struct io_context *ioc, gfp_t gfp_mask) 2856 struct io_context *ioc, gfp_t gfp_mask)
3010{ 2857{
3011 struct cfq_queue *cfqq, *new_cfqq = NULL; 2858 struct cfq_queue *cfqq, *new_cfqq = NULL;
3012 struct cfq_io_context *cic; 2859 struct cfq_io_cq *cic;
3013 struct cfq_group *cfqg; 2860 struct cfq_group *cfqg;
3014 2861
3015retry: 2862retry:
@@ -3100,160 +2947,6 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
3100 return cfqq; 2947 return cfqq;
3101} 2948}
3102 2949
3103/*
3104 * We drop cfq io contexts lazily, so we may find a dead one.
3105 */
3106static void
3107cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
3108 struct cfq_io_context *cic)
3109{
3110 unsigned long flags;
3111
3112 WARN_ON(!list_empty(&cic->queue_list));
3113 BUG_ON(cic->key != cfqd_dead_key(cfqd));
3114
3115 spin_lock_irqsave(&ioc->lock, flags);
3116
3117 BUG_ON(rcu_dereference_check(ioc->ioc_data,
3118 lockdep_is_held(&ioc->lock)) == cic);
3119
3120 radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
3121 hlist_del_rcu(&cic->cic_list);
3122 spin_unlock_irqrestore(&ioc->lock, flags);
3123
3124 cfq_cic_free(cic);
3125}
3126
3127static struct cfq_io_context *
3128cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
3129{
3130 struct cfq_io_context *cic;
3131 unsigned long flags;
3132
3133 if (unlikely(!ioc))
3134 return NULL;
3135
3136 rcu_read_lock();
3137
3138 /*
3139 * we maintain a last-hit cache, to avoid browsing over the tree
3140 */
3141 cic = rcu_dereference(ioc->ioc_data);
3142 if (cic && cic->key == cfqd) {
3143 rcu_read_unlock();
3144 return cic;
3145 }
3146
3147 do {
3148 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
3149 rcu_read_unlock();
3150 if (!cic)
3151 break;
3152 if (unlikely(cic->key != cfqd)) {
3153 cfq_drop_dead_cic(cfqd, ioc, cic);
3154 rcu_read_lock();
3155 continue;
3156 }
3157
3158 spin_lock_irqsave(&ioc->lock, flags);
3159 rcu_assign_pointer(ioc->ioc_data, cic);
3160 spin_unlock_irqrestore(&ioc->lock, flags);
3161 break;
3162 } while (1);
3163
3164 return cic;
3165}
3166
3167/*
3168 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
3169 * the process specific cfq io context when entered from the block layer.
3170 * Also adds the cic to a per-cfqd list, used when this queue is removed.
3171 */
3172static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
3173 struct cfq_io_context *cic, gfp_t gfp_mask)
3174{
3175 unsigned long flags;
3176 int ret;
3177
3178 ret = radix_tree_preload(gfp_mask);
3179 if (!ret) {
3180 cic->ioc = ioc;
3181 cic->key = cfqd;
3182
3183 spin_lock_irqsave(&ioc->lock, flags);
3184 ret = radix_tree_insert(&ioc->radix_root,
3185 cfqd->cic_index, cic);
3186 if (!ret)
3187 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3188 spin_unlock_irqrestore(&ioc->lock, flags);
3189
3190 radix_tree_preload_end();
3191
3192 if (!ret) {
3193 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3194 list_add(&cic->queue_list, &cfqd->cic_list);
3195 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3196 }
3197 }
3198
3199 if (ret && ret != -EEXIST)
3200 printk(KERN_ERR "cfq: cic link failed!\n");
3201
3202 return ret;
3203}
3204
3205/*
3206 * Setup general io context and cfq io context. There can be several cfq
3207 * io contexts per general io context, if this process is doing io to more
3208 * than one device managed by cfq.
3209 */
3210static struct cfq_io_context *
3211cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3212{
3213 struct io_context *ioc = NULL;
3214 struct cfq_io_context *cic;
3215 int ret;
3216
3217 might_sleep_if(gfp_mask & __GFP_WAIT);
3218
3219 ioc = get_io_context(gfp_mask, cfqd->queue->node);
3220 if (!ioc)
3221 return NULL;
3222
3223retry:
3224 cic = cfq_cic_lookup(cfqd, ioc);
3225 if (cic)
3226 goto out;
3227
3228 cic = cfq_alloc_io_context(cfqd, gfp_mask);
3229 if (cic == NULL)
3230 goto err;
3231
3232 ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
3233 if (ret == -EEXIST) {
3234 /* someone has linked cic to ioc already */
3235 cfq_cic_free(cic);
3236 goto retry;
3237 } else if (ret)
3238 goto err_free;
3239
3240out:
3241 smp_read_barrier_depends();
3242 if (unlikely(ioc->ioprio_changed))
3243 cfq_ioc_set_ioprio(ioc);
3244
3245#ifdef CONFIG_CFQ_GROUP_IOSCHED
3246 if (unlikely(ioc->cgroup_changed))
3247 cfq_ioc_set_cgroup(ioc);
3248#endif
3249 return cic;
3250err_free:
3251 cfq_cic_free(cic);
3252err:
3253 put_io_context(ioc);
3254 return NULL;
3255}
3256
3257static void 2950static void
3258__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) 2951__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3259{ 2952{
@@ -3267,7 +2960,7 @@ __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3267 2960
3268static void 2961static void
3269cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2962cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3270 struct cfq_io_context *cic) 2963 struct cfq_io_cq *cic)
3271{ 2964{
3272 if (cfq_cfqq_sync(cfqq)) { 2965 if (cfq_cfqq_sync(cfqq)) {
3273 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); 2966 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
@@ -3305,7 +2998,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3305 */ 2998 */
3306static void 2999static void
3307cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3000cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3308 struct cfq_io_context *cic) 3001 struct cfq_io_cq *cic)
3309{ 3002{
3310 int old_idle, enable_idle; 3003 int old_idle, enable_idle;
3311 3004
@@ -3322,8 +3015,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3322 3015
3323 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) 3016 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3324 enable_idle = 0; 3017 enable_idle = 0;
3325 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3018 else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
3326 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3019 !cfqd->cfq_slice_idle ||
3020 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3327 enable_idle = 0; 3021 enable_idle = 0;
3328 else if (sample_valid(cic->ttime.ttime_samples)) { 3022 else if (sample_valid(cic->ttime.ttime_samples)) {
3329 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) 3023 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
@@ -3455,7 +3149,7 @@ static void
3455cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3149cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3456 struct request *rq) 3150 struct request *rq)
3457{ 3151{
3458 struct cfq_io_context *cic = RQ_CIC(rq); 3152 struct cfq_io_cq *cic = RQ_CIC(rq);
3459 3153
3460 cfqd->rq_queued++; 3154 cfqd->rq_queued++;
3461 if (rq->cmd_flags & REQ_PRIO) 3155 if (rq->cmd_flags & REQ_PRIO)
@@ -3508,7 +3202,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
3508 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3202 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3509 3203
3510 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3204 cfq_log_cfqq(cfqd, cfqq, "insert_request");
3511 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3205 cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
3512 3206
3513 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3207 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3514 list_add_tail(&rq->queuelist, &cfqq->fifo); 3208 list_add_tail(&rq->queuelist, &cfqq->fifo);
@@ -3558,7 +3252,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
3558 3252
3559static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3253static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3560{ 3254{
3561 struct cfq_io_context *cic = cfqd->active_cic; 3255 struct cfq_io_cq *cic = cfqd->active_cic;
3562 3256
3563 /* If the queue already has requests, don't wait */ 3257 /* If the queue already has requests, don't wait */
3564 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3258 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3695,7 +3389,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
3695{ 3389{
3696 struct cfq_data *cfqd = q->elevator->elevator_data; 3390 struct cfq_data *cfqd = q->elevator->elevator_data;
3697 struct task_struct *tsk = current; 3391 struct task_struct *tsk = current;
3698 struct cfq_io_context *cic; 3392 struct cfq_io_cq *cic;
3699 struct cfq_queue *cfqq; 3393 struct cfq_queue *cfqq;
3700 3394
3701 /* 3395 /*
@@ -3710,7 +3404,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
3710 3404
3711 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3405 cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3712 if (cfqq) { 3406 if (cfqq) {
3713 cfq_init_prio_data(cfqq, cic->ioc); 3407 cfq_init_prio_data(cfqq, cic->icq.ioc);
3714 3408
3715 return __cfq_may_queue(cfqq); 3409 return __cfq_may_queue(cfqq);
3716 } 3410 }
@@ -3731,21 +3425,17 @@ static void cfq_put_request(struct request *rq)
3731 BUG_ON(!cfqq->allocated[rw]); 3425 BUG_ON(!cfqq->allocated[rw]);
3732 cfqq->allocated[rw]--; 3426 cfqq->allocated[rw]--;
3733 3427
3734 put_io_context(RQ_CIC(rq)->ioc);
3735
3736 rq->elevator_private[0] = NULL;
3737 rq->elevator_private[1] = NULL;
3738
3739 /* Put down rq reference on cfqg */ 3428 /* Put down rq reference on cfqg */
3740 cfq_put_cfqg(RQ_CFQG(rq)); 3429 cfq_put_cfqg(RQ_CFQG(rq));
3741 rq->elevator_private[2] = NULL; 3430 rq->elv.priv[0] = NULL;
3431 rq->elv.priv[1] = NULL;
3742 3432
3743 cfq_put_queue(cfqq); 3433 cfq_put_queue(cfqq);
3744 } 3434 }
3745} 3435}
3746 3436
3747static struct cfq_queue * 3437static struct cfq_queue *
3748cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3438cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
3749 struct cfq_queue *cfqq) 3439 struct cfq_queue *cfqq)
3750{ 3440{
3751 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3441 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
@@ -3760,7 +3450,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3760 * was the last process referring to said cfqq. 3450 * was the last process referring to said cfqq.
3761 */ 3451 */
3762static struct cfq_queue * 3452static struct cfq_queue *
3763split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3453split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3764{ 3454{
3765 if (cfqq_process_refs(cfqq) == 1) { 3455 if (cfqq_process_refs(cfqq) == 1) {
3766 cfqq->pid = current->pid; 3456 cfqq->pid = current->pid;
@@ -3783,25 +3473,29 @@ static int
3783cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3473cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3784{ 3474{
3785 struct cfq_data *cfqd = q->elevator->elevator_data; 3475 struct cfq_data *cfqd = q->elevator->elevator_data;
3786 struct cfq_io_context *cic; 3476 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3787 const int rw = rq_data_dir(rq); 3477 const int rw = rq_data_dir(rq);
3788 const bool is_sync = rq_is_sync(rq); 3478 const bool is_sync = rq_is_sync(rq);
3789 struct cfq_queue *cfqq; 3479 struct cfq_queue *cfqq;
3790 unsigned long flags;
3791 3480
3792 might_sleep_if(gfp_mask & __GFP_WAIT); 3481 might_sleep_if(gfp_mask & __GFP_WAIT);
3793 3482
3794 cic = cfq_get_io_context(cfqd, gfp_mask); 3483 spin_lock_irq(q->queue_lock);
3795
3796 spin_lock_irqsave(q->queue_lock, flags);
3797 3484
3798 if (!cic) 3485 /* handle changed notifications */
3799 goto queue_fail; 3486 if (unlikely(cic->icq.changed)) {
3487 if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
3488 changed_ioprio(cic);
3489#ifdef CONFIG_CFQ_GROUP_IOSCHED
3490 if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
3491 changed_cgroup(cic);
3492#endif
3493 }
3800 3494
3801new_queue: 3495new_queue:
3802 cfqq = cic_to_cfqq(cic, is_sync); 3496 cfqq = cic_to_cfqq(cic, is_sync);
3803 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3497 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3804 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3498 cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
3805 cic_set_cfqq(cic, cfqq, is_sync); 3499 cic_set_cfqq(cic, cfqq, is_sync);
3806 } else { 3500 } else {
3807 /* 3501 /*
@@ -3827,17 +3521,10 @@ new_queue:
3827 cfqq->allocated[rw]++; 3521 cfqq->allocated[rw]++;
3828 3522
3829 cfqq->ref++; 3523 cfqq->ref++;
3830 rq->elevator_private[0] = cic; 3524 rq->elv.priv[0] = cfqq;
3831 rq->elevator_private[1] = cfqq; 3525 rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
3832 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); 3526 spin_unlock_irq(q->queue_lock);
3833 spin_unlock_irqrestore(q->queue_lock, flags);
3834 return 0; 3527 return 0;
3835
3836queue_fail:
3837 cfq_schedule_dispatch(cfqd);
3838 spin_unlock_irqrestore(q->queue_lock, flags);
3839 cfq_log(cfqd, "set_request fail");
3840 return 1;
3841} 3528}
3842 3529
3843static void cfq_kick_queue(struct work_struct *work) 3530static void cfq_kick_queue(struct work_struct *work)
@@ -3941,14 +3628,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
3941 if (cfqd->active_queue) 3628 if (cfqd->active_queue)
3942 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3629 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3943 3630
3944 while (!list_empty(&cfqd->cic_list)) {
3945 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3946 struct cfq_io_context,
3947 queue_list);
3948
3949 __cfq_exit_single_io_context(cfqd, cic);
3950 }
3951
3952 cfq_put_async_queues(cfqd); 3631 cfq_put_async_queues(cfqd);
3953 cfq_release_cfq_groups(cfqd); 3632 cfq_release_cfq_groups(cfqd);
3954 3633
@@ -3963,10 +3642,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
3963 3642
3964 cfq_shutdown_timer_wq(cfqd); 3643 cfq_shutdown_timer_wq(cfqd);
3965 3644
3966 spin_lock(&cic_index_lock);
3967 ida_remove(&cic_index_ida, cfqd->cic_index);
3968 spin_unlock(&cic_index_lock);
3969
3970 /* 3645 /*
3971 * Wait for cfqg->blkg->key accessors to exit their grace periods. 3646 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3972 * Do this wait only if there are other unlinked groups out 3647 * Do this wait only if there are other unlinked groups out
@@ -3988,24 +3663,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
3988 kfree(cfqd); 3663 kfree(cfqd);
3989} 3664}
3990 3665
3991static int cfq_alloc_cic_index(void)
3992{
3993 int index, error;
3994
3995 do {
3996 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3997 return -ENOMEM;
3998
3999 spin_lock(&cic_index_lock);
4000 error = ida_get_new(&cic_index_ida, &index);
4001 spin_unlock(&cic_index_lock);
4002 if (error && error != -EAGAIN)
4003 return error;
4004 } while (error);
4005
4006 return index;
4007}
4008
4009static void *cfq_init_queue(struct request_queue *q) 3666static void *cfq_init_queue(struct request_queue *q)
4010{ 3667{
4011 struct cfq_data *cfqd; 3668 struct cfq_data *cfqd;
@@ -4013,23 +3670,9 @@ static void *cfq_init_queue(struct request_queue *q)
4013 struct cfq_group *cfqg; 3670 struct cfq_group *cfqg;
4014 struct cfq_rb_root *st; 3671 struct cfq_rb_root *st;
4015 3672
4016 i = cfq_alloc_cic_index();
4017 if (i < 0)
4018 return NULL;
4019
4020 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3673 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4021 if (!cfqd) { 3674 if (!cfqd)
4022 spin_lock(&cic_index_lock);
4023 ida_remove(&cic_index_ida, i);
4024 spin_unlock(&cic_index_lock);
4025 return NULL; 3675 return NULL;
4026 }
4027
4028 /*
4029 * Don't need take queue_lock in the routine, since we are
4030 * initializing the ioscheduler, and nobody is using cfqd
4031 */
4032 cfqd->cic_index = i;
4033 3676
4034 /* Init root service tree */ 3677 /* Init root service tree */
4035 cfqd->grp_service_tree = CFQ_RB_ROOT; 3678 cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4055,11 +3698,6 @@ static void *cfq_init_queue(struct request_queue *q)
4055 3698
4056 if (blkio_alloc_blkg_stats(&cfqg->blkg)) { 3699 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
4057 kfree(cfqg); 3700 kfree(cfqg);
4058
4059 spin_lock(&cic_index_lock);
4060 ida_remove(&cic_index_ida, cfqd->cic_index);
4061 spin_unlock(&cic_index_lock);
4062
4063 kfree(cfqd); 3701 kfree(cfqd);
4064 return NULL; 3702 return NULL;
4065 } 3703 }
@@ -4091,8 +3729,6 @@ static void *cfq_init_queue(struct request_queue *q)
4091 cfqd->oom_cfqq.ref++; 3729 cfqd->oom_cfqq.ref++;
4092 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3730 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
4093 3731
4094 INIT_LIST_HEAD(&cfqd->cic_list);
4095
4096 cfqd->queue = q; 3732 cfqd->queue = q;
4097 3733
4098 init_timer(&cfqd->idle_slice_timer); 3734 init_timer(&cfqd->idle_slice_timer);
@@ -4121,34 +3757,6 @@ static void *cfq_init_queue(struct request_queue *q)
4121 return cfqd; 3757 return cfqd;
4122} 3758}
4123 3759
4124static void cfq_slab_kill(void)
4125{
4126 /*
4127 * Caller already ensured that pending RCU callbacks are completed,
4128 * so we should have no busy allocations at this point.
4129 */
4130 if (cfq_pool)
4131 kmem_cache_destroy(cfq_pool);
4132 if (cfq_ioc_pool)
4133 kmem_cache_destroy(cfq_ioc_pool);
4134}
4135
4136static int __init cfq_slab_setup(void)
4137{
4138 cfq_pool = KMEM_CACHE(cfq_queue, 0);
4139 if (!cfq_pool)
4140 goto fail;
4141
4142 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
4143 if (!cfq_ioc_pool)
4144 goto fail;
4145
4146 return 0;
4147fail:
4148 cfq_slab_kill();
4149 return -ENOMEM;
4150}
4151
4152/* 3760/*
4153 * sysfs parts below --> 3761 * sysfs parts below -->
4154 */ 3762 */
@@ -4254,15 +3862,18 @@ static struct elevator_type iosched_cfq = {
4254 .elevator_completed_req_fn = cfq_completed_request, 3862 .elevator_completed_req_fn = cfq_completed_request,
4255 .elevator_former_req_fn = elv_rb_former_request, 3863 .elevator_former_req_fn = elv_rb_former_request,
4256 .elevator_latter_req_fn = elv_rb_latter_request, 3864 .elevator_latter_req_fn = elv_rb_latter_request,
3865 .elevator_init_icq_fn = cfq_init_icq,
3866 .elevator_exit_icq_fn = cfq_exit_icq,
4257 .elevator_set_req_fn = cfq_set_request, 3867 .elevator_set_req_fn = cfq_set_request,
4258 .elevator_put_req_fn = cfq_put_request, 3868 .elevator_put_req_fn = cfq_put_request,
4259 .elevator_may_queue_fn = cfq_may_queue, 3869 .elevator_may_queue_fn = cfq_may_queue,
4260 .elevator_init_fn = cfq_init_queue, 3870 .elevator_init_fn = cfq_init_queue,
4261 .elevator_exit_fn = cfq_exit_queue, 3871 .elevator_exit_fn = cfq_exit_queue,
4262 .trim = cfq_free_io_context,
4263 }, 3872 },
3873 .icq_size = sizeof(struct cfq_io_cq),
3874 .icq_align = __alignof__(struct cfq_io_cq),
4264 .elevator_attrs = cfq_attrs, 3875 .elevator_attrs = cfq_attrs,
4265 .elevator_name = "cfq", 3876 .elevator_name = "cfq",
4266 .elevator_owner = THIS_MODULE, 3877 .elevator_owner = THIS_MODULE,
4267}; 3878};
4268 3879
@@ -4280,6 +3891,8 @@ static struct blkio_policy_type blkio_policy_cfq;
4280 3891
4281static int __init cfq_init(void) 3892static int __init cfq_init(void)
4282{ 3893{
3894 int ret;
3895
4283 /* 3896 /*
4284 * could be 0 on HZ < 1000 setups 3897 * could be 0 on HZ < 1000 setups
4285 */ 3898 */
@@ -4294,10 +3907,16 @@ static int __init cfq_init(void)
4294#else 3907#else
4295 cfq_group_idle = 0; 3908 cfq_group_idle = 0;
4296#endif 3909#endif
4297 if (cfq_slab_setup()) 3910 cfq_pool = KMEM_CACHE(cfq_queue, 0);
3911 if (!cfq_pool)
4298 return -ENOMEM; 3912 return -ENOMEM;
4299 3913
4300 elv_register(&iosched_cfq); 3914 ret = elv_register(&iosched_cfq);
3915 if (ret) {
3916 kmem_cache_destroy(cfq_pool);
3917 return ret;
3918 }
3919
4301 blkio_policy_register(&blkio_policy_cfq); 3920 blkio_policy_register(&blkio_policy_cfq);
4302 3921
4303 return 0; 3922 return 0;
@@ -4305,21 +3924,9 @@ static int __init cfq_init(void)
4305 3924
4306static void __exit cfq_exit(void) 3925static void __exit cfq_exit(void)
4307{ 3926{
4308 DECLARE_COMPLETION_ONSTACK(all_gone);
4309 blkio_policy_unregister(&blkio_policy_cfq); 3927 blkio_policy_unregister(&blkio_policy_cfq);
4310 elv_unregister(&iosched_cfq); 3928 elv_unregister(&iosched_cfq);
4311 ioc_gone = &all_gone; 3929 kmem_cache_destroy(cfq_pool);
4312 /* ioc_gone's update must be visible before reading ioc_count */
4313 smp_wmb();
4314
4315 /*
4316 * this also protects us from entering cfq_slab_kill() with
4317 * pending RCU callbacks
4318 */
4319 if (elv_ioc_count_read(cfq_ioc_count))
4320 wait_for_completion(&all_gone);
4321 ida_destroy(&cic_index_ida);
4322 cfq_slab_kill();
4323} 3930}
4324 3931
4325module_init(cfq_init); 3932module_init(cfq_init);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7b725020823..7c668c8a6f9 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -719,6 +719,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
719 case BLKSECTGET: 719 case BLKSECTGET:
720 return compat_put_ushort(arg, 720 return compat_put_ushort(arg,
721 queue_max_sectors(bdev_get_queue(bdev))); 721 queue_max_sectors(bdev_get_queue(bdev)));
722 case BLKROTATIONAL:
723 return compat_put_ushort(arg,
724 !blk_queue_nonrot(bdev_get_queue(bdev)));
722 case BLKRASET: /* compatible, but no compat_ptr (!) */ 725 case BLKRASET: /* compatible, but no compat_ptr (!) */
723 case BLKFRASET: 726 case BLKFRASET:
724 if (!capable(CAP_SYS_ADMIN)) 727 if (!capable(CAP_SYS_ADMIN))
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index c644137d9cd..7bf12d793fc 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -448,9 +448,7 @@ static struct elevator_type iosched_deadline = {
448 448
449static int __init deadline_init(void) 449static int __init deadline_init(void)
450{ 450{
451 elv_register(&iosched_deadline); 451 return elv_register(&iosched_deadline);
452
453 return 0;
454} 452}
455 453
456static void __exit deadline_exit(void) 454static void __exit deadline_exit(void)
diff --git a/block/elevator.c b/block/elevator.c
index 66343d6917d..91e18f8af9b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -61,8 +61,8 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
61 struct request_queue *q = rq->q; 61 struct request_queue *q = rq->q;
62 struct elevator_queue *e = q->elevator; 62 struct elevator_queue *e = q->elevator;
63 63
64 if (e->ops->elevator_allow_merge_fn) 64 if (e->type->ops.elevator_allow_merge_fn)
65 return e->ops->elevator_allow_merge_fn(q, rq, bio); 65 return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
66 66
67 return 1; 67 return 1;
68} 68}
@@ -168,17 +168,13 @@ static struct elevator_type *elevator_get(const char *name)
168 return e; 168 return e;
169} 169}
170 170
171static void *elevator_init_queue(struct request_queue *q, 171static int elevator_init_queue(struct request_queue *q,
172 struct elevator_queue *eq) 172 struct elevator_queue *eq)
173{ 173{
174 return eq->ops->elevator_init_fn(q); 174 eq->elevator_data = eq->type->ops.elevator_init_fn(q);
175} 175 if (eq->elevator_data)
176 176 return 0;
177static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, 177 return -ENOMEM;
178 void *data)
179{
180 q->elevator = eq;
181 eq->elevator_data = data;
182} 178}
183 179
184static char chosen_elevator[ELV_NAME_MAX]; 180static char chosen_elevator[ELV_NAME_MAX];
@@ -207,8 +203,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
207 if (unlikely(!eq)) 203 if (unlikely(!eq))
208 goto err; 204 goto err;
209 205
210 eq->ops = &e->ops; 206 eq->type = e;
211 eq->elevator_type = e;
212 kobject_init(&eq->kobj, &elv_ktype); 207 kobject_init(&eq->kobj, &elv_ktype);
213 mutex_init(&eq->sysfs_lock); 208 mutex_init(&eq->sysfs_lock);
214 209
@@ -232,7 +227,7 @@ static void elevator_release(struct kobject *kobj)
232 struct elevator_queue *e; 227 struct elevator_queue *e;
233 228
234 e = container_of(kobj, struct elevator_queue, kobj); 229 e = container_of(kobj, struct elevator_queue, kobj);
235 elevator_put(e->elevator_type); 230 elevator_put(e->type);
236 kfree(e->hash); 231 kfree(e->hash);
237 kfree(e); 232 kfree(e);
238} 233}
@@ -241,7 +236,7 @@ int elevator_init(struct request_queue *q, char *name)
241{ 236{
242 struct elevator_type *e = NULL; 237 struct elevator_type *e = NULL;
243 struct elevator_queue *eq; 238 struct elevator_queue *eq;
244 void *data; 239 int err;
245 240
246 if (unlikely(q->elevator)) 241 if (unlikely(q->elevator))
247 return 0; 242 return 0;
@@ -278,13 +273,13 @@ int elevator_init(struct request_queue *q, char *name)
278 if (!eq) 273 if (!eq)
279 return -ENOMEM; 274 return -ENOMEM;
280 275
281 data = elevator_init_queue(q, eq); 276 err = elevator_init_queue(q, eq);
282 if (!data) { 277 if (err) {
283 kobject_put(&eq->kobj); 278 kobject_put(&eq->kobj);
284 return -ENOMEM; 279 return err;
285 } 280 }
286 281
287 elevator_attach(q, eq, data); 282 q->elevator = eq;
288 return 0; 283 return 0;
289} 284}
290EXPORT_SYMBOL(elevator_init); 285EXPORT_SYMBOL(elevator_init);
@@ -292,9 +287,8 @@ EXPORT_SYMBOL(elevator_init);
292void elevator_exit(struct elevator_queue *e) 287void elevator_exit(struct elevator_queue *e)
293{ 288{
294 mutex_lock(&e->sysfs_lock); 289 mutex_lock(&e->sysfs_lock);
295 if (e->ops->elevator_exit_fn) 290 if (e->type->ops.elevator_exit_fn)
296 e->ops->elevator_exit_fn(e); 291 e->type->ops.elevator_exit_fn(e);
297 e->ops = NULL;
298 mutex_unlock(&e->sysfs_lock); 292 mutex_unlock(&e->sysfs_lock);
299 293
300 kobject_put(&e->kobj); 294 kobject_put(&e->kobj);
@@ -504,8 +498,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
504 return ELEVATOR_BACK_MERGE; 498 return ELEVATOR_BACK_MERGE;
505 } 499 }
506 500
507 if (e->ops->elevator_merge_fn) 501 if (e->type->ops.elevator_merge_fn)
508 return e->ops->elevator_merge_fn(q, req, bio); 502 return e->type->ops.elevator_merge_fn(q, req, bio);
509 503
510 return ELEVATOR_NO_MERGE; 504 return ELEVATOR_NO_MERGE;
511} 505}
@@ -548,8 +542,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
548{ 542{
549 struct elevator_queue *e = q->elevator; 543 struct elevator_queue *e = q->elevator;
550 544
551 if (e->ops->elevator_merged_fn) 545 if (e->type->ops.elevator_merged_fn)
552 e->ops->elevator_merged_fn(q, rq, type); 546 e->type->ops.elevator_merged_fn(q, rq, type);
553 547
554 if (type == ELEVATOR_BACK_MERGE) 548 if (type == ELEVATOR_BACK_MERGE)
555 elv_rqhash_reposition(q, rq); 549 elv_rqhash_reposition(q, rq);
@@ -563,8 +557,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
563 struct elevator_queue *e = q->elevator; 557 struct elevator_queue *e = q->elevator;
564 const int next_sorted = next->cmd_flags & REQ_SORTED; 558 const int next_sorted = next->cmd_flags & REQ_SORTED;
565 559
566 if (next_sorted && e->ops->elevator_merge_req_fn) 560 if (next_sorted && e->type->ops.elevator_merge_req_fn)
567 e->ops->elevator_merge_req_fn(q, rq, next); 561 e->type->ops.elevator_merge_req_fn(q, rq, next);
568 562
569 elv_rqhash_reposition(q, rq); 563 elv_rqhash_reposition(q, rq);
570 564
@@ -581,8 +575,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
581{ 575{
582 struct elevator_queue *e = q->elevator; 576 struct elevator_queue *e = q->elevator;
583 577
584 if (e->ops->elevator_bio_merged_fn) 578 if (e->type->ops.elevator_bio_merged_fn)
585 e->ops->elevator_bio_merged_fn(q, rq, bio); 579 e->type->ops.elevator_bio_merged_fn(q, rq, bio);
586} 580}
587 581
588void elv_requeue_request(struct request_queue *q, struct request *rq) 582void elv_requeue_request(struct request_queue *q, struct request *rq)
@@ -608,12 +602,12 @@ void elv_drain_elevator(struct request_queue *q)
608 602
609 lockdep_assert_held(q->queue_lock); 603 lockdep_assert_held(q->queue_lock);
610 604
611 while (q->elevator->ops->elevator_dispatch_fn(q, 1)) 605 while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
612 ; 606 ;
613 if (q->nr_sorted && printed++ < 10) { 607 if (q->nr_sorted && printed++ < 10) {
614 printk(KERN_ERR "%s: forced dispatching is broken " 608 printk(KERN_ERR "%s: forced dispatching is broken "
615 "(nr_sorted=%u), please report this\n", 609 "(nr_sorted=%u), please report this\n",
616 q->elevator->elevator_type->elevator_name, q->nr_sorted); 610 q->elevator->type->elevator_name, q->nr_sorted);
617 } 611 }
618} 612}
619 613
@@ -702,7 +696,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
702 * rq cannot be accessed after calling 696 * rq cannot be accessed after calling
703 * elevator_add_req_fn. 697 * elevator_add_req_fn.
704 */ 698 */
705 q->elevator->ops->elevator_add_req_fn(q, rq); 699 q->elevator->type->ops.elevator_add_req_fn(q, rq);
706 break; 700 break;
707 701
708 case ELEVATOR_INSERT_FLUSH: 702 case ELEVATOR_INSERT_FLUSH:
@@ -731,8 +725,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
731{ 725{
732 struct elevator_queue *e = q->elevator; 726 struct elevator_queue *e = q->elevator;
733 727
734 if (e->ops->elevator_latter_req_fn) 728 if (e->type->ops.elevator_latter_req_fn)
735 return e->ops->elevator_latter_req_fn(q, rq); 729 return e->type->ops.elevator_latter_req_fn(q, rq);
736 return NULL; 730 return NULL;
737} 731}
738 732
@@ -740,8 +734,8 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
740{ 734{
741 struct elevator_queue *e = q->elevator; 735 struct elevator_queue *e = q->elevator;
742 736
743 if (e->ops->elevator_former_req_fn) 737 if (e->type->ops.elevator_former_req_fn)
744 return e->ops->elevator_former_req_fn(q, rq); 738 return e->type->ops.elevator_former_req_fn(q, rq);
745 return NULL; 739 return NULL;
746} 740}
747 741
@@ -749,10 +743,8 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
749{ 743{
750 struct elevator_queue *e = q->elevator; 744 struct elevator_queue *e = q->elevator;
751 745
752 if (e->ops->elevator_set_req_fn) 746 if (e->type->ops.elevator_set_req_fn)
753 return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 747 return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
754
755 rq->elevator_private[0] = NULL;
756 return 0; 748 return 0;
757} 749}
758 750
@@ -760,16 +752,16 @@ void elv_put_request(struct request_queue *q, struct request *rq)
760{ 752{
761 struct elevator_queue *e = q->elevator; 753 struct elevator_queue *e = q->elevator;
762 754
763 if (e->ops->elevator_put_req_fn) 755 if (e->type->ops.elevator_put_req_fn)
764 e->ops->elevator_put_req_fn(rq); 756 e->type->ops.elevator_put_req_fn(rq);
765} 757}
766 758
767int elv_may_queue(struct request_queue *q, int rw) 759int elv_may_queue(struct request_queue *q, int rw)
768{ 760{
769 struct elevator_queue *e = q->elevator; 761 struct elevator_queue *e = q->elevator;
770 762
771 if (e->ops->elevator_may_queue_fn) 763 if (e->type->ops.elevator_may_queue_fn)
772 return e->ops->elevator_may_queue_fn(q, rw); 764 return e->type->ops.elevator_may_queue_fn(q, rw);
773 765
774 return ELV_MQUEUE_MAY; 766 return ELV_MQUEUE_MAY;
775} 767}
@@ -804,8 +796,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
804 if (blk_account_rq(rq)) { 796 if (blk_account_rq(rq)) {
805 q->in_flight[rq_is_sync(rq)]--; 797 q->in_flight[rq_is_sync(rq)]--;
806 if ((rq->cmd_flags & REQ_SORTED) && 798 if ((rq->cmd_flags & REQ_SORTED) &&
807 e->ops->elevator_completed_req_fn) 799 e->type->ops.elevator_completed_req_fn)
808 e->ops->elevator_completed_req_fn(q, rq); 800 e->type->ops.elevator_completed_req_fn(q, rq);
809 } 801 }
810} 802}
811 803
@@ -823,7 +815,7 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
823 815
824 e = container_of(kobj, struct elevator_queue, kobj); 816 e = container_of(kobj, struct elevator_queue, kobj);
825 mutex_lock(&e->sysfs_lock); 817 mutex_lock(&e->sysfs_lock);
826 error = e->ops ? entry->show(e, page) : -ENOENT; 818 error = e->type ? entry->show(e, page) : -ENOENT;
827 mutex_unlock(&e->sysfs_lock); 819 mutex_unlock(&e->sysfs_lock);
828 return error; 820 return error;
829} 821}
@@ -841,7 +833,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
841 833
842 e = container_of(kobj, struct elevator_queue, kobj); 834 e = container_of(kobj, struct elevator_queue, kobj);
843 mutex_lock(&e->sysfs_lock); 835 mutex_lock(&e->sysfs_lock);
844 error = e->ops ? entry->store(e, page, length) : -ENOENT; 836 error = e->type ? entry->store(e, page, length) : -ENOENT;
845 mutex_unlock(&e->sysfs_lock); 837 mutex_unlock(&e->sysfs_lock);
846 return error; 838 return error;
847} 839}
@@ -856,14 +848,13 @@ static struct kobj_type elv_ktype = {
856 .release = elevator_release, 848 .release = elevator_release,
857}; 849};
858 850
859int elv_register_queue(struct request_queue *q) 851int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
860{ 852{
861 struct elevator_queue *e = q->elevator;
862 int error; 853 int error;
863 854
864 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 855 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
865 if (!error) { 856 if (!error) {
866 struct elv_fs_entry *attr = e->elevator_type->elevator_attrs; 857 struct elv_fs_entry *attr = e->type->elevator_attrs;
867 if (attr) { 858 if (attr) {
868 while (attr->attr.name) { 859 while (attr->attr.name) {
869 if (sysfs_create_file(&e->kobj, &attr->attr)) 860 if (sysfs_create_file(&e->kobj, &attr->attr))
@@ -876,31 +867,55 @@ int elv_register_queue(struct request_queue *q)
876 } 867 }
877 return error; 868 return error;
878} 869}
879EXPORT_SYMBOL(elv_register_queue);
880 870
881static void __elv_unregister_queue(struct elevator_queue *e) 871int elv_register_queue(struct request_queue *q)
882{ 872{
883 kobject_uevent(&e->kobj, KOBJ_REMOVE); 873 return __elv_register_queue(q, q->elevator);
884 kobject_del(&e->kobj);
885 e->registered = 0;
886} 874}
875EXPORT_SYMBOL(elv_register_queue);
887 876
888void elv_unregister_queue(struct request_queue *q) 877void elv_unregister_queue(struct request_queue *q)
889{ 878{
890 if (q) 879 if (q) {
891 __elv_unregister_queue(q->elevator); 880 struct elevator_queue *e = q->elevator;
881
882 kobject_uevent(&e->kobj, KOBJ_REMOVE);
883 kobject_del(&e->kobj);
884 e->registered = 0;
885 }
892} 886}
893EXPORT_SYMBOL(elv_unregister_queue); 887EXPORT_SYMBOL(elv_unregister_queue);
894 888
895void elv_register(struct elevator_type *e) 889int elv_register(struct elevator_type *e)
896{ 890{
897 char *def = ""; 891 char *def = "";
898 892
893 /* create icq_cache if requested */
894 if (e->icq_size) {
895 if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
896 WARN_ON(e->icq_align < __alignof__(struct io_cq)))
897 return -EINVAL;
898
899 snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
900 "%s_io_cq", e->elevator_name);
901 e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
902 e->icq_align, 0, NULL);
903 if (!e->icq_cache)
904 return -ENOMEM;
905 }
906
907 /* register, don't allow duplicate names */
899 spin_lock(&elv_list_lock); 908 spin_lock(&elv_list_lock);
900 BUG_ON(elevator_find(e->elevator_name)); 909 if (elevator_find(e->elevator_name)) {
910 spin_unlock(&elv_list_lock);
911 if (e->icq_cache)
912 kmem_cache_destroy(e->icq_cache);
913 return -EBUSY;
914 }
901 list_add_tail(&e->list, &elv_list); 915 list_add_tail(&e->list, &elv_list);
902 spin_unlock(&elv_list_lock); 916 spin_unlock(&elv_list_lock);
903 917
918 /* print pretty message */
904 if (!strcmp(e->elevator_name, chosen_elevator) || 919 if (!strcmp(e->elevator_name, chosen_elevator) ||
905 (!*chosen_elevator && 920 (!*chosen_elevator &&
906 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) 921 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
@@ -908,30 +923,26 @@ void elv_register(struct elevator_type *e)
908 923
909 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 924 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
910 def); 925 def);
926 return 0;
911} 927}
912EXPORT_SYMBOL_GPL(elv_register); 928EXPORT_SYMBOL_GPL(elv_register);
913 929
914void elv_unregister(struct elevator_type *e) 930void elv_unregister(struct elevator_type *e)
915{ 931{
916 struct task_struct *g, *p; 932 /* unregister */
933 spin_lock(&elv_list_lock);
934 list_del_init(&e->list);
935 spin_unlock(&elv_list_lock);
917 936
918 /* 937 /*
919 * Iterate every thread in the process to remove the io contexts. 938 * Destroy icq_cache if it exists. icq's are RCU managed. Make
939 * sure all RCU operations are complete before proceeding.
920 */ 940 */
921 if (e->ops.trim) { 941 if (e->icq_cache) {
922 read_lock(&tasklist_lock); 942 rcu_barrier();
923 do_each_thread(g, p) { 943 kmem_cache_destroy(e->icq_cache);
924 task_lock(p); 944 e->icq_cache = NULL;
925 if (p->io_context)
926 e->ops.trim(p->io_context);
927 task_unlock(p);
928 } while_each_thread(g, p);
929 read_unlock(&tasklist_lock);
930 } 945 }
931
932 spin_lock(&elv_list_lock);
933 list_del_init(&e->list);
934 spin_unlock(&elv_list_lock);
935} 946}
936EXPORT_SYMBOL_GPL(elv_unregister); 947EXPORT_SYMBOL_GPL(elv_unregister);
937 948
@@ -944,54 +955,41 @@ EXPORT_SYMBOL_GPL(elv_unregister);
944static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 955static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
945{ 956{
946 struct elevator_queue *old_elevator, *e; 957 struct elevator_queue *old_elevator, *e;
947 void *data;
948 int err; 958 int err;
949 959
950 /* 960 /* allocate new elevator */
951 * Allocate new elevator
952 */
953 e = elevator_alloc(q, new_e); 961 e = elevator_alloc(q, new_e);
954 if (!e) 962 if (!e)
955 return -ENOMEM; 963 return -ENOMEM;
956 964
957 data = elevator_init_queue(q, e); 965 err = elevator_init_queue(q, e);
958 if (!data) { 966 if (err) {
959 kobject_put(&e->kobj); 967 kobject_put(&e->kobj);
960 return -ENOMEM; 968 return err;
961 } 969 }
962 970
963 /* 971 /* turn on BYPASS and drain all requests w/ elevator private data */
964 * Turn on BYPASS and drain all requests w/ elevator private data
965 */
966 elv_quiesce_start(q); 972 elv_quiesce_start(q);
967 973
968 /* 974 /* unregister old queue, register new one and kill old elevator */
969 * Remember old elevator. 975 if (q->elevator->registered) {
970 */ 976 elv_unregister_queue(q);
971 old_elevator = q->elevator; 977 err = __elv_register_queue(q, e);
972
973 /*
974 * attach and start new elevator
975 */
976 spin_lock_irq(q->queue_lock);
977 elevator_attach(q, e, data);
978 spin_unlock_irq(q->queue_lock);
979
980 if (old_elevator->registered) {
981 __elv_unregister_queue(old_elevator);
982
983 err = elv_register_queue(q);
984 if (err) 978 if (err)
985 goto fail_register; 979 goto fail_register;
986 } 980 }
987 981
988 /* 982 /* done, clear io_cq's, switch elevators and turn off BYPASS */
989 * finally exit old elevator and turn off BYPASS. 983 spin_lock_irq(q->queue_lock);
990 */ 984 ioc_clear_queue(q);
985 old_elevator = q->elevator;
986 q->elevator = e;
987 spin_unlock_irq(q->queue_lock);
988
991 elevator_exit(old_elevator); 989 elevator_exit(old_elevator);
992 elv_quiesce_end(q); 990 elv_quiesce_end(q);
993 991
994 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); 992 blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
995 993
996 return 0; 994 return 0;
997 995
@@ -1001,7 +999,6 @@ fail_register:
1001 * one again (along with re-adding the sysfs dir) 999 * one again (along with re-adding the sysfs dir)
1002 */ 1000 */
1003 elevator_exit(e); 1001 elevator_exit(e);
1004 q->elevator = old_elevator;
1005 elv_register_queue(q); 1002 elv_register_queue(q);
1006 elv_quiesce_end(q); 1003 elv_quiesce_end(q);
1007 1004
@@ -1026,7 +1023,7 @@ int elevator_change(struct request_queue *q, const char *name)
1026 return -EINVAL; 1023 return -EINVAL;
1027 } 1024 }
1028 1025
1029 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { 1026 if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
1030 elevator_put(e); 1027 elevator_put(e);
1031 return 0; 1028 return 0;
1032 } 1029 }
@@ -1061,7 +1058,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1061 if (!q->elevator || !blk_queue_stackable(q)) 1058 if (!q->elevator || !blk_queue_stackable(q))
1062 return sprintf(name, "none\n"); 1059 return sprintf(name, "none\n");
1063 1060
1064 elv = e->elevator_type; 1061 elv = e->type;
1065 1062
1066 spin_lock(&elv_list_lock); 1063 spin_lock(&elv_list_lock);
1067 list_for_each_entry(__e, &elv_list, list) { 1064 list_for_each_entry(__e, &elv_list, list) {
diff --git a/block/genhd.c b/block/genhd.c
index 83e7c04015e..23b4f706332 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -614,7 +614,7 @@ void add_disk(struct gendisk *disk)
614 * Take an extra ref on queue which will be put on disk_release() 614 * Take an extra ref on queue which will be put on disk_release()
615 * so that it sticks around as long as @disk is there. 615 * so that it sticks around as long as @disk is there.
616 */ 616 */
617 WARN_ON_ONCE(blk_get_queue(disk->queue)); 617 WARN_ON_ONCE(!blk_get_queue(disk->queue));
618 618
619 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 619 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
620 "bdi"); 620 "bdi");
diff --git a/block/ioctl.c b/block/ioctl.c
index 4828fa34981..ba15b2dbfb9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -296,6 +296,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
296 return put_uint(arg, bdev_discard_zeroes_data(bdev)); 296 return put_uint(arg, bdev_discard_zeroes_data(bdev));
297 case BLKSECTGET: 297 case BLKSECTGET:
298 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 298 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
299 case BLKROTATIONAL:
300 return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
299 case BLKRASET: 301 case BLKRASET:
300 case BLKFRASET: 302 case BLKFRASET:
301 if(!capable(CAP_SYS_ADMIN)) 303 if(!capable(CAP_SYS_ADMIN))
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 06389e9ef96..413a0b1d788 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -94,9 +94,7 @@ static struct elevator_type elevator_noop = {
94 94
95static int __init noop_init(void) 95static int __init noop_init(void)
96{ 96{
97 elv_register(&elevator_noop); 97 return elv_register(&elevator_noop);
98
99 return 0;
100} 98}
101 99
102static void __exit noop_exit(void) 100static void __exit noop_exit(void)