diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 17:14:23 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 17:14:23 -0500 |
commit | caf292ae5bb9d57198ce001d8b762f7abae3a94d (patch) | |
tree | 5fd5d6d971503818ab2824407134cf36a80c53d0 /block/blk-mq.c | |
parent | 8f4385d590d4296ec38e228d17b1d002f6031dd2 (diff) | |
parent | fcbf6a087a7e4d3f03d28333678a1010810a53c3 (diff) |
Merge branch 'for-3.19/core' of git://git.kernel.dk/linux-block
Pull block driver core update from Jens Axboe:
"This is the pull request for the core block IO changes for 3.19. Not
a huge round this time, mostly lots of little good fixes:
- Fix a bug in sysfs blktrace interface causing a NULL pointer
dereference, when enabled/disabled through that API. From Arianna
Avanzini.
- Various updates/fixes/improvements for blk-mq:
- A set of updates from Bart, mostly fixing buts in the tag
handling.
- Cleanup/code consolidation from Christoph.
- Extend queue_rq API to be able to handle batching issues of IO
requests. NVMe will utilize this shortly. From me.
- A few tag and request handling updates from me.
- Cleanup of the preempt handling for running queues from Paolo.
- Prevent running of unmapped hardware queues from Ming Lei.
- Move the kdump memory limiting check to be in the correct
location, from Shaohua.
- Initialize all software queues at init time from Takashi. This
prevents a kobject warning when CPUs are brought online that
weren't online when a queue was registered.
- Single writeback fix for I_DIRTY clearing from Tejun. Queued with
the core IO changes, since it's just a single fix.
- Version X of the __bio_add_page() segment addition retry from
Maurizio. Hope the Xth time is the charm.
- Documentation fixup for IO scheduler merging from Jan.
- Introduce (and use) generic IO stat accounting helpers for non-rq
drivers, from Gu Zheng.
- Kill off artificial limiting of max sectors in a request from
Christoph"
* 'for-3.19/core' of git://git.kernel.dk/linux-block: (26 commits)
bio: modify __bio_add_page() to accept pages that don't start a new segment
blk-mq: Fix uninitialized kobject at CPU hotplugging
blktrace: don't let the sysfs interface remove trace from running list
blk-mq: Use all available hardware queues
blk-mq: Micro-optimize bt_get()
blk-mq: Fix a race between bt_clear_tag() and bt_get()
blk-mq: Avoid that __bt_get_word() wraps multiple times
blk-mq: Fix a use-after-free
blk-mq: prevent unmapped hw queue from being scheduled
blk-mq: re-check for available tags after running the hardware queue
blk-mq: fix hang in bt_get()
blk-mq: move the kdump check to blk_mq_alloc_tag_set
blk-mq: cleanup tag free handling
blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map
blk: introduce generic io stat accounting help function
blk-mq: handle the single queue case in blk_mq_hctx_next_cpu
genhd: check for int overflow in disk_expand_part_tbl()
blk-mq: add blk_mq_free_hctx_request()
blk-mq: export blk_mq_free_request()
blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable
...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 126 |
1 files changed, 80 insertions, 46 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 92ceef0d2ab9..da1ab5641227 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -279,17 +279,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, | |||
279 | blk_mq_queue_exit(q); | 279 | blk_mq_queue_exit(q); |
280 | } | 280 | } |
281 | 281 | ||
282 | void blk_mq_free_request(struct request *rq) | 282 | void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) |
283 | { | 283 | { |
284 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 284 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
285 | struct blk_mq_hw_ctx *hctx; | ||
286 | struct request_queue *q = rq->q; | ||
287 | 285 | ||
288 | ctx->rq_completed[rq_is_sync(rq)]++; | 286 | ctx->rq_completed[rq_is_sync(rq)]++; |
289 | |||
290 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
291 | __blk_mq_free_request(hctx, ctx, rq); | 287 | __blk_mq_free_request(hctx, ctx, rq); |
288 | |||
289 | } | ||
290 | EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); | ||
291 | |||
292 | void blk_mq_free_request(struct request *rq) | ||
293 | { | ||
294 | struct blk_mq_hw_ctx *hctx; | ||
295 | struct request_queue *q = rq->q; | ||
296 | |||
297 | hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); | ||
298 | blk_mq_free_hctx_request(hctx, rq); | ||
292 | } | 299 | } |
300 | EXPORT_SYMBOL_GPL(blk_mq_free_request); | ||
293 | 301 | ||
294 | inline void __blk_mq_end_request(struct request *rq, int error) | 302 | inline void __blk_mq_end_request(struct request *rq, int error) |
295 | { | 303 | { |
@@ -591,7 +599,7 @@ static void blk_mq_rq_timer(unsigned long priv) | |||
591 | * If not software queues are currently mapped to this | 599 | * If not software queues are currently mapped to this |
592 | * hardware queue, there's nothing to check | 600 | * hardware queue, there's nothing to check |
593 | */ | 601 | */ |
594 | if (!hctx->nr_ctx || !hctx->tags) | 602 | if (!blk_mq_hw_queue_mapped(hctx)) |
595 | continue; | 603 | continue; |
596 | 604 | ||
597 | blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); | 605 | blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); |
@@ -690,6 +698,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
690 | struct request_queue *q = hctx->queue; | 698 | struct request_queue *q = hctx->queue; |
691 | struct request *rq; | 699 | struct request *rq; |
692 | LIST_HEAD(rq_list); | 700 | LIST_HEAD(rq_list); |
701 | LIST_HEAD(driver_list); | ||
702 | struct list_head *dptr; | ||
693 | int queued; | 703 | int queued; |
694 | 704 | ||
695 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); | 705 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); |
@@ -716,16 +726,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
716 | } | 726 | } |
717 | 727 | ||
718 | /* | 728 | /* |
729 | * Start off with dptr being NULL, so we start the first request | ||
730 | * immediately, even if we have more pending. | ||
731 | */ | ||
732 | dptr = NULL; | ||
733 | |||
734 | /* | ||
719 | * Now process all the entries, sending them to the driver. | 735 | * Now process all the entries, sending them to the driver. |
720 | */ | 736 | */ |
721 | queued = 0; | 737 | queued = 0; |
722 | while (!list_empty(&rq_list)) { | 738 | while (!list_empty(&rq_list)) { |
739 | struct blk_mq_queue_data bd; | ||
723 | int ret; | 740 | int ret; |
724 | 741 | ||
725 | rq = list_first_entry(&rq_list, struct request, queuelist); | 742 | rq = list_first_entry(&rq_list, struct request, queuelist); |
726 | list_del_init(&rq->queuelist); | 743 | list_del_init(&rq->queuelist); |
727 | 744 | ||
728 | ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); | 745 | bd.rq = rq; |
746 | bd.list = dptr; | ||
747 | bd.last = list_empty(&rq_list); | ||
748 | |||
749 | ret = q->mq_ops->queue_rq(hctx, &bd); | ||
729 | switch (ret) { | 750 | switch (ret) { |
730 | case BLK_MQ_RQ_QUEUE_OK: | 751 | case BLK_MQ_RQ_QUEUE_OK: |
731 | queued++; | 752 | queued++; |
@@ -744,6 +765,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
744 | 765 | ||
745 | if (ret == BLK_MQ_RQ_QUEUE_BUSY) | 766 | if (ret == BLK_MQ_RQ_QUEUE_BUSY) |
746 | break; | 767 | break; |
768 | |||
769 | /* | ||
770 | * We've done the first request. If we have more than 1 | ||
771 | * left in the list, set dptr to defer issue. | ||
772 | */ | ||
773 | if (!dptr && rq_list.next != rq_list.prev) | ||
774 | dptr = &driver_list; | ||
747 | } | 775 | } |
748 | 776 | ||
749 | if (!queued) | 777 | if (!queued) |
@@ -770,10 +798,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
770 | */ | 798 | */ |
771 | static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) | 799 | static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) |
772 | { | 800 | { |
773 | int cpu = hctx->next_cpu; | 801 | if (hctx->queue->nr_hw_queues == 1) |
802 | return WORK_CPU_UNBOUND; | ||
774 | 803 | ||
775 | if (--hctx->next_cpu_batch <= 0) { | 804 | if (--hctx->next_cpu_batch <= 0) { |
776 | int next_cpu; | 805 | int cpu = hctx->next_cpu, next_cpu; |
777 | 806 | ||
778 | next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); | 807 | next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); |
779 | if (next_cpu >= nr_cpu_ids) | 808 | if (next_cpu >= nr_cpu_ids) |
@@ -781,26 +810,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) | |||
781 | 810 | ||
782 | hctx->next_cpu = next_cpu; | 811 | hctx->next_cpu = next_cpu; |
783 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; | 812 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; |
813 | |||
814 | return cpu; | ||
784 | } | 815 | } |
785 | 816 | ||
786 | return cpu; | 817 | return hctx->next_cpu; |
787 | } | 818 | } |
788 | 819 | ||
789 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | 820 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) |
790 | { | 821 | { |
791 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) | 822 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || |
823 | !blk_mq_hw_queue_mapped(hctx))) | ||
792 | return; | 824 | return; |
793 | 825 | ||
794 | if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) | 826 | if (!async) { |
795 | __blk_mq_run_hw_queue(hctx); | 827 | int cpu = get_cpu(); |
796 | else if (hctx->queue->nr_hw_queues == 1) | 828 | if (cpumask_test_cpu(cpu, hctx->cpumask)) { |
797 | kblockd_schedule_delayed_work(&hctx->run_work, 0); | 829 | __blk_mq_run_hw_queue(hctx); |
798 | else { | 830 | put_cpu(); |
799 | unsigned int cpu; | 831 | return; |
832 | } | ||
800 | 833 | ||
801 | cpu = blk_mq_hctx_next_cpu(hctx); | 834 | put_cpu(); |
802 | kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); | ||
803 | } | 835 | } |
836 | |||
837 | kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), | ||
838 | &hctx->run_work, 0); | ||
804 | } | 839 | } |
805 | 840 | ||
806 | void blk_mq_run_queues(struct request_queue *q, bool async) | 841 | void blk_mq_run_queues(struct request_queue *q, bool async) |
@@ -814,9 +849,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) | |||
814 | test_bit(BLK_MQ_S_STOPPED, &hctx->state)) | 849 | test_bit(BLK_MQ_S_STOPPED, &hctx->state)) |
815 | continue; | 850 | continue; |
816 | 851 | ||
817 | preempt_disable(); | ||
818 | blk_mq_run_hw_queue(hctx, async); | 852 | blk_mq_run_hw_queue(hctx, async); |
819 | preempt_enable(); | ||
820 | } | 853 | } |
821 | } | 854 | } |
822 | EXPORT_SYMBOL(blk_mq_run_queues); | 855 | EXPORT_SYMBOL(blk_mq_run_queues); |
@@ -843,9 +876,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
843 | { | 876 | { |
844 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); | 877 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); |
845 | 878 | ||
846 | preempt_disable(); | ||
847 | blk_mq_run_hw_queue(hctx, false); | 879 | blk_mq_run_hw_queue(hctx, false); |
848 | preempt_enable(); | ||
849 | } | 880 | } |
850 | EXPORT_SYMBOL(blk_mq_start_hw_queue); | 881 | EXPORT_SYMBOL(blk_mq_start_hw_queue); |
851 | 882 | ||
@@ -870,9 +901,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) | |||
870 | continue; | 901 | continue; |
871 | 902 | ||
872 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); | 903 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); |
873 | preempt_disable(); | ||
874 | blk_mq_run_hw_queue(hctx, async); | 904 | blk_mq_run_hw_queue(hctx, async); |
875 | preempt_enable(); | ||
876 | } | 905 | } |
877 | } | 906 | } |
878 | EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); | 907 | EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); |
@@ -898,16 +927,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work) | |||
898 | 927 | ||
899 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) | 928 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) |
900 | { | 929 | { |
901 | unsigned long tmo = msecs_to_jiffies(msecs); | 930 | if (unlikely(!blk_mq_hw_queue_mapped(hctx))) |
902 | 931 | return; | |
903 | if (hctx->queue->nr_hw_queues == 1) | ||
904 | kblockd_schedule_delayed_work(&hctx->delay_work, tmo); | ||
905 | else { | ||
906 | unsigned int cpu; | ||
907 | 932 | ||
908 | cpu = blk_mq_hctx_next_cpu(hctx); | 933 | kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), |
909 | kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); | 934 | &hctx->delay_work, msecs_to_jiffies(msecs)); |
910 | } | ||
911 | } | 935 | } |
912 | EXPORT_SYMBOL(blk_mq_delay_queue); | 936 | EXPORT_SYMBOL(blk_mq_delay_queue); |
913 | 937 | ||
@@ -1162,7 +1186,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1162 | goto run_queue; | 1186 | goto run_queue; |
1163 | } | 1187 | } |
1164 | 1188 | ||
1165 | if (is_sync) { | 1189 | /* |
1190 | * If the driver supports defer issued based on 'last', then | ||
1191 | * queue it up like normal since we can potentially save some | ||
1192 | * CPU this way. | ||
1193 | */ | ||
1194 | if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { | ||
1195 | struct blk_mq_queue_data bd = { | ||
1196 | .rq = rq, | ||
1197 | .list = NULL, | ||
1198 | .last = 1 | ||
1199 | }; | ||
1166 | int ret; | 1200 | int ret; |
1167 | 1201 | ||
1168 | blk_mq_bio_to_request(rq, bio); | 1202 | blk_mq_bio_to_request(rq, bio); |
@@ -1172,7 +1206,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1172 | * error (busy), just add it to our list as we previously | 1206 | * error (busy), just add it to our list as we previously |
1173 | * would have done | 1207 | * would have done |
1174 | */ | 1208 | */ |
1175 | ret = q->mq_ops->queue_rq(data.hctx, rq, true); | 1209 | ret = q->mq_ops->queue_rq(data.hctx, &bd); |
1176 | if (ret == BLK_MQ_RQ_QUEUE_OK) | 1210 | if (ret == BLK_MQ_RQ_QUEUE_OK) |
1177 | goto done; | 1211 | goto done; |
1178 | else { | 1212 | else { |
@@ -1784,16 +1818,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1784 | if (!ctx) | 1818 | if (!ctx) |
1785 | return ERR_PTR(-ENOMEM); | 1819 | return ERR_PTR(-ENOMEM); |
1786 | 1820 | ||
1787 | /* | ||
1788 | * If a crashdump is active, then we are potentially in a very | ||
1789 | * memory constrained environment. Limit us to 1 queue and | ||
1790 | * 64 tags to prevent using too much memory. | ||
1791 | */ | ||
1792 | if (is_kdump_kernel()) { | ||
1793 | set->nr_hw_queues = 1; | ||
1794 | set->queue_depth = min(64U, set->queue_depth); | ||
1795 | } | ||
1796 | |||
1797 | hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, | 1821 | hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, |
1798 | set->numa_node); | 1822 | set->numa_node); |
1799 | 1823 | ||
@@ -2067,6 +2091,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
2067 | set->queue_depth = BLK_MQ_MAX_DEPTH; | 2091 | set->queue_depth = BLK_MQ_MAX_DEPTH; |
2068 | } | 2092 | } |
2069 | 2093 | ||
2094 | /* | ||
2095 | * If a crashdump is active, then we are potentially in a very | ||
2096 | * memory constrained environment. Limit us to 1 queue and | ||
2097 | * 64 tags to prevent using too much memory. | ||
2098 | */ | ||
2099 | if (is_kdump_kernel()) { | ||
2100 | set->nr_hw_queues = 1; | ||
2101 | set->queue_depth = min(64U, set->queue_depth); | ||
2102 | } | ||
2103 | |||
2070 | set->tags = kmalloc_node(set->nr_hw_queues * | 2104 | set->tags = kmalloc_node(set->nr_hw_queues * |
2071 | sizeof(struct blk_mq_tags *), | 2105 | sizeof(struct blk_mq_tags *), |
2072 | GFP_KERNEL, set->numa_node); | 2106 | GFP_KERNEL, set->numa_node); |