aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 17:14:23 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 17:14:23 -0500
commitcaf292ae5bb9d57198ce001d8b762f7abae3a94d (patch)
tree5fd5d6d971503818ab2824407134cf36a80c53d0 /block
parent8f4385d590d4296ec38e228d17b1d002f6031dd2 (diff)
parentfcbf6a087a7e4d3f03d28333678a1010810a53c3 (diff)
Merge branch 'for-3.19/core' of git://git.kernel.dk/linux-block
Pull block driver core update from Jens Axboe: "This is the pull request for the core block IO changes for 3.19. Not a huge round this time, mostly lots of little good fixes: - Fix a bug in sysfs blktrace interface causing a NULL pointer dereference, when enabled/disabled through that API. From Arianna Avanzini. - Various updates/fixes/improvements for blk-mq: - A set of updates from Bart, mostly fixing buts in the tag handling. - Cleanup/code consolidation from Christoph. - Extend queue_rq API to be able to handle batching issues of IO requests. NVMe will utilize this shortly. From me. - A few tag and request handling updates from me. - Cleanup of the preempt handling for running queues from Paolo. - Prevent running of unmapped hardware queues from Ming Lei. - Move the kdump memory limiting check to be in the correct location, from Shaohua. - Initialize all software queues at init time from Takashi. This prevents a kobject warning when CPUs are brought online that weren't online when a queue was registered. - Single writeback fix for I_DIRTY clearing from Tejun. Queued with the core IO changes, since it's just a single fix. - Version X of the __bio_add_page() segment addition retry from Maurizio. Hope the Xth time is the charm. - Documentation fixup for IO scheduler merging from Jan. - Introduce (and use) generic IO stat accounting helpers for non-rq drivers, from Gu Zheng. - Kill off artificial limiting of max sectors in a request from Christoph" * 'for-3.19/core' of git://git.kernel.dk/linux-block: (26 commits) bio: modify __bio_add_page() to accept pages that don't start a new segment blk-mq: Fix uninitialized kobject at CPU hotplugging blktrace: don't let the sysfs interface remove trace from running list blk-mq: Use all available hardware queues blk-mq: Micro-optimize bt_get() blk-mq: Fix a race between bt_clear_tag() and bt_get() blk-mq: Avoid that __bt_get_word() wraps multiple times blk-mq: Fix a use-after-free blk-mq: prevent unmapped hw queue from being scheduled blk-mq: re-check for available tags after running the hardware queue blk-mq: fix hang in bt_get() blk-mq: move the kdump check to blk_mq_alloc_tag_set blk-mq: cleanup tag free handling blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map blk: introduce generic io stat accounting help function blk-mq: handle the single queue case in blk_mq_hctx_next_cpu genhd: check for int overflow in disk_expand_part_tbl() blk-mq: add blk_mq_free_hctx_request() blk-mq: export blk_mq_free_request() blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable ...
Diffstat (limited to 'block')
-rw-r--r--block/bio.c82
-rw-r--r--block/blk-core.c3
-rw-r--r--block/blk-mq-cpumap.c4
-rw-r--r--block/blk-mq-sysfs.c9
-rw-r--r--block/blk-mq-tag.c60
-rw-r--r--block/blk-mq.c126
-rw-r--r--block/blk-mq.h5
-rw-r--r--block/blk-settings.c4
-rw-r--r--block/blk-sysfs.c12
-rw-r--r--block/genhd.c11
10 files changed, 197 insertions, 119 deletions
diff --git a/block/bio.c b/block/bio.c
index 3e6e1986a5b2..471d7382c7d1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
748 } 748 }
749 } 749 }
750 750
751 bio->bi_iter.bi_size += len;
751 goto done; 752 goto done;
752 } 753 }
753 754
@@ -764,29 +765,32 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
764 return 0; 765 return 0;
765 766
766 /* 767 /*
767 * we might lose a segment or two here, but rather that than 768 * setup the new entry, we might clear it again later if we
768 * make this too complex. 769 * cannot add the page
770 */
771 bvec = &bio->bi_io_vec[bio->bi_vcnt];
772 bvec->bv_page = page;
773 bvec->bv_len = len;
774 bvec->bv_offset = offset;
775 bio->bi_vcnt++;
776 bio->bi_phys_segments++;
777 bio->bi_iter.bi_size += len;
778
779 /*
780 * Perform a recount if the number of segments is greater
781 * than queue_max_segments(q).
769 */ 782 */
770 783
771 while (bio->bi_phys_segments >= queue_max_segments(q)) { 784 while (bio->bi_phys_segments > queue_max_segments(q)) {
772 785
773 if (retried_segments) 786 if (retried_segments)
774 return 0; 787 goto failed;
775 788
776 retried_segments = 1; 789 retried_segments = 1;
777 blk_recount_segments(q, bio); 790 blk_recount_segments(q, bio);
778 } 791 }
779 792
780 /* 793 /*
781 * setup the new entry, we might clear it again later if we
782 * cannot add the page
783 */
784 bvec = &bio->bi_io_vec[bio->bi_vcnt];
785 bvec->bv_page = page;
786 bvec->bv_len = len;
787 bvec->bv_offset = offset;
788
789 /*
790 * if queue has other restrictions (eg varying max sector size 794 * if queue has other restrictions (eg varying max sector size
791 * depending on offset), it can specify a merge_bvec_fn in the 795 * depending on offset), it can specify a merge_bvec_fn in the
792 * queue to get further control 796 * queue to get further control
@@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
795 struct bvec_merge_data bvm = { 799 struct bvec_merge_data bvm = {
796 .bi_bdev = bio->bi_bdev, 800 .bi_bdev = bio->bi_bdev,
797 .bi_sector = bio->bi_iter.bi_sector, 801 .bi_sector = bio->bi_iter.bi_sector,
798 .bi_size = bio->bi_iter.bi_size, 802 .bi_size = bio->bi_iter.bi_size - len,
799 .bi_rw = bio->bi_rw, 803 .bi_rw = bio->bi_rw,
800 }; 804 };
801 805
@@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
803 * merge_bvec_fn() returns number of bytes it can accept 807 * merge_bvec_fn() returns number of bytes it can accept
804 * at this offset 808 * at this offset
805 */ 809 */
806 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { 810 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
807 bvec->bv_page = NULL; 811 goto failed;
808 bvec->bv_len = 0;
809 bvec->bv_offset = 0;
810 return 0;
811 }
812 } 812 }
813 813
814 /* If we may be able to merge these biovecs, force a recount */ 814 /* If we may be able to merge these biovecs, force a recount */
815 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) 815 if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
816 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 816 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
817 817
818 bio->bi_vcnt++;
819 bio->bi_phys_segments++;
820 done: 818 done:
821 bio->bi_iter.bi_size += len;
822 return len; 819 return len;
820
821 failed:
822 bvec->bv_page = NULL;
823 bvec->bv_len = 0;
824 bvec->bv_offset = 0;
825 bio->bi_vcnt--;
826 bio->bi_iter.bi_size -= len;
827 blk_recount_segments(q, bio);
828 return 0;
823} 829}
824 830
825/** 831/**
@@ -1739,6 +1745,34 @@ void bio_check_pages_dirty(struct bio *bio)
1739 } 1745 }
1740} 1746}
1741 1747
1748void generic_start_io_acct(int rw, unsigned long sectors,
1749 struct hd_struct *part)
1750{
1751 int cpu = part_stat_lock();
1752
1753 part_round_stats(cpu, part);
1754 part_stat_inc(cpu, part, ios[rw]);
1755 part_stat_add(cpu, part, sectors[rw], sectors);
1756 part_inc_in_flight(part, rw);
1757
1758 part_stat_unlock();
1759}
1760EXPORT_SYMBOL(generic_start_io_acct);
1761
1762void generic_end_io_acct(int rw, struct hd_struct *part,
1763 unsigned long start_time)
1764{
1765 unsigned long duration = jiffies - start_time;
1766 int cpu = part_stat_lock();
1767
1768 part_stat_add(cpu, part, ticks[rw], duration);
1769 part_round_stats(cpu, part);
1770 part_dec_in_flight(part, rw);
1771
1772 part_stat_unlock();
1773}
1774EXPORT_SYMBOL(generic_end_io_acct);
1775
1742#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1776#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1743void bio_flush_dcache_pages(struct bio *bi) 1777void bio_flush_dcache_pages(struct bio *bi)
1744{ 1778{
diff --git a/block/blk-core.c b/block/blk-core.c
index ea1c4d0d7a44..30f6153a40c2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q)
525 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); 525 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
526 blk_sync_queue(q); 526 blk_sync_queue(q);
527 527
528 if (q->mq_ops)
529 blk_mq_free_queue(q);
530
528 spin_lock_irq(lock); 531 spin_lock_irq(lock);
529 if (q->queue_lock != &q->__queue_lock) 532 if (q->queue_lock != &q->__queue_lock)
530 q->queue_lock = &q->__queue_lock; 533 q->queue_lock = &q->__queue_lock;
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 1065d7c65fa1..5f13f4d0bcce 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -17,7 +17,7 @@
17static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, 17static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
18 const int cpu) 18 const int cpu)
19{ 19{
20 return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); 20 return cpu * nr_queues / nr_cpus;
21} 21}
22 22
23static int get_first_sibling(unsigned int cpu) 23static int get_first_sibling(unsigned int cpu)
@@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
90 unsigned int *map; 90 unsigned int *map;
91 91
92 /* If cpus are offline, map them to first hctx */ 92 /* If cpus are offline, map them to first hctx */
93 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, 93 map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
94 set->numa_node); 94 set->numa_node);
95 if (!map) 95 if (!map)
96 return NULL; 96 return NULL;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 371d8800b48a..1630a20d5dcf 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q)
390{ 390{
391 struct blk_mq_hw_ctx *hctx; 391 struct blk_mq_hw_ctx *hctx;
392 struct blk_mq_ctx *ctx; 392 struct blk_mq_ctx *ctx;
393 int i, j; 393 int i;
394 394
395 kobject_init(&q->mq_kobj, &blk_mq_ktype); 395 kobject_init(&q->mq_kobj, &blk_mq_ktype);
396 396
397 queue_for_each_hw_ctx(q, hctx, i) { 397 queue_for_each_hw_ctx(q, hctx, i)
398 kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 398 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
399 399
400 hctx_for_each_ctx(hctx, ctx, j) 400 queue_for_each_ctx(q, ctx, i)
401 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); 401 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
402 }
403} 402}
404 403
405/* see blk_register_queue() */ 404/* see blk_register_queue() */
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 728b9a4d5f56..e3d4e4043b49 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
137static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) 137static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
138{ 138{
139 int tag, org_last_tag, end; 139 int tag, org_last_tag, end;
140 bool wrap = last_tag != 0;
140 141
141 org_last_tag = last_tag; 142 org_last_tag = last_tag;
142 end = bm->depth; 143 end = bm->depth;
@@ -148,15 +149,16 @@ restart:
148 * We started with an offset, start from 0 to 149 * We started with an offset, start from 0 to
149 * exhaust the map. 150 * exhaust the map.
150 */ 151 */
151 if (org_last_tag && last_tag) { 152 if (wrap) {
152 end = last_tag; 153 wrap = false;
154 end = org_last_tag;
153 last_tag = 0; 155 last_tag = 0;
154 goto restart; 156 goto restart;
155 } 157 }
156 return -1; 158 return -1;
157 } 159 }
158 last_tag = tag + 1; 160 last_tag = tag + 1;
159 } while (test_and_set_bit_lock(tag, &bm->word)); 161 } while (test_and_set_bit(tag, &bm->word));
160 162
161 return tag; 163 return tag;
162} 164}
@@ -246,14 +248,29 @@ static int bt_get(struct blk_mq_alloc_data *data,
246 if (!(data->gfp & __GFP_WAIT)) 248 if (!(data->gfp & __GFP_WAIT))
247 return -1; 249 return -1;
248 250
249 bs = bt_wait_ptr(bt, hctx);
250 do { 251 do {
252 bs = bt_wait_ptr(bt, hctx);
251 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); 253 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
252 254
253 tag = __bt_get(hctx, bt, last_tag); 255 tag = __bt_get(hctx, bt, last_tag);
254 if (tag != -1) 256 if (tag != -1)
255 break; 257 break;
256 258
259 /*
260 * We're out of tags on this hardware queue, kick any
261 * pending IO submits before going to sleep waiting for
262 * some to complete.
263 */
264 blk_mq_run_hw_queue(hctx, false);
265
266 /*
267 * Retry tag allocation after running the hardware queue,
268 * as running the queue may also have found completions.
269 */
270 tag = __bt_get(hctx, bt, last_tag);
271 if (tag != -1)
272 break;
273
257 blk_mq_put_ctx(data->ctx); 274 blk_mq_put_ctx(data->ctx);
258 275
259 io_schedule(); 276 io_schedule();
@@ -268,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data,
268 hctx = data->hctx; 285 hctx = data->hctx;
269 bt = &hctx->tags->bitmap_tags; 286 bt = &hctx->tags->bitmap_tags;
270 } 287 }
271 finish_wait(&bs->wait, &wait);
272 bs = bt_wait_ptr(bt, hctx);
273 } while (1); 288 } while (1);
274 289
275 finish_wait(&bs->wait, &wait); 290 finish_wait(&bs->wait, &wait);
@@ -340,11 +355,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
340 struct bt_wait_state *bs; 355 struct bt_wait_state *bs;
341 int wait_cnt; 356 int wait_cnt;
342 357
343 /* 358 clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
344 * The unlock memory barrier need to order access to req in free 359
345 * path and clearing tag bit 360 /* Ensure that the wait list checks occur after clear_bit(). */
346 */ 361 smp_mb();
347 clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
348 362
349 bs = bt_wake_ptr(bt); 363 bs = bt_wake_ptr(bt);
350 if (!bs) 364 if (!bs)
@@ -360,21 +374,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
360 } 374 }
361} 375}
362 376
363static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
364{
365 BUG_ON(tag >= tags->nr_tags);
366
367 bt_clear_tag(&tags->bitmap_tags, tag);
368}
369
370static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
371 unsigned int tag)
372{
373 BUG_ON(tag >= tags->nr_reserved_tags);
374
375 bt_clear_tag(&tags->breserved_tags, tag);
376}
377
378void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, 377void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
379 unsigned int *last_tag) 378 unsigned int *last_tag)
380{ 379{
@@ -383,10 +382,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
383 if (tag >= tags->nr_reserved_tags) { 382 if (tag >= tags->nr_reserved_tags) {
384 const int real_tag = tag - tags->nr_reserved_tags; 383 const int real_tag = tag - tags->nr_reserved_tags;
385 384
386 __blk_mq_put_tag(tags, real_tag); 385 BUG_ON(real_tag >= tags->nr_tags);
386 bt_clear_tag(&tags->bitmap_tags, real_tag);
387 *last_tag = real_tag; 387 *last_tag = real_tag;
388 } else 388 } else {
389 __blk_mq_put_reserved_tag(tags, tag); 389 BUG_ON(tag >= tags->nr_reserved_tags);
390 bt_clear_tag(&tags->breserved_tags, tag);
391 }
390} 392}
391 393
392static void bt_for_each(struct blk_mq_hw_ctx *hctx, 394static void bt_for_each(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 92ceef0d2ab9..da1ab5641227 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -279,17 +279,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
279 blk_mq_queue_exit(q); 279 blk_mq_queue_exit(q);
280} 280}
281 281
282void blk_mq_free_request(struct request *rq) 282void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
283{ 283{
284 struct blk_mq_ctx *ctx = rq->mq_ctx; 284 struct blk_mq_ctx *ctx = rq->mq_ctx;
285 struct blk_mq_hw_ctx *hctx;
286 struct request_queue *q = rq->q;
287 285
288 ctx->rq_completed[rq_is_sync(rq)]++; 286 ctx->rq_completed[rq_is_sync(rq)]++;
289
290 hctx = q->mq_ops->map_queue(q, ctx->cpu);
291 __blk_mq_free_request(hctx, ctx, rq); 287 __blk_mq_free_request(hctx, ctx, rq);
288
289}
290EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
291
292void blk_mq_free_request(struct request *rq)
293{
294 struct blk_mq_hw_ctx *hctx;
295 struct request_queue *q = rq->q;
296
297 hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
298 blk_mq_free_hctx_request(hctx, rq);
292} 299}
300EXPORT_SYMBOL_GPL(blk_mq_free_request);
293 301
294inline void __blk_mq_end_request(struct request *rq, int error) 302inline void __blk_mq_end_request(struct request *rq, int error)
295{ 303{
@@ -591,7 +599,7 @@ static void blk_mq_rq_timer(unsigned long priv)
591 * If not software queues are currently mapped to this 599 * If not software queues are currently mapped to this
592 * hardware queue, there's nothing to check 600 * hardware queue, there's nothing to check
593 */ 601 */
594 if (!hctx->nr_ctx || !hctx->tags) 602 if (!blk_mq_hw_queue_mapped(hctx))
595 continue; 603 continue;
596 604
597 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); 605 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
@@ -690,6 +698,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
690 struct request_queue *q = hctx->queue; 698 struct request_queue *q = hctx->queue;
691 struct request *rq; 699 struct request *rq;
692 LIST_HEAD(rq_list); 700 LIST_HEAD(rq_list);
701 LIST_HEAD(driver_list);
702 struct list_head *dptr;
693 int queued; 703 int queued;
694 704
695 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); 705 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
@@ -716,16 +726,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
716 } 726 }
717 727
718 /* 728 /*
729 * Start off with dptr being NULL, so we start the first request
730 * immediately, even if we have more pending.
731 */
732 dptr = NULL;
733
734 /*
719 * Now process all the entries, sending them to the driver. 735 * Now process all the entries, sending them to the driver.
720 */ 736 */
721 queued = 0; 737 queued = 0;
722 while (!list_empty(&rq_list)) { 738 while (!list_empty(&rq_list)) {
739 struct blk_mq_queue_data bd;
723 int ret; 740 int ret;
724 741
725 rq = list_first_entry(&rq_list, struct request, queuelist); 742 rq = list_first_entry(&rq_list, struct request, queuelist);
726 list_del_init(&rq->queuelist); 743 list_del_init(&rq->queuelist);
727 744
728 ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); 745 bd.rq = rq;
746 bd.list = dptr;
747 bd.last = list_empty(&rq_list);
748
749 ret = q->mq_ops->queue_rq(hctx, &bd);
729 switch (ret) { 750 switch (ret) {
730 case BLK_MQ_RQ_QUEUE_OK: 751 case BLK_MQ_RQ_QUEUE_OK:
731 queued++; 752 queued++;
@@ -744,6 +765,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
744 765
745 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 766 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
746 break; 767 break;
768
769 /*
770 * We've done the first request. If we have more than 1
771 * left in the list, set dptr to defer issue.
772 */
773 if (!dptr && rq_list.next != rq_list.prev)
774 dptr = &driver_list;
747 } 775 }
748 776
749 if (!queued) 777 if (!queued)
@@ -770,10 +798,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
770 */ 798 */
771static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 799static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
772{ 800{
773 int cpu = hctx->next_cpu; 801 if (hctx->queue->nr_hw_queues == 1)
802 return WORK_CPU_UNBOUND;
774 803
775 if (--hctx->next_cpu_batch <= 0) { 804 if (--hctx->next_cpu_batch <= 0) {
776 int next_cpu; 805 int cpu = hctx->next_cpu, next_cpu;
777 806
778 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 807 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
779 if (next_cpu >= nr_cpu_ids) 808 if (next_cpu >= nr_cpu_ids)
@@ -781,26 +810,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
781 810
782 hctx->next_cpu = next_cpu; 811 hctx->next_cpu = next_cpu;
783 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 812 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
813
814 return cpu;
784 } 815 }
785 816
786 return cpu; 817 return hctx->next_cpu;
787} 818}
788 819
789void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 820void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
790{ 821{
791 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 822 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
823 !blk_mq_hw_queue_mapped(hctx)))
792 return; 824 return;
793 825
794 if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) 826 if (!async) {
795 __blk_mq_run_hw_queue(hctx); 827 int cpu = get_cpu();
796 else if (hctx->queue->nr_hw_queues == 1) 828 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
797 kblockd_schedule_delayed_work(&hctx->run_work, 0); 829 __blk_mq_run_hw_queue(hctx);
798 else { 830 put_cpu();
799 unsigned int cpu; 831 return;
832 }
800 833
801 cpu = blk_mq_hctx_next_cpu(hctx); 834 put_cpu();
802 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
803 } 835 }
836
837 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
838 &hctx->run_work, 0);
804} 839}
805 840
806void blk_mq_run_queues(struct request_queue *q, bool async) 841void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -814,9 +849,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
814 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 849 test_bit(BLK_MQ_S_STOPPED, &hctx->state))
815 continue; 850 continue;
816 851
817 preempt_disable();
818 blk_mq_run_hw_queue(hctx, async); 852 blk_mq_run_hw_queue(hctx, async);
819 preempt_enable();
820 } 853 }
821} 854}
822EXPORT_SYMBOL(blk_mq_run_queues); 855EXPORT_SYMBOL(blk_mq_run_queues);
@@ -843,9 +876,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
843{ 876{
844 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 877 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
845 878
846 preempt_disable();
847 blk_mq_run_hw_queue(hctx, false); 879 blk_mq_run_hw_queue(hctx, false);
848 preempt_enable();
849} 880}
850EXPORT_SYMBOL(blk_mq_start_hw_queue); 881EXPORT_SYMBOL(blk_mq_start_hw_queue);
851 882
@@ -870,9 +901,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
870 continue; 901 continue;
871 902
872 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 903 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
873 preempt_disable();
874 blk_mq_run_hw_queue(hctx, async); 904 blk_mq_run_hw_queue(hctx, async);
875 preempt_enable();
876 } 905 }
877} 906}
878EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 907EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -898,16 +927,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
898 927
899void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 928void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
900{ 929{
901 unsigned long tmo = msecs_to_jiffies(msecs); 930 if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
902 931 return;
903 if (hctx->queue->nr_hw_queues == 1)
904 kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
905 else {
906 unsigned int cpu;
907 932
908 cpu = blk_mq_hctx_next_cpu(hctx); 933 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
909 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); 934 &hctx->delay_work, msecs_to_jiffies(msecs));
910 }
911} 935}
912EXPORT_SYMBOL(blk_mq_delay_queue); 936EXPORT_SYMBOL(blk_mq_delay_queue);
913 937
@@ -1162,7 +1186,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1162 goto run_queue; 1186 goto run_queue;
1163 } 1187 }
1164 1188
1165 if (is_sync) { 1189 /*
1190 * If the driver supports defer issued based on 'last', then
1191 * queue it up like normal since we can potentially save some
1192 * CPU this way.
1193 */
1194 if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1195 struct blk_mq_queue_data bd = {
1196 .rq = rq,
1197 .list = NULL,
1198 .last = 1
1199 };
1166 int ret; 1200 int ret;
1167 1201
1168 blk_mq_bio_to_request(rq, bio); 1202 blk_mq_bio_to_request(rq, bio);
@@ -1172,7 +1206,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1172 * error (busy), just add it to our list as we previously 1206 * error (busy), just add it to our list as we previously
1173 * would have done 1207 * would have done
1174 */ 1208 */
1175 ret = q->mq_ops->queue_rq(data.hctx, rq, true); 1209 ret = q->mq_ops->queue_rq(data.hctx, &bd);
1176 if (ret == BLK_MQ_RQ_QUEUE_OK) 1210 if (ret == BLK_MQ_RQ_QUEUE_OK)
1177 goto done; 1211 goto done;
1178 else { 1212 else {
@@ -1784,16 +1818,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1784 if (!ctx) 1818 if (!ctx)
1785 return ERR_PTR(-ENOMEM); 1819 return ERR_PTR(-ENOMEM);
1786 1820
1787 /*
1788 * If a crashdump is active, then we are potentially in a very
1789 * memory constrained environment. Limit us to 1 queue and
1790 * 64 tags to prevent using too much memory.
1791 */
1792 if (is_kdump_kernel()) {
1793 set->nr_hw_queues = 1;
1794 set->queue_depth = min(64U, set->queue_depth);
1795 }
1796
1797 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1821 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1798 set->numa_node); 1822 set->numa_node);
1799 1823
@@ -2067,6 +2091,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2067 set->queue_depth = BLK_MQ_MAX_DEPTH; 2091 set->queue_depth = BLK_MQ_MAX_DEPTH;
2068 } 2092 }
2069 2093
2094 /*
2095 * If a crashdump is active, then we are potentially in a very
2096 * memory constrained environment. Limit us to 1 queue and
2097 * 64 tags to prevent using too much memory.
2098 */
2099 if (is_kdump_kernel()) {
2100 set->nr_hw_queues = 1;
2101 set->queue_depth = min(64U, set->queue_depth);
2102 }
2103
2070 set->tags = kmalloc_node(set->nr_hw_queues * 2104 set->tags = kmalloc_node(set->nr_hw_queues *
2071 sizeof(struct blk_mq_tags *), 2105 sizeof(struct blk_mq_tags *),
2072 GFP_KERNEL, set->numa_node); 2106 GFP_KERNEL, set->numa_node);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d567d5283ffa..206230e64f79 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
115 data->hctx = hctx; 115 data->hctx = hctx;
116} 116}
117 117
118static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
119{
120 return hctx->nr_ctx && hctx->tags;
121}
122
118#endif 123#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index aa02247d227e..6ed2cbe5e8c9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
257 __func__, max_hw_sectors); 257 __func__, max_hw_sectors);
258 } 258 }
259 259
260 limits->max_hw_sectors = max_hw_sectors; 260 limits->max_sectors = limits->max_hw_sectors = max_hw_sectors;
261 limits->max_sectors = min_t(unsigned int, max_hw_sectors,
262 BLK_DEF_MAX_SECTORS);
263} 261}
264EXPORT_SYMBOL(blk_limits_max_hw_sectors); 262EXPORT_SYMBOL(blk_limits_max_hw_sectors);
265 263
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1fac43408911..935ea2aa0730 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
492 * Currently, its primary task it to free all the &struct request 492 * Currently, its primary task it to free all the &struct request
493 * structures that were allocated to the queue and the queue itself. 493 * structures that were allocated to the queue and the queue itself.
494 * 494 *
495 * Caveat: 495 * Note:
496 * Hopefully the low level driver will have finished any 496 * The low level driver must have finished any outstanding requests first
497 * outstanding requests first... 497 * via blk_cleanup_queue().
498 **/ 498 **/
499static void blk_release_queue(struct kobject *kobj) 499static void blk_release_queue(struct kobject *kobj)
500{ 500{
501 struct request_queue *q = 501 struct request_queue *q =
502 container_of(kobj, struct request_queue, kobj); 502 container_of(kobj, struct request_queue, kobj);
503 503
504 blk_sync_queue(q);
505
506 blkcg_exit_queue(q); 504 blkcg_exit_queue(q);
507 505
508 if (q->elevator) { 506 if (q->elevator) {
@@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj)
517 if (q->queue_tags) 515 if (q->queue_tags)
518 __blk_queue_free_tags(q); 516 __blk_queue_free_tags(q);
519 517
520 if (q->mq_ops) 518 if (!q->mq_ops)
521 blk_mq_free_queue(q);
522 else
523 blk_free_flush_queue(q->fq); 519 blk_free_flush_queue(q->fq);
524 520
525 blk_trace_shutdown(q); 521 blk_trace_shutdown(q);
diff --git a/block/genhd.c b/block/genhd.c
index bd3060684ab2..0a536dc05f3b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
1070 struct disk_part_tbl *old_ptbl = disk->part_tbl; 1070 struct disk_part_tbl *old_ptbl = disk->part_tbl;
1071 struct disk_part_tbl *new_ptbl; 1071 struct disk_part_tbl *new_ptbl;
1072 int len = old_ptbl ? old_ptbl->len : 0; 1072 int len = old_ptbl ? old_ptbl->len : 0;
1073 int target = partno + 1; 1073 int i, target;
1074 size_t size; 1074 size_t size;
1075 int i; 1075
1076 /*
1077 * check for int overflow, since we can get here from blkpg_ioctl()
1078 * with a user passed 'partno'.
1079 */
1080 target = partno + 1;
1081 if (target < 0)
1082 return -EINVAL;
1076 1083
1077 /* disk_max_parts() is zero during initialization, ignore if so */ 1084 /* disk_max_parts() is zero during initialization, ignore if so */
1078 if (disk_max_parts(disk) && target > disk_max_parts(disk)) 1085 if (disk_max_parts(disk) && target > disk_max_parts(disk))