diff options
author | Jens Axboe <axboe@fb.com> | 2014-05-28 11:50:26 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2014-05-28 11:50:26 -0400 |
commit | 6178976500ae61fa7b12ebb2d3de816dc9396388 (patch) | |
tree | 143df1479f56458801b676d038e6a7157a472981 | |
parent | 6314a108ec19aefa5160535b2bfe1ca9c38efe37 (diff) | |
parent | d852564f8c88b0604490234fdeeb6fb47e4bcc7a (diff) |
Merge branch 'for-3.16/core' into for-3.16/drivers
mtip32xx uses blk_mq_alloc_reserved_request(), so pull in the
core changes so we have a properly merged end result.
Signed-off-by: Jens Axboe <axboe@fb.com>
33 files changed, 1471 insertions, 559 deletions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl index 4f676838da06..bcdfdb9a9277 100644 --- a/Documentation/DocBook/filesystems.tmpl +++ b/Documentation/DocBook/filesystems.tmpl | |||
@@ -62,7 +62,7 @@ | |||
62 | !Efs/mpage.c | 62 | !Efs/mpage.c |
63 | !Efs/namei.c | 63 | !Efs/namei.c |
64 | !Efs/buffer.c | 64 | !Efs/buffer.c |
65 | !Efs/bio.c | 65 | !Eblock/bio.c |
66 | !Efs/seq_file.c | 66 | !Efs/seq_file.c |
67 | !Efs/filesystems.c | 67 | !Efs/filesystems.c |
68 | !Efs/fs-writeback.c | 68 | !Efs/fs-writeback.c |
diff --git a/block/Makefile b/block/Makefile index 20645e88fb57..a2ce6ac935ec 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -2,13 +2,15 @@ | |||
2 | # Makefile for the kernel block layer | 2 | # Makefile for the kernel block layer |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | 5 | obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ |
6 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ | 6 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ |
7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ | 7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ |
8 | blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ | 8 | blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ |
9 | blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ | 9 | blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ |
10 | genhd.o scsi_ioctl.o partition-generic.o partitions/ | 10 | genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ |
11 | partitions/ | ||
11 | 12 | ||
13 | obj-$(CONFIG_BOUNCE) += bounce.o | ||
12 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | 14 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
13 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o | 15 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o |
14 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o | 16 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
@@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o | |||
20 | obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o | 22 | obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o |
21 | obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o | 23 | obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o |
22 | obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o | 24 | obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o |
25 | obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o | ||
diff --git a/fs/bio-integrity.c b/block/bio-integrity.c index 1c2ce0c87711..9e241063a616 100644 --- a/fs/bio-integrity.c +++ b/block/bio-integrity.c | |||
@@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) | |||
617 | if (!bs->bio_integrity_pool) | 617 | if (!bs->bio_integrity_pool) |
618 | return -1; | 618 | return -1; |
619 | 619 | ||
620 | bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); | 620 | bs->bvec_integrity_pool = biovec_create_pool(pool_size); |
621 | if (!bs->bvec_integrity_pool) { | 621 | if (!bs->bvec_integrity_pool) { |
622 | mempool_destroy(bs->bio_integrity_pool); | 622 | mempool_destroy(bs->bio_integrity_pool); |
623 | return -1; | 623 | return -1; |
diff --git a/fs/bio.c b/block/bio.c index 6f0362b77806..96d28eee8a1e 100644 --- a/fs/bio.c +++ b/block/bio.c | |||
@@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error) | |||
305 | 305 | ||
306 | /** | 306 | /** |
307 | * bio_chain - chain bio completions | 307 | * bio_chain - chain bio completions |
308 | * @bio: the target bio | ||
309 | * @parent: the @bio's parent bio | ||
308 | * | 310 | * |
309 | * The caller won't have a bi_end_io called when @bio completes - instead, | 311 | * The caller won't have a bi_end_io called when @bio completes - instead, |
310 | * @parent's bi_end_io won't be called until both @parent and @bio have | 312 | * @parent's bi_end_io won't be called until both @parent and @bio have |
@@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, | |||
1011 | bio->bi_private = bmd; | 1013 | bio->bi_private = bmd; |
1012 | } | 1014 | } |
1013 | 1015 | ||
1014 | static struct bio_map_data *bio_alloc_map_data(int nr_segs, | 1016 | static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, |
1015 | unsigned int iov_count, | ||
1016 | gfp_t gfp_mask) | 1017 | gfp_t gfp_mask) |
1017 | { | 1018 | { |
1018 | if (iov_count > UIO_MAXIOV) | 1019 | if (iov_count > UIO_MAXIOV) |
@@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, | |||
1154 | if (offset) | 1155 | if (offset) |
1155 | nr_pages++; | 1156 | nr_pages++; |
1156 | 1157 | ||
1157 | bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); | 1158 | bmd = bio_alloc_map_data(iov_count, gfp_mask); |
1158 | if (!bmd) | 1159 | if (!bmd) |
1159 | return ERR_PTR(-ENOMEM); | 1160 | return ERR_PTR(-ENOMEM); |
1160 | 1161 | ||
@@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim); | |||
1859 | * create memory pools for biovec's in a bio_set. | 1860 | * create memory pools for biovec's in a bio_set. |
1860 | * use the global biovec slabs created for general use. | 1861 | * use the global biovec slabs created for general use. |
1861 | */ | 1862 | */ |
1862 | mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) | 1863 | mempool_t *biovec_create_pool(int pool_entries) |
1863 | { | 1864 | { |
1864 | struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; | 1865 | struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; |
1865 | 1866 | ||
@@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) | |||
1922 | if (!bs->bio_pool) | 1923 | if (!bs->bio_pool) |
1923 | goto bad; | 1924 | goto bad; |
1924 | 1925 | ||
1925 | bs->bvec_pool = biovec_create_pool(bs, pool_size); | 1926 | bs->bvec_pool = biovec_create_pool(pool_size); |
1926 | if (!bs->bvec_pool) | 1927 | if (!bs->bvec_pool) |
1927 | goto bad; | 1928 | goto bad; |
1928 | 1929 | ||
diff --git a/block/blk-core.c b/block/blk-core.c index c4269701cb4f..d87be5b4e554 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -576,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
576 | if (!q) | 576 | if (!q) |
577 | return NULL; | 577 | return NULL; |
578 | 578 | ||
579 | if (percpu_counter_init(&q->mq_usage_counter, 0)) | ||
580 | goto fail_q; | ||
581 | |||
582 | q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); | 579 | q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); |
583 | if (q->id < 0) | 580 | if (q->id < 0) |
584 | goto fail_c; | 581 | goto fail_q; |
585 | 582 | ||
586 | q->backing_dev_info.ra_pages = | 583 | q->backing_dev_info.ra_pages = |
587 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 584 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
@@ -639,8 +636,6 @@ fail_bdi: | |||
639 | bdi_destroy(&q->backing_dev_info); | 636 | bdi_destroy(&q->backing_dev_info); |
640 | fail_id: | 637 | fail_id: |
641 | ida_simple_remove(&blk_queue_ida, q->id); | 638 | ida_simple_remove(&blk_queue_ida, q->id); |
642 | fail_c: | ||
643 | percpu_counter_destroy(&q->mq_usage_counter); | ||
644 | fail_q: | 639 | fail_q: |
645 | kmem_cache_free(blk_requestq_cachep, q); | 640 | kmem_cache_free(blk_requestq_cachep, q); |
646 | return NULL; | 641 | return NULL; |
@@ -848,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags) | |||
848 | __freed_request(rl, sync ^ 1); | 843 | __freed_request(rl, sync ^ 1); |
849 | } | 844 | } |
850 | 845 | ||
846 | int blk_update_nr_requests(struct request_queue *q, unsigned int nr) | ||
847 | { | ||
848 | struct request_list *rl; | ||
849 | |||
850 | spin_lock_irq(q->queue_lock); | ||
851 | q->nr_requests = nr; | ||
852 | blk_queue_congestion_threshold(q); | ||
853 | |||
854 | /* congestion isn't cgroup aware and follows root blkcg for now */ | ||
855 | rl = &q->root_rl; | ||
856 | |||
857 | if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) | ||
858 | blk_set_queue_congested(q, BLK_RW_SYNC); | ||
859 | else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) | ||
860 | blk_clear_queue_congested(q, BLK_RW_SYNC); | ||
861 | |||
862 | if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) | ||
863 | blk_set_queue_congested(q, BLK_RW_ASYNC); | ||
864 | else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) | ||
865 | blk_clear_queue_congested(q, BLK_RW_ASYNC); | ||
866 | |||
867 | blk_queue_for_each_rl(rl, q) { | ||
868 | if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { | ||
869 | blk_set_rl_full(rl, BLK_RW_SYNC); | ||
870 | } else { | ||
871 | blk_clear_rl_full(rl, BLK_RW_SYNC); | ||
872 | wake_up(&rl->wait[BLK_RW_SYNC]); | ||
873 | } | ||
874 | |||
875 | if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { | ||
876 | blk_set_rl_full(rl, BLK_RW_ASYNC); | ||
877 | } else { | ||
878 | blk_clear_rl_full(rl, BLK_RW_ASYNC); | ||
879 | wake_up(&rl->wait[BLK_RW_ASYNC]); | ||
880 | } | ||
881 | } | ||
882 | |||
883 | spin_unlock_irq(q->queue_lock); | ||
884 | return 0; | ||
885 | } | ||
886 | |||
851 | /* | 887 | /* |
852 | * Determine if elevator data should be initialized when allocating the | 888 | * Determine if elevator data should be initialized when allocating the |
853 | * request associated with @bio. | 889 | * request associated with @bio. |
@@ -1137,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, | |||
1137 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) | 1173 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) |
1138 | { | 1174 | { |
1139 | if (q->mq_ops) | 1175 | if (q->mq_ops) |
1140 | return blk_mq_alloc_request(q, rw, gfp_mask); | 1176 | return blk_mq_alloc_request(q, rw, gfp_mask, false); |
1141 | else | 1177 | else |
1142 | return blk_old_get_request(q, rw, gfp_mask); | 1178 | return blk_old_get_request(q, rw, gfp_mask); |
1143 | } | 1179 | } |
@@ -1233,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, | |||
1233 | static void part_round_stats_single(int cpu, struct hd_struct *part, | 1269 | static void part_round_stats_single(int cpu, struct hd_struct *part, |
1234 | unsigned long now) | 1270 | unsigned long now) |
1235 | { | 1271 | { |
1272 | int inflight; | ||
1273 | |||
1236 | if (now == part->stamp) | 1274 | if (now == part->stamp) |
1237 | return; | 1275 | return; |
1238 | 1276 | ||
1239 | if (part_in_flight(part)) { | 1277 | inflight = part_in_flight(part); |
1278 | if (inflight) { | ||
1240 | __part_stat_add(cpu, part, time_in_queue, | 1279 | __part_stat_add(cpu, part, time_in_queue, |
1241 | part_in_flight(part) * (now - part->stamp)); | 1280 | inflight * (now - part->stamp)); |
1242 | __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); | 1281 | __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); |
1243 | } | 1282 | } |
1244 | part->stamp = now; | 1283 | part->stamp = now; |
@@ -1427,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, | |||
1427 | * added on the elevator at this point. In addition, we don't have | 1466 | * added on the elevator at this point. In addition, we don't have |
1428 | * reliable access to the elevator outside queue lock. Only check basic | 1467 | * reliable access to the elevator outside queue lock. Only check basic |
1429 | * merging parameters without querying the elevator. | 1468 | * merging parameters without querying the elevator. |
1469 | * | ||
1470 | * Caller must ensure !blk_queue_nomerges(q) beforehand. | ||
1430 | */ | 1471 | */ |
1431 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | 1472 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, |
1432 | unsigned int *request_count) | 1473 | unsigned int *request_count) |
@@ -1436,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | |||
1436 | bool ret = false; | 1477 | bool ret = false; |
1437 | struct list_head *plug_list; | 1478 | struct list_head *plug_list; |
1438 | 1479 | ||
1439 | if (blk_queue_nomerges(q)) | ||
1440 | goto out; | ||
1441 | |||
1442 | plug = current->plug; | 1480 | plug = current->plug; |
1443 | if (!plug) | 1481 | if (!plug) |
1444 | goto out; | 1482 | goto out; |
@@ -1517,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) | |||
1517 | * Check if we can merge with the plugged list before grabbing | 1555 | * Check if we can merge with the plugged list before grabbing |
1518 | * any locks. | 1556 | * any locks. |
1519 | */ | 1557 | */ |
1520 | if (blk_attempt_plug_merge(q, bio, &request_count)) | 1558 | if (!blk_queue_nomerges(q) && |
1559 | blk_attempt_plug_merge(q, bio, &request_count)) | ||
1521 | return; | 1560 | return; |
1522 | 1561 | ||
1523 | spin_lock_irq(q->queue_lock); | 1562 | spin_lock_irq(q->queue_lock); |
diff --git a/block/blk-flush.c b/block/blk-flush.c index ec7a224d6733..ef608b35d9be 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c | |||
@@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq) | |||
130 | blk_clear_rq_complete(rq); | 130 | blk_clear_rq_complete(rq); |
131 | } | 131 | } |
132 | 132 | ||
133 | static void mq_flush_run(struct work_struct *work) | ||
134 | { | ||
135 | struct request *rq; | ||
136 | |||
137 | rq = container_of(work, struct request, requeue_work); | ||
138 | |||
139 | memset(&rq->csd, 0, sizeof(rq->csd)); | ||
140 | blk_mq_insert_request(rq, false, true, false); | ||
141 | } | ||
142 | |||
143 | static bool blk_flush_queue_rq(struct request *rq, bool add_front) | 133 | static bool blk_flush_queue_rq(struct request *rq, bool add_front) |
144 | { | 134 | { |
145 | if (rq->q->mq_ops) { | 135 | if (rq->q->mq_ops) { |
146 | INIT_WORK(&rq->requeue_work, mq_flush_run); | 136 | struct request_queue *q = rq->q; |
147 | kblockd_schedule_work(&rq->requeue_work); | 137 | |
138 | blk_mq_add_to_requeue_list(rq, add_front); | ||
139 | blk_mq_kick_requeue_list(q); | ||
148 | return false; | 140 | return false; |
149 | } else { | 141 | } else { |
150 | if (add_front) | 142 | if (add_front) |
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index c11d24e379e2..d828b44a404b 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c | |||
@@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete); | |||
64 | * iopoll handler will not be invoked again before blk_iopoll_sched_prep() | 64 | * iopoll handler will not be invoked again before blk_iopoll_sched_prep() |
65 | * is called. | 65 | * is called. |
66 | **/ | 66 | **/ |
67 | void blk_iopoll_complete(struct blk_iopoll *iopoll) | 67 | void blk_iopoll_complete(struct blk_iopoll *iop) |
68 | { | 68 | { |
69 | unsigned long flags; | 69 | unsigned long flags; |
70 | 70 | ||
71 | local_irq_save(flags); | 71 | local_irq_save(flags); |
72 | __blk_iopoll_complete(iopoll); | 72 | __blk_iopoll_complete(iop); |
73 | local_irq_restore(flags); | 73 | local_irq_restore(flags); |
74 | } | 74 | } |
75 | EXPORT_SYMBOL(blk_iopoll_complete); | 75 | EXPORT_SYMBOL(blk_iopoll_complete); |
diff --git a/block/blk-lib.c b/block/blk-lib.c index 97a733cf3d5f..8411be3c19d3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same); | |||
226 | * Generate and issue number of bios with zerofiled pages. | 226 | * Generate and issue number of bios with zerofiled pages. |
227 | */ | 227 | */ |
228 | 228 | ||
229 | int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | 229 | static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, |
230 | sector_t nr_sects, gfp_t gfp_mask) | 230 | sector_t nr_sects, gfp_t gfp_mask) |
231 | { | 231 | { |
232 | int ret; | 232 | int ret; |
233 | struct bio *bio; | 233 | struct bio *bio; |
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index 136ef8643bba..d2c253f71b86 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c | |||
@@ -18,14 +18,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self, | |||
18 | { | 18 | { |
19 | unsigned int cpu = (unsigned long) hcpu; | 19 | unsigned int cpu = (unsigned long) hcpu; |
20 | struct blk_mq_cpu_notifier *notify; | 20 | struct blk_mq_cpu_notifier *notify; |
21 | int ret = NOTIFY_OK; | ||
21 | 22 | ||
22 | raw_spin_lock(&blk_mq_cpu_notify_lock); | 23 | raw_spin_lock(&blk_mq_cpu_notify_lock); |
23 | 24 | ||
24 | list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) | 25 | list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { |
25 | notify->notify(notify->data, action, cpu); | 26 | ret = notify->notify(notify->data, action, cpu); |
27 | if (ret != NOTIFY_OK) | ||
28 | break; | ||
29 | } | ||
26 | 30 | ||
27 | raw_spin_unlock(&blk_mq_cpu_notify_lock); | 31 | raw_spin_unlock(&blk_mq_cpu_notify_lock); |
28 | return NOTIFY_OK; | 32 | return ret; |
29 | } | 33 | } |
30 | 34 | ||
31 | void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) | 35 | void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) |
@@ -45,7 +49,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) | |||
45 | } | 49 | } |
46 | 50 | ||
47 | void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, | 51 | void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, |
48 | void (*fn)(void *, unsigned long, unsigned int), | 52 | int (*fn)(void *, unsigned long, unsigned int), |
49 | void *data) | 53 | void *data) |
50 | { | 54 | { |
51 | notifier->notify = fn; | 55 | notifier->notify = fn; |
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 5d0f93cf358c..0daacb927be1 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c | |||
@@ -96,3 +96,19 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) | |||
96 | kfree(map); | 96 | kfree(map); |
97 | return NULL; | 97 | return NULL; |
98 | } | 98 | } |
99 | |||
100 | /* | ||
101 | * We have no quick way of doing reverse lookups. This is only used at | ||
102 | * queue init time, so runtime isn't important. | ||
103 | */ | ||
104 | int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) | ||
105 | { | ||
106 | int i; | ||
107 | |||
108 | for_each_possible_cpu(i) { | ||
109 | if (index == mq_map[i]) | ||
110 | return cpu_to_node(i); | ||
111 | } | ||
112 | |||
113 | return NUMA_NO_NODE; | ||
114 | } | ||
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 9176a6984857..99a60a829e69 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c | |||
@@ -203,45 +203,14 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, | |||
203 | return ret; | 203 | return ret; |
204 | } | 204 | } |
205 | 205 | ||
206 | static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) | 206 | static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) |
207 | { | ||
208 | ssize_t ret; | ||
209 | |||
210 | spin_lock(&hctx->lock); | ||
211 | ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); | ||
212 | spin_unlock(&hctx->lock); | ||
213 | |||
214 | return ret; | ||
215 | } | ||
216 | |||
217 | static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, | ||
218 | const char *page, size_t len) | ||
219 | { | 207 | { |
220 | struct blk_mq_ctx *ctx; | 208 | return blk_mq_tag_sysfs_show(hctx->tags, page); |
221 | unsigned long ret; | ||
222 | unsigned int i; | ||
223 | |||
224 | if (kstrtoul(page, 10, &ret)) { | ||
225 | pr_err("blk-mq-sysfs: invalid input '%s'\n", page); | ||
226 | return -EINVAL; | ||
227 | } | ||
228 | |||
229 | spin_lock(&hctx->lock); | ||
230 | if (ret) | ||
231 | hctx->flags |= BLK_MQ_F_SHOULD_IPI; | ||
232 | else | ||
233 | hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; | ||
234 | spin_unlock(&hctx->lock); | ||
235 | |||
236 | hctx_for_each_ctx(hctx, ctx, i) | ||
237 | ctx->ipi_redirect = !!ret; | ||
238 | |||
239 | return len; | ||
240 | } | 209 | } |
241 | 210 | ||
242 | static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) | 211 | static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) |
243 | { | 212 | { |
244 | return blk_mq_tag_sysfs_show(hctx->tags, page); | 213 | return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); |
245 | } | 214 | } |
246 | 215 | ||
247 | static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) | 216 | static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) |
@@ -303,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { | |||
303 | .attr = {.name = "dispatched", .mode = S_IRUGO }, | 272 | .attr = {.name = "dispatched", .mode = S_IRUGO }, |
304 | .show = blk_mq_hw_sysfs_dispatched_show, | 273 | .show = blk_mq_hw_sysfs_dispatched_show, |
305 | }; | 274 | }; |
275 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { | ||
276 | .attr = {.name = "active", .mode = S_IRUGO }, | ||
277 | .show = blk_mq_hw_sysfs_active_show, | ||
278 | }; | ||
306 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { | 279 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { |
307 | .attr = {.name = "pending", .mode = S_IRUGO }, | 280 | .attr = {.name = "pending", .mode = S_IRUGO }, |
308 | .show = blk_mq_hw_sysfs_rq_list_show, | 281 | .show = blk_mq_hw_sysfs_rq_list_show, |
309 | }; | 282 | }; |
310 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { | ||
311 | .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, | ||
312 | .show = blk_mq_hw_sysfs_ipi_show, | ||
313 | .store = blk_mq_hw_sysfs_ipi_store, | ||
314 | }; | ||
315 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { | 283 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { |
316 | .attr = {.name = "tags", .mode = S_IRUGO }, | 284 | .attr = {.name = "tags", .mode = S_IRUGO }, |
317 | .show = blk_mq_hw_sysfs_tags_show, | 285 | .show = blk_mq_hw_sysfs_tags_show, |
@@ -326,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = { | |||
326 | &blk_mq_hw_sysfs_run.attr, | 294 | &blk_mq_hw_sysfs_run.attr, |
327 | &blk_mq_hw_sysfs_dispatched.attr, | 295 | &blk_mq_hw_sysfs_dispatched.attr, |
328 | &blk_mq_hw_sysfs_pending.attr, | 296 | &blk_mq_hw_sysfs_pending.attr, |
329 | &blk_mq_hw_sysfs_ipi.attr, | ||
330 | &blk_mq_hw_sysfs_tags.attr, | 297 | &blk_mq_hw_sysfs_tags.attr, |
331 | &blk_mq_hw_sysfs_cpus.attr, | 298 | &blk_mq_hw_sysfs_cpus.attr, |
299 | &blk_mq_hw_sysfs_active.attr, | ||
332 | NULL, | 300 | NULL, |
333 | }; | 301 | }; |
334 | 302 | ||
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 7a799c46c32d..0d0640d38a06 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c | |||
@@ -1,64 +1,333 @@ | |||
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/module.h> | 2 | #include <linux/module.h> |
3 | #include <linux/random.h> | ||
3 | 4 | ||
4 | #include <linux/blk-mq.h> | 5 | #include <linux/blk-mq.h> |
5 | #include "blk.h" | 6 | #include "blk.h" |
6 | #include "blk-mq.h" | 7 | #include "blk-mq.h" |
7 | #include "blk-mq-tag.h" | 8 | #include "blk-mq-tag.h" |
8 | 9 | ||
9 | void blk_mq_wait_for_tags(struct blk_mq_tags *tags) | 10 | static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) |
10 | { | 11 | { |
11 | int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); | 12 | int i; |
12 | blk_mq_put_tag(tags, tag); | 13 | |
14 | for (i = 0; i < bt->map_nr; i++) { | ||
15 | struct blk_align_bitmap *bm = &bt->map[i]; | ||
16 | int ret; | ||
17 | |||
18 | ret = find_first_zero_bit(&bm->word, bm->depth); | ||
19 | if (ret < bm->depth) | ||
20 | return true; | ||
21 | } | ||
22 | |||
23 | return false; | ||
13 | } | 24 | } |
14 | 25 | ||
15 | bool blk_mq_has_free_tags(struct blk_mq_tags *tags) | 26 | bool blk_mq_has_free_tags(struct blk_mq_tags *tags) |
16 | { | 27 | { |
17 | return !tags || | 28 | if (!tags) |
18 | percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; | 29 | return true; |
30 | |||
31 | return bt_has_free_tags(&tags->bitmap_tags); | ||
32 | } | ||
33 | |||
34 | static inline void bt_index_inc(unsigned int *index) | ||
35 | { | ||
36 | *index = (*index + 1) & (BT_WAIT_QUEUES - 1); | ||
37 | } | ||
38 | |||
39 | /* | ||
40 | * If a previously inactive queue goes active, bump the active user count. | ||
41 | */ | ||
42 | bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) | ||
43 | { | ||
44 | if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && | ||
45 | !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) | ||
46 | atomic_inc(&hctx->tags->active_queues); | ||
47 | |||
48 | return true; | ||
49 | } | ||
50 | |||
51 | /* | ||
52 | * Wakeup all potentially sleeping on normal (non-reserved) tags | ||
53 | */ | ||
54 | static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) | ||
55 | { | ||
56 | struct blk_mq_bitmap_tags *bt; | ||
57 | int i, wake_index; | ||
58 | |||
59 | bt = &tags->bitmap_tags; | ||
60 | wake_index = bt->wake_index; | ||
61 | for (i = 0; i < BT_WAIT_QUEUES; i++) { | ||
62 | struct bt_wait_state *bs = &bt->bs[wake_index]; | ||
63 | |||
64 | if (waitqueue_active(&bs->wait)) | ||
65 | wake_up(&bs->wait); | ||
66 | |||
67 | bt_index_inc(&wake_index); | ||
68 | } | ||
19 | } | 69 | } |
20 | 70 | ||
21 | static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) | 71 | /* |
72 | * If a previously busy queue goes inactive, potential waiters could now | ||
73 | * be allowed to queue. Wake them up and check. | ||
74 | */ | ||
75 | void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) | ||
22 | { | 76 | { |
77 | struct blk_mq_tags *tags = hctx->tags; | ||
78 | |||
79 | if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) | ||
80 | return; | ||
81 | |||
82 | atomic_dec(&tags->active_queues); | ||
83 | |||
84 | blk_mq_tag_wakeup_all(tags); | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * For shared tag users, we track the number of currently active users | ||
89 | * and attempt to provide a fair share of the tag depth for each of them. | ||
90 | */ | ||
91 | static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, | ||
92 | struct blk_mq_bitmap_tags *bt) | ||
93 | { | ||
94 | unsigned int depth, users; | ||
95 | |||
96 | if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) | ||
97 | return true; | ||
98 | if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) | ||
99 | return true; | ||
100 | |||
101 | /* | ||
102 | * Don't try dividing an ant | ||
103 | */ | ||
104 | if (bt->depth == 1) | ||
105 | return true; | ||
106 | |||
107 | users = atomic_read(&hctx->tags->active_queues); | ||
108 | if (!users) | ||
109 | return true; | ||
110 | |||
111 | /* | ||
112 | * Allow at least some tags | ||
113 | */ | ||
114 | depth = max((bt->depth + users - 1) / users, 4U); | ||
115 | return atomic_read(&hctx->nr_active) < depth; | ||
116 | } | ||
117 | |||
118 | static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) | ||
119 | { | ||
120 | int tag, org_last_tag, end; | ||
121 | |||
122 | org_last_tag = last_tag; | ||
123 | end = bm->depth; | ||
124 | do { | ||
125 | restart: | ||
126 | tag = find_next_zero_bit(&bm->word, end, last_tag); | ||
127 | if (unlikely(tag >= end)) { | ||
128 | /* | ||
129 | * We started with an offset, start from 0 to | ||
130 | * exhaust the map. | ||
131 | */ | ||
132 | if (org_last_tag && last_tag) { | ||
133 | end = last_tag; | ||
134 | last_tag = 0; | ||
135 | goto restart; | ||
136 | } | ||
137 | return -1; | ||
138 | } | ||
139 | last_tag = tag + 1; | ||
140 | } while (test_and_set_bit_lock(tag, &bm->word)); | ||
141 | |||
142 | return tag; | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Straight forward bitmap tag implementation, where each bit is a tag | ||
147 | * (cleared == free, and set == busy). The small twist is using per-cpu | ||
148 | * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue | ||
149 | * contexts. This enables us to drastically limit the space searched, | ||
150 | * without dirtying an extra shared cacheline like we would if we stored | ||
151 | * the cache value inside the shared blk_mq_bitmap_tags structure. On top | ||
152 | * of that, each word of tags is in a separate cacheline. This means that | ||
153 | * multiple users will tend to stick to different cachelines, at least | ||
154 | * until the map is exhausted. | ||
155 | */ | ||
156 | static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, | ||
157 | unsigned int *tag_cache) | ||
158 | { | ||
159 | unsigned int last_tag, org_last_tag; | ||
160 | int index, i, tag; | ||
161 | |||
162 | if (!hctx_may_queue(hctx, bt)) | ||
163 | return -1; | ||
164 | |||
165 | last_tag = org_last_tag = *tag_cache; | ||
166 | index = TAG_TO_INDEX(bt, last_tag); | ||
167 | |||
168 | for (i = 0; i < bt->map_nr; i++) { | ||
169 | tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); | ||
170 | if (tag != -1) { | ||
171 | tag += (index << bt->bits_per_word); | ||
172 | goto done; | ||
173 | } | ||
174 | |||
175 | last_tag = 0; | ||
176 | if (++index >= bt->map_nr) | ||
177 | index = 0; | ||
178 | } | ||
179 | |||
180 | *tag_cache = 0; | ||
181 | return -1; | ||
182 | |||
183 | /* | ||
184 | * Only update the cache from the allocation path, if we ended | ||
185 | * up using the specific cached tag. | ||
186 | */ | ||
187 | done: | ||
188 | if (tag == org_last_tag) { | ||
189 | last_tag = tag + 1; | ||
190 | if (last_tag >= bt->depth - 1) | ||
191 | last_tag = 0; | ||
192 | |||
193 | *tag_cache = last_tag; | ||
194 | } | ||
195 | |||
196 | return tag; | ||
197 | } | ||
198 | |||
199 | static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, | ||
200 | struct blk_mq_hw_ctx *hctx) | ||
201 | { | ||
202 | struct bt_wait_state *bs; | ||
203 | |||
204 | if (!hctx) | ||
205 | return &bt->bs[0]; | ||
206 | |||
207 | bs = &bt->bs[hctx->wait_index]; | ||
208 | bt_index_inc(&hctx->wait_index); | ||
209 | return bs; | ||
210 | } | ||
211 | |||
212 | static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, | ||
213 | unsigned int *last_tag, gfp_t gfp) | ||
214 | { | ||
215 | struct bt_wait_state *bs; | ||
216 | DEFINE_WAIT(wait); | ||
23 | int tag; | 217 | int tag; |
24 | 218 | ||
25 | tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? | 219 | tag = __bt_get(hctx, bt, last_tag); |
26 | TASK_UNINTERRUPTIBLE : TASK_RUNNING); | 220 | if (tag != -1) |
27 | if (tag < 0) | 221 | return tag; |
28 | return BLK_MQ_TAG_FAIL; | 222 | |
29 | return tag + tags->nr_reserved_tags; | 223 | if (!(gfp & __GFP_WAIT)) |
224 | return -1; | ||
225 | |||
226 | bs = bt_wait_ptr(bt, hctx); | ||
227 | do { | ||
228 | bool was_empty; | ||
229 | |||
230 | was_empty = list_empty(&wait.task_list); | ||
231 | prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
232 | |||
233 | tag = __bt_get(hctx, bt, last_tag); | ||
234 | if (tag != -1) | ||
235 | break; | ||
236 | |||
237 | if (was_empty) | ||
238 | atomic_set(&bs->wait_cnt, bt->wake_cnt); | ||
239 | |||
240 | io_schedule(); | ||
241 | } while (1); | ||
242 | |||
243 | finish_wait(&bs->wait, &wait); | ||
244 | return tag; | ||
245 | } | ||
246 | |||
247 | static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, | ||
248 | struct blk_mq_hw_ctx *hctx, | ||
249 | unsigned int *last_tag, gfp_t gfp) | ||
250 | { | ||
251 | int tag; | ||
252 | |||
253 | tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp); | ||
254 | if (tag >= 0) | ||
255 | return tag + tags->nr_reserved_tags; | ||
256 | |||
257 | return BLK_MQ_TAG_FAIL; | ||
30 | } | 258 | } |
31 | 259 | ||
32 | static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, | 260 | static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, |
33 | gfp_t gfp) | 261 | gfp_t gfp) |
34 | { | 262 | { |
35 | int tag; | 263 | int tag, zero = 0; |
36 | 264 | ||
37 | if (unlikely(!tags->nr_reserved_tags)) { | 265 | if (unlikely(!tags->nr_reserved_tags)) { |
38 | WARN_ON_ONCE(1); | 266 | WARN_ON_ONCE(1); |
39 | return BLK_MQ_TAG_FAIL; | 267 | return BLK_MQ_TAG_FAIL; |
40 | } | 268 | } |
41 | 269 | ||
42 | tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? | 270 | tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp); |
43 | TASK_UNINTERRUPTIBLE : TASK_RUNNING); | ||
44 | if (tag < 0) | 271 | if (tag < 0) |
45 | return BLK_MQ_TAG_FAIL; | 272 | return BLK_MQ_TAG_FAIL; |
273 | |||
46 | return tag; | 274 | return tag; |
47 | } | 275 | } |
48 | 276 | ||
49 | unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) | 277 | unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, |
278 | gfp_t gfp, bool reserved) | ||
50 | { | 279 | { |
51 | if (!reserved) | 280 | if (!reserved) |
52 | return __blk_mq_get_tag(tags, gfp); | 281 | return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); |
53 | 282 | ||
54 | return __blk_mq_get_reserved_tag(tags, gfp); | 283 | return __blk_mq_get_reserved_tag(hctx->tags, gfp); |
284 | } | ||
285 | |||
286 | static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) | ||
287 | { | ||
288 | int i, wake_index; | ||
289 | |||
290 | wake_index = bt->wake_index; | ||
291 | for (i = 0; i < BT_WAIT_QUEUES; i++) { | ||
292 | struct bt_wait_state *bs = &bt->bs[wake_index]; | ||
293 | |||
294 | if (waitqueue_active(&bs->wait)) { | ||
295 | if (wake_index != bt->wake_index) | ||
296 | bt->wake_index = wake_index; | ||
297 | |||
298 | return bs; | ||
299 | } | ||
300 | |||
301 | bt_index_inc(&wake_index); | ||
302 | } | ||
303 | |||
304 | return NULL; | ||
305 | } | ||
306 | |||
307 | static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) | ||
308 | { | ||
309 | const int index = TAG_TO_INDEX(bt, tag); | ||
310 | struct bt_wait_state *bs; | ||
311 | |||
312 | /* | ||
313 | * The unlock memory barrier need to order access to req in free | ||
314 | * path and clearing tag bit | ||
315 | */ | ||
316 | clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); | ||
317 | |||
318 | bs = bt_wake_ptr(bt); | ||
319 | if (bs && atomic_dec_and_test(&bs->wait_cnt)) { | ||
320 | atomic_set(&bs->wait_cnt, bt->wake_cnt); | ||
321 | bt_index_inc(&bt->wake_index); | ||
322 | wake_up(&bs->wait); | ||
323 | } | ||
55 | } | 324 | } |
56 | 325 | ||
57 | static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) | 326 | static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) |
58 | { | 327 | { |
59 | BUG_ON(tag >= tags->nr_tags); | 328 | BUG_ON(tag >= tags->nr_tags); |
60 | 329 | ||
61 | percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); | 330 | bt_clear_tag(&tags->bitmap_tags, tag); |
62 | } | 331 | } |
63 | 332 | ||
64 | static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, | 333 | static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, |
@@ -66,22 +335,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, | |||
66 | { | 335 | { |
67 | BUG_ON(tag >= tags->nr_reserved_tags); | 336 | BUG_ON(tag >= tags->nr_reserved_tags); |
68 | 337 | ||
69 | percpu_ida_free(&tags->reserved_tags, tag); | 338 | bt_clear_tag(&tags->breserved_tags, tag); |
70 | } | 339 | } |
71 | 340 | ||
72 | void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) | 341 | void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, |
342 | unsigned int *last_tag) | ||
73 | { | 343 | { |
74 | if (tag >= tags->nr_reserved_tags) | 344 | struct blk_mq_tags *tags = hctx->tags; |
75 | __blk_mq_put_tag(tags, tag); | 345 | |
76 | else | 346 | if (tag >= tags->nr_reserved_tags) { |
347 | const int real_tag = tag - tags->nr_reserved_tags; | ||
348 | |||
349 | __blk_mq_put_tag(tags, real_tag); | ||
350 | *last_tag = real_tag; | ||
351 | } else | ||
77 | __blk_mq_put_reserved_tag(tags, tag); | 352 | __blk_mq_put_reserved_tag(tags, tag); |
78 | } | 353 | } |
79 | 354 | ||
80 | static int __blk_mq_tag_iter(unsigned id, void *data) | 355 | static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, |
356 | unsigned long *free_map, unsigned int off) | ||
81 | { | 357 | { |
82 | unsigned long *tag_map = data; | 358 | int i; |
83 | __set_bit(id, tag_map); | 359 | |
84 | return 0; | 360 | for (i = 0; i < bt->map_nr; i++) { |
361 | struct blk_align_bitmap *bm = &bt->map[i]; | ||
362 | int bit = 0; | ||
363 | |||
364 | do { | ||
365 | bit = find_next_zero_bit(&bm->word, bm->depth, bit); | ||
366 | if (bit >= bm->depth) | ||
367 | break; | ||
368 | |||
369 | __set_bit(bit + off, free_map); | ||
370 | bit++; | ||
371 | } while (1); | ||
372 | |||
373 | off += (1 << bt->bits_per_word); | ||
374 | } | ||
85 | } | 375 | } |
86 | 376 | ||
87 | void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, | 377 | void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, |
@@ -95,21 +385,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, | |||
95 | if (!tag_map) | 385 | if (!tag_map) |
96 | return; | 386 | return; |
97 | 387 | ||
98 | percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); | 388 | bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); |
99 | if (tags->nr_reserved_tags) | 389 | if (tags->nr_reserved_tags) |
100 | percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, | 390 | bt_for_each_free(&tags->breserved_tags, tag_map, 0); |
101 | tag_map); | ||
102 | 391 | ||
103 | fn(data, tag_map); | 392 | fn(data, tag_map); |
104 | kfree(tag_map); | 393 | kfree(tag_map); |
105 | } | 394 | } |
395 | EXPORT_SYMBOL(blk_mq_tag_busy_iter); | ||
396 | |||
397 | static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) | ||
398 | { | ||
399 | unsigned int i, used; | ||
400 | |||
401 | for (i = 0, used = 0; i < bt->map_nr; i++) { | ||
402 | struct blk_align_bitmap *bm = &bt->map[i]; | ||
403 | |||
404 | used += bitmap_weight(&bm->word, bm->depth); | ||
405 | } | ||
406 | |||
407 | return bt->depth - used; | ||
408 | } | ||
409 | |||
410 | static void bt_update_count(struct blk_mq_bitmap_tags *bt, | ||
411 | unsigned int depth) | ||
412 | { | ||
413 | unsigned int tags_per_word = 1U << bt->bits_per_word; | ||
414 | unsigned int map_depth = depth; | ||
415 | |||
416 | if (depth) { | ||
417 | int i; | ||
418 | |||
419 | for (i = 0; i < bt->map_nr; i++) { | ||
420 | bt->map[i].depth = min(map_depth, tags_per_word); | ||
421 | map_depth -= bt->map[i].depth; | ||
422 | } | ||
423 | } | ||
424 | |||
425 | bt->wake_cnt = BT_WAIT_BATCH; | ||
426 | if (bt->wake_cnt > depth / 4) | ||
427 | bt->wake_cnt = max(1U, depth / 4); | ||
428 | |||
429 | bt->depth = depth; | ||
430 | } | ||
431 | |||
432 | static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, | ||
433 | int node, bool reserved) | ||
434 | { | ||
435 | int i; | ||
436 | |||
437 | bt->bits_per_word = ilog2(BITS_PER_LONG); | ||
438 | |||
439 | /* | ||
440 | * Depth can be zero for reserved tags, that's not a failure | ||
441 | * condition. | ||
442 | */ | ||
443 | if (depth) { | ||
444 | unsigned int nr, tags_per_word; | ||
445 | |||
446 | tags_per_word = (1 << bt->bits_per_word); | ||
447 | |||
448 | /* | ||
449 | * If the tag space is small, shrink the number of tags | ||
450 | * per word so we spread over a few cachelines, at least. | ||
451 | * If less than 4 tags, just forget about it, it's not | ||
452 | * going to work optimally anyway. | ||
453 | */ | ||
454 | if (depth >= 4) { | ||
455 | while (tags_per_word * 4 > depth) { | ||
456 | bt->bits_per_word--; | ||
457 | tags_per_word = (1 << bt->bits_per_word); | ||
458 | } | ||
459 | } | ||
460 | |||
461 | nr = ALIGN(depth, tags_per_word) / tags_per_word; | ||
462 | bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), | ||
463 | GFP_KERNEL, node); | ||
464 | if (!bt->map) | ||
465 | return -ENOMEM; | ||
466 | |||
467 | bt->map_nr = nr; | ||
468 | } | ||
469 | |||
470 | bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); | ||
471 | if (!bt->bs) { | ||
472 | kfree(bt->map); | ||
473 | return -ENOMEM; | ||
474 | } | ||
475 | |||
476 | for (i = 0; i < BT_WAIT_QUEUES; i++) | ||
477 | init_waitqueue_head(&bt->bs[i].wait); | ||
478 | |||
479 | bt_update_count(bt, depth); | ||
480 | return 0; | ||
481 | } | ||
482 | |||
483 | static void bt_free(struct blk_mq_bitmap_tags *bt) | ||
484 | { | ||
485 | kfree(bt->map); | ||
486 | kfree(bt->bs); | ||
487 | } | ||
488 | |||
489 | static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, | ||
490 | int node) | ||
491 | { | ||
492 | unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; | ||
493 | |||
494 | if (bt_alloc(&tags->bitmap_tags, depth, node, false)) | ||
495 | goto enomem; | ||
496 | if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) | ||
497 | goto enomem; | ||
498 | |||
499 | return tags; | ||
500 | enomem: | ||
501 | bt_free(&tags->bitmap_tags); | ||
502 | kfree(tags); | ||
503 | return NULL; | ||
504 | } | ||
106 | 505 | ||
107 | struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, | 506 | struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, |
108 | unsigned int reserved_tags, int node) | 507 | unsigned int reserved_tags, int node) |
109 | { | 508 | { |
110 | unsigned int nr_tags, nr_cache; | ||
111 | struct blk_mq_tags *tags; | 509 | struct blk_mq_tags *tags; |
112 | int ret; | ||
113 | 510 | ||
114 | if (total_tags > BLK_MQ_TAG_MAX) { | 511 | if (total_tags > BLK_MQ_TAG_MAX) { |
115 | pr_err("blk-mq: tag depth too large\n"); | 512 | pr_err("blk-mq: tag depth too large\n"); |
@@ -120,73 +517,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, | |||
120 | if (!tags) | 517 | if (!tags) |
121 | return NULL; | 518 | return NULL; |
122 | 519 | ||
123 | nr_tags = total_tags - reserved_tags; | ||
124 | nr_cache = nr_tags / num_possible_cpus(); | ||
125 | |||
126 | if (nr_cache < BLK_MQ_TAG_CACHE_MIN) | ||
127 | nr_cache = BLK_MQ_TAG_CACHE_MIN; | ||
128 | else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) | ||
129 | nr_cache = BLK_MQ_TAG_CACHE_MAX; | ||
130 | |||
131 | tags->nr_tags = total_tags; | 520 | tags->nr_tags = total_tags; |
132 | tags->nr_reserved_tags = reserved_tags; | 521 | tags->nr_reserved_tags = reserved_tags; |
133 | tags->nr_max_cache = nr_cache; | ||
134 | tags->nr_batch_move = max(1u, nr_cache / 2); | ||
135 | 522 | ||
136 | ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - | 523 | return blk_mq_init_bitmap_tags(tags, node); |
137 | tags->nr_reserved_tags, | 524 | } |
138 | tags->nr_max_cache, | ||
139 | tags->nr_batch_move); | ||
140 | if (ret) | ||
141 | goto err_free_tags; | ||
142 | 525 | ||
143 | if (reserved_tags) { | 526 | void blk_mq_free_tags(struct blk_mq_tags *tags) |
144 | /* | 527 | { |
145 | * With max_cahe and batch set to 1, the allocator fallbacks to | 528 | bt_free(&tags->bitmap_tags); |
146 | * no cached. It's fine reserved tags allocation is slow. | 529 | bt_free(&tags->breserved_tags); |
147 | */ | 530 | kfree(tags); |
148 | ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, | 531 | } |
149 | 1, 1); | ||
150 | if (ret) | ||
151 | goto err_reserved_tags; | ||
152 | } | ||
153 | 532 | ||
154 | return tags; | 533 | void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) |
534 | { | ||
535 | unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; | ||
155 | 536 | ||
156 | err_reserved_tags: | 537 | *tag = prandom_u32() % depth; |
157 | percpu_ida_destroy(&tags->free_tags); | ||
158 | err_free_tags: | ||
159 | kfree(tags); | ||
160 | return NULL; | ||
161 | } | 538 | } |
162 | 539 | ||
163 | void blk_mq_free_tags(struct blk_mq_tags *tags) | 540 | int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) |
164 | { | 541 | { |
165 | percpu_ida_destroy(&tags->free_tags); | 542 | tdepth -= tags->nr_reserved_tags; |
166 | percpu_ida_destroy(&tags->reserved_tags); | 543 | if (tdepth > tags->nr_tags) |
167 | kfree(tags); | 544 | return -EINVAL; |
545 | |||
546 | /* | ||
547 | * Don't need (or can't) update reserved tags here, they remain | ||
548 | * static and should never need resizing. | ||
549 | */ | ||
550 | bt_update_count(&tags->bitmap_tags, tdepth); | ||
551 | blk_mq_tag_wakeup_all(tags); | ||
552 | return 0; | ||
168 | } | 553 | } |
169 | 554 | ||
170 | ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) | 555 | ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) |
171 | { | 556 | { |
172 | char *orig_page = page; | 557 | char *orig_page = page; |
173 | unsigned int cpu; | 558 | unsigned int free, res; |
174 | 559 | ||
175 | if (!tags) | 560 | if (!tags) |
176 | return 0; | 561 | return 0; |
177 | 562 | ||
178 | page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," | 563 | page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " |
179 | " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, | 564 | "bits_per_word=%u\n", |
180 | tags->nr_batch_move, tags->nr_max_cache); | 565 | tags->nr_tags, tags->nr_reserved_tags, |
566 | tags->bitmap_tags.bits_per_word); | ||
181 | 567 | ||
182 | page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", | 568 | free = bt_unused_tags(&tags->bitmap_tags); |
183 | percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), | 569 | res = bt_unused_tags(&tags->breserved_tags); |
184 | percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); | ||
185 | 570 | ||
186 | for_each_possible_cpu(cpu) { | 571 | page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); |
187 | page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, | 572 | page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); |
188 | percpu_ida_free_tags(&tags->free_tags, cpu)); | ||
189 | } | ||
190 | 573 | ||
191 | return page - orig_page; | 574 | return page - orig_page; |
192 | } | 575 | } |
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index b602e3fa66ea..c959de58d2a5 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h | |||
@@ -1,7 +1,32 @@ | |||
1 | #ifndef INT_BLK_MQ_TAG_H | 1 | #ifndef INT_BLK_MQ_TAG_H |
2 | #define INT_BLK_MQ_TAG_H | 2 | #define INT_BLK_MQ_TAG_H |
3 | 3 | ||
4 | #include <linux/percpu_ida.h> | 4 | #include "blk-mq.h" |
5 | |||
6 | enum { | ||
7 | BT_WAIT_QUEUES = 8, | ||
8 | BT_WAIT_BATCH = 8, | ||
9 | }; | ||
10 | |||
11 | struct bt_wait_state { | ||
12 | atomic_t wait_cnt; | ||
13 | wait_queue_head_t wait; | ||
14 | } ____cacheline_aligned_in_smp; | ||
15 | |||
16 | #define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) | ||
17 | #define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) | ||
18 | |||
19 | struct blk_mq_bitmap_tags { | ||
20 | unsigned int depth; | ||
21 | unsigned int wake_cnt; | ||
22 | unsigned int bits_per_word; | ||
23 | |||
24 | unsigned int map_nr; | ||
25 | struct blk_align_bitmap *map; | ||
26 | |||
27 | unsigned int wake_index; | ||
28 | struct bt_wait_state *bs; | ||
29 | }; | ||
5 | 30 | ||
6 | /* | 31 | /* |
7 | * Tag address space map. | 32 | * Tag address space map. |
@@ -9,11 +34,11 @@ | |||
9 | struct blk_mq_tags { | 34 | struct blk_mq_tags { |
10 | unsigned int nr_tags; | 35 | unsigned int nr_tags; |
11 | unsigned int nr_reserved_tags; | 36 | unsigned int nr_reserved_tags; |
12 | unsigned int nr_batch_move; | ||
13 | unsigned int nr_max_cache; | ||
14 | 37 | ||
15 | struct percpu_ida free_tags; | 38 | atomic_t active_queues; |
16 | struct percpu_ida reserved_tags; | 39 | |
40 | struct blk_mq_bitmap_tags bitmap_tags; | ||
41 | struct blk_mq_bitmap_tags breserved_tags; | ||
17 | 42 | ||
18 | struct request **rqs; | 43 | struct request **rqs; |
19 | struct list_head page_list; | 44 | struct list_head page_list; |
@@ -23,12 +48,12 @@ struct blk_mq_tags { | |||
23 | extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); | 48 | extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); |
24 | extern void blk_mq_free_tags(struct blk_mq_tags *tags); | 49 | extern void blk_mq_free_tags(struct blk_mq_tags *tags); |
25 | 50 | ||
26 | extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); | 51 | extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); |
27 | extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); | 52 | extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); |
28 | extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); | ||
29 | extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); | ||
30 | extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); | 53 | extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); |
31 | extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); | 54 | extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); |
55 | extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); | ||
56 | extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); | ||
32 | 57 | ||
33 | enum { | 58 | enum { |
34 | BLK_MQ_TAG_CACHE_MIN = 1, | 59 | BLK_MQ_TAG_CACHE_MIN = 1, |
@@ -41,4 +66,23 @@ enum { | |||
41 | BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, | 66 | BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, |
42 | }; | 67 | }; |
43 | 68 | ||
69 | extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); | ||
70 | extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); | ||
71 | |||
72 | static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) | ||
73 | { | ||
74 | if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) | ||
75 | return false; | ||
76 | |||
77 | return __blk_mq_tag_busy(hctx); | ||
78 | } | ||
79 | |||
80 | static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) | ||
81 | { | ||
82 | if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) | ||
83 | return; | ||
84 | |||
85 | __blk_mq_tag_idle(hctx); | ||
86 | } | ||
87 | |||
44 | #endif | 88 | #endif |
diff --git a/block/blk-mq.c b/block/blk-mq.c index ee225cc312b8..ae14749b530c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -56,39 +56,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) | |||
56 | { | 56 | { |
57 | unsigned int i; | 57 | unsigned int i; |
58 | 58 | ||
59 | for (i = 0; i < hctx->nr_ctx_map; i++) | 59 | for (i = 0; i < hctx->ctx_map.map_size; i++) |
60 | if (hctx->ctx_map[i]) | 60 | if (hctx->ctx_map.map[i].word) |
61 | return true; | 61 | return true; |
62 | 62 | ||
63 | return false; | 63 | return false; |
64 | } | 64 | } |
65 | 65 | ||
66 | static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, | ||
67 | struct blk_mq_ctx *ctx) | ||
68 | { | ||
69 | return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; | ||
70 | } | ||
71 | |||
72 | #define CTX_TO_BIT(hctx, ctx) \ | ||
73 | ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) | ||
74 | |||
66 | /* | 75 | /* |
67 | * Mark this ctx as having pending work in this hardware queue | 76 | * Mark this ctx as having pending work in this hardware queue |
68 | */ | 77 | */ |
69 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, | 78 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, |
70 | struct blk_mq_ctx *ctx) | 79 | struct blk_mq_ctx *ctx) |
71 | { | 80 | { |
72 | if (!test_bit(ctx->index_hw, hctx->ctx_map)) | 81 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); |
73 | set_bit(ctx->index_hw, hctx->ctx_map); | 82 | |
83 | if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) | ||
84 | set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); | ||
74 | } | 85 | } |
75 | 86 | ||
76 | static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, | 87 | static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, |
77 | gfp_t gfp, bool reserved) | 88 | struct blk_mq_ctx *ctx) |
78 | { | 89 | { |
79 | struct request *rq; | 90 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); |
80 | unsigned int tag; | ||
81 | |||
82 | tag = blk_mq_get_tag(hctx->tags, gfp, reserved); | ||
83 | if (tag != BLK_MQ_TAG_FAIL) { | ||
84 | rq = hctx->tags->rqs[tag]; | ||
85 | blk_rq_init(hctx->queue, rq); | ||
86 | rq->tag = tag; | ||
87 | |||
88 | return rq; | ||
89 | } | ||
90 | 91 | ||
91 | return NULL; | 92 | clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); |
92 | } | 93 | } |
93 | 94 | ||
94 | static int blk_mq_queue_enter(struct request_queue *q) | 95 | static int blk_mq_queue_enter(struct request_queue *q) |
@@ -187,70 +188,109 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, | |||
187 | if (blk_queue_io_stat(q)) | 188 | if (blk_queue_io_stat(q)) |
188 | rw_flags |= REQ_IO_STAT; | 189 | rw_flags |= REQ_IO_STAT; |
189 | 190 | ||
191 | INIT_LIST_HEAD(&rq->queuelist); | ||
192 | /* csd/requeue_work/fifo_time is initialized before use */ | ||
193 | rq->q = q; | ||
190 | rq->mq_ctx = ctx; | 194 | rq->mq_ctx = ctx; |
191 | rq->cmd_flags = rw_flags; | 195 | rq->cmd_flags |= rw_flags; |
196 | rq->cmd_type = 0; | ||
197 | /* do not touch atomic flags, it needs atomic ops against the timer */ | ||
198 | rq->cpu = -1; | ||
199 | rq->__data_len = 0; | ||
200 | rq->__sector = (sector_t) -1; | ||
201 | rq->bio = NULL; | ||
202 | rq->biotail = NULL; | ||
203 | INIT_HLIST_NODE(&rq->hash); | ||
204 | RB_CLEAR_NODE(&rq->rb_node); | ||
205 | memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv))); | ||
206 | rq->rq_disk = NULL; | ||
207 | rq->part = NULL; | ||
192 | rq->start_time = jiffies; | 208 | rq->start_time = jiffies; |
209 | #ifdef CONFIG_BLK_CGROUP | ||
210 | rq->rl = NULL; | ||
193 | set_start_time_ns(rq); | 211 | set_start_time_ns(rq); |
212 | rq->io_start_time_ns = 0; | ||
213 | #endif | ||
214 | rq->nr_phys_segments = 0; | ||
215 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | ||
216 | rq->nr_integrity_segments = 0; | ||
217 | #endif | ||
218 | rq->ioprio = 0; | ||
219 | rq->special = NULL; | ||
220 | /* tag was already set */ | ||
221 | rq->errors = 0; | ||
222 | memset(rq->__cmd, 0, sizeof(rq->__cmd)); | ||
223 | rq->cmd = rq->__cmd; | ||
224 | rq->cmd_len = BLK_MAX_CDB; | ||
225 | |||
226 | rq->extra_len = 0; | ||
227 | rq->sense_len = 0; | ||
228 | rq->resid_len = 0; | ||
229 | rq->sense = NULL; | ||
230 | |||
231 | rq->deadline = 0; | ||
232 | INIT_LIST_HEAD(&rq->timeout_list); | ||
233 | rq->timeout = 0; | ||
234 | rq->retries = 0; | ||
235 | rq->end_io = NULL; | ||
236 | rq->end_io_data = NULL; | ||
237 | rq->next_rq = NULL; | ||
238 | |||
194 | ctx->rq_dispatched[rw_is_sync(rw_flags)]++; | 239 | ctx->rq_dispatched[rw_is_sync(rw_flags)]++; |
195 | } | 240 | } |
196 | 241 | ||
197 | static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, | 242 | static struct request * |
198 | int rw, gfp_t gfp, | 243 | __blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, |
199 | bool reserved) | 244 | struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved) |
200 | { | 245 | { |
201 | struct request *rq; | 246 | struct request *rq; |
247 | unsigned int tag; | ||
202 | 248 | ||
203 | do { | 249 | tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); |
204 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); | 250 | if (tag != BLK_MQ_TAG_FAIL) { |
205 | struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); | 251 | rq = hctx->tags->rqs[tag]; |
206 | |||
207 | rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); | ||
208 | if (rq) { | ||
209 | blk_mq_rq_ctx_init(q, ctx, rq, rw); | ||
210 | break; | ||
211 | } | ||
212 | 252 | ||
213 | if (gfp & __GFP_WAIT) { | 253 | rq->cmd_flags = 0; |
214 | __blk_mq_run_hw_queue(hctx); | 254 | if (blk_mq_tag_busy(hctx)) { |
215 | blk_mq_put_ctx(ctx); | 255 | rq->cmd_flags = REQ_MQ_INFLIGHT; |
216 | } else { | 256 | atomic_inc(&hctx->nr_active); |
217 | blk_mq_put_ctx(ctx); | ||
218 | break; | ||
219 | } | 257 | } |
220 | 258 | ||
221 | blk_mq_wait_for_tags(hctx->tags); | 259 | rq->tag = tag; |
222 | } while (1); | 260 | blk_mq_rq_ctx_init(q, ctx, rq, rw); |
261 | return rq; | ||
262 | } | ||
223 | 263 | ||
224 | return rq; | 264 | return NULL; |
225 | } | 265 | } |
226 | 266 | ||
227 | struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) | 267 | struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, |
268 | bool reserved) | ||
228 | { | 269 | { |
270 | struct blk_mq_ctx *ctx; | ||
271 | struct blk_mq_hw_ctx *hctx; | ||
229 | struct request *rq; | 272 | struct request *rq; |
230 | 273 | ||
231 | if (blk_mq_queue_enter(q)) | 274 | if (blk_mq_queue_enter(q)) |
232 | return NULL; | 275 | return NULL; |
233 | 276 | ||
234 | rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); | 277 | ctx = blk_mq_get_ctx(q); |
235 | if (rq) | 278 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
236 | blk_mq_put_ctx(rq->mq_ctx); | ||
237 | return rq; | ||
238 | } | ||
239 | |||
240 | struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, | ||
241 | gfp_t gfp) | ||
242 | { | ||
243 | struct request *rq; | ||
244 | 279 | ||
245 | if (blk_mq_queue_enter(q)) | 280 | rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, |
246 | return NULL; | 281 | reserved); |
282 | if (!rq && (gfp & __GFP_WAIT)) { | ||
283 | __blk_mq_run_hw_queue(hctx); | ||
284 | blk_mq_put_ctx(ctx); | ||
247 | 285 | ||
248 | rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); | 286 | ctx = blk_mq_get_ctx(q); |
249 | if (rq) | 287 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
250 | blk_mq_put_ctx(rq->mq_ctx); | 288 | rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved); |
289 | } | ||
290 | blk_mq_put_ctx(ctx); | ||
251 | return rq; | 291 | return rq; |
252 | } | 292 | } |
253 | EXPORT_SYMBOL(blk_mq_alloc_reserved_request); | 293 | EXPORT_SYMBOL(blk_mq_alloc_request); |
254 | 294 | ||
255 | static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, | 295 | static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, |
256 | struct blk_mq_ctx *ctx, struct request *rq) | 296 | struct blk_mq_ctx *ctx, struct request *rq) |
@@ -258,7 +298,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, | |||
258 | const int tag = rq->tag; | 298 | const int tag = rq->tag; |
259 | struct request_queue *q = rq->q; | 299 | struct request_queue *q = rq->q; |
260 | 300 | ||
261 | blk_mq_put_tag(hctx->tags, tag); | 301 | if (rq->cmd_flags & REQ_MQ_INFLIGHT) |
302 | atomic_dec(&hctx->nr_active); | ||
303 | |||
304 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | ||
305 | blk_mq_put_tag(hctx, tag, &ctx->last_tag); | ||
262 | blk_mq_queue_exit(q); | 306 | blk_mq_queue_exit(q); |
263 | } | 307 | } |
264 | 308 | ||
@@ -326,15 +370,19 @@ static void __blk_mq_complete_request_remote(void *data) | |||
326 | void __blk_mq_complete_request(struct request *rq) | 370 | void __blk_mq_complete_request(struct request *rq) |
327 | { | 371 | { |
328 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 372 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
373 | bool shared = false; | ||
329 | int cpu; | 374 | int cpu; |
330 | 375 | ||
331 | if (!ctx->ipi_redirect) { | 376 | if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { |
332 | rq->q->softirq_done_fn(rq); | 377 | rq->q->softirq_done_fn(rq); |
333 | return; | 378 | return; |
334 | } | 379 | } |
335 | 380 | ||
336 | cpu = get_cpu(); | 381 | cpu = get_cpu(); |
337 | if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { | 382 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) |
383 | shared = cpus_share_cache(cpu, ctx->cpu); | ||
384 | |||
385 | if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { | ||
338 | rq->csd.func = __blk_mq_complete_request_remote; | 386 | rq->csd.func = __blk_mq_complete_request_remote; |
339 | rq->csd.info = rq; | 387 | rq->csd.info = rq; |
340 | rq->csd.flags = 0; | 388 | rq->csd.flags = 0; |
@@ -355,10 +403,16 @@ void __blk_mq_complete_request(struct request *rq) | |||
355 | **/ | 403 | **/ |
356 | void blk_mq_complete_request(struct request *rq) | 404 | void blk_mq_complete_request(struct request *rq) |
357 | { | 405 | { |
358 | if (unlikely(blk_should_fake_timeout(rq->q))) | 406 | struct request_queue *q = rq->q; |
407 | |||
408 | if (unlikely(blk_should_fake_timeout(q))) | ||
359 | return; | 409 | return; |
360 | if (!blk_mark_rq_complete(rq)) | 410 | if (!blk_mark_rq_complete(rq)) { |
361 | __blk_mq_complete_request(rq); | 411 | if (q->softirq_done_fn) |
412 | __blk_mq_complete_request(rq); | ||
413 | else | ||
414 | blk_mq_end_io(rq, rq->errors); | ||
415 | } | ||
362 | } | 416 | } |
363 | EXPORT_SYMBOL(blk_mq_complete_request); | 417 | EXPORT_SYMBOL(blk_mq_complete_request); |
364 | 418 | ||
@@ -375,10 +429,22 @@ static void blk_mq_start_request(struct request *rq, bool last) | |||
375 | /* | 429 | /* |
376 | * Just mark start time and set the started bit. Due to memory | 430 | * Just mark start time and set the started bit. Due to memory |
377 | * ordering, we know we'll see the correct deadline as long as | 431 | * ordering, we know we'll see the correct deadline as long as |
378 | * REQ_ATOMIC_STARTED is seen. | 432 | * REQ_ATOMIC_STARTED is seen. Use the default queue timeout, |
433 | * unless one has been set in the request. | ||
434 | */ | ||
435 | if (!rq->timeout) | ||
436 | rq->deadline = jiffies + q->rq_timeout; | ||
437 | else | ||
438 | rq->deadline = jiffies + rq->timeout; | ||
439 | |||
440 | /* | ||
441 | * Mark us as started and clear complete. Complete might have been | ||
442 | * set if requeue raced with timeout, which then marked it as | ||
443 | * complete. So be sure to clear complete again when we start | ||
444 | * the request, otherwise we'll ignore the completion event. | ||
379 | */ | 445 | */ |
380 | rq->deadline = jiffies + q->rq_timeout; | ||
381 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 446 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
447 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); | ||
382 | 448 | ||
383 | if (q->dma_drain_size && blk_rq_bytes(rq)) { | 449 | if (q->dma_drain_size && blk_rq_bytes(rq)) { |
384 | /* | 450 | /* |
@@ -415,18 +481,72 @@ static void __blk_mq_requeue_request(struct request *rq) | |||
415 | 481 | ||
416 | void blk_mq_requeue_request(struct request *rq) | 482 | void blk_mq_requeue_request(struct request *rq) |
417 | { | 483 | { |
418 | struct request_queue *q = rq->q; | ||
419 | |||
420 | __blk_mq_requeue_request(rq); | 484 | __blk_mq_requeue_request(rq); |
421 | blk_clear_rq_complete(rq); | 485 | blk_clear_rq_complete(rq); |
422 | 486 | ||
423 | trace_block_rq_requeue(q, rq); | ||
424 | |||
425 | BUG_ON(blk_queued_rq(rq)); | 487 | BUG_ON(blk_queued_rq(rq)); |
426 | blk_mq_insert_request(rq, true, true, false); | 488 | blk_mq_add_to_requeue_list(rq, true); |
427 | } | 489 | } |
428 | EXPORT_SYMBOL(blk_mq_requeue_request); | 490 | EXPORT_SYMBOL(blk_mq_requeue_request); |
429 | 491 | ||
492 | static void blk_mq_requeue_work(struct work_struct *work) | ||
493 | { | ||
494 | struct request_queue *q = | ||
495 | container_of(work, struct request_queue, requeue_work); | ||
496 | LIST_HEAD(rq_list); | ||
497 | struct request *rq, *next; | ||
498 | unsigned long flags; | ||
499 | |||
500 | spin_lock_irqsave(&q->requeue_lock, flags); | ||
501 | list_splice_init(&q->requeue_list, &rq_list); | ||
502 | spin_unlock_irqrestore(&q->requeue_lock, flags); | ||
503 | |||
504 | list_for_each_entry_safe(rq, next, &rq_list, queuelist) { | ||
505 | if (!(rq->cmd_flags & REQ_SOFTBARRIER)) | ||
506 | continue; | ||
507 | |||
508 | rq->cmd_flags &= ~REQ_SOFTBARRIER; | ||
509 | list_del_init(&rq->queuelist); | ||
510 | blk_mq_insert_request(rq, true, false, false); | ||
511 | } | ||
512 | |||
513 | while (!list_empty(&rq_list)) { | ||
514 | rq = list_entry(rq_list.next, struct request, queuelist); | ||
515 | list_del_init(&rq->queuelist); | ||
516 | blk_mq_insert_request(rq, false, false, false); | ||
517 | } | ||
518 | |||
519 | blk_mq_run_queues(q, false); | ||
520 | } | ||
521 | |||
522 | void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) | ||
523 | { | ||
524 | struct request_queue *q = rq->q; | ||
525 | unsigned long flags; | ||
526 | |||
527 | /* | ||
528 | * We abuse this flag that is otherwise used by the I/O scheduler to | ||
529 | * request head insertation from the workqueue. | ||
530 | */ | ||
531 | BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); | ||
532 | |||
533 | spin_lock_irqsave(&q->requeue_lock, flags); | ||
534 | if (at_head) { | ||
535 | rq->cmd_flags |= REQ_SOFTBARRIER; | ||
536 | list_add(&rq->queuelist, &q->requeue_list); | ||
537 | } else { | ||
538 | list_add_tail(&rq->queuelist, &q->requeue_list); | ||
539 | } | ||
540 | spin_unlock_irqrestore(&q->requeue_lock, flags); | ||
541 | } | ||
542 | EXPORT_SYMBOL(blk_mq_add_to_requeue_list); | ||
543 | |||
544 | void blk_mq_kick_requeue_list(struct request_queue *q) | ||
545 | { | ||
546 | kblockd_schedule_work(&q->requeue_work); | ||
547 | } | ||
548 | EXPORT_SYMBOL(blk_mq_kick_requeue_list); | ||
549 | |||
430 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) | 550 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) |
431 | { | 551 | { |
432 | return tags->rqs[tag]; | 552 | return tags->rqs[tag]; |
@@ -485,6 +605,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, | |||
485 | blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); | 605 | blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); |
486 | } | 606 | } |
487 | 607 | ||
608 | static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) | ||
609 | { | ||
610 | struct request_queue *q = rq->q; | ||
611 | |||
612 | /* | ||
613 | * We know that complete is set at this point. If STARTED isn't set | ||
614 | * anymore, then the request isn't active and the "timeout" should | ||
615 | * just be ignored. This can happen due to the bitflag ordering. | ||
616 | * Timeout first checks if STARTED is set, and if it is, assumes | ||
617 | * the request is active. But if we race with completion, then | ||
618 | * we both flags will get cleared. So check here again, and ignore | ||
619 | * a timeout event with a request that isn't active. | ||
620 | */ | ||
621 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | ||
622 | return BLK_EH_NOT_HANDLED; | ||
623 | |||
624 | if (!q->mq_ops->timeout) | ||
625 | return BLK_EH_RESET_TIMER; | ||
626 | |||
627 | return q->mq_ops->timeout(rq); | ||
628 | } | ||
629 | |||
488 | static void blk_mq_rq_timer(unsigned long data) | 630 | static void blk_mq_rq_timer(unsigned long data) |
489 | { | 631 | { |
490 | struct request_queue *q = (struct request_queue *) data; | 632 | struct request_queue *q = (struct request_queue *) data; |
@@ -492,11 +634,24 @@ static void blk_mq_rq_timer(unsigned long data) | |||
492 | unsigned long next = 0; | 634 | unsigned long next = 0; |
493 | int i, next_set = 0; | 635 | int i, next_set = 0; |
494 | 636 | ||
495 | queue_for_each_hw_ctx(q, hctx, i) | 637 | queue_for_each_hw_ctx(q, hctx, i) { |
638 | /* | ||
639 | * If not software queues are currently mapped to this | ||
640 | * hardware queue, there's nothing to check | ||
641 | */ | ||
642 | if (!hctx->nr_ctx || !hctx->tags) | ||
643 | continue; | ||
644 | |||
496 | blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); | 645 | blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); |
646 | } | ||
497 | 647 | ||
498 | if (next_set) | 648 | if (next_set) { |
499 | mod_timer(&q->timeout, round_jiffies_up(next)); | 649 | next = blk_rq_timeout(round_jiffies_up(next)); |
650 | mod_timer(&q->timeout, next); | ||
651 | } else { | ||
652 | queue_for_each_hw_ctx(q, hctx, i) | ||
653 | blk_mq_tag_idle(hctx); | ||
654 | } | ||
500 | } | 655 | } |
501 | 656 | ||
502 | /* | 657 | /* |
@@ -538,9 +693,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q, | |||
538 | return false; | 693 | return false; |
539 | } | 694 | } |
540 | 695 | ||
541 | void blk_mq_add_timer(struct request *rq) | 696 | /* |
697 | * Process software queues that have been marked busy, splicing them | ||
698 | * to the for-dispatch | ||
699 | */ | ||
700 | static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) | ||
542 | { | 701 | { |
543 | __blk_add_timer(rq, NULL); | 702 | struct blk_mq_ctx *ctx; |
703 | int i; | ||
704 | |||
705 | for (i = 0; i < hctx->ctx_map.map_size; i++) { | ||
706 | struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; | ||
707 | unsigned int off, bit; | ||
708 | |||
709 | if (!bm->word) | ||
710 | continue; | ||
711 | |||
712 | bit = 0; | ||
713 | off = i * hctx->ctx_map.bits_per_word; | ||
714 | do { | ||
715 | bit = find_next_bit(&bm->word, bm->depth, bit); | ||
716 | if (bit >= bm->depth) | ||
717 | break; | ||
718 | |||
719 | ctx = hctx->ctxs[bit + off]; | ||
720 | clear_bit(bit, &bm->word); | ||
721 | spin_lock(&ctx->lock); | ||
722 | list_splice_tail_init(&ctx->rq_list, list); | ||
723 | spin_unlock(&ctx->lock); | ||
724 | |||
725 | bit++; | ||
726 | } while (1); | ||
727 | } | ||
544 | } | 728 | } |
545 | 729 | ||
546 | /* | 730 | /* |
@@ -552,10 +736,9 @@ void blk_mq_add_timer(struct request *rq) | |||
552 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | 736 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) |
553 | { | 737 | { |
554 | struct request_queue *q = hctx->queue; | 738 | struct request_queue *q = hctx->queue; |
555 | struct blk_mq_ctx *ctx; | ||
556 | struct request *rq; | 739 | struct request *rq; |
557 | LIST_HEAD(rq_list); | 740 | LIST_HEAD(rq_list); |
558 | int bit, queued; | 741 | int queued; |
559 | 742 | ||
560 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); | 743 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); |
561 | 744 | ||
@@ -567,15 +750,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
567 | /* | 750 | /* |
568 | * Touch any software queue that has pending entries. | 751 | * Touch any software queue that has pending entries. |
569 | */ | 752 | */ |
570 | for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { | 753 | flush_busy_ctxs(hctx, &rq_list); |
571 | clear_bit(bit, hctx->ctx_map); | ||
572 | ctx = hctx->ctxs[bit]; | ||
573 | BUG_ON(bit != ctx->index_hw); | ||
574 | |||
575 | spin_lock(&ctx->lock); | ||
576 | list_splice_tail_init(&ctx->rq_list, &rq_list); | ||
577 | spin_unlock(&ctx->lock); | ||
578 | } | ||
579 | 754 | ||
580 | /* | 755 | /* |
581 | * If we have previous entries on our dispatch list, grab them | 756 | * If we have previous entries on our dispatch list, grab them |
@@ -589,13 +764,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
589 | } | 764 | } |
590 | 765 | ||
591 | /* | 766 | /* |
592 | * Delete and return all entries from our dispatch list | ||
593 | */ | ||
594 | queued = 0; | ||
595 | |||
596 | /* | ||
597 | * Now process all the entries, sending them to the driver. | 767 | * Now process all the entries, sending them to the driver. |
598 | */ | 768 | */ |
769 | queued = 0; | ||
599 | while (!list_empty(&rq_list)) { | 770 | while (!list_empty(&rq_list)) { |
600 | int ret; | 771 | int ret; |
601 | 772 | ||
@@ -610,11 +781,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
610 | queued++; | 781 | queued++; |
611 | continue; | 782 | continue; |
612 | case BLK_MQ_RQ_QUEUE_BUSY: | 783 | case BLK_MQ_RQ_QUEUE_BUSY: |
613 | /* | ||
614 | * FIXME: we should have a mechanism to stop the queue | ||
615 | * like blk_stop_queue, otherwise we will waste cpu | ||
616 | * time | ||
617 | */ | ||
618 | list_add(&rq->queuelist, &rq_list); | 784 | list_add(&rq->queuelist, &rq_list); |
619 | __blk_mq_requeue_request(rq); | 785 | __blk_mq_requeue_request(rq); |
620 | break; | 786 | break; |
@@ -646,6 +812,30 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
646 | } | 812 | } |
647 | } | 813 | } |
648 | 814 | ||
815 | /* | ||
816 | * It'd be great if the workqueue API had a way to pass | ||
817 | * in a mask and had some smarts for more clever placement. | ||
818 | * For now we just round-robin here, switching for every | ||
819 | * BLK_MQ_CPU_WORK_BATCH queued items. | ||
820 | */ | ||
821 | static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) | ||
822 | { | ||
823 | int cpu = hctx->next_cpu; | ||
824 | |||
825 | if (--hctx->next_cpu_batch <= 0) { | ||
826 | int next_cpu; | ||
827 | |||
828 | next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); | ||
829 | if (next_cpu >= nr_cpu_ids) | ||
830 | next_cpu = cpumask_first(hctx->cpumask); | ||
831 | |||
832 | hctx->next_cpu = next_cpu; | ||
833 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; | ||
834 | } | ||
835 | |||
836 | return cpu; | ||
837 | } | ||
838 | |||
649 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | 839 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) |
650 | { | 840 | { |
651 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) | 841 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) |
@@ -658,13 +848,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | |||
658 | else { | 848 | else { |
659 | unsigned int cpu; | 849 | unsigned int cpu; |
660 | 850 | ||
661 | /* | 851 | cpu = blk_mq_hctx_next_cpu(hctx); |
662 | * It'd be great if the workqueue API had a way to pass | ||
663 | * in a mask and had some smarts for more clever placement | ||
664 | * than the first CPU. Or we could round-robin here. For now, | ||
665 | * just queue on the first CPU. | ||
666 | */ | ||
667 | cpu = cpumask_first(hctx->cpumask); | ||
668 | kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); | 852 | kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); |
669 | } | 853 | } |
670 | } | 854 | } |
@@ -771,13 +955,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) | |||
771 | else { | 955 | else { |
772 | unsigned int cpu; | 956 | unsigned int cpu; |
773 | 957 | ||
774 | /* | 958 | cpu = blk_mq_hctx_next_cpu(hctx); |
775 | * It'd be great if the workqueue API had a way to pass | ||
776 | * in a mask and had some smarts for more clever placement | ||
777 | * than the first CPU. Or we could round-robin here. For now, | ||
778 | * just queue on the first CPU. | ||
779 | */ | ||
780 | cpu = cpumask_first(hctx->cpumask); | ||
781 | kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); | 959 | kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); |
782 | } | 960 | } |
783 | } | 961 | } |
@@ -794,12 +972,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, | |||
794 | list_add(&rq->queuelist, &ctx->rq_list); | 972 | list_add(&rq->queuelist, &ctx->rq_list); |
795 | else | 973 | else |
796 | list_add_tail(&rq->queuelist, &ctx->rq_list); | 974 | list_add_tail(&rq->queuelist, &ctx->rq_list); |
975 | |||
797 | blk_mq_hctx_mark_pending(hctx, ctx); | 976 | blk_mq_hctx_mark_pending(hctx, ctx); |
798 | 977 | ||
799 | /* | 978 | /* |
800 | * We do this early, to ensure we are on the right CPU. | 979 | * We do this early, to ensure we are on the right CPU. |
801 | */ | 980 | */ |
802 | blk_mq_add_timer(rq); | 981 | blk_add_timer(rq); |
803 | } | 982 | } |
804 | 983 | ||
805 | void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, | 984 | void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, |
@@ -930,21 +1109,161 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | |||
930 | blk_account_io_start(rq, 1); | 1109 | blk_account_io_start(rq, 1); |
931 | } | 1110 | } |
932 | 1111 | ||
933 | static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | 1112 | static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, |
1113 | struct blk_mq_ctx *ctx, | ||
1114 | struct request *rq, struct bio *bio) | ||
1115 | { | ||
1116 | struct request_queue *q = hctx->queue; | ||
1117 | |||
1118 | if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { | ||
1119 | blk_mq_bio_to_request(rq, bio); | ||
1120 | spin_lock(&ctx->lock); | ||
1121 | insert_rq: | ||
1122 | __blk_mq_insert_request(hctx, rq, false); | ||
1123 | spin_unlock(&ctx->lock); | ||
1124 | return false; | ||
1125 | } else { | ||
1126 | spin_lock(&ctx->lock); | ||
1127 | if (!blk_mq_attempt_merge(q, ctx, bio)) { | ||
1128 | blk_mq_bio_to_request(rq, bio); | ||
1129 | goto insert_rq; | ||
1130 | } | ||
1131 | |||
1132 | spin_unlock(&ctx->lock); | ||
1133 | __blk_mq_free_request(hctx, ctx, rq); | ||
1134 | return true; | ||
1135 | } | ||
1136 | } | ||
1137 | |||
1138 | struct blk_map_ctx { | ||
1139 | struct blk_mq_hw_ctx *hctx; | ||
1140 | struct blk_mq_ctx *ctx; | ||
1141 | }; | ||
1142 | |||
1143 | static struct request *blk_mq_map_request(struct request_queue *q, | ||
1144 | struct bio *bio, | ||
1145 | struct blk_map_ctx *data) | ||
934 | { | 1146 | { |
935 | struct blk_mq_hw_ctx *hctx; | 1147 | struct blk_mq_hw_ctx *hctx; |
936 | struct blk_mq_ctx *ctx; | 1148 | struct blk_mq_ctx *ctx; |
1149 | struct request *rq; | ||
1150 | int rw = bio_data_dir(bio); | ||
1151 | |||
1152 | if (unlikely(blk_mq_queue_enter(q))) { | ||
1153 | bio_endio(bio, -EIO); | ||
1154 | return NULL; | ||
1155 | } | ||
1156 | |||
1157 | ctx = blk_mq_get_ctx(q); | ||
1158 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
1159 | |||
1160 | if (rw_is_sync(bio->bi_rw)) | ||
1161 | rw |= REQ_SYNC; | ||
1162 | |||
1163 | trace_block_getrq(q, bio, rw); | ||
1164 | rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false); | ||
1165 | if (unlikely(!rq)) { | ||
1166 | __blk_mq_run_hw_queue(hctx); | ||
1167 | blk_mq_put_ctx(ctx); | ||
1168 | trace_block_sleeprq(q, bio, rw); | ||
1169 | |||
1170 | ctx = blk_mq_get_ctx(q); | ||
1171 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
1172 | rq = __blk_mq_alloc_request(q, hctx, ctx, rw, | ||
1173 | __GFP_WAIT|GFP_ATOMIC, false); | ||
1174 | } | ||
1175 | |||
1176 | hctx->queued++; | ||
1177 | data->hctx = hctx; | ||
1178 | data->ctx = ctx; | ||
1179 | return rq; | ||
1180 | } | ||
1181 | |||
1182 | /* | ||
1183 | * Multiple hardware queue variant. This will not use per-process plugs, | ||
1184 | * but will attempt to bypass the hctx queueing if we can go straight to | ||
1185 | * hardware for SYNC IO. | ||
1186 | */ | ||
1187 | static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | ||
1188 | { | ||
937 | const int is_sync = rw_is_sync(bio->bi_rw); | 1189 | const int is_sync = rw_is_sync(bio->bi_rw); |
938 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); | 1190 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); |
939 | int rw = bio_data_dir(bio); | 1191 | struct blk_map_ctx data; |
940 | struct request *rq; | 1192 | struct request *rq; |
1193 | |||
1194 | blk_queue_bounce(q, &bio); | ||
1195 | |||
1196 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { | ||
1197 | bio_endio(bio, -EIO); | ||
1198 | return; | ||
1199 | } | ||
1200 | |||
1201 | rq = blk_mq_map_request(q, bio, &data); | ||
1202 | if (unlikely(!rq)) | ||
1203 | return; | ||
1204 | |||
1205 | if (unlikely(is_flush_fua)) { | ||
1206 | blk_mq_bio_to_request(rq, bio); | ||
1207 | blk_insert_flush(rq); | ||
1208 | goto run_queue; | ||
1209 | } | ||
1210 | |||
1211 | if (is_sync) { | ||
1212 | int ret; | ||
1213 | |||
1214 | blk_mq_bio_to_request(rq, bio); | ||
1215 | blk_mq_start_request(rq, true); | ||
1216 | |||
1217 | /* | ||
1218 | * For OK queue, we are done. For error, kill it. Any other | ||
1219 | * error (busy), just add it to our list as we previously | ||
1220 | * would have done | ||
1221 | */ | ||
1222 | ret = q->mq_ops->queue_rq(data.hctx, rq); | ||
1223 | if (ret == BLK_MQ_RQ_QUEUE_OK) | ||
1224 | goto done; | ||
1225 | else { | ||
1226 | __blk_mq_requeue_request(rq); | ||
1227 | |||
1228 | if (ret == BLK_MQ_RQ_QUEUE_ERROR) { | ||
1229 | rq->errors = -EIO; | ||
1230 | blk_mq_end_io(rq, rq->errors); | ||
1231 | goto done; | ||
1232 | } | ||
1233 | } | ||
1234 | } | ||
1235 | |||
1236 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { | ||
1237 | /* | ||
1238 | * For a SYNC request, send it to the hardware immediately. For | ||
1239 | * an ASYNC request, just ensure that we run it later on. The | ||
1240 | * latter allows for merging opportunities and more efficient | ||
1241 | * dispatching. | ||
1242 | */ | ||
1243 | run_queue: | ||
1244 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); | ||
1245 | } | ||
1246 | done: | ||
1247 | blk_mq_put_ctx(data.ctx); | ||
1248 | } | ||
1249 | |||
1250 | /* | ||
1251 | * Single hardware queue variant. This will attempt to use any per-process | ||
1252 | * plug for merging and IO deferral. | ||
1253 | */ | ||
1254 | static void blk_sq_make_request(struct request_queue *q, struct bio *bio) | ||
1255 | { | ||
1256 | const int is_sync = rw_is_sync(bio->bi_rw); | ||
1257 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); | ||
941 | unsigned int use_plug, request_count = 0; | 1258 | unsigned int use_plug, request_count = 0; |
1259 | struct blk_map_ctx data; | ||
1260 | struct request *rq; | ||
942 | 1261 | ||
943 | /* | 1262 | /* |
944 | * If we have multiple hardware queues, just go directly to | 1263 | * If we have multiple hardware queues, just go directly to |
945 | * one of those for sync IO. | 1264 | * one of those for sync IO. |
946 | */ | 1265 | */ |
947 | use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); | 1266 | use_plug = !is_flush_fua && !is_sync; |
948 | 1267 | ||
949 | blk_queue_bounce(q, &bio); | 1268 | blk_queue_bounce(q, &bio); |
950 | 1269 | ||
@@ -953,33 +1272,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
953 | return; | 1272 | return; |
954 | } | 1273 | } |
955 | 1274 | ||
956 | if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) | 1275 | if (use_plug && !blk_queue_nomerges(q) && |
1276 | blk_attempt_plug_merge(q, bio, &request_count)) | ||
957 | return; | 1277 | return; |
958 | 1278 | ||
959 | if (blk_mq_queue_enter(q)) { | 1279 | rq = blk_mq_map_request(q, bio, &data); |
960 | bio_endio(bio, -EIO); | ||
961 | return; | ||
962 | } | ||
963 | |||
964 | ctx = blk_mq_get_ctx(q); | ||
965 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
966 | |||
967 | if (is_sync) | ||
968 | rw |= REQ_SYNC; | ||
969 | trace_block_getrq(q, bio, rw); | ||
970 | rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); | ||
971 | if (likely(rq)) | ||
972 | blk_mq_rq_ctx_init(q, ctx, rq, rw); | ||
973 | else { | ||
974 | blk_mq_put_ctx(ctx); | ||
975 | trace_block_sleeprq(q, bio, rw); | ||
976 | rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, | ||
977 | false); | ||
978 | ctx = rq->mq_ctx; | ||
979 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
980 | } | ||
981 | |||
982 | hctx->queued++; | ||
983 | 1280 | ||
984 | if (unlikely(is_flush_fua)) { | 1281 | if (unlikely(is_flush_fua)) { |
985 | blk_mq_bio_to_request(rq, bio); | 1282 | blk_mq_bio_to_request(rq, bio); |
@@ -1004,31 +1301,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1004 | trace_block_plug(q); | 1301 | trace_block_plug(q); |
1005 | } | 1302 | } |
1006 | list_add_tail(&rq->queuelist, &plug->mq_list); | 1303 | list_add_tail(&rq->queuelist, &plug->mq_list); |
1007 | blk_mq_put_ctx(ctx); | 1304 | blk_mq_put_ctx(data.ctx); |
1008 | return; | 1305 | return; |
1009 | } | 1306 | } |
1010 | } | 1307 | } |
1011 | 1308 | ||
1012 | spin_lock(&ctx->lock); | 1309 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { |
1013 | 1310 | /* | |
1014 | if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && | 1311 | * For a SYNC request, send it to the hardware immediately. For |
1015 | blk_mq_attempt_merge(q, ctx, bio)) | 1312 | * an ASYNC request, just ensure that we run it later on. The |
1016 | __blk_mq_free_request(hctx, ctx, rq); | 1313 | * latter allows for merging opportunities and more efficient |
1017 | else { | 1314 | * dispatching. |
1018 | blk_mq_bio_to_request(rq, bio); | 1315 | */ |
1019 | __blk_mq_insert_request(hctx, rq, false); | 1316 | run_queue: |
1317 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); | ||
1020 | } | 1318 | } |
1021 | 1319 | ||
1022 | spin_unlock(&ctx->lock); | 1320 | blk_mq_put_ctx(data.ctx); |
1023 | |||
1024 | /* | ||
1025 | * For a SYNC request, send it to the hardware immediately. For an | ||
1026 | * ASYNC request, just ensure that we run it later on. The latter | ||
1027 | * allows for merging opportunities and more efficient dispatching. | ||
1028 | */ | ||
1029 | run_queue: | ||
1030 | blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); | ||
1031 | blk_mq_put_ctx(ctx); | ||
1032 | } | 1321 | } |
1033 | 1322 | ||
1034 | /* | 1323 | /* |
@@ -1041,10 +1330,10 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) | |||
1041 | EXPORT_SYMBOL(blk_mq_map_queue); | 1330 | EXPORT_SYMBOL(blk_mq_map_queue); |
1042 | 1331 | ||
1043 | struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set, | 1332 | struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set, |
1044 | unsigned int hctx_index) | 1333 | unsigned int hctx_index, |
1334 | int node) | ||
1045 | { | 1335 | { |
1046 | return kmalloc_node(sizeof(struct blk_mq_hw_ctx), | 1336 | return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); |
1047 | GFP_KERNEL | __GFP_ZERO, set->numa_node); | ||
1048 | } | 1337 | } |
1049 | EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); | 1338 | EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); |
1050 | 1339 | ||
@@ -1055,52 +1344,6 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, | |||
1055 | } | 1344 | } |
1056 | EXPORT_SYMBOL(blk_mq_free_single_hw_queue); | 1345 | EXPORT_SYMBOL(blk_mq_free_single_hw_queue); |
1057 | 1346 | ||
1058 | static void blk_mq_hctx_notify(void *data, unsigned long action, | ||
1059 | unsigned int cpu) | ||
1060 | { | ||
1061 | struct blk_mq_hw_ctx *hctx = data; | ||
1062 | struct request_queue *q = hctx->queue; | ||
1063 | struct blk_mq_ctx *ctx; | ||
1064 | LIST_HEAD(tmp); | ||
1065 | |||
1066 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) | ||
1067 | return; | ||
1068 | |||
1069 | /* | ||
1070 | * Move ctx entries to new CPU, if this one is going away. | ||
1071 | */ | ||
1072 | ctx = __blk_mq_get_ctx(q, cpu); | ||
1073 | |||
1074 | spin_lock(&ctx->lock); | ||
1075 | if (!list_empty(&ctx->rq_list)) { | ||
1076 | list_splice_init(&ctx->rq_list, &tmp); | ||
1077 | clear_bit(ctx->index_hw, hctx->ctx_map); | ||
1078 | } | ||
1079 | spin_unlock(&ctx->lock); | ||
1080 | |||
1081 | if (list_empty(&tmp)) | ||
1082 | return; | ||
1083 | |||
1084 | ctx = blk_mq_get_ctx(q); | ||
1085 | spin_lock(&ctx->lock); | ||
1086 | |||
1087 | while (!list_empty(&tmp)) { | ||
1088 | struct request *rq; | ||
1089 | |||
1090 | rq = list_first_entry(&tmp, struct request, queuelist); | ||
1091 | rq->mq_ctx = ctx; | ||
1092 | list_move_tail(&rq->queuelist, &ctx->rq_list); | ||
1093 | } | ||
1094 | |||
1095 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
1096 | blk_mq_hctx_mark_pending(hctx, ctx); | ||
1097 | |||
1098 | spin_unlock(&ctx->lock); | ||
1099 | |||
1100 | blk_mq_run_hw_queue(hctx, true); | ||
1101 | blk_mq_put_ctx(ctx); | ||
1102 | } | ||
1103 | |||
1104 | static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, | 1347 | static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, |
1105 | struct blk_mq_tags *tags, unsigned int hctx_idx) | 1348 | struct blk_mq_tags *tags, unsigned int hctx_idx) |
1106 | { | 1349 | { |
@@ -1130,12 +1373,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, | |||
1130 | 1373 | ||
1131 | static size_t order_to_size(unsigned int order) | 1374 | static size_t order_to_size(unsigned int order) |
1132 | { | 1375 | { |
1133 | size_t ret = PAGE_SIZE; | 1376 | return (size_t)PAGE_SIZE << order; |
1134 | |||
1135 | while (order--) | ||
1136 | ret *= 2; | ||
1137 | |||
1138 | return ret; | ||
1139 | } | 1377 | } |
1140 | 1378 | ||
1141 | static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, | 1379 | static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, |
@@ -1219,17 +1457,147 @@ fail: | |||
1219 | return NULL; | 1457 | return NULL; |
1220 | } | 1458 | } |
1221 | 1459 | ||
1460 | static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) | ||
1461 | { | ||
1462 | kfree(bitmap->map); | ||
1463 | } | ||
1464 | |||
1465 | static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) | ||
1466 | { | ||
1467 | unsigned int bpw = 8, total, num_maps, i; | ||
1468 | |||
1469 | bitmap->bits_per_word = bpw; | ||
1470 | |||
1471 | num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; | ||
1472 | bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), | ||
1473 | GFP_KERNEL, node); | ||
1474 | if (!bitmap->map) | ||
1475 | return -ENOMEM; | ||
1476 | |||
1477 | bitmap->map_size = num_maps; | ||
1478 | |||
1479 | total = nr_cpu_ids; | ||
1480 | for (i = 0; i < num_maps; i++) { | ||
1481 | bitmap->map[i].depth = min(total, bitmap->bits_per_word); | ||
1482 | total -= bitmap->map[i].depth; | ||
1483 | } | ||
1484 | |||
1485 | return 0; | ||
1486 | } | ||
1487 | |||
1488 | static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) | ||
1489 | { | ||
1490 | struct request_queue *q = hctx->queue; | ||
1491 | struct blk_mq_ctx *ctx; | ||
1492 | LIST_HEAD(tmp); | ||
1493 | |||
1494 | /* | ||
1495 | * Move ctx entries to new CPU, if this one is going away. | ||
1496 | */ | ||
1497 | ctx = __blk_mq_get_ctx(q, cpu); | ||
1498 | |||
1499 | spin_lock(&ctx->lock); | ||
1500 | if (!list_empty(&ctx->rq_list)) { | ||
1501 | list_splice_init(&ctx->rq_list, &tmp); | ||
1502 | blk_mq_hctx_clear_pending(hctx, ctx); | ||
1503 | } | ||
1504 | spin_unlock(&ctx->lock); | ||
1505 | |||
1506 | if (list_empty(&tmp)) | ||
1507 | return NOTIFY_OK; | ||
1508 | |||
1509 | ctx = blk_mq_get_ctx(q); | ||
1510 | spin_lock(&ctx->lock); | ||
1511 | |||
1512 | while (!list_empty(&tmp)) { | ||
1513 | struct request *rq; | ||
1514 | |||
1515 | rq = list_first_entry(&tmp, struct request, queuelist); | ||
1516 | rq->mq_ctx = ctx; | ||
1517 | list_move_tail(&rq->queuelist, &ctx->rq_list); | ||
1518 | } | ||
1519 | |||
1520 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
1521 | blk_mq_hctx_mark_pending(hctx, ctx); | ||
1522 | |||
1523 | spin_unlock(&ctx->lock); | ||
1524 | |||
1525 | blk_mq_run_hw_queue(hctx, true); | ||
1526 | blk_mq_put_ctx(ctx); | ||
1527 | return NOTIFY_OK; | ||
1528 | } | ||
1529 | |||
1530 | static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) | ||
1531 | { | ||
1532 | struct request_queue *q = hctx->queue; | ||
1533 | struct blk_mq_tag_set *set = q->tag_set; | ||
1534 | |||
1535 | if (set->tags[hctx->queue_num]) | ||
1536 | return NOTIFY_OK; | ||
1537 | |||
1538 | set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); | ||
1539 | if (!set->tags[hctx->queue_num]) | ||
1540 | return NOTIFY_STOP; | ||
1541 | |||
1542 | hctx->tags = set->tags[hctx->queue_num]; | ||
1543 | return NOTIFY_OK; | ||
1544 | } | ||
1545 | |||
1546 | static int blk_mq_hctx_notify(void *data, unsigned long action, | ||
1547 | unsigned int cpu) | ||
1548 | { | ||
1549 | struct blk_mq_hw_ctx *hctx = data; | ||
1550 | |||
1551 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) | ||
1552 | return blk_mq_hctx_cpu_offline(hctx, cpu); | ||
1553 | else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | ||
1554 | return blk_mq_hctx_cpu_online(hctx, cpu); | ||
1555 | |||
1556 | return NOTIFY_OK; | ||
1557 | } | ||
1558 | |||
1559 | static void blk_mq_exit_hw_queues(struct request_queue *q, | ||
1560 | struct blk_mq_tag_set *set, int nr_queue) | ||
1561 | { | ||
1562 | struct blk_mq_hw_ctx *hctx; | ||
1563 | unsigned int i; | ||
1564 | |||
1565 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1566 | if (i == nr_queue) | ||
1567 | break; | ||
1568 | |||
1569 | if (set->ops->exit_hctx) | ||
1570 | set->ops->exit_hctx(hctx, i); | ||
1571 | |||
1572 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | ||
1573 | kfree(hctx->ctxs); | ||
1574 | blk_mq_free_bitmap(&hctx->ctx_map); | ||
1575 | } | ||
1576 | |||
1577 | } | ||
1578 | |||
1579 | static void blk_mq_free_hw_queues(struct request_queue *q, | ||
1580 | struct blk_mq_tag_set *set) | ||
1581 | { | ||
1582 | struct blk_mq_hw_ctx *hctx; | ||
1583 | unsigned int i; | ||
1584 | |||
1585 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1586 | free_cpumask_var(hctx->cpumask); | ||
1587 | set->ops->free_hctx(hctx, i); | ||
1588 | } | ||
1589 | } | ||
1590 | |||
1222 | static int blk_mq_init_hw_queues(struct request_queue *q, | 1591 | static int blk_mq_init_hw_queues(struct request_queue *q, |
1223 | struct blk_mq_tag_set *set) | 1592 | struct blk_mq_tag_set *set) |
1224 | { | 1593 | { |
1225 | struct blk_mq_hw_ctx *hctx; | 1594 | struct blk_mq_hw_ctx *hctx; |
1226 | unsigned int i, j; | 1595 | unsigned int i; |
1227 | 1596 | ||
1228 | /* | 1597 | /* |
1229 | * Initialize hardware queues | 1598 | * Initialize hardware queues |
1230 | */ | 1599 | */ |
1231 | queue_for_each_hw_ctx(q, hctx, i) { | 1600 | queue_for_each_hw_ctx(q, hctx, i) { |
1232 | unsigned int num_maps; | ||
1233 | int node; | 1601 | int node; |
1234 | 1602 | ||
1235 | node = hctx->numa_node; | 1603 | node = hctx->numa_node; |
@@ -1260,13 +1628,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, | |||
1260 | if (!hctx->ctxs) | 1628 | if (!hctx->ctxs) |
1261 | break; | 1629 | break; |
1262 | 1630 | ||
1263 | num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; | 1631 | if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) |
1264 | hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), | ||
1265 | GFP_KERNEL, node); | ||
1266 | if (!hctx->ctx_map) | ||
1267 | break; | 1632 | break; |
1268 | 1633 | ||
1269 | hctx->nr_ctx_map = num_maps; | ||
1270 | hctx->nr_ctx = 0; | 1634 | hctx->nr_ctx = 0; |
1271 | 1635 | ||
1272 | if (set->ops->init_hctx && | 1636 | if (set->ops->init_hctx && |
@@ -1280,16 +1644,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, | |||
1280 | /* | 1644 | /* |
1281 | * Init failed | 1645 | * Init failed |
1282 | */ | 1646 | */ |
1283 | queue_for_each_hw_ctx(q, hctx, j) { | 1647 | blk_mq_exit_hw_queues(q, set, i); |
1284 | if (i == j) | ||
1285 | break; | ||
1286 | |||
1287 | if (set->ops->exit_hctx) | ||
1288 | set->ops->exit_hctx(hctx, j); | ||
1289 | |||
1290 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | ||
1291 | kfree(hctx->ctxs); | ||
1292 | } | ||
1293 | 1648 | ||
1294 | return 1; | 1649 | return 1; |
1295 | } | 1650 | } |
@@ -1350,6 +1705,79 @@ static void blk_mq_map_swqueue(struct request_queue *q) | |||
1350 | ctx->index_hw = hctx->nr_ctx; | 1705 | ctx->index_hw = hctx->nr_ctx; |
1351 | hctx->ctxs[hctx->nr_ctx++] = ctx; | 1706 | hctx->ctxs[hctx->nr_ctx++] = ctx; |
1352 | } | 1707 | } |
1708 | |||
1709 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1710 | /* | ||
1711 | * If not software queues are mapped to this hardware queue, | ||
1712 | * disable it and free the request entries | ||
1713 | */ | ||
1714 | if (!hctx->nr_ctx) { | ||
1715 | struct blk_mq_tag_set *set = q->tag_set; | ||
1716 | |||
1717 | if (set->tags[i]) { | ||
1718 | blk_mq_free_rq_map(set, set->tags[i], i); | ||
1719 | set->tags[i] = NULL; | ||
1720 | hctx->tags = NULL; | ||
1721 | } | ||
1722 | continue; | ||
1723 | } | ||
1724 | |||
1725 | /* | ||
1726 | * Initialize batch roundrobin counts | ||
1727 | */ | ||
1728 | hctx->next_cpu = cpumask_first(hctx->cpumask); | ||
1729 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; | ||
1730 | } | ||
1731 | } | ||
1732 | |||
1733 | static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) | ||
1734 | { | ||
1735 | struct blk_mq_hw_ctx *hctx; | ||
1736 | struct request_queue *q; | ||
1737 | bool shared; | ||
1738 | int i; | ||
1739 | |||
1740 | if (set->tag_list.next == set->tag_list.prev) | ||
1741 | shared = false; | ||
1742 | else | ||
1743 | shared = true; | ||
1744 | |||
1745 | list_for_each_entry(q, &set->tag_list, tag_set_list) { | ||
1746 | blk_mq_freeze_queue(q); | ||
1747 | |||
1748 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1749 | if (shared) | ||
1750 | hctx->flags |= BLK_MQ_F_TAG_SHARED; | ||
1751 | else | ||
1752 | hctx->flags &= ~BLK_MQ_F_TAG_SHARED; | ||
1753 | } | ||
1754 | blk_mq_unfreeze_queue(q); | ||
1755 | } | ||
1756 | } | ||
1757 | |||
1758 | static void blk_mq_del_queue_tag_set(struct request_queue *q) | ||
1759 | { | ||
1760 | struct blk_mq_tag_set *set = q->tag_set; | ||
1761 | |||
1762 | blk_mq_freeze_queue(q); | ||
1763 | |||
1764 | mutex_lock(&set->tag_list_lock); | ||
1765 | list_del_init(&q->tag_set_list); | ||
1766 | blk_mq_update_tag_set_depth(set); | ||
1767 | mutex_unlock(&set->tag_list_lock); | ||
1768 | |||
1769 | blk_mq_unfreeze_queue(q); | ||
1770 | } | ||
1771 | |||
1772 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | ||
1773 | struct request_queue *q) | ||
1774 | { | ||
1775 | q->tag_set = set; | ||
1776 | |||
1777 | mutex_lock(&set->tag_list_lock); | ||
1778 | list_add_tail(&q->tag_set_list, &set->tag_list); | ||
1779 | blk_mq_update_tag_set_depth(set); | ||
1780 | mutex_unlock(&set->tag_list_lock); | ||
1353 | } | 1781 | } |
1354 | 1782 | ||
1355 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | 1783 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) |
@@ -1357,6 +1785,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1357 | struct blk_mq_hw_ctx **hctxs; | 1785 | struct blk_mq_hw_ctx **hctxs; |
1358 | struct blk_mq_ctx *ctx; | 1786 | struct blk_mq_ctx *ctx; |
1359 | struct request_queue *q; | 1787 | struct request_queue *q; |
1788 | unsigned int *map; | ||
1360 | int i; | 1789 | int i; |
1361 | 1790 | ||
1362 | ctx = alloc_percpu(struct blk_mq_ctx); | 1791 | ctx = alloc_percpu(struct blk_mq_ctx); |
@@ -1369,15 +1798,22 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1369 | if (!hctxs) | 1798 | if (!hctxs) |
1370 | goto err_percpu; | 1799 | goto err_percpu; |
1371 | 1800 | ||
1801 | map = blk_mq_make_queue_map(set); | ||
1802 | if (!map) | ||
1803 | goto err_map; | ||
1804 | |||
1372 | for (i = 0; i < set->nr_hw_queues; i++) { | 1805 | for (i = 0; i < set->nr_hw_queues; i++) { |
1373 | hctxs[i] = set->ops->alloc_hctx(set, i); | 1806 | int node = blk_mq_hw_queue_to_node(map, i); |
1807 | |||
1808 | hctxs[i] = set->ops->alloc_hctx(set, i, node); | ||
1374 | if (!hctxs[i]) | 1809 | if (!hctxs[i]) |
1375 | goto err_hctxs; | 1810 | goto err_hctxs; |
1376 | 1811 | ||
1377 | if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) | 1812 | if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) |
1378 | goto err_hctxs; | 1813 | goto err_hctxs; |
1379 | 1814 | ||
1380 | hctxs[i]->numa_node = NUMA_NO_NODE; | 1815 | atomic_set(&hctxs[i]->nr_active, 0); |
1816 | hctxs[i]->numa_node = node; | ||
1381 | hctxs[i]->queue_num = i; | 1817 | hctxs[i]->queue_num = i; |
1382 | } | 1818 | } |
1383 | 1819 | ||
@@ -1385,8 +1821,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1385 | if (!q) | 1821 | if (!q) |
1386 | goto err_hctxs; | 1822 | goto err_hctxs; |
1387 | 1823 | ||
1388 | q->mq_map = blk_mq_make_queue_map(set); | 1824 | if (percpu_counter_init(&q->mq_usage_counter, 0)) |
1389 | if (!q->mq_map) | ||
1390 | goto err_map; | 1825 | goto err_map; |
1391 | 1826 | ||
1392 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); | 1827 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); |
@@ -1394,6 +1829,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1394 | 1829 | ||
1395 | q->nr_queues = nr_cpu_ids; | 1830 | q->nr_queues = nr_cpu_ids; |
1396 | q->nr_hw_queues = set->nr_hw_queues; | 1831 | q->nr_hw_queues = set->nr_hw_queues; |
1832 | q->mq_map = map; | ||
1397 | 1833 | ||
1398 | q->queue_ctx = ctx; | 1834 | q->queue_ctx = ctx; |
1399 | q->queue_hw_ctx = hctxs; | 1835 | q->queue_hw_ctx = hctxs; |
@@ -1403,11 +1839,24 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1403 | 1839 | ||
1404 | q->sg_reserved_size = INT_MAX; | 1840 | q->sg_reserved_size = INT_MAX; |
1405 | 1841 | ||
1406 | blk_queue_make_request(q, blk_mq_make_request); | 1842 | INIT_WORK(&q->requeue_work, blk_mq_requeue_work); |
1407 | blk_queue_rq_timed_out(q, set->ops->timeout); | 1843 | INIT_LIST_HEAD(&q->requeue_list); |
1844 | spin_lock_init(&q->requeue_lock); | ||
1845 | |||
1846 | if (q->nr_hw_queues > 1) | ||
1847 | blk_queue_make_request(q, blk_mq_make_request); | ||
1848 | else | ||
1849 | blk_queue_make_request(q, blk_sq_make_request); | ||
1850 | |||
1851 | blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); | ||
1408 | if (set->timeout) | 1852 | if (set->timeout) |
1409 | blk_queue_rq_timeout(q, set->timeout); | 1853 | blk_queue_rq_timeout(q, set->timeout); |
1410 | 1854 | ||
1855 | /* | ||
1856 | * Do this after blk_queue_make_request() overrides it... | ||
1857 | */ | ||
1858 | q->nr_requests = set->queue_depth; | ||
1859 | |||
1411 | if (set->ops->complete) | 1860 | if (set->ops->complete) |
1412 | blk_queue_softirq_done(q, set->ops->complete); | 1861 | blk_queue_softirq_done(q, set->ops->complete); |
1413 | 1862 | ||
@@ -1423,27 +1872,29 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1423 | if (blk_mq_init_hw_queues(q, set)) | 1872 | if (blk_mq_init_hw_queues(q, set)) |
1424 | goto err_flush_rq; | 1873 | goto err_flush_rq; |
1425 | 1874 | ||
1426 | blk_mq_map_swqueue(q); | ||
1427 | |||
1428 | mutex_lock(&all_q_mutex); | 1875 | mutex_lock(&all_q_mutex); |
1429 | list_add_tail(&q->all_q_node, &all_q_list); | 1876 | list_add_tail(&q->all_q_node, &all_q_list); |
1430 | mutex_unlock(&all_q_mutex); | 1877 | mutex_unlock(&all_q_mutex); |
1431 | 1878 | ||
1879 | blk_mq_add_queue_tag_set(set, q); | ||
1880 | |||
1881 | blk_mq_map_swqueue(q); | ||
1882 | |||
1432 | return q; | 1883 | return q; |
1433 | 1884 | ||
1434 | err_flush_rq: | 1885 | err_flush_rq: |
1435 | kfree(q->flush_rq); | 1886 | kfree(q->flush_rq); |
1436 | err_hw: | 1887 | err_hw: |
1437 | kfree(q->mq_map); | ||
1438 | err_map: | ||
1439 | blk_cleanup_queue(q); | 1888 | blk_cleanup_queue(q); |
1440 | err_hctxs: | 1889 | err_hctxs: |
1890 | kfree(map); | ||
1441 | for (i = 0; i < set->nr_hw_queues; i++) { | 1891 | for (i = 0; i < set->nr_hw_queues; i++) { |
1442 | if (!hctxs[i]) | 1892 | if (!hctxs[i]) |
1443 | break; | 1893 | break; |
1444 | free_cpumask_var(hctxs[i]->cpumask); | 1894 | free_cpumask_var(hctxs[i]->cpumask); |
1445 | set->ops->free_hctx(hctxs[i], i); | 1895 | set->ops->free_hctx(hctxs[i], i); |
1446 | } | 1896 | } |
1897 | err_map: | ||
1447 | kfree(hctxs); | 1898 | kfree(hctxs); |
1448 | err_percpu: | 1899 | err_percpu: |
1449 | free_percpu(ctx); | 1900 | free_percpu(ctx); |
@@ -1453,18 +1904,14 @@ EXPORT_SYMBOL(blk_mq_init_queue); | |||
1453 | 1904 | ||
1454 | void blk_mq_free_queue(struct request_queue *q) | 1905 | void blk_mq_free_queue(struct request_queue *q) |
1455 | { | 1906 | { |
1456 | struct blk_mq_hw_ctx *hctx; | 1907 | struct blk_mq_tag_set *set = q->tag_set; |
1457 | int i; | ||
1458 | 1908 | ||
1459 | queue_for_each_hw_ctx(q, hctx, i) { | 1909 | blk_mq_del_queue_tag_set(q); |
1460 | kfree(hctx->ctx_map); | 1910 | |
1461 | kfree(hctx->ctxs); | 1911 | blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); |
1462 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | 1912 | blk_mq_free_hw_queues(q, set); |
1463 | if (q->mq_ops->exit_hctx) | 1913 | |
1464 | q->mq_ops->exit_hctx(hctx, i); | 1914 | percpu_counter_destroy(&q->mq_usage_counter); |
1465 | free_cpumask_var(hctx->cpumask); | ||
1466 | q->mq_ops->free_hctx(hctx, i); | ||
1467 | } | ||
1468 | 1915 | ||
1469 | free_percpu(q->queue_ctx); | 1916 | free_percpu(q->queue_ctx); |
1470 | kfree(q->queue_hw_ctx); | 1917 | kfree(q->queue_hw_ctx); |
@@ -1503,10 +1950,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, | |||
1503 | struct request_queue *q; | 1950 | struct request_queue *q; |
1504 | 1951 | ||
1505 | /* | 1952 | /* |
1506 | * Before new mapping is established, hotadded cpu might already start | 1953 | * Before new mappings are established, hotadded cpu might already |
1507 | * handling requests. This doesn't break anything as we map offline | 1954 | * start handling requests. This doesn't break anything as we map |
1508 | * CPUs to first hardware queue. We will re-init queue below to get | 1955 | * offline CPUs to first hardware queue. We will re-init the queue |
1509 | * optimal settings. | 1956 | * below to get optimal settings. |
1510 | */ | 1957 | */ |
1511 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && | 1958 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && |
1512 | action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) | 1959 | action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) |
@@ -1536,7 +1983,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
1536 | return -EINVAL; | 1983 | return -EINVAL; |
1537 | 1984 | ||
1538 | 1985 | ||
1539 | set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags), | 1986 | set->tags = kmalloc_node(set->nr_hw_queues * |
1987 | sizeof(struct blk_mq_tags *), | ||
1540 | GFP_KERNEL, set->numa_node); | 1988 | GFP_KERNEL, set->numa_node); |
1541 | if (!set->tags) | 1989 | if (!set->tags) |
1542 | goto out; | 1990 | goto out; |
@@ -1547,6 +1995,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
1547 | goto out_unwind; | 1995 | goto out_unwind; |
1548 | } | 1996 | } |
1549 | 1997 | ||
1998 | mutex_init(&set->tag_list_lock); | ||
1999 | INIT_LIST_HEAD(&set->tag_list); | ||
2000 | |||
1550 | return 0; | 2001 | return 0; |
1551 | 2002 | ||
1552 | out_unwind: | 2003 | out_unwind: |
@@ -1561,11 +2012,37 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) | |||
1561 | { | 2012 | { |
1562 | int i; | 2013 | int i; |
1563 | 2014 | ||
1564 | for (i = 0; i < set->nr_hw_queues; i++) | 2015 | for (i = 0; i < set->nr_hw_queues; i++) { |
1565 | blk_mq_free_rq_map(set, set->tags[i], i); | 2016 | if (set->tags[i]) |
2017 | blk_mq_free_rq_map(set, set->tags[i], i); | ||
2018 | } | ||
2019 | |||
2020 | kfree(set->tags); | ||
1566 | } | 2021 | } |
1567 | EXPORT_SYMBOL(blk_mq_free_tag_set); | 2022 | EXPORT_SYMBOL(blk_mq_free_tag_set); |
1568 | 2023 | ||
2024 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) | ||
2025 | { | ||
2026 | struct blk_mq_tag_set *set = q->tag_set; | ||
2027 | struct blk_mq_hw_ctx *hctx; | ||
2028 | int i, ret; | ||
2029 | |||
2030 | if (!set || nr > set->queue_depth) | ||
2031 | return -EINVAL; | ||
2032 | |||
2033 | ret = 0; | ||
2034 | queue_for_each_hw_ctx(q, hctx, i) { | ||
2035 | ret = blk_mq_tag_update_depth(hctx->tags, nr); | ||
2036 | if (ret) | ||
2037 | break; | ||
2038 | } | ||
2039 | |||
2040 | if (!ret) | ||
2041 | q->nr_requests = nr; | ||
2042 | |||
2043 | return ret; | ||
2044 | } | ||
2045 | |||
1569 | void blk_mq_disable_hotplug(void) | 2046 | void blk_mq_disable_hotplug(void) |
1570 | { | 2047 | { |
1571 | mutex_lock(&all_q_mutex); | 2048 | mutex_lock(&all_q_mutex); |
diff --git a/block/blk-mq.h b/block/blk-mq.h index 5fa14f19f752..ff5e6bf0f691 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h | |||
@@ -11,7 +11,8 @@ struct blk_mq_ctx { | |||
11 | 11 | ||
12 | unsigned int cpu; | 12 | unsigned int cpu; |
13 | unsigned int index_hw; | 13 | unsigned int index_hw; |
14 | unsigned int ipi_redirect; | 14 | |
15 | unsigned int last_tag ____cacheline_aligned_in_smp; | ||
15 | 16 | ||
16 | /* incremented at dispatch time */ | 17 | /* incremented at dispatch time */ |
17 | unsigned long rq_dispatched[2]; | 18 | unsigned long rq_dispatched[2]; |
@@ -22,7 +23,7 @@ struct blk_mq_ctx { | |||
22 | 23 | ||
23 | struct request_queue *queue; | 24 | struct request_queue *queue; |
24 | struct kobject kobj; | 25 | struct kobject kobj; |
25 | }; | 26 | } ____cacheline_aligned_in_smp; |
26 | 27 | ||
27 | void __blk_mq_complete_request(struct request *rq); | 28 | void __blk_mq_complete_request(struct request *rq); |
28 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); | 29 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); |
@@ -31,13 +32,14 @@ void blk_mq_drain_queue(struct request_queue *q); | |||
31 | void blk_mq_free_queue(struct request_queue *q); | 32 | void blk_mq_free_queue(struct request_queue *q); |
32 | void blk_mq_clone_flush_request(struct request *flush_rq, | 33 | void blk_mq_clone_flush_request(struct request *flush_rq, |
33 | struct request *orig_rq); | 34 | struct request *orig_rq); |
35 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); | ||
34 | 36 | ||
35 | /* | 37 | /* |
36 | * CPU hotplug helpers | 38 | * CPU hotplug helpers |
37 | */ | 39 | */ |
38 | struct blk_mq_cpu_notifier; | 40 | struct blk_mq_cpu_notifier; |
39 | void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, | 41 | void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, |
40 | void (*fn)(void *, unsigned long, unsigned int), | 42 | int (*fn)(void *, unsigned long, unsigned int), |
41 | void *data); | 43 | void *data); |
42 | void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); | 44 | void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); |
43 | void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); | 45 | void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); |
@@ -50,7 +52,15 @@ void blk_mq_disable_hotplug(void); | |||
50 | */ | 52 | */ |
51 | extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); | 53 | extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); |
52 | extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); | 54 | extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); |
55 | extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); | ||
53 | 56 | ||
54 | void blk_mq_add_timer(struct request *rq); | 57 | /* |
58 | * Basic implementation of sparser bitmap, allowing the user to spread | ||
59 | * the bits over more cachelines. | ||
60 | */ | ||
61 | struct blk_align_bitmap { | ||
62 | unsigned long word; | ||
63 | unsigned long depth; | ||
64 | } ____cacheline_aligned_in_smp; | ||
55 | 65 | ||
56 | #endif | 66 | #endif |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f876dae4..23321fbab293 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) | |||
48 | static ssize_t | 48 | static ssize_t |
49 | queue_requests_store(struct request_queue *q, const char *page, size_t count) | 49 | queue_requests_store(struct request_queue *q, const char *page, size_t count) |
50 | { | 50 | { |
51 | struct request_list *rl; | ||
52 | unsigned long nr; | 51 | unsigned long nr; |
53 | int ret; | 52 | int ret, err; |
54 | 53 | ||
55 | if (!q->request_fn) | 54 | if (!q->request_fn && !q->mq_ops) |
56 | return -EINVAL; | 55 | return -EINVAL; |
57 | 56 | ||
58 | ret = queue_var_store(&nr, page, count); | 57 | ret = queue_var_store(&nr, page, count); |
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) | |||
62 | if (nr < BLKDEV_MIN_RQ) | 61 | if (nr < BLKDEV_MIN_RQ) |
63 | nr = BLKDEV_MIN_RQ; | 62 | nr = BLKDEV_MIN_RQ; |
64 | 63 | ||
65 | spin_lock_irq(q->queue_lock); | 64 | if (q->request_fn) |
66 | q->nr_requests = nr; | 65 | err = blk_update_nr_requests(q, nr); |
67 | blk_queue_congestion_threshold(q); | 66 | else |
68 | 67 | err = blk_mq_update_nr_requests(q, nr); | |
69 | /* congestion isn't cgroup aware and follows root blkcg for now */ | 68 | |
70 | rl = &q->root_rl; | 69 | if (err) |
71 | 70 | return err; | |
72 | if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) | ||
73 | blk_set_queue_congested(q, BLK_RW_SYNC); | ||
74 | else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) | ||
75 | blk_clear_queue_congested(q, BLK_RW_SYNC); | ||
76 | |||
77 | if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) | ||
78 | blk_set_queue_congested(q, BLK_RW_ASYNC); | ||
79 | else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) | ||
80 | blk_clear_queue_congested(q, BLK_RW_ASYNC); | ||
81 | |||
82 | blk_queue_for_each_rl(rl, q) { | ||
83 | if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { | ||
84 | blk_set_rl_full(rl, BLK_RW_SYNC); | ||
85 | } else { | ||
86 | blk_clear_rl_full(rl, BLK_RW_SYNC); | ||
87 | wake_up(&rl->wait[BLK_RW_SYNC]); | ||
88 | } | ||
89 | |||
90 | if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { | ||
91 | blk_set_rl_full(rl, BLK_RW_ASYNC); | ||
92 | } else { | ||
93 | blk_clear_rl_full(rl, BLK_RW_ASYNC); | ||
94 | wake_up(&rl->wait[BLK_RW_ASYNC]); | ||
95 | } | ||
96 | } | ||
97 | 71 | ||
98 | spin_unlock_irq(q->queue_lock); | ||
99 | return ret; | 72 | return ret; |
100 | } | 73 | } |
101 | 74 | ||
@@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj) | |||
544 | if (q->queue_tags) | 517 | if (q->queue_tags) |
545 | __blk_queue_free_tags(q); | 518 | __blk_queue_free_tags(q); |
546 | 519 | ||
547 | percpu_counter_destroy(&q->mq_usage_counter); | ||
548 | |||
549 | if (q->mq_ops) | 520 | if (q->mq_ops) |
550 | blk_mq_free_queue(q); | 521 | blk_mq_free_queue(q); |
551 | 522 | ||
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 033745cd7fba..9353b4683359 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, | |||
744 | static bool throtl_slice_used(struct throtl_grp *tg, bool rw) | 744 | static bool throtl_slice_used(struct throtl_grp *tg, bool rw) |
745 | { | 745 | { |
746 | if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) | 746 | if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) |
747 | return 0; | 747 | return false; |
748 | 748 | ||
749 | return 1; | 749 | return 1; |
750 | } | 750 | } |
@@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, | |||
842 | if (tg->io_disp[rw] + 1 <= io_allowed) { | 842 | if (tg->io_disp[rw] + 1 <= io_allowed) { |
843 | if (wait) | 843 | if (wait) |
844 | *wait = 0; | 844 | *wait = 0; |
845 | return 1; | 845 | return true; |
846 | } | 846 | } |
847 | 847 | ||
848 | /* Calc approx time to dispatch */ | 848 | /* Calc approx time to dispatch */ |
@@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, | |||
880 | if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { | 880 | if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { |
881 | if (wait) | 881 | if (wait) |
882 | *wait = 0; | 882 | *wait = 0; |
883 | return 1; | 883 | return true; |
884 | } | 884 | } |
885 | 885 | ||
886 | /* Calc approx time to dispatch */ | 886 | /* Calc approx time to dispatch */ |
@@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, | |||
923 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { | 923 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { |
924 | if (wait) | 924 | if (wait) |
925 | *wait = 0; | 925 | *wait = 0; |
926 | return 1; | 926 | return true; |
927 | } | 927 | } |
928 | 928 | ||
929 | /* | 929 | /* |
@@ -1258,7 +1258,7 @@ out_unlock: | |||
1258 | * of throtl_data->service_queue. Those bio's are ready and issued by this | 1258 | * of throtl_data->service_queue. Those bio's are ready and issued by this |
1259 | * function. | 1259 | * function. |
1260 | */ | 1260 | */ |
1261 | void blk_throtl_dispatch_work_fn(struct work_struct *work) | 1261 | static void blk_throtl_dispatch_work_fn(struct work_struct *work) |
1262 | { | 1262 | { |
1263 | struct throtl_data *td = container_of(work, struct throtl_data, | 1263 | struct throtl_data *td = container_of(work, struct throtl_data, |
1264 | dispatch_work); | 1264 | dispatch_work); |
diff --git a/block/blk-timeout.c b/block/blk-timeout.c index a09e8af8186c..43e8b515806f 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c | |||
@@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req) | |||
96 | __blk_complete_request(req); | 96 | __blk_complete_request(req); |
97 | break; | 97 | break; |
98 | case BLK_EH_RESET_TIMER: | 98 | case BLK_EH_RESET_TIMER: |
99 | if (q->mq_ops) | 99 | blk_add_timer(req); |
100 | blk_mq_add_timer(req); | ||
101 | else | ||
102 | blk_add_timer(req); | ||
103 | |||
104 | blk_clear_rq_complete(req); | 100 | blk_clear_rq_complete(req); |
105 | break; | 101 | break; |
106 | case BLK_EH_NOT_HANDLED: | 102 | case BLK_EH_NOT_HANDLED: |
@@ -170,7 +166,26 @@ void blk_abort_request(struct request *req) | |||
170 | } | 166 | } |
171 | EXPORT_SYMBOL_GPL(blk_abort_request); | 167 | EXPORT_SYMBOL_GPL(blk_abort_request); |
172 | 168 | ||
173 | void __blk_add_timer(struct request *req, struct list_head *timeout_list) | 169 | unsigned long blk_rq_timeout(unsigned long timeout) |
170 | { | ||
171 | unsigned long maxt; | ||
172 | |||
173 | maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); | ||
174 | if (time_after(timeout, maxt)) | ||
175 | timeout = maxt; | ||
176 | |||
177 | return timeout; | ||
178 | } | ||
179 | |||
180 | /** | ||
181 | * blk_add_timer - Start timeout timer for a single request | ||
182 | * @req: request that is about to start running. | ||
183 | * | ||
184 | * Notes: | ||
185 | * Each request has its own timer, and as it is added to the queue, we | ||
186 | * set up the timer. When the request completes, we cancel the timer. | ||
187 | */ | ||
188 | void blk_add_timer(struct request *req) | ||
174 | { | 189 | { |
175 | struct request_queue *q = req->q; | 190 | struct request_queue *q = req->q; |
176 | unsigned long expiry; | 191 | unsigned long expiry; |
@@ -188,15 +203,15 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) | |||
188 | req->timeout = q->rq_timeout; | 203 | req->timeout = q->rq_timeout; |
189 | 204 | ||
190 | req->deadline = jiffies + req->timeout; | 205 | req->deadline = jiffies + req->timeout; |
191 | if (timeout_list) | 206 | if (!q->mq_ops) |
192 | list_add_tail(&req->timeout_list, timeout_list); | 207 | list_add_tail(&req->timeout_list, &req->q->timeout_list); |
193 | 208 | ||
194 | /* | 209 | /* |
195 | * If the timer isn't already pending or this timeout is earlier | 210 | * If the timer isn't already pending or this timeout is earlier |
196 | * than an existing one, modify the timer. Round up to next nearest | 211 | * than an existing one, modify the timer. Round up to next nearest |
197 | * second. | 212 | * second. |
198 | */ | 213 | */ |
199 | expiry = round_jiffies_up(req->deadline); | 214 | expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); |
200 | 215 | ||
201 | if (!timer_pending(&q->timeout) || | 216 | if (!timer_pending(&q->timeout) || |
202 | time_before(expiry, q->timeout.expires)) { | 217 | time_before(expiry, q->timeout.expires)) { |
@@ -214,17 +229,3 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) | |||
214 | } | 229 | } |
215 | 230 | ||
216 | } | 231 | } |
217 | |||
218 | /** | ||
219 | * blk_add_timer - Start timeout timer for a single request | ||
220 | * @req: request that is about to start running. | ||
221 | * | ||
222 | * Notes: | ||
223 | * Each request has its own timer, and as it is added to the queue, we | ||
224 | * set up the timer. When the request completes, we cancel the timer. | ||
225 | */ | ||
226 | void blk_add_timer(struct request *req) | ||
227 | { | ||
228 | __blk_add_timer(req, &req->q->timeout_list); | ||
229 | } | ||
230 | |||
diff --git a/block/blk.h b/block/blk.h index 1d880f1f957f..45385e9abf6f 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -9,6 +9,9 @@ | |||
9 | /* Number of requests a "batching" process may submit */ | 9 | /* Number of requests a "batching" process may submit */ |
10 | #define BLK_BATCH_REQ 32 | 10 | #define BLK_BATCH_REQ 32 |
11 | 11 | ||
12 | /* Max future timer expiry for timeouts */ | ||
13 | #define BLK_MAX_TIMEOUT (5 * HZ) | ||
14 | |||
12 | extern struct kmem_cache *blk_requestq_cachep; | 15 | extern struct kmem_cache *blk_requestq_cachep; |
13 | extern struct kmem_cache *request_cachep; | 16 | extern struct kmem_cache *request_cachep; |
14 | extern struct kobj_type blk_queue_ktype; | 17 | extern struct kobj_type blk_queue_ktype; |
@@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error, | |||
37 | void blk_rq_timed_out_timer(unsigned long data); | 40 | void blk_rq_timed_out_timer(unsigned long data); |
38 | void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, | 41 | void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, |
39 | unsigned int *next_set); | 42 | unsigned int *next_set); |
40 | void __blk_add_timer(struct request *req, struct list_head *timeout_list); | 43 | unsigned long blk_rq_timeout(unsigned long timeout); |
44 | void blk_add_timer(struct request *req); | ||
41 | void blk_delete_timer(struct request *); | 45 | void blk_delete_timer(struct request *); |
42 | void blk_add_timer(struct request *); | ||
43 | 46 | ||
44 | 47 | ||
45 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, | 48 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, |
@@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) | |||
185 | return q->nr_congestion_off; | 188 | return q->nr_congestion_off; |
186 | } | 189 | } |
187 | 190 | ||
191 | extern int blk_update_nr_requests(struct request_queue *, unsigned int); | ||
192 | |||
188 | /* | 193 | /* |
189 | * Contribute to IO statistics IFF: | 194 | * Contribute to IO statistics IFF: |
190 | * | 195 | * |
diff --git a/mm/bounce.c b/block/bounce.c index 523918b8c6dc..523918b8c6dc 100644 --- a/mm/bounce.c +++ b/block/bounce.c | |||
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5063a0bd831a..22dffebc7c73 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -4460,7 +4460,7 @@ out_free: | |||
4460 | static ssize_t | 4460 | static ssize_t |
4461 | cfq_var_show(unsigned int var, char *page) | 4461 | cfq_var_show(unsigned int var, char *page) |
4462 | { | 4462 | { |
4463 | return sprintf(page, "%d\n", var); | 4463 | return sprintf(page, "%u\n", var); |
4464 | } | 4464 | } |
4465 | 4465 | ||
4466 | static ssize_t | 4466 | static ssize_t |
diff --git a/fs/ioprio.c b/block/ioprio.c index e50170ca7c33..e50170ca7c33 100644 --- a/fs/ioprio.c +++ b/block/ioprio.c | |||
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index ae331ab4a451..ea323e91903b 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c | |||
@@ -178,7 +178,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) | |||
178 | { | 178 | { |
179 | struct request *rq; | 179 | struct request *rq; |
180 | 180 | ||
181 | rq = blk_mq_alloc_reserved_request(dd->queue, 0, __GFP_WAIT); | 181 | rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); |
182 | return blk_mq_rq_to_pdu(rq); | 182 | return blk_mq_rq_to_pdu(rq); |
183 | } | 183 | } |
184 | 184 | ||
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index e932398588aa..5a8081114df6 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c | |||
@@ -322,39 +322,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) | |||
322 | } | 322 | } |
323 | 323 | ||
324 | static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set, | 324 | static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set, |
325 | unsigned int hctx_index) | 325 | unsigned int hctx_index, |
326 | int node) | ||
326 | { | 327 | { |
327 | int b_size = DIV_ROUND_UP(set->nr_hw_queues, nr_online_nodes); | 328 | return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); |
328 | int tip = (set->nr_hw_queues % nr_online_nodes); | ||
329 | int node = 0, i, n; | ||
330 | |||
331 | /* | ||
332 | * Split submit queues evenly wrt to the number of nodes. If uneven, | ||
333 | * fill the first buckets with one extra, until the rest is filled with | ||
334 | * no extra. | ||
335 | */ | ||
336 | for (i = 0, n = 1; i < hctx_index; i++, n++) { | ||
337 | if (n % b_size == 0) { | ||
338 | n = 0; | ||
339 | node++; | ||
340 | |||
341 | tip--; | ||
342 | if (!tip) | ||
343 | b_size = set->nr_hw_queues / nr_online_nodes; | ||
344 | } | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * A node might not be online, therefore map the relative node id to the | ||
349 | * real node id. | ||
350 | */ | ||
351 | for_each_online_node(n) { | ||
352 | if (!node) | ||
353 | break; | ||
354 | node--; | ||
355 | } | ||
356 | |||
357 | return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n); | ||
358 | } | 329 | } |
359 | 330 | ||
360 | static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) | 331 | static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) |
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 1dcf9067cffa..608532d3f8c9 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c | |||
@@ -743,6 +743,7 @@ static void skd_request_fn(struct request_queue *q) | |||
743 | break; | 743 | break; |
744 | } | 744 | } |
745 | skreq->discard_page = 1; | 745 | skreq->discard_page = 1; |
746 | req->completion_data = page; | ||
746 | skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); | 747 | skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); |
747 | 748 | ||
748 | } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { | 749 | } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { |
@@ -855,10 +856,9 @@ static void skd_end_request(struct skd_device *skdev, | |||
855 | 856 | ||
856 | if ((io_flags & REQ_DISCARD) && | 857 | if ((io_flags & REQ_DISCARD) && |
857 | (skreq->discard_page == 1)) { | 858 | (skreq->discard_page == 1)) { |
858 | struct bio *bio = req->bio; | ||
859 | pr_debug("%s:%s:%d, free the page!", | 859 | pr_debug("%s:%s:%d, free the page!", |
860 | skdev->name, __func__, __LINE__); | 860 | skdev->name, __func__, __LINE__); |
861 | __free_page(bio->bi_io_vec->bv_page); | 861 | __free_page(req->completion_data); |
862 | } | 862 | } |
863 | 863 | ||
864 | if (unlikely(error)) { | 864 | if (unlikely(error)) { |
diff --git a/drivers/char/random.c b/drivers/char/random.c index 6b75713d953a..0a19d866a153 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c | |||
@@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk) | |||
902 | add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); | 902 | add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); |
903 | trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); | 903 | trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); |
904 | } | 904 | } |
905 | EXPORT_SYMBOL_GPL(add_disk_randomness); | ||
905 | #endif | 906 | #endif |
906 | 907 | ||
907 | /********************************************************************* | 908 | /********************************************************************* |
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 06d154d20faa..96af195224f2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c | |||
@@ -737,6 +737,7 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) | |||
737 | goto out; | 737 | goto out; |
738 | } | 738 | } |
739 | 739 | ||
740 | rq->completion_data = page; | ||
740 | blk_add_request_payload(rq, page, len); | 741 | blk_add_request_payload(rq, page, len); |
741 | ret = scsi_setup_blk_pc_cmnd(sdp, rq); | 742 | ret = scsi_setup_blk_pc_cmnd(sdp, rq); |
742 | rq->__data_len = nr_bytes; | 743 | rq->__data_len = nr_bytes; |
@@ -839,11 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq) | |||
839 | { | 840 | { |
840 | struct scsi_cmnd *SCpnt = rq->special; | 841 | struct scsi_cmnd *SCpnt = rq->special; |
841 | 842 | ||
842 | if (rq->cmd_flags & REQ_DISCARD) { | 843 | if (rq->cmd_flags & REQ_DISCARD) |
843 | struct bio *bio = rq->bio; | 844 | __free_page(rq->completion_data); |
844 | 845 | ||
845 | __free_page(bio->bi_io_vec->bv_page); | ||
846 | } | ||
847 | if (SCpnt->cmnd != rq->cmd) { | 846 | if (SCpnt->cmnd != rq->cmd) { |
848 | mempool_free(SCpnt->cmnd, sd_cdb_pool); | 847 | mempool_free(SCpnt->cmnd, sd_cdb_pool); |
849 | SCpnt->cmnd = NULL; | 848 | SCpnt->cmnd = NULL; |
diff --git a/fs/Makefile b/fs/Makefile index f9cb9876e466..4030cbfbc9af 100644 --- a/fs/Makefile +++ b/fs/Makefile | |||
@@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ | |||
14 | stack.o fs_struct.o statfs.o | 14 | stack.o fs_struct.o statfs.o |
15 | 15 | ||
16 | ifeq ($(CONFIG_BLOCK),y) | 16 | ifeq ($(CONFIG_BLOCK),y) |
17 | obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o | 17 | obj-y += buffer.o block_dev.o direct-io.o mpage.o |
18 | else | 18 | else |
19 | obj-y += no-block.o | 19 | obj-y += no-block.o |
20 | endif | 20 | endif |
21 | 21 | ||
22 | obj-$(CONFIG_PROC_FS) += proc_namespace.o | 22 | obj-$(CONFIG_PROC_FS) += proc_namespace.o |
23 | 23 | ||
24 | obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o | ||
25 | obj-y += notify/ | 24 | obj-y += notify/ |
26 | obj-$(CONFIG_EPOLL) += eventpoll.o | 25 | obj-$(CONFIG_EPOLL) += eventpoll.o |
27 | obj-$(CONFIG_ANON_INODES) += anon_inodes.o | 26 | obj-$(CONFIG_ANON_INODES) += anon_inodes.o |
diff --git a/include/linux/bio.h b/include/linux/bio.h index bba550826921..5a645769f020 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h | |||
@@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, | |||
333 | 333 | ||
334 | extern struct bio_set *bioset_create(unsigned int, unsigned int); | 334 | extern struct bio_set *bioset_create(unsigned int, unsigned int); |
335 | extern void bioset_free(struct bio_set *); | 335 | extern void bioset_free(struct bio_set *); |
336 | extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); | 336 | extern mempool_t *biovec_create_pool(int pool_entries); |
337 | 337 | ||
338 | extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); | 338 | extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); |
339 | extern void bio_put(struct bio *); | 339 | extern void bio_put(struct bio *); |
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ab469d525894..2bd82f399128 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
@@ -8,7 +8,13 @@ struct blk_mq_tags; | |||
8 | struct blk_mq_cpu_notifier { | 8 | struct blk_mq_cpu_notifier { |
9 | struct list_head list; | 9 | struct list_head list; |
10 | void *data; | 10 | void *data; |
11 | void (*notify)(void *data, unsigned long action, unsigned int cpu); | 11 | int (*notify)(void *data, unsigned long action, unsigned int cpu); |
12 | }; | ||
13 | |||
14 | struct blk_mq_ctxmap { | ||
15 | unsigned int map_size; | ||
16 | unsigned int bits_per_word; | ||
17 | struct blk_align_bitmap *map; | ||
12 | }; | 18 | }; |
13 | 19 | ||
14 | struct blk_mq_hw_ctx { | 20 | struct blk_mq_hw_ctx { |
@@ -21,6 +27,8 @@ struct blk_mq_hw_ctx { | |||
21 | struct delayed_work run_work; | 27 | struct delayed_work run_work; |
22 | struct delayed_work delay_work; | 28 | struct delayed_work delay_work; |
23 | cpumask_var_t cpumask; | 29 | cpumask_var_t cpumask; |
30 | int next_cpu; | ||
31 | int next_cpu_batch; | ||
24 | 32 | ||
25 | unsigned long flags; /* BLK_MQ_F_* flags */ | 33 | unsigned long flags; /* BLK_MQ_F_* flags */ |
26 | 34 | ||
@@ -29,10 +37,12 @@ struct blk_mq_hw_ctx { | |||
29 | 37 | ||
30 | void *driver_data; | 38 | void *driver_data; |
31 | 39 | ||
40 | struct blk_mq_ctxmap ctx_map; | ||
41 | |||
32 | unsigned int nr_ctx; | 42 | unsigned int nr_ctx; |
33 | struct blk_mq_ctx **ctxs; | 43 | struct blk_mq_ctx **ctxs; |
34 | unsigned int nr_ctx_map; | 44 | |
35 | unsigned long *ctx_map; | 45 | unsigned int wait_index; |
36 | 46 | ||
37 | struct blk_mq_tags *tags; | 47 | struct blk_mq_tags *tags; |
38 | 48 | ||
@@ -44,6 +54,8 @@ struct blk_mq_hw_ctx { | |||
44 | unsigned int numa_node; | 54 | unsigned int numa_node; |
45 | unsigned int cmd_size; /* per-request extra data */ | 55 | unsigned int cmd_size; /* per-request extra data */ |
46 | 56 | ||
57 | atomic_t nr_active; | ||
58 | |||
47 | struct blk_mq_cpu_notifier cpu_notifier; | 59 | struct blk_mq_cpu_notifier cpu_notifier; |
48 | struct kobject kobj; | 60 | struct kobject kobj; |
49 | }; | 61 | }; |
@@ -51,7 +63,7 @@ struct blk_mq_hw_ctx { | |||
51 | struct blk_mq_tag_set { | 63 | struct blk_mq_tag_set { |
52 | struct blk_mq_ops *ops; | 64 | struct blk_mq_ops *ops; |
53 | unsigned int nr_hw_queues; | 65 | unsigned int nr_hw_queues; |
54 | unsigned int queue_depth; | 66 | unsigned int queue_depth; /* max hw supported */ |
55 | unsigned int reserved_tags; | 67 | unsigned int reserved_tags; |
56 | unsigned int cmd_size; /* per-request extra data */ | 68 | unsigned int cmd_size; /* per-request extra data */ |
57 | int numa_node; | 69 | int numa_node; |
@@ -60,12 +72,15 @@ struct blk_mq_tag_set { | |||
60 | void *driver_data; | 72 | void *driver_data; |
61 | 73 | ||
62 | struct blk_mq_tags **tags; | 74 | struct blk_mq_tags **tags; |
75 | |||
76 | struct mutex tag_list_lock; | ||
77 | struct list_head tag_list; | ||
63 | }; | 78 | }; |
64 | 79 | ||
65 | typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); | 80 | typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); |
66 | typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); | 81 | typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); |
67 | typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *, | 82 | typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *, |
68 | unsigned int); | 83 | unsigned int, int); |
69 | typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); | 84 | typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); |
70 | typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); | 85 | typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); |
71 | typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); | 86 | typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); |
@@ -122,11 +137,14 @@ enum { | |||
122 | 137 | ||
123 | BLK_MQ_F_SHOULD_MERGE = 1 << 0, | 138 | BLK_MQ_F_SHOULD_MERGE = 1 << 0, |
124 | BLK_MQ_F_SHOULD_SORT = 1 << 1, | 139 | BLK_MQ_F_SHOULD_SORT = 1 << 1, |
125 | BLK_MQ_F_SHOULD_IPI = 1 << 2, | 140 | BLK_MQ_F_TAG_SHARED = 1 << 2, |
126 | 141 | ||
127 | BLK_MQ_S_STOPPED = 0, | 142 | BLK_MQ_S_STOPPED = 0, |
143 | BLK_MQ_S_TAG_ACTIVE = 1, | ||
128 | 144 | ||
129 | BLK_MQ_MAX_DEPTH = 2048, | 145 | BLK_MQ_MAX_DEPTH = 2048, |
146 | |||
147 | BLK_MQ_CPU_WORK_BATCH = 8, | ||
130 | }; | 148 | }; |
131 | 149 | ||
132 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); | 150 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); |
@@ -142,19 +160,20 @@ void blk_mq_insert_request(struct request *, bool, bool, bool); | |||
142 | void blk_mq_run_queues(struct request_queue *q, bool async); | 160 | void blk_mq_run_queues(struct request_queue *q, bool async); |
143 | void blk_mq_free_request(struct request *rq); | 161 | void blk_mq_free_request(struct request *rq); |
144 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *); | 162 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *); |
145 | struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); | 163 | struct request *blk_mq_alloc_request(struct request_queue *q, int rw, |
146 | struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); | 164 | gfp_t gfp, bool reserved); |
147 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); | 165 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); |
148 | 166 | ||
149 | struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); | 167 | struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); |
150 | struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int); | 168 | struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); |
151 | void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); | 169 | void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); |
152 | 170 | ||
153 | void blk_mq_end_io(struct request *rq, int error); | 171 | void blk_mq_end_io(struct request *rq, int error); |
154 | void __blk_mq_end_io(struct request *rq, int error); | 172 | void __blk_mq_end_io(struct request *rq, int error); |
155 | 173 | ||
156 | void blk_mq_requeue_request(struct request *rq); | 174 | void blk_mq_requeue_request(struct request *rq); |
157 | 175 | void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); | |
176 | void blk_mq_kick_requeue_list(struct request_queue *q); | ||
158 | void blk_mq_complete_request(struct request *rq); | 177 | void blk_mq_complete_request(struct request *rq); |
159 | 178 | ||
160 | void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); | 179 | void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); |
@@ -163,6 +182,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q); | |||
163 | void blk_mq_start_hw_queues(struct request_queue *q); | 182 | void blk_mq_start_hw_queues(struct request_queue *q); |
164 | void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); | 183 | void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); |
165 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); | 184 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); |
185 | void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); | ||
166 | 186 | ||
167 | /* | 187 | /* |
168 | * Driver command data is immediately after the request. So subtract request | 188 | * Driver command data is immediately after the request. So subtract request |
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index aa0eaa2d0bd8..d8e4cea23a25 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
@@ -190,6 +190,7 @@ enum rq_flag_bits { | |||
190 | __REQ_PM, /* runtime pm request */ | 190 | __REQ_PM, /* runtime pm request */ |
191 | __REQ_END, /* last of chain of requests */ | 191 | __REQ_END, /* last of chain of requests */ |
192 | __REQ_HASHED, /* on IO scheduler merge hash */ | 192 | __REQ_HASHED, /* on IO scheduler merge hash */ |
193 | __REQ_MQ_INFLIGHT, /* track inflight for MQ */ | ||
193 | __REQ_NR_BITS, /* stops here */ | 194 | __REQ_NR_BITS, /* stops here */ |
194 | }; | 195 | }; |
195 | 196 | ||
@@ -243,5 +244,6 @@ enum rq_flag_bits { | |||
243 | #define REQ_PM (1ULL << __REQ_PM) | 244 | #define REQ_PM (1ULL << __REQ_PM) |
244 | #define REQ_END (1ULL << __REQ_END) | 245 | #define REQ_END (1ULL << __REQ_END) |
245 | #define REQ_HASHED (1ULL << __REQ_HASHED) | 246 | #define REQ_HASHED (1ULL << __REQ_HASHED) |
247 | #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) | ||
246 | 248 | ||
247 | #endif /* __LINUX_BLK_TYPES_H */ | 249 | #endif /* __LINUX_BLK_TYPES_H */ |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 74ee55fefcf0..e90e1692e052 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -90,15 +90,15 @@ enum rq_cmd_type_bits { | |||
90 | #define BLK_MAX_CDB 16 | 90 | #define BLK_MAX_CDB 16 |
91 | 91 | ||
92 | /* | 92 | /* |
93 | * try to put the fields that are referenced together in the same cacheline. | 93 | * Try to put the fields that are referenced together in the same cacheline. |
94 | * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() | 94 | * |
95 | * as well! | 95 | * If you modify this structure, make sure to update blk_rq_init() and |
96 | * especially blk_mq_rq_ctx_init() to take care of the added fields. | ||
96 | */ | 97 | */ |
97 | struct request { | 98 | struct request { |
98 | struct list_head queuelist; | 99 | struct list_head queuelist; |
99 | union { | 100 | union { |
100 | struct call_single_data csd; | 101 | struct call_single_data csd; |
101 | struct work_struct requeue_work; | ||
102 | unsigned long fifo_time; | 102 | unsigned long fifo_time; |
103 | }; | 103 | }; |
104 | 104 | ||
@@ -462,6 +462,10 @@ struct request_queue { | |||
462 | struct request *flush_rq; | 462 | struct request *flush_rq; |
463 | spinlock_t mq_flush_lock; | 463 | spinlock_t mq_flush_lock; |
464 | 464 | ||
465 | struct list_head requeue_list; | ||
466 | spinlock_t requeue_lock; | ||
467 | struct work_struct requeue_work; | ||
468 | |||
465 | struct mutex sysfs_lock; | 469 | struct mutex sysfs_lock; |
466 | 470 | ||
467 | int bypass_depth; | 471 | int bypass_depth; |
@@ -480,6 +484,9 @@ struct request_queue { | |||
480 | wait_queue_head_t mq_freeze_wq; | 484 | wait_queue_head_t mq_freeze_wq; |
481 | struct percpu_counter mq_usage_counter; | 485 | struct percpu_counter mq_usage_counter; |
482 | struct list_head all_q_node; | 486 | struct list_head all_q_node; |
487 | |||
488 | struct blk_mq_tag_set *tag_set; | ||
489 | struct list_head tag_set_list; | ||
483 | }; | 490 | }; |
484 | 491 | ||
485 | #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ | 492 | #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ |
diff --git a/mm/Makefile b/mm/Makefile index b484452dac57..0173940407f6 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -30,7 +30,6 @@ endif | |||
30 | 30 | ||
31 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 31 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
32 | 32 | ||
33 | obj-$(CONFIG_BOUNCE) += bounce.o | ||
34 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o | 33 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
35 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | 34 | obj-$(CONFIG_FRONTSWAP) += frontswap.o |
36 | obj-$(CONFIG_ZSWAP) += zswap.o | 35 | obj-$(CONFIG_ZSWAP) += zswap.o |