diff options
author | Tejun Heo <tj@kernel.org> | 2014-09-24 13:00:21 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2014-09-24 13:00:21 -0400 |
commit | d06efebf0c37d438fcf07057be00dd40fcfce08d (patch) | |
tree | 31a0786d132aadf4cbb9725f3f444ef6e1052128 /block/blk-mq.c | |
parent | bb2e226b3bef596dd56be97df655d857b4603923 (diff) | |
parent | 0a30288da1aec914e158c2d7a3482a85f632750f (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block into for-3.18
This is to receive 0a30288da1ae ("blk-mq, percpu_ref: implement a
kludge for SCSI blk-mq stall during probe") which implements
__percpu_ref_kill_expedited() to work around SCSI blk-mq stall. The
commit reverted and patches to implement proper fix will be added.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 162 |
1 files changed, 119 insertions, 43 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 702df07b980d..255d79c14dc1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -112,18 +112,31 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) | |||
112 | */ | 112 | */ |
113 | void blk_mq_freeze_queue(struct request_queue *q) | 113 | void blk_mq_freeze_queue(struct request_queue *q) |
114 | { | 114 | { |
115 | bool freeze; | ||
116 | |||
115 | spin_lock_irq(q->queue_lock); | 117 | spin_lock_irq(q->queue_lock); |
116 | q->mq_freeze_depth++; | 118 | freeze = !q->mq_freeze_depth++; |
117 | spin_unlock_irq(q->queue_lock); | 119 | spin_unlock_irq(q->queue_lock); |
118 | 120 | ||
119 | percpu_ref_kill(&q->mq_usage_counter); | 121 | if (freeze) { |
120 | blk_mq_run_queues(q, false); | 122 | /* |
123 | * XXX: Temporary kludge to work around SCSI blk-mq stall. | ||
124 | * SCSI synchronously creates and destroys many queues | ||
125 | * back-to-back during probe leading to lengthy stalls. | ||
126 | * This will be fixed by keeping ->mq_usage_counter in | ||
127 | * atomic mode until genhd registration, but, for now, | ||
128 | * let's work around using expedited synchronization. | ||
129 | */ | ||
130 | __percpu_ref_kill_expedited(&q->mq_usage_counter); | ||
131 | |||
132 | blk_mq_run_queues(q, false); | ||
133 | } | ||
121 | wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); | 134 | wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); |
122 | } | 135 | } |
123 | 136 | ||
124 | static void blk_mq_unfreeze_queue(struct request_queue *q) | 137 | static void blk_mq_unfreeze_queue(struct request_queue *q) |
125 | { | 138 | { |
126 | bool wake = false; | 139 | bool wake; |
127 | 140 | ||
128 | spin_lock_irq(q->queue_lock); | 141 | spin_lock_irq(q->queue_lock); |
129 | wake = !--q->mq_freeze_depth; | 142 | wake = !--q->mq_freeze_depth; |
@@ -172,6 +185,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, | |||
172 | /* tag was already set */ | 185 | /* tag was already set */ |
173 | rq->errors = 0; | 186 | rq->errors = 0; |
174 | 187 | ||
188 | rq->cmd = rq->__cmd; | ||
189 | |||
175 | rq->extra_len = 0; | 190 | rq->extra_len = 0; |
176 | rq->sense_len = 0; | 191 | rq->sense_len = 0; |
177 | rq->resid_len = 0; | 192 | rq->resid_len = 0; |
@@ -197,7 +212,6 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) | |||
197 | if (tag != BLK_MQ_TAG_FAIL) { | 212 | if (tag != BLK_MQ_TAG_FAIL) { |
198 | rq = data->hctx->tags->rqs[tag]; | 213 | rq = data->hctx->tags->rqs[tag]; |
199 | 214 | ||
200 | rq->cmd_flags = 0; | ||
201 | if (blk_mq_tag_busy(data->hctx)) { | 215 | if (blk_mq_tag_busy(data->hctx)) { |
202 | rq->cmd_flags = REQ_MQ_INFLIGHT; | 216 | rq->cmd_flags = REQ_MQ_INFLIGHT; |
203 | atomic_inc(&data->hctx->nr_active); | 217 | atomic_inc(&data->hctx->nr_active); |
@@ -252,6 +266,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, | |||
252 | 266 | ||
253 | if (rq->cmd_flags & REQ_MQ_INFLIGHT) | 267 | if (rq->cmd_flags & REQ_MQ_INFLIGHT) |
254 | atomic_dec(&hctx->nr_active); | 268 | atomic_dec(&hctx->nr_active); |
269 | rq->cmd_flags = 0; | ||
255 | 270 | ||
256 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 271 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
257 | blk_mq_put_tag(hctx, tag, &ctx->last_tag); | 272 | blk_mq_put_tag(hctx, tag, &ctx->last_tag); |
@@ -387,6 +402,12 @@ static void blk_mq_start_request(struct request *rq, bool last) | |||
387 | blk_add_timer(rq); | 402 | blk_add_timer(rq); |
388 | 403 | ||
389 | /* | 404 | /* |
405 | * Ensure that ->deadline is visible before set the started | ||
406 | * flag and clear the completed flag. | ||
407 | */ | ||
408 | smp_mb__before_atomic(); | ||
409 | |||
410 | /* | ||
390 | * Mark us as started and clear complete. Complete might have been | 411 | * Mark us as started and clear complete. Complete might have been |
391 | * set if requeue raced with timeout, which then marked it as | 412 | * set if requeue raced with timeout, which then marked it as |
392 | * complete. So be sure to clear complete again when we start | 413 | * complete. So be sure to clear complete again when we start |
@@ -467,7 +488,11 @@ static void blk_mq_requeue_work(struct work_struct *work) | |||
467 | blk_mq_insert_request(rq, false, false, false); | 488 | blk_mq_insert_request(rq, false, false, false); |
468 | } | 489 | } |
469 | 490 | ||
470 | blk_mq_run_queues(q, false); | 491 | /* |
492 | * Use the start variant of queue running here, so that running | ||
493 | * the requeue work will kick stopped queues. | ||
494 | */ | ||
495 | blk_mq_start_hw_queues(q); | ||
471 | } | 496 | } |
472 | 497 | ||
473 | void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) | 498 | void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) |
@@ -951,14 +976,9 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, | |||
951 | 976 | ||
952 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 977 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
953 | 978 | ||
954 | if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) && | 979 | spin_lock(&ctx->lock); |
955 | !(rq->cmd_flags & (REQ_FLUSH_SEQ))) { | 980 | __blk_mq_insert_request(hctx, rq, at_head); |
956 | blk_insert_flush(rq); | 981 | spin_unlock(&ctx->lock); |
957 | } else { | ||
958 | spin_lock(&ctx->lock); | ||
959 | __blk_mq_insert_request(hctx, rq, at_head); | ||
960 | spin_unlock(&ctx->lock); | ||
961 | } | ||
962 | 982 | ||
963 | if (run_queue) | 983 | if (run_queue) |
964 | blk_mq_run_hw_queue(hctx, async); | 984 | blk_mq_run_hw_queue(hctx, async); |
@@ -1068,13 +1088,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | |||
1068 | blk_account_io_start(rq, 1); | 1088 | blk_account_io_start(rq, 1); |
1069 | } | 1089 | } |
1070 | 1090 | ||
1091 | static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) | ||
1092 | { | ||
1093 | return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && | ||
1094 | !blk_queue_nomerges(hctx->queue); | ||
1095 | } | ||
1096 | |||
1071 | static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, | 1097 | static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, |
1072 | struct blk_mq_ctx *ctx, | 1098 | struct blk_mq_ctx *ctx, |
1073 | struct request *rq, struct bio *bio) | 1099 | struct request *rq, struct bio *bio) |
1074 | { | 1100 | { |
1075 | struct request_queue *q = hctx->queue; | 1101 | if (!hctx_allow_merges(hctx)) { |
1076 | |||
1077 | if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { | ||
1078 | blk_mq_bio_to_request(rq, bio); | 1102 | blk_mq_bio_to_request(rq, bio); |
1079 | spin_lock(&ctx->lock); | 1103 | spin_lock(&ctx->lock); |
1080 | insert_rq: | 1104 | insert_rq: |
@@ -1082,6 +1106,8 @@ insert_rq: | |||
1082 | spin_unlock(&ctx->lock); | 1106 | spin_unlock(&ctx->lock); |
1083 | return false; | 1107 | return false; |
1084 | } else { | 1108 | } else { |
1109 | struct request_queue *q = hctx->queue; | ||
1110 | |||
1085 | spin_lock(&ctx->lock); | 1111 | spin_lock(&ctx->lock); |
1086 | if (!blk_mq_attempt_merge(q, ctx, bio)) { | 1112 | if (!blk_mq_attempt_merge(q, ctx, bio)) { |
1087 | blk_mq_bio_to_request(rq, bio); | 1113 | blk_mq_bio_to_request(rq, bio); |
@@ -1309,6 +1335,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, | |||
1309 | continue; | 1335 | continue; |
1310 | set->ops->exit_request(set->driver_data, tags->rqs[i], | 1336 | set->ops->exit_request(set->driver_data, tags->rqs[i], |
1311 | hctx_idx, i); | 1337 | hctx_idx, i); |
1338 | tags->rqs[i] = NULL; | ||
1312 | } | 1339 | } |
1313 | } | 1340 | } |
1314 | 1341 | ||
@@ -1342,8 +1369,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, | |||
1342 | 1369 | ||
1343 | INIT_LIST_HEAD(&tags->page_list); | 1370 | INIT_LIST_HEAD(&tags->page_list); |
1344 | 1371 | ||
1345 | tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), | 1372 | tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), |
1346 | GFP_KERNEL, set->numa_node); | 1373 | GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, |
1374 | set->numa_node); | ||
1347 | if (!tags->rqs) { | 1375 | if (!tags->rqs) { |
1348 | blk_mq_free_tags(tags); | 1376 | blk_mq_free_tags(tags); |
1349 | return NULL; | 1377 | return NULL; |
@@ -1367,8 +1395,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, | |||
1367 | this_order--; | 1395 | this_order--; |
1368 | 1396 | ||
1369 | do { | 1397 | do { |
1370 | page = alloc_pages_node(set->numa_node, GFP_KERNEL, | 1398 | page = alloc_pages_node(set->numa_node, |
1371 | this_order); | 1399 | GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, |
1400 | this_order); | ||
1372 | if (page) | 1401 | if (page) |
1373 | break; | 1402 | break; |
1374 | if (!this_order--) | 1403 | if (!this_order--) |
@@ -1389,11 +1418,15 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, | |||
1389 | left -= to_do * rq_size; | 1418 | left -= to_do * rq_size; |
1390 | for (j = 0; j < to_do; j++) { | 1419 | for (j = 0; j < to_do; j++) { |
1391 | tags->rqs[i] = p; | 1420 | tags->rqs[i] = p; |
1421 | tags->rqs[i]->atomic_flags = 0; | ||
1422 | tags->rqs[i]->cmd_flags = 0; | ||
1392 | if (set->ops->init_request) { | 1423 | if (set->ops->init_request) { |
1393 | if (set->ops->init_request(set->driver_data, | 1424 | if (set->ops->init_request(set->driver_data, |
1394 | tags->rqs[i], hctx_idx, i, | 1425 | tags->rqs[i], hctx_idx, i, |
1395 | set->numa_node)) | 1426 | set->numa_node)) { |
1427 | tags->rqs[i] = NULL; | ||
1396 | goto fail; | 1428 | goto fail; |
1429 | } | ||
1397 | } | 1430 | } |
1398 | 1431 | ||
1399 | p += rq_size; | 1432 | p += rq_size; |
@@ -1404,7 +1437,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, | |||
1404 | return tags; | 1437 | return tags; |
1405 | 1438 | ||
1406 | fail: | 1439 | fail: |
1407 | pr_warn("%s: failed to allocate requests\n", __func__); | ||
1408 | blk_mq_free_rq_map(set, tags, hctx_idx); | 1440 | blk_mq_free_rq_map(set, tags, hctx_idx); |
1409 | return NULL; | 1441 | return NULL; |
1410 | } | 1442 | } |
@@ -1574,7 +1606,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, | |||
1574 | hctx->tags = set->tags[i]; | 1606 | hctx->tags = set->tags[i]; |
1575 | 1607 | ||
1576 | /* | 1608 | /* |
1577 | * Allocate space for all possible cpus to avoid allocation in | 1609 | * Allocate space for all possible cpus to avoid allocation at |
1578 | * runtime | 1610 | * runtime |
1579 | */ | 1611 | */ |
1580 | hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), | 1612 | hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), |
@@ -1662,8 +1694,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) | |||
1662 | 1694 | ||
1663 | queue_for_each_hw_ctx(q, hctx, i) { | 1695 | queue_for_each_hw_ctx(q, hctx, i) { |
1664 | /* | 1696 | /* |
1665 | * If not software queues are mapped to this hardware queue, | 1697 | * If no software queues are mapped to this hardware queue, |
1666 | * disable it and free the request entries | 1698 | * disable it and free the request entries. |
1667 | */ | 1699 | */ |
1668 | if (!hctx->nr_ctx) { | 1700 | if (!hctx->nr_ctx) { |
1669 | struct blk_mq_tag_set *set = q->tag_set; | 1701 | struct blk_mq_tag_set *set = q->tag_set; |
@@ -1713,14 +1745,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) | |||
1713 | { | 1745 | { |
1714 | struct blk_mq_tag_set *set = q->tag_set; | 1746 | struct blk_mq_tag_set *set = q->tag_set; |
1715 | 1747 | ||
1716 | blk_mq_freeze_queue(q); | ||
1717 | |||
1718 | mutex_lock(&set->tag_list_lock); | 1748 | mutex_lock(&set->tag_list_lock); |
1719 | list_del_init(&q->tag_set_list); | 1749 | list_del_init(&q->tag_set_list); |
1720 | blk_mq_update_tag_set_depth(set); | 1750 | blk_mq_update_tag_set_depth(set); |
1721 | mutex_unlock(&set->tag_list_lock); | 1751 | mutex_unlock(&set->tag_list_lock); |
1722 | |||
1723 | blk_mq_unfreeze_queue(q); | ||
1724 | } | 1752 | } |
1725 | 1753 | ||
1726 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | 1754 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, |
@@ -1929,6 +1957,60 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, | |||
1929 | return NOTIFY_OK; | 1957 | return NOTIFY_OK; |
1930 | } | 1958 | } |
1931 | 1959 | ||
1960 | static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | ||
1961 | { | ||
1962 | int i; | ||
1963 | |||
1964 | for (i = 0; i < set->nr_hw_queues; i++) { | ||
1965 | set->tags[i] = blk_mq_init_rq_map(set, i); | ||
1966 | if (!set->tags[i]) | ||
1967 | goto out_unwind; | ||
1968 | } | ||
1969 | |||
1970 | return 0; | ||
1971 | |||
1972 | out_unwind: | ||
1973 | while (--i >= 0) | ||
1974 | blk_mq_free_rq_map(set, set->tags[i], i); | ||
1975 | |||
1976 | return -ENOMEM; | ||
1977 | } | ||
1978 | |||
1979 | /* | ||
1980 | * Allocate the request maps associated with this tag_set. Note that this | ||
1981 | * may reduce the depth asked for, if memory is tight. set->queue_depth | ||
1982 | * will be updated to reflect the allocated depth. | ||
1983 | */ | ||
1984 | static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | ||
1985 | { | ||
1986 | unsigned int depth; | ||
1987 | int err; | ||
1988 | |||
1989 | depth = set->queue_depth; | ||
1990 | do { | ||
1991 | err = __blk_mq_alloc_rq_maps(set); | ||
1992 | if (!err) | ||
1993 | break; | ||
1994 | |||
1995 | set->queue_depth >>= 1; | ||
1996 | if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { | ||
1997 | err = -ENOMEM; | ||
1998 | break; | ||
1999 | } | ||
2000 | } while (set->queue_depth); | ||
2001 | |||
2002 | if (!set->queue_depth || err) { | ||
2003 | pr_err("blk-mq: failed to allocate request map\n"); | ||
2004 | return -ENOMEM; | ||
2005 | } | ||
2006 | |||
2007 | if (depth != set->queue_depth) | ||
2008 | pr_info("blk-mq: reduced tag depth (%u -> %u)\n", | ||
2009 | depth, set->queue_depth); | ||
2010 | |||
2011 | return 0; | ||
2012 | } | ||
2013 | |||
1932 | /* | 2014 | /* |
1933 | * Alloc a tag set to be associated with one or more request queues. | 2015 | * Alloc a tag set to be associated with one or more request queues. |
1934 | * May fail with EINVAL for various error conditions. May adjust the | 2016 | * May fail with EINVAL for various error conditions. May adjust the |
@@ -1937,8 +2019,6 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, | |||
1937 | */ | 2019 | */ |
1938 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | 2020 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) |
1939 | { | 2021 | { |
1940 | int i; | ||
1941 | |||
1942 | if (!set->nr_hw_queues) | 2022 | if (!set->nr_hw_queues) |
1943 | return -EINVAL; | 2023 | return -EINVAL; |
1944 | if (!set->queue_depth) | 2024 | if (!set->queue_depth) |
@@ -1959,23 +2039,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
1959 | sizeof(struct blk_mq_tags *), | 2039 | sizeof(struct blk_mq_tags *), |
1960 | GFP_KERNEL, set->numa_node); | 2040 | GFP_KERNEL, set->numa_node); |
1961 | if (!set->tags) | 2041 | if (!set->tags) |
1962 | goto out; | 2042 | return -ENOMEM; |
1963 | 2043 | ||
1964 | for (i = 0; i < set->nr_hw_queues; i++) { | 2044 | if (blk_mq_alloc_rq_maps(set)) |
1965 | set->tags[i] = blk_mq_init_rq_map(set, i); | 2045 | goto enomem; |
1966 | if (!set->tags[i]) | ||
1967 | goto out_unwind; | ||
1968 | } | ||
1969 | 2046 | ||
1970 | mutex_init(&set->tag_list_lock); | 2047 | mutex_init(&set->tag_list_lock); |
1971 | INIT_LIST_HEAD(&set->tag_list); | 2048 | INIT_LIST_HEAD(&set->tag_list); |
1972 | 2049 | ||
1973 | return 0; | 2050 | return 0; |
1974 | 2051 | enomem: | |
1975 | out_unwind: | 2052 | kfree(set->tags); |
1976 | while (--i >= 0) | 2053 | set->tags = NULL; |
1977 | blk_mq_free_rq_map(set, set->tags[i], i); | ||
1978 | out: | ||
1979 | return -ENOMEM; | 2054 | return -ENOMEM; |
1980 | } | 2055 | } |
1981 | EXPORT_SYMBOL(blk_mq_alloc_tag_set); | 2056 | EXPORT_SYMBOL(blk_mq_alloc_tag_set); |
@@ -1990,6 +2065,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) | |||
1990 | } | 2065 | } |
1991 | 2066 | ||
1992 | kfree(set->tags); | 2067 | kfree(set->tags); |
2068 | set->tags = NULL; | ||
1993 | } | 2069 | } |
1994 | EXPORT_SYMBOL(blk_mq_free_tag_set); | 2070 | EXPORT_SYMBOL(blk_mq_free_tag_set); |
1995 | 2071 | ||