summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorKeith Busch <keith.busch@intel.com>2015-12-17 19:08:14 -0500
committerJens Axboe <axboe@fb.com>2016-02-09 14:42:17 -0500
commit868f2f0b72068a097508b6e8870a8950fd8eb7ef (patch)
treef88aed0e80d7b08c7a7907d8bcf4ff6df0150b66 /block
parent3984aa55204e2c3f423a70b013c44c64261788df (diff)
blk-mq: dynamic h/w context count
The hardware's provided queue count may change at runtime with resource provisioning. This patch allows a block driver to alter the number of h/w queues available when its resource count changes. The main part is a new blk-mq API to request a new number of h/w queues for a given live tag set. The new API freezes all queues using that set, then adjusts the allocated count prior to remapping these to CPUs. The bulk of the rest just shifts where h/w contexts and all their artifacts are allocated and freed. The number of max h/w contexts is capped to the number of possible cpus since there is no use for more than that. As such, all pre-allocated memory for pointers need to account for the max possible rather than the initial number of queues. A side effect of this is that the blk-mq will proceed successfully as long as it can allocate at least one h/w context. Previously it would fail request queue initialization if less than the requested number was allocated. Signed-off-by: Keith Busch <keith.busch@intel.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Tested-by: Jon Derrick <jonathan.derrick@intel.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block')
-rw-r--r--block/blk-mq-sysfs.c9
-rw-r--r--block/blk-mq.c173
-rw-r--r--block/blk-mq.h1
3 files changed, 110 insertions, 73 deletions
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 1cf18784c5cf..431fdda21737 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -408,17 +408,18 @@ void blk_mq_unregister_disk(struct gendisk *disk)
408 blk_mq_enable_hotplug(); 408 blk_mq_enable_hotplug();
409} 409}
410 410
411void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
412{
413 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
414}
415
411static void blk_mq_sysfs_init(struct request_queue *q) 416static void blk_mq_sysfs_init(struct request_queue *q)
412{ 417{
413 struct blk_mq_hw_ctx *hctx;
414 struct blk_mq_ctx *ctx; 418 struct blk_mq_ctx *ctx;
415 int i; 419 int i;
416 420
417 kobject_init(&q->mq_kobj, &blk_mq_ktype); 421 kobject_init(&q->mq_kobj, &blk_mq_ktype);
418 422
419 queue_for_each_hw_ctx(q, hctx, i)
420 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
421
422 queue_for_each_ctx(q, ctx, i) 423 queue_for_each_ctx(q, ctx, i)
423 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); 424 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
424} 425}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c0622fae413..645eb9e716d0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1742,31 +1742,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
1742 return -1; 1742 return -1;
1743} 1743}
1744 1744
1745static int blk_mq_init_hw_queues(struct request_queue *q,
1746 struct blk_mq_tag_set *set)
1747{
1748 struct blk_mq_hw_ctx *hctx;
1749 unsigned int i;
1750
1751 /*
1752 * Initialize hardware queues
1753 */
1754 queue_for_each_hw_ctx(q, hctx, i) {
1755 if (blk_mq_init_hctx(q, set, hctx, i))
1756 break;
1757 }
1758
1759 if (i == q->nr_hw_queues)
1760 return 0;
1761
1762 /*
1763 * Init failed
1764 */
1765 blk_mq_exit_hw_queues(q, set, i);
1766
1767 return 1;
1768}
1769
1770static void blk_mq_init_cpu_queues(struct request_queue *q, 1745static void blk_mq_init_cpu_queues(struct request_queue *q,
1771 unsigned int nr_hw_queues) 1746 unsigned int nr_hw_queues)
1772{ 1747{
@@ -1824,6 +1799,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
1824 continue; 1799 continue;
1825 1800
1826 hctx = q->mq_ops->map_queue(q, i); 1801 hctx = q->mq_ops->map_queue(q, i);
1802
1827 cpumask_set_cpu(i, hctx->cpumask); 1803 cpumask_set_cpu(i, hctx->cpumask);
1828 ctx->index_hw = hctx->nr_ctx; 1804 ctx->index_hw = hctx->nr_ctx;
1829 hctx->ctxs[hctx->nr_ctx++] = ctx; 1805 hctx->ctxs[hctx->nr_ctx++] = ctx;
@@ -1972,54 +1948,89 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1972} 1948}
1973EXPORT_SYMBOL(blk_mq_init_queue); 1949EXPORT_SYMBOL(blk_mq_init_queue);
1974 1950
1975struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 1951static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
1976 struct request_queue *q) 1952 struct request_queue *q)
1977{ 1953{
1978 struct blk_mq_hw_ctx **hctxs; 1954 int i, j;
1979 struct blk_mq_ctx __percpu *ctx; 1955 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
1980 unsigned int *map;
1981 int i;
1982
1983 ctx = alloc_percpu(struct blk_mq_ctx);
1984 if (!ctx)
1985 return ERR_PTR(-ENOMEM);
1986
1987 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1988 set->numa_node);
1989
1990 if (!hctxs)
1991 goto err_percpu;
1992
1993 map = blk_mq_make_queue_map(set);
1994 if (!map)
1995 goto err_map;
1996 1956
1957 blk_mq_sysfs_unregister(q);
1997 for (i = 0; i < set->nr_hw_queues; i++) { 1958 for (i = 0; i < set->nr_hw_queues; i++) {
1998 int node = blk_mq_hw_queue_to_node(map, i); 1959 int node;
1999 1960
1961 if (hctxs[i])
1962 continue;
1963
1964 node = blk_mq_hw_queue_to_node(q->mq_map, i);
2000 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 1965 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
2001 GFP_KERNEL, node); 1966 GFP_KERNEL, node);
2002 if (!hctxs[i]) 1967 if (!hctxs[i])
2003 goto err_hctxs; 1968 break;
2004 1969
2005 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 1970 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
2006 node)) 1971 node)) {
2007 goto err_hctxs; 1972 kfree(hctxs[i]);
1973 hctxs[i] = NULL;
1974 break;
1975 }
2008 1976
2009 atomic_set(&hctxs[i]->nr_active, 0); 1977 atomic_set(&hctxs[i]->nr_active, 0);
2010 hctxs[i]->numa_node = node; 1978 hctxs[i]->numa_node = node;
2011 hctxs[i]->queue_num = i; 1979 hctxs[i]->queue_num = i;
1980
1981 if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
1982 free_cpumask_var(hctxs[i]->cpumask);
1983 kfree(hctxs[i]);
1984 hctxs[i] = NULL;
1985 break;
1986 }
1987 blk_mq_hctx_kobj_init(hctxs[i]);
2012 } 1988 }
1989 for (j = i; j < q->nr_hw_queues; j++) {
1990 struct blk_mq_hw_ctx *hctx = hctxs[j];
1991
1992 if (hctx) {
1993 if (hctx->tags) {
1994 blk_mq_free_rq_map(set, hctx->tags, j);
1995 set->tags[j] = NULL;
1996 }
1997 blk_mq_exit_hctx(q, set, hctx, j);
1998 free_cpumask_var(hctx->cpumask);
1999 kobject_put(&hctx->kobj);
2000 kfree(hctx->ctxs);
2001 kfree(hctx);
2002 hctxs[j] = NULL;
2003
2004 }
2005 }
2006 q->nr_hw_queues = i;
2007 blk_mq_sysfs_register(q);
2008}
2009
2010struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2011 struct request_queue *q)
2012{
2013 q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2014 if (!q->queue_ctx)
2015 return ERR_PTR(-ENOMEM);
2016
2017 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2018 GFP_KERNEL, set->numa_node);
2019 if (!q->queue_hw_ctx)
2020 goto err_percpu;
2021
2022 q->mq_map = blk_mq_make_queue_map(set);
2023 if (!q->mq_map)
2024 goto err_map;
2025
2026 blk_mq_realloc_hw_ctxs(set, q);
2027 if (!q->nr_hw_queues)
2028 goto err_hctxs;
2013 2029
2014 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2030 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2015 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2031 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2016 2032
2017 q->nr_queues = nr_cpu_ids; 2033 q->nr_queues = nr_cpu_ids;
2018 q->nr_hw_queues = set->nr_hw_queues;
2019 q->mq_map = map;
2020
2021 q->queue_ctx = ctx;
2022 q->queue_hw_ctx = hctxs;
2023 2034
2024 q->mq_ops = set->ops; 2035 q->mq_ops = set->ops;
2025 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2036 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
@@ -2048,9 +2059,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2048 2059
2049 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2060 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2050 2061
2051 if (blk_mq_init_hw_queues(q, set))
2052 goto err_hctxs;
2053
2054 get_online_cpus(); 2062 get_online_cpus();
2055 mutex_lock(&all_q_mutex); 2063 mutex_lock(&all_q_mutex);
2056 2064
@@ -2064,17 +2072,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2064 return q; 2072 return q;
2065 2073
2066err_hctxs: 2074err_hctxs:
2067 kfree(map); 2075 kfree(q->mq_map);
2068 for (i = 0; i < set->nr_hw_queues; i++) {
2069 if (!hctxs[i])
2070 break;
2071 free_cpumask_var(hctxs[i]->cpumask);
2072 kfree(hctxs[i]);
2073 }
2074err_map: 2076err_map:
2075 kfree(hctxs); 2077 kfree(q->queue_hw_ctx);
2076err_percpu: 2078err_percpu:
2077 free_percpu(ctx); 2079 free_percpu(q->queue_ctx);
2078 return ERR_PTR(-ENOMEM); 2080 return ERR_PTR(-ENOMEM);
2079} 2081}
2080EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2082EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -2282,9 +2284,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2282 set->nr_hw_queues = 1; 2284 set->nr_hw_queues = 1;
2283 set->queue_depth = min(64U, set->queue_depth); 2285 set->queue_depth = min(64U, set->queue_depth);
2284 } 2286 }
2287 /*
2288 * There is no use for more h/w queues than cpus.
2289 */
2290 if (set->nr_hw_queues > nr_cpu_ids)
2291 set->nr_hw_queues = nr_cpu_ids;
2285 2292
2286 set->tags = kmalloc_node(set->nr_hw_queues * 2293 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2287 sizeof(struct blk_mq_tags *),
2288 GFP_KERNEL, set->numa_node); 2294 GFP_KERNEL, set->numa_node);
2289 if (!set->tags) 2295 if (!set->tags)
2290 return -ENOMEM; 2296 return -ENOMEM;
@@ -2307,7 +2313,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2307{ 2313{
2308 int i; 2314 int i;
2309 2315
2310 for (i = 0; i < set->nr_hw_queues; i++) { 2316 for (i = 0; i < nr_cpu_ids; i++) {
2311 if (set->tags[i]) 2317 if (set->tags[i])
2312 blk_mq_free_rq_map(set, set->tags[i], i); 2318 blk_mq_free_rq_map(set, set->tags[i], i);
2313 } 2319 }
@@ -2339,6 +2345,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2339 return ret; 2345 return ret;
2340} 2346}
2341 2347
2348void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2349{
2350 struct request_queue *q;
2351
2352 if (nr_hw_queues > nr_cpu_ids)
2353 nr_hw_queues = nr_cpu_ids;
2354 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2355 return;
2356
2357 list_for_each_entry(q, &set->tag_list, tag_set_list)
2358 blk_mq_freeze_queue(q);
2359
2360 set->nr_hw_queues = nr_hw_queues;
2361 list_for_each_entry(q, &set->tag_list, tag_set_list) {
2362 blk_mq_realloc_hw_ctxs(set, q);
2363
2364 if (q->nr_hw_queues > 1)
2365 blk_queue_make_request(q, blk_mq_make_request);
2366 else
2367 blk_queue_make_request(q, blk_sq_make_request);
2368
2369 blk_mq_queue_reinit(q, cpu_online_mask);
2370 }
2371
2372 list_for_each_entry(q, &set->tag_list, tag_set_list)
2373 blk_mq_unfreeze_queue(q);
2374}
2375EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2376
2342void blk_mq_disable_hotplug(void) 2377void blk_mq_disable_hotplug(void)
2343{ 2378{
2344 mutex_lock(&all_q_mutex); 2379 mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index eaede8e45c9c..9087b11037b7 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -57,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
57 */ 57 */
58extern int blk_mq_sysfs_register(struct request_queue *q); 58extern int blk_mq_sysfs_register(struct request_queue *q);
59extern void blk_mq_sysfs_unregister(struct request_queue *q); 59extern void blk_mq_sysfs_unregister(struct request_queue *q);
60extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
60 61
61extern void blk_mq_rq_timed_out(struct request *req, bool reserved); 62extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
62 63