aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorAkinobu Mita <akinobu.mita@gmail.com>2015-09-26 13:09:23 -0400
committerJens Axboe <axboe@fb.com>2015-09-29 13:32:50 -0400
commit5778322e67ed34dc9f391a4a5cbcbb856071ceba (patch)
tree142b9e7ac2a85dc3ee7cfd403f53b56cb2921957 /block
parent0e6263682014d480b8d7b8c10287f4536066b54f (diff)
blk-mq: avoid inserting requests before establishing new mapping
Notifier callbacks for CPU_ONLINE action can be run on the other CPU than the CPU which was just onlined. So it is possible for the process running on the just onlined CPU to insert request and run hw queue before establishing new mapping which is done by blk_mq_queue_reinit_notify(). This can cause a problem when the CPU has just been onlined first time since the request queue was initialized. At this time ctx->index_hw for the CPU, which is the index in hctx->ctxs[] for this ctx, is still zero before blk_mq_queue_reinit_notify() is called by notifier callbacks for CPU_ONLINE action. For example, there is a single hw queue (hctx) and two CPU queues (ctx0 for CPU0, and ctx1 for CPU1). Now CPU1 is just onlined and a request is inserted into ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is still zero. And then while running hw queue, flush_busy_ctxs() finds bit0 is set in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is ignored. Fix it by ensuring that new mapping is established before onlined cpu starts running. Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Reviewed-by: Ming Lei <tom.leiming@gmail.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Ming Lei <tom.leiming@gmail.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block')
-rw-r--r--block/blk-mq-cpumap.c9
-rw-r--r--block/blk-mq.c59
-rw-r--r--block/blk-mq.h3
3 files changed, 52 insertions, 19 deletions
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 1e28ddb656b8..8764c241e5bb 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
31 return cpu; 31 return cpu;
32} 32}
33 33
34int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) 34int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
35 const struct cpumask *online_mask)
35{ 36{
36 unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; 37 unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
37 cpumask_var_t cpus; 38 cpumask_var_t cpus;
@@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
41 42
42 cpumask_clear(cpus); 43 cpumask_clear(cpus);
43 nr_cpus = nr_uniq_cpus = 0; 44 nr_cpus = nr_uniq_cpus = 0;
44 for_each_online_cpu(i) { 45 for_each_cpu(i, online_mask) {
45 nr_cpus++; 46 nr_cpus++;
46 first_sibling = get_first_sibling(i); 47 first_sibling = get_first_sibling(i);
47 if (!cpumask_test_cpu(first_sibling, cpus)) 48 if (!cpumask_test_cpu(first_sibling, cpus))
@@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
51 52
52 queue = 0; 53 queue = 0;
53 for_each_possible_cpu(i) { 54 for_each_possible_cpu(i) {
54 if (!cpu_online(i)) { 55 if (!cpumask_test_cpu(i, online_mask)) {
55 map[i] = 0; 56 map[i] = 0;
56 continue; 57 continue;
57 } 58 }
@@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
95 if (!map) 96 if (!map)
96 return NULL; 97 return NULL;
97 98
98 if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) 99 if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
99 return map; 100 return map;
100 101
101 kfree(map); 102 kfree(map);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3a39184e82e5..a5dbd069c9da 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1789,7 +1789,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
1789 } 1789 }
1790} 1790}
1791 1791
1792static void blk_mq_map_swqueue(struct request_queue *q) 1792static void blk_mq_map_swqueue(struct request_queue *q,
1793 const struct cpumask *online_mask)
1793{ 1794{
1794 unsigned int i; 1795 unsigned int i;
1795 struct blk_mq_hw_ctx *hctx; 1796 struct blk_mq_hw_ctx *hctx;
@@ -1806,7 +1807,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1806 */ 1807 */
1807 queue_for_each_ctx(q, ctx, i) { 1808 queue_for_each_ctx(q, ctx, i) {
1808 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1809 /* If the cpu isn't online, the cpu is mapped to first hctx */
1809 if (!cpu_online(i)) 1810 if (!cpumask_test_cpu(i, online_mask))
1810 continue; 1811 continue;
1811 1812
1812 hctx = q->mq_ops->map_queue(q, i); 1813 hctx = q->mq_ops->map_queue(q, i);
@@ -1852,7 +1853,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1852 } 1853 }
1853 1854
1854 queue_for_each_ctx(q, ctx, i) { 1855 queue_for_each_ctx(q, ctx, i) {
1855 if (!cpu_online(i)) 1856 if (!cpumask_test_cpu(i, online_mask))
1856 continue; 1857 continue;
1857 1858
1858 hctx = q->mq_ops->map_queue(q, i); 1859 hctx = q->mq_ops->map_queue(q, i);
@@ -2037,13 +2038,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2037 if (blk_mq_init_hw_queues(q, set)) 2038 if (blk_mq_init_hw_queues(q, set))
2038 goto err_hctxs; 2039 goto err_hctxs;
2039 2040
2041 get_online_cpus();
2040 mutex_lock(&all_q_mutex); 2042 mutex_lock(&all_q_mutex);
2041 2043
2042 list_add_tail(&q->all_q_node, &all_q_list); 2044 list_add_tail(&q->all_q_node, &all_q_list);
2043 blk_mq_add_queue_tag_set(set, q); 2045 blk_mq_add_queue_tag_set(set, q);
2044 blk_mq_map_swqueue(q); 2046 blk_mq_map_swqueue(q, cpu_online_mask);
2045 2047
2046 mutex_unlock(&all_q_mutex); 2048 mutex_unlock(&all_q_mutex);
2049 put_online_cpus();
2047 2050
2048 return q; 2051 return q;
2049 2052
@@ -2080,13 +2083,14 @@ void blk_mq_free_queue(struct request_queue *q)
2080} 2083}
2081 2084
2082/* Basically redo blk_mq_init_queue with queue frozen */ 2085/* Basically redo blk_mq_init_queue with queue frozen */
2083static void blk_mq_queue_reinit(struct request_queue *q) 2086static void blk_mq_queue_reinit(struct request_queue *q,
2087 const struct cpumask *online_mask)
2084{ 2088{
2085 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); 2089 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2086 2090
2087 blk_mq_sysfs_unregister(q); 2091 blk_mq_sysfs_unregister(q);
2088 2092
2089 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 2093 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
2090 2094
2091 /* 2095 /*
2092 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2096 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
@@ -2094,7 +2098,7 @@ static void blk_mq_queue_reinit(struct request_queue *q)
2094 * involves free and re-allocate memory, worthy doing?) 2098 * involves free and re-allocate memory, worthy doing?)
2095 */ 2099 */
2096 2100
2097 blk_mq_map_swqueue(q); 2101 blk_mq_map_swqueue(q, online_mask);
2098 2102
2099 blk_mq_sysfs_register(q); 2103 blk_mq_sysfs_register(q);
2100} 2104}
@@ -2103,16 +2107,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2103 unsigned long action, void *hcpu) 2107 unsigned long action, void *hcpu)
2104{ 2108{
2105 struct request_queue *q; 2109 struct request_queue *q;
2110 int cpu = (unsigned long)hcpu;
2111 /*
2112 * New online cpumask which is going to be set in this hotplug event.
2113 * Declare this cpumasks as global as cpu-hotplug operation is invoked
2114 * one-by-one and dynamically allocating this could result in a failure.
2115 */
2116 static struct cpumask online_new;
2106 2117
2107 /* 2118 /*
2108 * Before new mappings are established, hotadded cpu might already 2119 * Before hotadded cpu starts handling requests, new mappings must
2109 * start handling requests. This doesn't break anything as we map 2120 * be established. Otherwise, these requests in hw queue might
2110 * offline CPUs to first hardware queue. We will re-init the queue 2121 * never be dispatched.
2111 * below to get optimal settings. 2122 *
2123 * For example, there is a single hw queue (hctx) and two CPU queues
2124 * (ctx0 for CPU0, and ctx1 for CPU1).
2125 *
2126 * Now CPU1 is just onlined and a request is inserted into
2127 * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
2128 * still zero.
2129 *
2130 * And then while running hw queue, flush_busy_ctxs() finds bit0 is
2131 * set in pending bitmap and tries to retrieve requests in
2132 * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0,
2133 * so the request in ctx1->rq_list is ignored.
2112 */ 2134 */
2113 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 2135 switch (action & ~CPU_TASKS_FROZEN) {
2114 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 2136 case CPU_DEAD:
2137 case CPU_UP_CANCELED:
2138 cpumask_copy(&online_new, cpu_online_mask);
2139 break;
2140 case CPU_UP_PREPARE:
2141 cpumask_copy(&online_new, cpu_online_mask);
2142 cpumask_set_cpu(cpu, &online_new);
2143 break;
2144 default:
2115 return NOTIFY_OK; 2145 return NOTIFY_OK;
2146 }
2116 2147
2117 mutex_lock(&all_q_mutex); 2148 mutex_lock(&all_q_mutex);
2118 2149
@@ -2136,7 +2167,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2136 } 2167 }
2137 2168
2138 list_for_each_entry(q, &all_q_list, all_q_node) 2169 list_for_each_entry(q, &all_q_list, all_q_node)
2139 blk_mq_queue_reinit(q); 2170 blk_mq_queue_reinit(q, &online_new);
2140 2171
2141 list_for_each_entry(q, &all_q_list, all_q_node) 2172 list_for_each_entry(q, &all_q_list, all_q_node)
2142 blk_mq_unfreeze_queue(q); 2173 blk_mq_unfreeze_queue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6a48c4c0d8a2..f4fea7964910 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
51 * CPU -> queue mappings 51 * CPU -> queue mappings
52 */ 52 */
53extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); 53extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
54extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); 54extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
55 const struct cpumask *online_mask);
55extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); 56extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
56 57
57/* 58/*