aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c228
1 files changed, 141 insertions, 87 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..f53779692c77 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
89 return -EBUSY; 89 return -EBUSY;
90 90
91 ret = wait_event_interruptible(q->mq_freeze_wq, 91 ret = wait_event_interruptible(q->mq_freeze_wq,
92 !q->mq_freeze_depth || blk_queue_dying(q)); 92 !atomic_read(&q->mq_freeze_depth) ||
93 blk_queue_dying(q));
93 if (blk_queue_dying(q)) 94 if (blk_queue_dying(q))
94 return -ENODEV; 95 return -ENODEV;
95 if (ret) 96 if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
112 113
113void blk_mq_freeze_queue_start(struct request_queue *q) 114void blk_mq_freeze_queue_start(struct request_queue *q)
114{ 115{
115 bool freeze; 116 int freeze_depth;
116 117
117 spin_lock_irq(q->queue_lock); 118 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
118 freeze = !q->mq_freeze_depth++; 119 if (freeze_depth == 1) {
119 spin_unlock_irq(q->queue_lock);
120
121 if (freeze) {
122 percpu_ref_kill(&q->mq_usage_counter); 120 percpu_ref_kill(&q->mq_usage_counter);
123 blk_mq_run_hw_queues(q, false); 121 blk_mq_run_hw_queues(q, false);
124 } 122 }
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
143 141
144void blk_mq_unfreeze_queue(struct request_queue *q) 142void blk_mq_unfreeze_queue(struct request_queue *q)
145{ 143{
146 bool wake; 144 int freeze_depth;
147 145
148 spin_lock_irq(q->queue_lock); 146 freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
149 wake = !--q->mq_freeze_depth; 147 WARN_ON_ONCE(freeze_depth < 0);
150 WARN_ON_ONCE(q->mq_freeze_depth < 0); 148 if (!freeze_depth) {
151 spin_unlock_irq(q->queue_lock);
152 if (wake) {
153 percpu_ref_reinit(&q->mq_usage_counter); 149 percpu_ref_reinit(&q->mq_usage_counter);
154 wake_up_all(&q->mq_freeze_wq); 150 wake_up_all(&q->mq_freeze_wq);
155 } 151 }
@@ -677,8 +673,11 @@ static void blk_mq_rq_timer(unsigned long priv)
677 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 673 data.next = blk_rq_timeout(round_jiffies_up(data.next));
678 mod_timer(&q->timeout, data.next); 674 mod_timer(&q->timeout, data.next);
679 } else { 675 } else {
680 queue_for_each_hw_ctx(q, hctx, i) 676 queue_for_each_hw_ctx(q, hctx, i) {
681 blk_mq_tag_idle(hctx); 677 /* the hctx may be unmapped, so check it here */
678 if (blk_mq_hw_queue_mapped(hctx))
679 blk_mq_tag_idle(hctx);
680 }
682 } 681 }
683} 682}
684 683
@@ -855,6 +854,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
855 spin_lock(&hctx->lock); 854 spin_lock(&hctx->lock);
856 list_splice(&rq_list, &hctx->dispatch); 855 list_splice(&rq_list, &hctx->dispatch);
857 spin_unlock(&hctx->lock); 856 spin_unlock(&hctx->lock);
857 /*
858 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
859 * it's possible the queue is stopped and restarted again
860 * before this. Queue restart will dispatch requests. And since
861 * requests in rq_list aren't added into hctx->dispatch yet,
862 * the requests in rq_list might get lost.
863 *
864 * blk_mq_run_hw_queue() already checks the STOPPED bit
865 **/
866 blk_mq_run_hw_queue(hctx, true);
858 } 867 }
859} 868}
860 869
@@ -1224,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
1224 return rq; 1233 return rq;
1225} 1234}
1226 1235
1236static int blk_mq_direct_issue_request(struct request *rq)
1237{
1238 int ret;
1239 struct request_queue *q = rq->q;
1240 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
1241 rq->mq_ctx->cpu);
1242 struct blk_mq_queue_data bd = {
1243 .rq = rq,
1244 .list = NULL,
1245 .last = 1
1246 };
1247
1248 /*
1249 * For OK queue, we are done. For error, kill it. Any other
1250 * error (busy), just add it to our list as we previously
1251 * would have done
1252 */
1253 ret = q->mq_ops->queue_rq(hctx, &bd);
1254 if (ret == BLK_MQ_RQ_QUEUE_OK)
1255 return 0;
1256 else {
1257 __blk_mq_requeue_request(rq);
1258
1259 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1260 rq->errors = -EIO;
1261 blk_mq_end_request(rq, rq->errors);
1262 return 0;
1263 }
1264 return -1;
1265 }
1266}
1267
1227/* 1268/*
1228 * Multiple hardware queue variant. This will not use per-process plugs, 1269 * Multiple hardware queue variant. This will not use per-process plugs,
1229 * but will attempt to bypass the hctx queueing if we can go straight to 1270 * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1235 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1276 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1236 struct blk_map_ctx data; 1277 struct blk_map_ctx data;
1237 struct request *rq; 1278 struct request *rq;
1279 unsigned int request_count = 0;
1280 struct blk_plug *plug;
1281 struct request *same_queue_rq = NULL;
1238 1282
1239 blk_queue_bounce(q, &bio); 1283 blk_queue_bounce(q, &bio);
1240 1284
@@ -1243,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1243 return; 1287 return;
1244 } 1288 }
1245 1289
1290 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1291 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1292 return;
1293
1246 rq = blk_mq_map_request(q, bio, &data); 1294 rq = blk_mq_map_request(q, bio, &data);
1247 if (unlikely(!rq)) 1295 if (unlikely(!rq))
1248 return; 1296 return;
@@ -1253,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1253 goto run_queue; 1301 goto run_queue;
1254 } 1302 }
1255 1303
1304 plug = current->plug;
1256 /* 1305 /*
1257 * If the driver supports defer issued based on 'last', then 1306 * If the driver supports defer issued based on 'last', then
1258 * queue it up like normal since we can potentially save some 1307 * queue it up like normal since we can potentially save some
1259 * CPU this way. 1308 * CPU this way.
1260 */ 1309 */
1261 if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { 1310 if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1262 struct blk_mq_queue_data bd = { 1311 !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1263 .rq = rq, 1312 struct request *old_rq = NULL;
1264 .list = NULL,
1265 .last = 1
1266 };
1267 int ret;
1268 1313
1269 blk_mq_bio_to_request(rq, bio); 1314 blk_mq_bio_to_request(rq, bio);
1270 1315
1271 /* 1316 /*
1272 * For OK queue, we are done. For error, kill it. Any other 1317 * we do limited pluging. If bio can be merged, do merge.
1273 * error (busy), just add it to our list as we previously 1318 * Otherwise the existing request in the plug list will be
1274 * would have done 1319 * issued. So the plug list will have one request at most
1275 */ 1320 */
1276 ret = q->mq_ops->queue_rq(data.hctx, &bd); 1321 if (plug) {
1277 if (ret == BLK_MQ_RQ_QUEUE_OK) 1322 /*
1278 goto done; 1323 * The plug list might get flushed before this. If that
1279 else { 1324 * happens, same_queue_rq is invalid and plug list is empty
1280 __blk_mq_requeue_request(rq); 1325 **/
1281 1326 if (same_queue_rq && !list_empty(&plug->mq_list)) {
1282 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1327 old_rq = same_queue_rq;
1283 rq->errors = -EIO; 1328 list_del_init(&old_rq->queuelist);
1284 blk_mq_end_request(rq, rq->errors);
1285 goto done;
1286 } 1329 }
1287 } 1330 list_add_tail(&rq->queuelist, &plug->mq_list);
1331 } else /* is_sync */
1332 old_rq = rq;
1333 blk_mq_put_ctx(data.ctx);
1334 if (!old_rq)
1335 return;
1336 if (!blk_mq_direct_issue_request(old_rq))
1337 return;
1338 blk_mq_insert_request(old_rq, false, true, true);
1339 return;
1288 } 1340 }
1289 1341
1290 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1342 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1297,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1297run_queue: 1349run_queue:
1298 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1350 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1299 } 1351 }
1300done:
1301 blk_mq_put_ctx(data.ctx); 1352 blk_mq_put_ctx(data.ctx);
1302} 1353}
1303 1354
@@ -1309,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1309{ 1360{
1310 const int is_sync = rw_is_sync(bio->bi_rw); 1361 const int is_sync = rw_is_sync(bio->bi_rw);
1311 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1362 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1312 unsigned int use_plug, request_count = 0; 1363 struct blk_plug *plug;
1364 unsigned int request_count = 0;
1313 struct blk_map_ctx data; 1365 struct blk_map_ctx data;
1314 struct request *rq; 1366 struct request *rq;
1315 1367
1316 /*
1317 * If we have multiple hardware queues, just go directly to
1318 * one of those for sync IO.
1319 */
1320 use_plug = !is_flush_fua && !is_sync;
1321
1322 blk_queue_bounce(q, &bio); 1368 blk_queue_bounce(q, &bio);
1323 1369
1324 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1370 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1326 return; 1372 return;
1327 } 1373 }
1328 1374
1329 if (use_plug && !blk_queue_nomerges(q) && 1375 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1330 blk_attempt_plug_merge(q, bio, &request_count)) 1376 blk_attempt_plug_merge(q, bio, &request_count, NULL))
1331 return; 1377 return;
1332 1378
1333 rq = blk_mq_map_request(q, bio, &data); 1379 rq = blk_mq_map_request(q, bio, &data);
@@ -1345,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1345 * utilize that to temporarily store requests until the task is 1391 * utilize that to temporarily store requests until the task is
1346 * either done or scheduled away. 1392 * either done or scheduled away.
1347 */ 1393 */
1348 if (use_plug) { 1394 plug = current->plug;
1349 struct blk_plug *plug = current->plug; 1395 if (plug) {
1350 1396 blk_mq_bio_to_request(rq, bio);
1351 if (plug) { 1397 if (list_empty(&plug->mq_list))
1352 blk_mq_bio_to_request(rq, bio); 1398 trace_block_plug(q);
1353 if (list_empty(&plug->mq_list)) 1399 else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1354 trace_block_plug(q); 1400 blk_flush_plug_list(plug, false);
1355 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 1401 trace_block_plug(q);
1356 blk_flush_plug_list(plug, false);
1357 trace_block_plug(q);
1358 }
1359 list_add_tail(&rq->queuelist, &plug->mq_list);
1360 blk_mq_put_ctx(data.ctx);
1361 return;
1362 } 1402 }
1403 list_add_tail(&rq->queuelist, &plug->mq_list);
1404 blk_mq_put_ctx(data.ctx);
1405 return;
1363 } 1406 }
1364 1407
1365 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1408 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1495,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1495 i++; 1538 i++;
1496 } 1539 }
1497 } 1540 }
1498
1499 return tags; 1541 return tags;
1500 1542
1501fail: 1543fail:
@@ -1571,22 +1613,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1571 return NOTIFY_OK; 1613 return NOTIFY_OK;
1572} 1614}
1573 1615
1574static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1575{
1576 struct request_queue *q = hctx->queue;
1577 struct blk_mq_tag_set *set = q->tag_set;
1578
1579 if (set->tags[hctx->queue_num])
1580 return NOTIFY_OK;
1581
1582 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1583 if (!set->tags[hctx->queue_num])
1584 return NOTIFY_STOP;
1585
1586 hctx->tags = set->tags[hctx->queue_num];
1587 return NOTIFY_OK;
1588}
1589
1590static int blk_mq_hctx_notify(void *data, unsigned long action, 1616static int blk_mq_hctx_notify(void *data, unsigned long action,
1591 unsigned int cpu) 1617 unsigned int cpu)
1592{ 1618{
@@ -1594,12 +1620,16 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
1594 1620
1595 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 1621 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1596 return blk_mq_hctx_cpu_offline(hctx, cpu); 1622 return blk_mq_hctx_cpu_offline(hctx, cpu);
1597 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 1623
1598 return blk_mq_hctx_cpu_online(hctx, cpu); 1624 /*
1625 * In case of CPU online, tags may be reallocated
1626 * in blk_mq_map_swqueue() after mapping is updated.
1627 */
1599 1628
1600 return NOTIFY_OK; 1629 return NOTIFY_OK;
1601} 1630}
1602 1631
1632/* hctx->ctxs will be freed in queue's release handler */
1603static void blk_mq_exit_hctx(struct request_queue *q, 1633static void blk_mq_exit_hctx(struct request_queue *q,
1604 struct blk_mq_tag_set *set, 1634 struct blk_mq_tag_set *set,
1605 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1635 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
@@ -1618,7 +1648,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
1618 1648
1619 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1649 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1620 blk_free_flush_queue(hctx->fq); 1650 blk_free_flush_queue(hctx->fq);
1621 kfree(hctx->ctxs);
1622 blk_mq_free_bitmap(&hctx->ctx_map); 1651 blk_mq_free_bitmap(&hctx->ctx_map);
1623} 1652}
1624 1653
@@ -1775,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1775 unsigned int i; 1804 unsigned int i;
1776 struct blk_mq_hw_ctx *hctx; 1805 struct blk_mq_hw_ctx *hctx;
1777 struct blk_mq_ctx *ctx; 1806 struct blk_mq_ctx *ctx;
1807 struct blk_mq_tag_set *set = q->tag_set;
1778 1808
1779 queue_for_each_hw_ctx(q, hctx, i) { 1809 queue_for_each_hw_ctx(q, hctx, i) {
1780 cpumask_clear(hctx->cpumask); 1810 cpumask_clear(hctx->cpumask);
@@ -1791,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1791 1821
1792 hctx = q->mq_ops->map_queue(q, i); 1822 hctx = q->mq_ops->map_queue(q, i);
1793 cpumask_set_cpu(i, hctx->cpumask); 1823 cpumask_set_cpu(i, hctx->cpumask);
1824 cpumask_set_cpu(i, hctx->tags->cpumask);
1794 ctx->index_hw = hctx->nr_ctx; 1825 ctx->index_hw = hctx->nr_ctx;
1795 hctx->ctxs[hctx->nr_ctx++] = ctx; 1826 hctx->ctxs[hctx->nr_ctx++] = ctx;
1796 } 1827 }
@@ -1803,16 +1834,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1803 * disable it and free the request entries. 1834 * disable it and free the request entries.
1804 */ 1835 */
1805 if (!hctx->nr_ctx) { 1836 if (!hctx->nr_ctx) {
1806 struct blk_mq_tag_set *set = q->tag_set;
1807
1808 if (set->tags[i]) { 1837 if (set->tags[i]) {
1809 blk_mq_free_rq_map(set, set->tags[i], i); 1838 blk_mq_free_rq_map(set, set->tags[i], i);
1810 set->tags[i] = NULL; 1839 set->tags[i] = NULL;
1811 hctx->tags = NULL;
1812 } 1840 }
1841 hctx->tags = NULL;
1813 continue; 1842 continue;
1814 } 1843 }
1815 1844
1845 /* unmapped hw queue can be remapped after CPU topo changed */
1846 if (!set->tags[i])
1847 set->tags[i] = blk_mq_init_rq_map(set, i);
1848 hctx->tags = set->tags[i];
1849 WARN_ON(!hctx->tags);
1850
1816 /* 1851 /*
1817 * Set the map size to the number of mapped software queues. 1852 * Set the map size to the number of mapped software queues.
1818 * This is more accurate and more efficient than looping 1853 * This is more accurate and more efficient than looping
@@ -1886,8 +1921,12 @@ void blk_mq_release(struct request_queue *q)
1886 unsigned int i; 1921 unsigned int i;
1887 1922
1888 /* hctx kobj stays in hctx */ 1923 /* hctx kobj stays in hctx */
1889 queue_for_each_hw_ctx(q, hctx, i) 1924 queue_for_each_hw_ctx(q, hctx, i) {
1925 if (!hctx)
1926 continue;
1927 kfree(hctx->ctxs);
1890 kfree(hctx); 1928 kfree(hctx);
1929 }
1891 1930
1892 kfree(q->queue_hw_ctx); 1931 kfree(q->queue_hw_ctx);
1893 1932
@@ -2047,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
2047/* Basically redo blk_mq_init_queue with queue frozen */ 2086/* Basically redo blk_mq_init_queue with queue frozen */
2048static void blk_mq_queue_reinit(struct request_queue *q) 2087static void blk_mq_queue_reinit(struct request_queue *q)
2049{ 2088{
2050 WARN_ON_ONCE(!q->mq_freeze_depth); 2089 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2051 2090
2052 blk_mq_sysfs_unregister(q); 2091 blk_mq_sysfs_unregister(q);
2053 2092
@@ -2090,9 +2129,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2090 */ 2129 */
2091 list_for_each_entry(q, &all_q_list, all_q_node) 2130 list_for_each_entry(q, &all_q_list, all_q_node)
2092 blk_mq_freeze_queue_start(q); 2131 blk_mq_freeze_queue_start(q);
2093 list_for_each_entry(q, &all_q_list, all_q_node) 2132 list_for_each_entry(q, &all_q_list, all_q_node) {
2094 blk_mq_freeze_queue_wait(q); 2133 blk_mq_freeze_queue_wait(q);
2095 2134
2135 /*
2136 * timeout handler can't touch hw queue during the
2137 * reinitialization
2138 */
2139 del_timer_sync(&q->timeout);
2140 }
2141
2096 list_for_each_entry(q, &all_q_list, all_q_node) 2142 list_for_each_entry(q, &all_q_list, all_q_node)
2097 blk_mq_queue_reinit(q); 2143 blk_mq_queue_reinit(q);
2098 2144
@@ -2157,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2157 return 0; 2203 return 0;
2158} 2204}
2159 2205
2206struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
2207{
2208 return tags->cpumask;
2209}
2210EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
2211
2160/* 2212/*
2161 * Alloc a tag set to be associated with one or more request queues. 2213 * Alloc a tag set to be associated with one or more request queues.
2162 * May fail with EINVAL for various error conditions. May adjust the 2214 * May fail with EINVAL for various error conditions. May adjust the
@@ -2218,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2218 int i; 2270 int i;
2219 2271
2220 for (i = 0; i < set->nr_hw_queues; i++) { 2272 for (i = 0; i < set->nr_hw_queues; i++) {
2221 if (set->tags[i]) 2273 if (set->tags[i]) {
2222 blk_mq_free_rq_map(set, set->tags[i], i); 2274 blk_mq_free_rq_map(set, set->tags[i], i);
2275 free_cpumask_var(set->tags[i]->cpumask);
2276 }
2223 } 2277 }
2224 2278
2225 kfree(set->tags); 2279 kfree(set->tags);