aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMing Lei <ming.lei@canonical.com>2015-08-09 03:41:51 -0400
committerJens Axboe <axboe@fb.com>2015-08-15 11:45:21 -0400
commit0048b4837affd153897ed1222283492070027aa9 (patch)
treed0616b460662b3262502894493c09d194148bd9f
parent596f5aad2a704b72934e5abec1b1b4114c16f45b (diff)
blk-mq: fix race between timeout and freeing request
Inside timeout handler, blk_mq_tag_to_rq() is called to retrieve the request from one tag. This way is obviously wrong because the request can be freed any time and some fiedds of the request can't be trusted, then kernel oops might be triggered[1]. Currently wrt. blk_mq_tag_to_rq(), the only special case is that the flush request can share same tag with the request cloned from, and the two requests can't be active at the same time, so this patch fixes the above issue by updating tags->rqs[tag] with the active request(either flush rq or the request cloned from) of the tag. Also blk_mq_tag_to_rq() gets much simplified with this patch. Given blk_mq_tag_to_rq() is mainly for drivers and the caller must make sure the request can't be freed, so in bt_for_each() this helper is replaced with tags->rqs[tag]. [1] kernel oops log [ 439.696220] BUG: unable to handle kernel NULL pointer dereference at 0000000000000158^M [ 439.697162] IP: [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M [ 439.700653] PGD 7ef765067 PUD 7ef764067 PMD 0 ^M [ 439.700653] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ^M [ 439.700653] Dumping ftrace buffer:^M [ 439.700653] (ftrace buffer empty)^M [ 439.700653] Modules linked in: nbd ipv6 kvm_intel kvm serio_raw^M [ 439.700653] CPU: 6 PID: 2779 Comm: stress-ng-sigfd Not tainted 4.2.0-rc5-next-20150805+ #265^M [ 439.730500] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011^M [ 439.730500] task: ffff880605308000 ti: ffff88060530c000 task.ti: ffff88060530c000^M [ 439.730500] RIP: 0010:[<ffffffff812d89ba>] [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M [ 439.730500] RSP: 0018:ffff880819203da0 EFLAGS: 00010283^M [ 439.730500] RAX: ffff880811b0e000 RBX: ffff8800bb465f00 RCX: 0000000000000002^M [ 439.730500] RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000^M [ 439.730500] RBP: ffff880819203db0 R08: 0000000000000002 R09: 0000000000000000^M [ 439.730500] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000202^M [ 439.730500] R13: ffff880814104800 R14: 0000000000000002 R15: ffff880811a2ea00^M [ 439.730500] FS: 00007f165b3f5740(0000) GS:ffff880819200000(0000) knlGS:0000000000000000^M [ 439.730500] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b^M [ 439.730500] CR2: 0000000000000158 CR3: 00000007ef766000 CR4: 00000000000006e0^M [ 439.730500] Stack:^M [ 439.730500] 0000000000000008 ffff8808114eed90 ffff880819203e00 ffffffff812dc104^M [ 439.755663] ffff880819203e40 ffffffff812d9f5e 0000020000000000 ffff8808114eed80^M [ 439.755663] Call Trace:^M [ 439.755663] <IRQ> ^M [ 439.755663] [<ffffffff812dc104>] bt_for_each+0x6e/0xc8^M [ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M [ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M [ 439.755663] [<ffffffff812dc1b3>] blk_mq_tag_busy_iter+0x55/0x5e^M [ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M [ 439.755663] [<ffffffff812d8911>] blk_mq_rq_timer+0x5d/0xd4^M [ 439.755663] [<ffffffff810a3e10>] call_timer_fn+0xf7/0x284^M [ 439.755663] [<ffffffff810a3d1e>] ? call_timer_fn+0x5/0x284^M [ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M [ 439.755663] [<ffffffff810a46d6>] run_timer_softirq+0x1ce/0x1f8^M [ 439.755663] [<ffffffff8104c367>] __do_softirq+0x181/0x3a4^M [ 439.755663] [<ffffffff8104c76e>] irq_exit+0x40/0x94^M [ 439.755663] [<ffffffff81031482>] smp_apic_timer_interrupt+0x33/0x3e^M [ 439.755663] [<ffffffff815559a4>] apic_timer_interrupt+0x84/0x90^M [ 439.755663] <EOI> ^M [ 439.755663] [<ffffffff81554350>] ? _raw_spin_unlock_irq+0x32/0x4a^M [ 439.755663] [<ffffffff8106a98b>] finish_task_switch+0xe0/0x163^M [ 439.755663] [<ffffffff8106a94d>] ? finish_task_switch+0xa2/0x163^M [ 439.755663] [<ffffffff81550066>] __schedule+0x469/0x6cd^M [ 439.755663] [<ffffffff8155039b>] schedule+0x82/0x9a^M [ 439.789267] [<ffffffff8119b28b>] signalfd_read+0x186/0x49a^M [ 439.790911] [<ffffffff8106d86a>] ? wake_up_q+0x47/0x47^M [ 439.790911] [<ffffffff811618c2>] __vfs_read+0x28/0x9f^M [ 439.790911] [<ffffffff8117a289>] ? __fget_light+0x4d/0x74^M [ 439.790911] [<ffffffff811620a7>] vfs_read+0x7a/0xc6^M [ 439.790911] [<ffffffff8116292b>] SyS_read+0x49/0x7f^M [ 439.790911] [<ffffffff81554c17>] entry_SYSCALL_64_fastpath+0x12/0x6f^M [ 439.790911] Code: 48 89 e5 e8 a9 b8 e7 ff 5d c3 0f 1f 44 00 00 55 89 f2 48 89 e5 41 54 41 89 f4 53 48 8b 47 60 48 8b 1c d0 48 8b 7b 30 48 8b 53 38 <48> 8b 87 58 01 00 00 48 85 c0 75 09 48 8b 97 88 0c 00 00 eb 10 ^M [ 439.790911] RIP [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M [ 439.790911] RSP <ffff880819203da0>^M [ 439.790911] CR2: 0000000000000158^M [ 439.790911] ---[ end trace d40af58949325661 ]---^M Cc: <stable@vger.kernel.org> Signed-off-by: Ming Lei <ming.lei@canonical.com> Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--block/blk-flush.c15
-rw-r--r--block/blk-mq-tag.c4
-rw-r--r--block/blk-mq-tag.h12
-rw-r--r--block/blk-mq.c16
-rw-r--r--block/blk.h6
5 files changed, 35 insertions, 18 deletions
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 20badd7b9d1b..9c423e53324a 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -73,6 +73,7 @@
73 73
74#include "blk.h" 74#include "blk.h"
75#include "blk-mq.h" 75#include "blk-mq.h"
76#include "blk-mq-tag.h"
76 77
77/* FLUSH/FUA sequences */ 78/* FLUSH/FUA sequences */
78enum { 79enum {
@@ -226,7 +227,12 @@ static void flush_end_io(struct request *flush_rq, int error)
226 struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); 227 struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
227 228
228 if (q->mq_ops) { 229 if (q->mq_ops) {
230 struct blk_mq_hw_ctx *hctx;
231
232 /* release the tag's ownership to the req cloned from */
229 spin_lock_irqsave(&fq->mq_flush_lock, flags); 233 spin_lock_irqsave(&fq->mq_flush_lock, flags);
234 hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu);
235 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
230 flush_rq->tag = -1; 236 flush_rq->tag = -1;
231 } 237 }
232 238
@@ -308,11 +314,18 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
308 314
309 /* 315 /*
310 * Borrow tag from the first request since they can't 316 * Borrow tag from the first request since they can't
311 * be in flight at the same time. 317 * be in flight at the same time. And acquire the tag's
318 * ownership for flush req.
312 */ 319 */
313 if (q->mq_ops) { 320 if (q->mq_ops) {
321 struct blk_mq_hw_ctx *hctx;
322
314 flush_rq->mq_ctx = first_rq->mq_ctx; 323 flush_rq->mq_ctx = first_rq->mq_ctx;
315 flush_rq->tag = first_rq->tag; 324 flush_rq->tag = first_rq->tag;
325 fq->orig_rq = first_rq;
326
327 hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu);
328 blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
316 } 329 }
317 330
318 flush_rq->cmd_type = REQ_TYPE_FS; 331 flush_rq->cmd_type = REQ_TYPE_FS;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9b6e28830b82..9115c6d59948 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -429,7 +429,7 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
429 for (bit = find_first_bit(&bm->word, bm->depth); 429 for (bit = find_first_bit(&bm->word, bm->depth);
430 bit < bm->depth; 430 bit < bm->depth;
431 bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { 431 bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
432 rq = blk_mq_tag_to_rq(hctx->tags, off + bit); 432 rq = hctx->tags->rqs[off + bit];
433 if (rq->q == hctx->queue) 433 if (rq->q == hctx->queue)
434 fn(hctx, rq, data, reserved); 434 fn(hctx, rq, data, reserved);
435 } 435 }
@@ -453,7 +453,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags,
453 for (bit = find_first_bit(&bm->word, bm->depth); 453 for (bit = find_first_bit(&bm->word, bm->depth);
454 bit < bm->depth; 454 bit < bm->depth;
455 bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { 455 bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
456 rq = blk_mq_tag_to_rq(tags, off + bit); 456 rq = tags->rqs[off + bit];
457 fn(rq, data, reserved); 457 fn(rq, data, reserved);
458 } 458 }
459 459
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 75893a34237d..9eb2cf4f01cb 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -89,4 +89,16 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
89 __blk_mq_tag_idle(hctx); 89 __blk_mq_tag_idle(hctx);
90} 90}
91 91
92/*
93 * This helper should only be used for flush request to share tag
94 * with the request cloned from, and both the two requests can't be
95 * in flight at the same time. The caller has to make sure the tag
96 * can't be freed.
97 */
98static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
99 unsigned int tag, struct request *rq)
100{
101 hctx->tags->rqs[tag] = rq;
102}
103
92#endif 104#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 81edbd95bda8..f2d67b4047a0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -559,23 +559,9 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
559} 559}
560EXPORT_SYMBOL(blk_mq_abort_requeue_list); 560EXPORT_SYMBOL(blk_mq_abort_requeue_list);
561 561
562static inline bool is_flush_request(struct request *rq,
563 struct blk_flush_queue *fq, unsigned int tag)
564{
565 return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
566 fq->flush_rq->tag == tag);
567}
568
569struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 562struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
570{ 563{
571 struct request *rq = tags->rqs[tag]; 564 return tags->rqs[tag];
572 /* mq_ctx of flush rq is always cloned from the corresponding req */
573 struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
574
575 if (!is_flush_request(rq, fq, tag))
576 return rq;
577
578 return fq->flush_rq;
579} 565}
580EXPORT_SYMBOL(blk_mq_tag_to_rq); 566EXPORT_SYMBOL(blk_mq_tag_to_rq);
581 567
diff --git a/block/blk.h b/block/blk.h
index 026d9594142b..838188b35a83 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -22,6 +22,12 @@ struct blk_flush_queue {
22 struct list_head flush_queue[2]; 22 struct list_head flush_queue[2];
23 struct list_head flush_data_in_flight; 23 struct list_head flush_data_in_flight;
24 struct request *flush_rq; 24 struct request *flush_rq;
25
26 /*
27 * flush_rq shares tag with this rq, both can't be active
28 * at the same time
29 */
30 struct request *orig_rq;
25 spinlock_t mq_flush_lock; 31 spinlock_t mq_flush_lock;
26}; 32};
27 33