aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2014-04-09 22:27:01 -0400
committerJens Axboe <axboe@fb.com>2014-04-09 23:54:06 -0400
commit360f92c2443073143467a0088daffec96a17910b (patch)
treef27d17bfd3645586ffc7d5fc43d05a2200aa08c9
parent2bfad21ecc6f837de29743f4419f47dee3fac9e2 (diff)
block: fix regression with block enabled tagging
Martin reported that his test system would not boot with current git, it oopsed with this: BUG: unable to handle kernel paging request at ffff88046c6c9e80 IP: [<ffffffff812971e0>] blk_queue_start_tag+0x90/0x150 PGD 1ddf067 PUD 1de2067 PMD 47fc7d067 PTE 800000046c6c9060 Oops: 0002 [#1] SMP DEBUG_PAGEALLOC Modules linked in: sd_mod lpfc(+) scsi_transport_fc scsi_tgt oracleasm rpcsec_gss_krb5 ipv6 igb dca i2c_algo_bit i2c_core hwmon CPU: 3 PID: 87 Comm: kworker/u17:1 Not tainted 3.14.0+ #246 Hardware name: Supermicro X9DRX+-F/X9DRX+-F, BIOS 3.00 07/09/2013 Workqueue: events_unbound async_run_entry_fn task: ffff8802743c2150 ti: ffff880273d02000 task.ti: ffff880273d02000 RIP: 0010:[<ffffffff812971e0>] [<ffffffff812971e0>] blk_queue_start_tag+0x90/0x150 RSP: 0018:ffff880273d03a58 EFLAGS: 00010092 RAX: ffff88046c6c9e78 RBX: ffff880077208e78 RCX: 00000000fffc8da6 RDX: 00000000fffc186d RSI: 0000000000000009 RDI: 00000000fffc8d9d RBP: ffff880273d03a88 R08: 0000000000000001 R09: ffff8800021c2410 R10: 0000000000000005 R11: 0000000000015b30 R12: ffff88046c5bb8a0 R13: ffff88046c5c0890 R14: 000000000000001e R15: 000000000000001e FS: 0000000000000000(0000) GS:ffff880277b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff88046c6c9e80 CR3: 00000000018f6000 CR4: 00000000000407e0 Stack: ffff880273d03a98 ffff880474b18800 0000000000000000 ffff880474157000 ffff88046c5c0890 ffff880077208e78 ffff880273d03ae8 ffffffff813b9e62 ffff880200000010 ffff880474b18968 ffff880474b18848 ffff88046c5c0cd8 Call Trace: [<ffffffff813b9e62>] scsi_request_fn+0xf2/0x510 [<ffffffff81293167>] __blk_run_queue+0x37/0x50 [<ffffffff8129ac43>] blk_execute_rq_nowait+0xb3/0x130 [<ffffffff8129ad24>] blk_execute_rq+0x64/0xf0 [<ffffffff8108d2b0>] ? bit_waitqueue+0xd0/0xd0 [<ffffffff813bba35>] scsi_execute+0xe5/0x180 [<ffffffff813bbe4a>] scsi_execute_req_flags+0x9a/0x110 [<ffffffffa01b1304>] sd_spinup_disk+0x94/0x460 [sd_mod] [<ffffffff81160000>] ? __unmap_hugepage_range+0x200/0x2f0 [<ffffffffa01b2b9a>] sd_revalidate_disk+0xaa/0x3f0 [sd_mod] [<ffffffffa01b2fb8>] sd_probe_async+0xd8/0x200 [sd_mod] [<ffffffff8107703f>] async_run_entry_fn+0x3f/0x140 [<ffffffff8106a1c5>] process_one_work+0x175/0x410 [<ffffffff8106b373>] worker_thread+0x123/0x400 [<ffffffff8106b250>] ? manage_workers+0x160/0x160 [<ffffffff8107104e>] kthread+0xce/0xf0 [<ffffffff81070f80>] ? kthread_freezable_should_stop+0x70/0x70 [<ffffffff815f0bac>] ret_from_fork+0x7c/0xb0 [<ffffffff81070f80>] ? kthread_freezable_should_stop+0x70/0x70 Code: 48 0f ab 11 72 db 48 81 4b 40 00 00 10 00 89 83 08 01 00 00 48 89 df 49 8b 04 24 48 89 1c d0 e8 f7 a8 ff ff 49 8b 85 28 05 00 00 <48> 89 58 08 48 89 03 49 8d 85 28 05 00 00 48 89 43 08 49 89 9d RIP [<ffffffff812971e0>] blk_queue_start_tag+0x90/0x150 RSP <ffff880273d03a58> CR2: ffff88046c6c9e80 Martin bisected and found this to be the problem patch; commit 6d113398dcf4dfcd9787a4ead738b186f7b7ff0f Author: Jan Kara <jack@suse.cz> Date: Mon Feb 24 16:39:54 2014 +0100 block: Stop abusing rq->csd.list in blk-softirq and the problem was immediately apparent. The patch states that it is safe to reuse queuelist at completion time, since it is no longer used. However, that is not true if a device is using block enabled tagging. If that is the case, then the queuelist is reused to keep track of busy tags. If a device also ended up using softirq completions, we'd reuse ->queuelist for the IPI handling while block tagging was still using it. Boom. Fix this by adding a new ipi_list list head, and share the memory used with the request hash table. The hash table is never used after the request is moved to the dispatch list, which happens long before any potential completion of the request. Add a new request bit for this, so we don't have cases that check rq->hash while it could potentially have been reused for the IPI completion. Reported-by: Martin K. Petersen <martin.petersen@oracle.com> Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--block/blk-core.c2
-rw-r--r--block/blk-softirq.c17
-rw-r--r--block/blk.h2
-rw-r--r--block/elevator.c2
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h13
6 files changed, 24 insertions, 14 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 34d7c196338b..a0e3096c4bb5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1307,7 +1307,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1307 struct request_list *rl = blk_rq_rl(req); 1307 struct request_list *rl = blk_rq_rl(req);
1308 1308
1309 BUG_ON(!list_empty(&req->queuelist)); 1309 BUG_ON(!list_empty(&req->queuelist));
1310 BUG_ON(!hlist_unhashed(&req->hash)); 1310 BUG_ON(ELV_ON_HASH(req));
1311 1311
1312 blk_free_request(rl, req); 1312 blk_free_request(rl, req);
1313 freed_request(rl, flags); 1313 freed_request(rl, flags);
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ebd6b6f1bdeb..53b1737e978d 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -30,8 +30,8 @@ static void blk_done_softirq(struct softirq_action *h)
30 while (!list_empty(&local_list)) { 30 while (!list_empty(&local_list)) {
31 struct request *rq; 31 struct request *rq;
32 32
33 rq = list_entry(local_list.next, struct request, queuelist); 33 rq = list_entry(local_list.next, struct request, ipi_list);
34 list_del_init(&rq->queuelist); 34 list_del_init(&rq->ipi_list);
35 rq->q->softirq_done_fn(rq); 35 rq->q->softirq_done_fn(rq);
36 } 36 }
37} 37}
@@ -45,14 +45,9 @@ static void trigger_softirq(void *data)
45 45
46 local_irq_save(flags); 46 local_irq_save(flags);
47 list = this_cpu_ptr(&blk_cpu_done); 47 list = this_cpu_ptr(&blk_cpu_done);
48 /* 48 list_add_tail(&rq->ipi_list, list);
49 * We reuse queuelist for a list of requests to process. Since the
50 * queuelist is used by the block layer only for requests waiting to be
51 * submitted to the device it is unused now.
52 */
53 list_add_tail(&rq->queuelist, list);
54 49
55 if (list->next == &rq->queuelist) 50 if (list->next == &rq->ipi_list)
56 raise_softirq_irqoff(BLOCK_SOFTIRQ); 51 raise_softirq_irqoff(BLOCK_SOFTIRQ);
57 52
58 local_irq_restore(flags); 53 local_irq_restore(flags);
@@ -141,7 +136,7 @@ void __blk_complete_request(struct request *req)
141 struct list_head *list; 136 struct list_head *list;
142do_local: 137do_local:
143 list = this_cpu_ptr(&blk_cpu_done); 138 list = this_cpu_ptr(&blk_cpu_done);
144 list_add_tail(&req->queuelist, list); 139 list_add_tail(&req->ipi_list, list);
145 140
146 /* 141 /*
147 * if the list only contains our just added request, 142 * if the list only contains our just added request,
@@ -149,7 +144,7 @@ do_local:
149 * entries there, someone already raised the irq but it 144 * entries there, someone already raised the irq but it
150 * hasn't run yet. 145 * hasn't run yet.
151 */ 146 */
152 if (list->next == &req->queuelist) 147 if (list->next == &req->ipi_list)
153 raise_softirq_irqoff(BLOCK_SOFTIRQ); 148 raise_softirq_irqoff(BLOCK_SOFTIRQ);
154 } else if (raise_blk_irq(ccpu, req)) 149 } else if (raise_blk_irq(ccpu, req))
155 goto do_local; 150 goto do_local;
diff --git a/block/blk.h b/block/blk.h
index d23b415b8a28..1d880f1f957f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
78/* 78/*
79 * Internal elevator interface 79 * Internal elevator interface
80 */ 80 */
81#define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash) 81#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
82 82
83void blk_insert_flush(struct request *rq); 83void blk_insert_flush(struct request *rq);
84void blk_abort_flushes(struct request_queue *q); 84void blk_abort_flushes(struct request_queue *q);
diff --git a/block/elevator.c b/block/elevator.c
index 42c45a7d6714..1e01b66a0b92 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -247,6 +247,7 @@ EXPORT_SYMBOL(elevator_exit);
247static inline void __elv_rqhash_del(struct request *rq) 247static inline void __elv_rqhash_del(struct request *rq)
248{ 248{
249 hash_del(&rq->hash); 249 hash_del(&rq->hash);
250 rq->cmd_flags &= ~REQ_HASHED;
250} 251}
251 252
252static void elv_rqhash_del(struct request_queue *q, struct request *rq) 253static void elv_rqhash_del(struct request_queue *q, struct request *rq)
@@ -261,6 +262,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)
261 262
262 BUG_ON(ELV_ON_HASH(rq)); 263 BUG_ON(ELV_ON_HASH(rq));
263 hash_add(e->hash, &rq->hash, rq_hash_key(rq)); 264 hash_add(e->hash, &rq->hash, rq_hash_key(rq));
265 rq->cmd_flags |= REQ_HASHED;
264} 266}
265 267
266static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) 268static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index bbc3a6c88fce..aa0eaa2d0bd8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -189,6 +189,7 @@ enum rq_flag_bits {
189 __REQ_KERNEL, /* direct IO to kernel pages */ 189 __REQ_KERNEL, /* direct IO to kernel pages */
190 __REQ_PM, /* runtime pm request */ 190 __REQ_PM, /* runtime pm request */
191 __REQ_END, /* last of chain of requests */ 191 __REQ_END, /* last of chain of requests */
192 __REQ_HASHED, /* on IO scheduler merge hash */
192 __REQ_NR_BITS, /* stops here */ 193 __REQ_NR_BITS, /* stops here */
193}; 194};
194 195
@@ -241,5 +242,6 @@ enum rq_flag_bits {
241#define REQ_KERNEL (1ULL << __REQ_KERNEL) 242#define REQ_KERNEL (1ULL << __REQ_KERNEL)
242#define REQ_PM (1ULL << __REQ_PM) 243#define REQ_PM (1ULL << __REQ_PM)
243#define REQ_END (1ULL << __REQ_END) 244#define REQ_END (1ULL << __REQ_END)
245#define REQ_HASHED (1ULL << __REQ_HASHED)
244 246
245#endif /* __LINUX_BLK_TYPES_H */ 247#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1e1fa3f93d5f..99617cf7dd1a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,7 +118,18 @@ struct request {
118 struct bio *bio; 118 struct bio *bio;
119 struct bio *biotail; 119 struct bio *biotail;
120 120
121 struct hlist_node hash; /* merge hash */ 121 /*
122 * The hash is used inside the scheduler, and killed once the
123 * request reaches the dispatch list. The ipi_list is only used
124 * to queue the request for softirq completion, which is long
125 * after the request has been unhashed (and even removed from
126 * the dispatch list).
127 */
128 union {
129 struct hlist_node hash; /* merge hash */
130 struct list_head ipi_list;
131 };
132
122 /* 133 /*
123 * The rb_node is only used inside the io scheduler, requests 134 * The rb_node is only used inside the io scheduler, requests
124 * are pruned when moved to the dispatch queue. So let the 135 * are pruned when moved to the dispatch queue. So let the