aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatias Bjørling <m@bjorling.me>2014-11-04 10:20:14 -0500
committerJens Axboe <axboe@fb.com>2014-11-04 15:18:52 -0500
commita4aea5623d4a54682b6ff5c18196d7802f3e478f (patch)
tree33e0eaeac20a6d534de58552b5f6d530691de74b
parent9dbbfab7d54109626031bf3bc476fb1804113970 (diff)
NVMe: Convert to blk-mq
This converts the NVMe driver to a blk-mq request-based driver. The NVMe driver is currently bio-based and implements queue logic within itself. By using blk-mq, a lot of these responsibilities can be moved and simplified. The patch is divided into the following blocks: * Per-command data and cmdid have been moved into the struct request field. The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id maintenance are now handled by blk-mq through the rq->tag field. * The logic for splitting bio's has been moved into the blk-mq layer. The driver instead notifies the block layer about limited gap support in SG lists. * blk-mq handles timeouts and is reimplemented within nvme_timeout(). This both includes abort handling and command cancelation. * Assignment of nvme queues to CPUs are replaced with the blk-mq version. The current blk-mq strategy is to assign the number of mapped queues and CPUs to provide synergy, while the nvme driver assign as many nvme hw queues as possible. This can be implemented in blk-mq if needed. * NVMe queues are merged with the tags structure of blk-mq. * blk-mq takes care of setup/teardown of nvme queues and guards invalid accesses. Therefore, RCU-usage for nvme queues can be removed. * IO tracing and accounting are handled by blk-mq and therefore removed. * Queue suspension logic is replaced with the logic from the block layer. Contributions in this patch from: Sam Bradshaw <sbradshaw@micron.com> Jens Axboe <axboe@fb.com> Keith Busch <keith.busch@intel.com> Robert Nelson <rlnelson@google.com> Acked-by: Keith Busch <keith.busch@intel.com> Acked-by: Jens Axboe <axboe@fb.com> Updated for new ->queue_rq() prototype. Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--drivers/block/nvme-core.c1366
-rw-r--r--drivers/block/nvme-scsi.c8
-rw-r--r--include/linux/nvme.h15
3 files changed, 570 insertions, 819 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c70eff3673d0..39050a3d10fd 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
13 */ 13 */
14 14
15#include <linux/nvme.h> 15#include <linux/nvme.h>
16#include <linux/bio.h>
17#include <linux/bitops.h> 16#include <linux/bitops.h>
18#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/blk-mq.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
@@ -33,7 +33,6 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/pci.h> 35#include <linux/pci.h>
36#include <linux/percpu.h>
37#include <linux/poison.h> 36#include <linux/poison.h>
38#include <linux/ptrace.h> 37#include <linux/ptrace.h>
39#include <linux/sched.h> 38#include <linux/sched.h>
@@ -42,9 +41,8 @@
42#include <scsi/sg.h> 41#include <scsi/sg.h>
43#include <asm-generic/io-64-nonatomic-lo-hi.h> 42#include <asm-generic/io-64-nonatomic-lo-hi.h>
44 43
45#include <trace/events/block.h>
46
47#define NVME_Q_DEPTH 1024 44#define NVME_Q_DEPTH 1024
45#define NVME_AQ_DEPTH 64
48#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 46#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
49#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 47#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
50#define ADMIN_TIMEOUT (admin_timeout * HZ) 48#define ADMIN_TIMEOUT (admin_timeout * HZ)
@@ -81,10 +79,12 @@ static wait_queue_head_t nvme_kthread_wait;
81static struct notifier_block nvme_nb; 79static struct notifier_block nvme_nb;
82 80
83static void nvme_reset_failed_dev(struct work_struct *ws); 81static void nvme_reset_failed_dev(struct work_struct *ws);
82static int nvme_process_cq(struct nvme_queue *nvmeq);
84 83
85struct async_cmd_info { 84struct async_cmd_info {
86 struct kthread_work work; 85 struct kthread_work work;
87 struct kthread_worker *worker; 86 struct kthread_worker *worker;
87 struct request *req;
88 u32 result; 88 u32 result;
89 int status; 89 int status;
90 void *ctx; 90 void *ctx;
@@ -104,10 +104,6 @@ struct nvme_queue {
104 volatile struct nvme_completion *cqes; 104 volatile struct nvme_completion *cqes;
105 dma_addr_t sq_dma_addr; 105 dma_addr_t sq_dma_addr;
106 dma_addr_t cq_dma_addr; 106 dma_addr_t cq_dma_addr;
107 wait_queue_head_t sq_full;
108 wait_queue_t sq_cong_wait;
109 struct bio_list sq_cong;
110 struct list_head iod_bio;
111 u32 __iomem *q_db; 107 u32 __iomem *q_db;
112 u16 q_depth; 108 u16 q_depth;
113 u16 cq_vector; 109 u16 cq_vector;
@@ -117,10 +113,8 @@ struct nvme_queue {
117 u16 qid; 113 u16 qid;
118 u8 cq_phase; 114 u8 cq_phase;
119 u8 cqe_seen; 115 u8 cqe_seen;
120 u8 q_suspended;
121 cpumask_var_t cpu_mask;
122 struct async_cmd_info cmdinfo; 116 struct async_cmd_info cmdinfo;
123 unsigned long cmdid_data[]; 117 struct blk_mq_hw_ctx *hctx;
124}; 118};
125 119
126/* 120/*
@@ -148,62 +142,72 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
148struct nvme_cmd_info { 142struct nvme_cmd_info {
149 nvme_completion_fn fn; 143 nvme_completion_fn fn;
150 void *ctx; 144 void *ctx;
151 unsigned long timeout;
152 int aborted; 145 int aborted;
146 struct nvme_queue *nvmeq;
153}; 147};
154 148
155static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 149static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
150 unsigned int hctx_idx)
156{ 151{
157 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 152 struct nvme_dev *dev = data;
153 struct nvme_queue *nvmeq = dev->queues[0];
154
155 WARN_ON(nvmeq->hctx);
156 nvmeq->hctx = hctx;
157 hctx->driver_data = nvmeq;
158 return 0;
158} 159}
159 160
160static unsigned nvme_queue_extra(int depth) 161static int nvme_admin_init_request(void *data, struct request *req,
162 unsigned int hctx_idx, unsigned int rq_idx,
163 unsigned int numa_node)
161{ 164{
162 return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); 165 struct nvme_dev *dev = data;
166 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
167 struct nvme_queue *nvmeq = dev->queues[0];
168
169 BUG_ON(!nvmeq);
170 cmd->nvmeq = nvmeq;
171 return 0;
163} 172}
164 173
165/** 174static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
166 * alloc_cmdid() - Allocate a Command ID 175 unsigned int hctx_idx)
167 * @nvmeq: The queue that will be used for this command
168 * @ctx: A pointer that will be passed to the handler
169 * @handler: The function to call on completion
170 *
171 * Allocate a Command ID for a queue. The data passed in will
172 * be passed to the completion handler. This is implemented by using
173 * the bottom two bits of the ctx pointer to store the handler ID.
174 * Passing in a pointer that's not 4-byte aligned will cause a BUG.
175 * We can change this if it becomes a problem.
176 *
177 * May be called with local interrupts disabled and the q_lock held,
178 * or with interrupts enabled and no locks held.
179 */
180static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
181 nvme_completion_fn handler, unsigned timeout)
182{ 176{
183 int depth = nvmeq->q_depth - 1; 177 struct nvme_dev *dev = data;
184 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 178 struct nvme_queue *nvmeq = dev->queues[
185 int cmdid; 179 (hctx_idx % dev->queue_count) + 1];
186 180
187 do { 181 if (!nvmeq->hctx)
188 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 182 nvmeq->hctx = hctx;
189 if (cmdid >= depth) 183
190 return -EBUSY; 184 /* nvmeq queues are shared between namespaces. We assume here that
191 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 185 * blk-mq map the tags so they match up with the nvme queue tags. */
186 WARN_ON(nvmeq->hctx->tags != hctx->tags);
192 187
193 info[cmdid].fn = handler; 188 hctx->driver_data = nvmeq;
194 info[cmdid].ctx = ctx; 189 return 0;
195 info[cmdid].timeout = jiffies + timeout;
196 info[cmdid].aborted = 0;
197 return cmdid;
198} 190}
199 191
200static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 192static int nvme_init_request(void *data, struct request *req,
201 nvme_completion_fn handler, unsigned timeout) 193 unsigned int hctx_idx, unsigned int rq_idx,
194 unsigned int numa_node)
202{ 195{
203 int cmdid; 196 struct nvme_dev *dev = data;
204 wait_event_killable(nvmeq->sq_full, 197 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
205 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 198 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
206 return (cmdid < 0) ? -EINTR : cmdid; 199
200 BUG_ON(!nvmeq);
201 cmd->nvmeq = nvmeq;
202 return 0;
203}
204
205static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
206 nvme_completion_fn handler)
207{
208 cmd->fn = handler;
209 cmd->ctx = ctx;
210 cmd->aborted = 0;
207} 211}
208 212
209/* Special values must be less than 0x1000 */ 213/* Special values must be less than 0x1000 */
@@ -211,18 +215,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
211#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 215#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
212#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 216#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
213#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 217#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
214#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE)
215#define CMD_CTX_ASYNC (0x31C + CMD_CTX_BASE)
216 218
217static void special_completion(struct nvme_queue *nvmeq, void *ctx, 219static void special_completion(struct nvme_queue *nvmeq, void *ctx,
218 struct nvme_completion *cqe) 220 struct nvme_completion *cqe)
219{ 221{
220 if (ctx == CMD_CTX_CANCELLED) 222 if (ctx == CMD_CTX_CANCELLED)
221 return; 223 return;
222 if (ctx == CMD_CTX_ABORT) {
223 ++nvmeq->dev->abort_limit;
224 return;
225 }
226 if (ctx == CMD_CTX_COMPLETED) { 224 if (ctx == CMD_CTX_COMPLETED) {
227 dev_warn(nvmeq->q_dmadev, 225 dev_warn(nvmeq->q_dmadev,
228 "completed id %d twice on queue %d\n", 226 "completed id %d twice on queue %d\n",
@@ -235,110 +233,89 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx,
235 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 233 cqe->command_id, le16_to_cpup(&cqe->sq_id));
236 return; 234 return;
237 } 235 }
238 if (ctx == CMD_CTX_ASYNC) {
239 u32 result = le32_to_cpup(&cqe->result);
240 u16 status = le16_to_cpup(&cqe->status) >> 1;
241
242 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
243 ++nvmeq->dev->event_limit;
244 if (status == NVME_SC_SUCCESS)
245 dev_warn(nvmeq->q_dmadev,
246 "async event result %08x\n", result);
247 return;
248 }
249
250 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 236 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
251} 237}
252 238
253static void async_completion(struct nvme_queue *nvmeq, void *ctx, 239static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
254 struct nvme_completion *cqe)
255{
256 struct async_cmd_info *cmdinfo = ctx;
257 cmdinfo->result = le32_to_cpup(&cqe->result);
258 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
259 queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
260}
261
262/*
263 * Called with local interrupts disabled and the q_lock held. May not sleep.
264 */
265static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
266 nvme_completion_fn *fn)
267{ 240{
268 void *ctx; 241 void *ctx;
269 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
270 242
271 if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
272 if (fn)
273 *fn = special_completion;
274 return CMD_CTX_INVALID;
275 }
276 if (fn) 243 if (fn)
277 *fn = info[cmdid].fn; 244 *fn = cmd->fn;
278 ctx = info[cmdid].ctx; 245 ctx = cmd->ctx;
279 info[cmdid].fn = special_completion; 246 cmd->fn = special_completion;
280 info[cmdid].ctx = CMD_CTX_COMPLETED; 247 cmd->ctx = CMD_CTX_CANCELLED;
281 clear_bit(cmdid, nvmeq->cmdid_data);
282 wake_up(&nvmeq->sq_full);
283 return ctx; 248 return ctx;
284} 249}
285 250
286static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 251static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
287 nvme_completion_fn *fn) 252 struct nvme_completion *cqe)
288{ 253{
289 void *ctx; 254 struct request *req = ctx;
290 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
291 if (fn)
292 *fn = info[cmdid].fn;
293 ctx = info[cmdid].ctx;
294 info[cmdid].fn = special_completion;
295 info[cmdid].ctx = CMD_CTX_CANCELLED;
296 return ctx;
297}
298 255
299static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) 256 u32 result = le32_to_cpup(&cqe->result);
300{ 257 u16 status = le16_to_cpup(&cqe->status) >> 1;
301 return rcu_dereference_raw(dev->queues[qid]); 258
259 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
260 ++nvmeq->dev->event_limit;
261 if (status == NVME_SC_SUCCESS)
262 dev_warn(nvmeq->q_dmadev,
263 "async event result %08x\n", result);
264
265 blk_put_request(req);
302} 266}
303 267
304static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) 268static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
269 struct nvme_completion *cqe)
305{ 270{
306 struct nvme_queue *nvmeq; 271 struct request *req = ctx;
307 unsigned queue_id = get_cpu_var(*dev->io_queue); 272
273 u16 status = le16_to_cpup(&cqe->status) >> 1;
274 u32 result = le32_to_cpup(&cqe->result);
308 275
309 rcu_read_lock(); 276 blk_put_request(req);
310 nvmeq = rcu_dereference(dev->queues[queue_id]);
311 if (nvmeq)
312 return nvmeq;
313 277
314 rcu_read_unlock(); 278 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
315 put_cpu_var(*dev->io_queue); 279 ++nvmeq->dev->abort_limit;
316 return NULL;
317} 280}
318 281
319static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 282static void async_completion(struct nvme_queue *nvmeq, void *ctx,
283 struct nvme_completion *cqe)
320{ 284{
321 rcu_read_unlock(); 285 struct async_cmd_info *cmdinfo = ctx;
322 put_cpu_var(nvmeq->dev->io_queue); 286 cmdinfo->result = le32_to_cpup(&cqe->result);
287 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
288 queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
289 blk_put_request(cmdinfo->req);
323} 290}
324 291
325static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) 292static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
326 __acquires(RCU) 293 unsigned int tag)
327{ 294{
328 struct nvme_queue *nvmeq; 295 struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
329 296 struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
330 rcu_read_lock();
331 nvmeq = rcu_dereference(dev->queues[q_idx]);
332 if (nvmeq)
333 return nvmeq;
334 297
335 rcu_read_unlock(); 298 return blk_mq_rq_to_pdu(req);
336 return NULL;
337} 299}
338 300
339static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 301/*
302 * Called with local interrupts disabled and the q_lock held. May not sleep.
303 */
304static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
305 nvme_completion_fn *fn)
340{ 306{
341 rcu_read_unlock(); 307 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
308 void *ctx;
309 if (tag >= nvmeq->q_depth) {
310 *fn = special_completion;
311 return CMD_CTX_INVALID;
312 }
313 if (fn)
314 *fn = cmd->fn;
315 ctx = cmd->ctx;
316 cmd->fn = special_completion;
317 cmd->ctx = CMD_CTX_COMPLETED;
318 return ctx;
342} 319}
343 320
344/** 321/**
@@ -348,26 +325,29 @@ static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
348 * 325 *
349 * Safe to use from interrupt context 326 * Safe to use from interrupt context
350 */ 327 */
351static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 328static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
352{ 329{
353 unsigned long flags; 330 u16 tail = nvmeq->sq_tail;
354 u16 tail; 331
355 spin_lock_irqsave(&nvmeq->q_lock, flags);
356 if (nvmeq->q_suspended) {
357 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
358 return -EBUSY;
359 }
360 tail = nvmeq->sq_tail;
361 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 332 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
362 if (++tail == nvmeq->q_depth) 333 if (++tail == nvmeq->q_depth)
363 tail = 0; 334 tail = 0;
364 writel(tail, nvmeq->q_db); 335 writel(tail, nvmeq->q_db);
365 nvmeq->sq_tail = tail; 336 nvmeq->sq_tail = tail;
366 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
367 337
368 return 0; 338 return 0;
369} 339}
370 340
341static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
342{
343 unsigned long flags;
344 int ret;
345 spin_lock_irqsave(&nvmeq->q_lock, flags);
346 ret = __nvme_submit_cmd(nvmeq, cmd);
347 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
348 return ret;
349}
350
371static __le64 **iod_list(struct nvme_iod *iod) 351static __le64 **iod_list(struct nvme_iod *iod)
372{ 352{
373 return ((void *)iod) + iod->offset; 353 return ((void *)iod) + iod->offset;
@@ -397,7 +377,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
397 iod->length = nbytes; 377 iod->length = nbytes;
398 iod->nents = 0; 378 iod->nents = 0;
399 iod->first_dma = 0ULL; 379 iod->first_dma = 0ULL;
400 iod->start_time = jiffies;
401 } 380 }
402 381
403 return iod; 382 return iod;
@@ -421,35 +400,6 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
421 kfree(iod); 400 kfree(iod);
422} 401}
423 402
424static void nvme_start_io_acct(struct bio *bio)
425{
426 struct gendisk *disk = bio->bi_bdev->bd_disk;
427 if (blk_queue_io_stat(disk->queue)) {
428 const int rw = bio_data_dir(bio);
429 int cpu = part_stat_lock();
430 part_round_stats(cpu, &disk->part0);
431 part_stat_inc(cpu, &disk->part0, ios[rw]);
432 part_stat_add(cpu, &disk->part0, sectors[rw],
433 bio_sectors(bio));
434 part_inc_in_flight(&disk->part0, rw);
435 part_stat_unlock();
436 }
437}
438
439static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
440{
441 struct gendisk *disk = bio->bi_bdev->bd_disk;
442 if (blk_queue_io_stat(disk->queue)) {
443 const int rw = bio_data_dir(bio);
444 unsigned long duration = jiffies - start_time;
445 int cpu = part_stat_lock();
446 part_stat_add(cpu, &disk->part0, ticks[rw], duration);
447 part_round_stats(cpu, &disk->part0);
448 part_dec_in_flight(&disk->part0, rw);
449 part_stat_unlock();
450 }
451}
452
453static int nvme_error_status(u16 status) 403static int nvme_error_status(u16 status)
454{ 404{
455 switch (status & 0x7ff) { 405 switch (status & 0x7ff) {
@@ -462,36 +412,37 @@ static int nvme_error_status(u16 status)
462 } 412 }
463} 413}
464 414
465static void bio_completion(struct nvme_queue *nvmeq, void *ctx, 415static void req_completion(struct nvme_queue *nvmeq, void *ctx,
466 struct nvme_completion *cqe) 416 struct nvme_completion *cqe)
467{ 417{
468 struct nvme_iod *iod = ctx; 418 struct nvme_iod *iod = ctx;
469 struct bio *bio = iod->private; 419 struct request *req = iod->private;
420 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
421
470 u16 status = le16_to_cpup(&cqe->status) >> 1; 422 u16 status = le16_to_cpup(&cqe->status) >> 1;
471 int error = 0;
472 423
473 if (unlikely(status)) { 424 if (unlikely(status)) {
474 if (!(status & NVME_SC_DNR || 425 if (!(status & NVME_SC_DNR || blk_noretry_request(req))
475 bio->bi_rw & REQ_FAILFAST_MASK) && 426 && (jiffies - req->start_time) < req->timeout) {
476 (jiffies - iod->start_time) < IOD_TIMEOUT) { 427 blk_mq_requeue_request(req);
477 if (!waitqueue_active(&nvmeq->sq_full)) 428 blk_mq_kick_requeue_list(req->q);
478 add_wait_queue(&nvmeq->sq_full,
479 &nvmeq->sq_cong_wait);
480 list_add_tail(&iod->node, &nvmeq->iod_bio);
481 wake_up(&nvmeq->sq_full);
482 return; 429 return;
483 } 430 }
484 error = nvme_error_status(status); 431 req->errors = nvme_error_status(status);
485 } 432 } else
486 if (iod->nents) { 433 req->errors = 0;
487 dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, 434
488 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 435 if (cmd_rq->aborted)
489 nvme_end_io_acct(bio, iod->start_time); 436 dev_warn(&nvmeq->dev->pci_dev->dev,
490 } 437 "completing aborted command with status:%04x\n",
438 status);
439
440 if (iod->nents)
441 dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
442 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
491 nvme_free_iod(nvmeq->dev, iod); 443 nvme_free_iod(nvmeq->dev, iod);
492 444
493 trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); 445 blk_mq_complete_request(req);
494 bio_endio(bio, error);
495} 446}
496 447
497/* length is in bytes. gfp flags indicates whether we may sleep. */ 448/* length is in bytes. gfp flags indicates whether we may sleep. */
@@ -574,88 +525,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
574 return total_len; 525 return total_len;
575} 526}
576 527
577static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 528/*
578 int len) 529 * We reuse the small pool to allocate the 16-byte range here as it is not
579{ 530 * worth having a special pool for these or additional cases to handle freeing
580 struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); 531 * the iod.
581 if (!split) 532 */
582 return -ENOMEM; 533static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
583 534 struct request *req, struct nvme_iod *iod)
584 trace_block_split(bdev_get_queue(bio->bi_bdev), bio,
585 split->bi_iter.bi_sector);
586 bio_chain(split, bio);
587
588 if (!waitqueue_active(&nvmeq->sq_full))
589 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
590 bio_list_add(&nvmeq->sq_cong, split);
591 bio_list_add(&nvmeq->sq_cong, bio);
592 wake_up(&nvmeq->sq_full);
593
594 return 0;
595}
596
597/* NVMe scatterlists require no holes in the virtual address */
598#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
599 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
600
601static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
602 struct bio *bio, enum dma_data_direction dma_dir, int psegs)
603{
604 struct bio_vec bvec, bvprv;
605 struct bvec_iter iter;
606 struct scatterlist *sg = NULL;
607 int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
608 int first = 1;
609
610 if (nvmeq->dev->stripe_size)
611 split_len = nvmeq->dev->stripe_size -
612 ((bio->bi_iter.bi_sector << 9) &
613 (nvmeq->dev->stripe_size - 1));
614
615 sg_init_table(iod->sg, psegs);
616 bio_for_each_segment(bvec, bio, iter) {
617 if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
618 sg->length += bvec.bv_len;
619 } else {
620 if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
621 return nvme_split_and_submit(bio, nvmeq,
622 length);
623
624 sg = sg ? sg + 1 : iod->sg;
625 sg_set_page(sg, bvec.bv_page,
626 bvec.bv_len, bvec.bv_offset);
627 nsegs++;
628 }
629
630 if (split_len - length < bvec.bv_len)
631 return nvme_split_and_submit(bio, nvmeq, split_len);
632 length += bvec.bv_len;
633 bvprv = bvec;
634 first = 0;
635 }
636 iod->nents = nsegs;
637 sg_mark_end(sg);
638 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
639 return -ENOMEM;
640
641 BUG_ON(length != bio->bi_iter.bi_size);
642 return length;
643}
644
645static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
646 struct bio *bio, struct nvme_iod *iod, int cmdid)
647{ 535{
648 struct nvme_dsm_range *range = 536 struct nvme_dsm_range *range =
649 (struct nvme_dsm_range *)iod_list(iod)[0]; 537 (struct nvme_dsm_range *)iod_list(iod)[0];
650 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 538 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
651 539
652 range->cattr = cpu_to_le32(0); 540 range->cattr = cpu_to_le32(0);
653 range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); 541 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
654 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 542 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
655 543
656 memset(cmnd, 0, sizeof(*cmnd)); 544 memset(cmnd, 0, sizeof(*cmnd));
657 cmnd->dsm.opcode = nvme_cmd_dsm; 545 cmnd->dsm.opcode = nvme_cmd_dsm;
658 cmnd->dsm.command_id = cmdid; 546 cmnd->dsm.command_id = req->tag;
659 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 547 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
660 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 548 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
661 cmnd->dsm.nr = 0; 549 cmnd->dsm.nr = 0;
@@ -664,11 +552,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
664 if (++nvmeq->sq_tail == nvmeq->q_depth) 552 if (++nvmeq->sq_tail == nvmeq->q_depth)
665 nvmeq->sq_tail = 0; 553 nvmeq->sq_tail = 0;
666 writel(nvmeq->sq_tail, nvmeq->q_db); 554 writel(nvmeq->sq_tail, nvmeq->q_db);
667
668 return 0;
669} 555}
670 556
671static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 557static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
672 int cmdid) 558 int cmdid)
673{ 559{
674 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 560 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
@@ -681,49 +567,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
681 if (++nvmeq->sq_tail == nvmeq->q_depth) 567 if (++nvmeq->sq_tail == nvmeq->q_depth)
682 nvmeq->sq_tail = 0; 568 nvmeq->sq_tail = 0;
683 writel(nvmeq->sq_tail, nvmeq->q_db); 569 writel(nvmeq->sq_tail, nvmeq->q_db);
684
685 return 0;
686} 570}
687 571
688static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) 572static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
573 struct nvme_ns *ns)
689{ 574{
690 struct bio *bio = iod->private; 575 struct request *req = iod->private;
691 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
692 struct nvme_command *cmnd; 576 struct nvme_command *cmnd;
693 int cmdid; 577 u16 control = 0;
694 u16 control; 578 u32 dsmgmt = 0;
695 u32 dsmgmt;
696 579
697 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 580 if (req->cmd_flags & REQ_FUA)
698 if (unlikely(cmdid < 0))
699 return cmdid;
700
701 if (bio->bi_rw & REQ_DISCARD)
702 return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
703 if (bio->bi_rw & REQ_FLUSH)
704 return nvme_submit_flush(nvmeq, ns, cmdid);
705
706 control = 0;
707 if (bio->bi_rw & REQ_FUA)
708 control |= NVME_RW_FUA; 581 control |= NVME_RW_FUA;
709 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 582 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
710 control |= NVME_RW_LR; 583 control |= NVME_RW_LR;
711 584
712 dsmgmt = 0; 585 if (req->cmd_flags & REQ_RAHEAD)
713 if (bio->bi_rw & REQ_RAHEAD)
714 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 586 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
715 587
716 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 588 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
717 memset(cmnd, 0, sizeof(*cmnd)); 589 memset(cmnd, 0, sizeof(*cmnd));
718 590
719 cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; 591 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
720 cmnd->rw.command_id = cmdid; 592 cmnd->rw.command_id = req->tag;
721 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 593 cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
722 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 594 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
723 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 595 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
724 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 596 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
725 cmnd->rw.length = 597 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
726 cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
727 cmnd->rw.control = cpu_to_le16(control); 598 cmnd->rw.control = cpu_to_le16(control);
728 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 599 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
729 600
@@ -734,47 +605,37 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
734 return 0; 605 return 0;
735} 606}
736 607
737static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) 608static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
738{ 609 const struct blk_mq_queue_data *bd)
739 struct bio *split = bio_clone(bio, GFP_ATOMIC);
740 if (!split)
741 return -ENOMEM;
742
743 split->bi_iter.bi_size = 0;
744 split->bi_phys_segments = 0;
745 bio->bi_rw &= ~REQ_FLUSH;
746 bio_chain(split, bio);
747
748 if (!waitqueue_active(&nvmeq->sq_full))
749 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
750 bio_list_add(&nvmeq->sq_cong, split);
751 bio_list_add(&nvmeq->sq_cong, bio);
752 wake_up_process(nvme_thread);
753
754 return 0;
755}
756
757/*
758 * Called with local interrupts disabled and the q_lock held. May not sleep.
759 */
760static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
761 struct bio *bio)
762{ 610{
611 struct nvme_ns *ns = hctx->queue->queuedata;
612 struct nvme_queue *nvmeq = hctx->driver_data;
613 struct request *req = bd->rq;
614 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
763 struct nvme_iod *iod; 615 struct nvme_iod *iod;
764 int psegs = bio_phys_segments(ns->queue, bio); 616 int psegs = req->nr_phys_segments;
765 int result; 617 int result = BLK_MQ_RQ_QUEUE_BUSY;
766 unsigned size = !(bio->bi_rw & REQ_DISCARD) ? bio->bi_iter.bi_size : 618 enum dma_data_direction dma_dir;
619 unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
767 sizeof(struct nvme_dsm_range); 620 sizeof(struct nvme_dsm_range);
768 621
769 if ((bio->bi_rw & REQ_FLUSH) && psegs) 622 /*
770 return nvme_split_flush_data(nvmeq, bio); 623 * Requeued IO has already been prepped
624 */
625 iod = req->special;
626 if (iod)
627 goto submit_iod;
771 628
772 iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); 629 iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
773 if (!iod) 630 if (!iod)
774 return -ENOMEM; 631 return result;
632
633 iod->private = req;
634 req->special = iod;
775 635
776 iod->private = bio; 636 nvme_set_info(cmd, iod, req_completion);
777 if (bio->bi_rw & REQ_DISCARD) { 637
638 if (req->cmd_flags & REQ_DISCARD) {
778 void *range; 639 void *range;
779 /* 640 /*
780 * We reuse the small pool to allocate the 16-byte range here 641 * We reuse the small pool to allocate the 16-byte range here
@@ -784,33 +645,45 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
784 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, 645 range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
785 GFP_ATOMIC, 646 GFP_ATOMIC,
786 &iod->first_dma); 647 &iod->first_dma);
787 if (!range) { 648 if (!range)
788 result = -ENOMEM; 649 goto finish_cmd;
789 goto free_iod;
790 }
791 iod_list(iod)[0] = (__le64 *)range; 650 iod_list(iod)[0] = (__le64 *)range;
792 iod->npages = 0; 651 iod->npages = 0;
793 } else if (psegs) { 652 } else if (psegs) {
794 result = nvme_map_bio(nvmeq, iod, bio, 653 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
795 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, 654
796 psegs); 655 sg_init_table(iod->sg, psegs);
797 if (result <= 0) 656 iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
798 goto free_iod; 657 if (!iod->nents) {
799 if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != 658 result = BLK_MQ_RQ_QUEUE_ERROR;
800 result) { 659 goto finish_cmd;
801 result = -ENOMEM;
802 goto free_iod;
803 } 660 }
804 nvme_start_io_acct(bio); 661
805 } 662 if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
806 if (unlikely(nvme_submit_iod(nvmeq, iod))) { 663 goto finish_cmd;
807 if (!waitqueue_active(&nvmeq->sq_full)) 664
808 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 665 if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
809 list_add_tail(&iod->node, &nvmeq->iod_bio); 666 blk_rq_bytes(req), GFP_ATOMIC))
667 goto finish_cmd;
810 } 668 }
811 return 0;
812 669
813 free_iod: 670 blk_mq_start_request(req);
671
672 submit_iod:
673 spin_lock_irq(&nvmeq->q_lock);
674 if (req->cmd_flags & REQ_DISCARD)
675 nvme_submit_discard(nvmeq, ns, req, iod);
676 else if (req->cmd_flags & REQ_FLUSH)
677 nvme_submit_flush(nvmeq, ns, req->tag);
678 else
679 nvme_submit_iod(nvmeq, iod, ns);
680
681 nvme_process_cq(nvmeq);
682 spin_unlock_irq(&nvmeq->q_lock);
683 return BLK_MQ_RQ_QUEUE_OK;
684
685 finish_cmd:
686 nvme_finish_cmd(nvmeq, req->tag, NULL);
814 nvme_free_iod(nvmeq->dev, iod); 687 nvme_free_iod(nvmeq->dev, iod);
815 return result; 688 return result;
816} 689}
@@ -833,8 +706,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
833 head = 0; 706 head = 0;
834 phase = !phase; 707 phase = !phase;
835 } 708 }
836 709 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
837 ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
838 fn(nvmeq, ctx, &cqe); 710 fn(nvmeq, ctx, &cqe);
839 } 711 }
840 712
@@ -855,29 +727,13 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
855 return 1; 727 return 1;
856} 728}
857 729
858static void nvme_make_request(struct request_queue *q, struct bio *bio) 730/* Admin queue isn't initialized as a request queue. If at some point this
731 * happens anyway, make sure to notify the user */
732static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
733 const struct blk_mq_queue_data *bd)
859{ 734{
860 struct nvme_ns *ns = q->queuedata; 735 WARN_ON_ONCE(1);
861 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 736 return BLK_MQ_RQ_QUEUE_ERROR;
862 int result = -EBUSY;
863
864 if (!nvmeq) {
865 bio_endio(bio, -EIO);
866 return;
867 }
868
869 spin_lock_irq(&nvmeq->q_lock);
870 if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
871 result = nvme_submit_bio_queue(nvmeq, ns, bio);
872 if (unlikely(result)) {
873 if (!waitqueue_active(&nvmeq->sq_full))
874 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
875 bio_list_add(&nvmeq->sq_cong, bio);
876 }
877
878 nvme_process_cq(nvmeq);
879 spin_unlock_irq(&nvmeq->q_lock);
880 put_nvmeq(nvmeq);
881} 737}
882 738
883static irqreturn_t nvme_irq(int irq, void *data) 739static irqreturn_t nvme_irq(int irq, void *data)
@@ -901,10 +757,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
901 return IRQ_WAKE_THREAD; 757 return IRQ_WAKE_THREAD;
902} 758}
903 759
904static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 760static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info *
761 cmd_info)
905{ 762{
906 spin_lock_irq(&nvmeq->q_lock); 763 spin_lock_irq(&nvmeq->q_lock);
907 cancel_cmdid(nvmeq, cmdid, NULL); 764 cancel_cmd_info(cmd_info, NULL);
908 spin_unlock_irq(&nvmeq->q_lock); 765 spin_unlock_irq(&nvmeq->q_lock);
909} 766}
910 767
@@ -927,45 +784,31 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
927 * Returns 0 on success. If the result is negative, it's a Linux error code; 784 * Returns 0 on success. If the result is negative, it's a Linux error code;
928 * if the result is positive, it's an NVM Express status code 785 * if the result is positive, it's an NVM Express status code
929 */ 786 */
930static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, 787static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
931 struct nvme_command *cmd,
932 u32 *result, unsigned timeout) 788 u32 *result, unsigned timeout)
933{ 789{
934 int cmdid, ret; 790 int ret;
935 struct sync_cmd_info cmdinfo; 791 struct sync_cmd_info cmdinfo;
936 struct nvme_queue *nvmeq; 792 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
937 793 struct nvme_queue *nvmeq = cmd_rq->nvmeq;
938 nvmeq = lock_nvmeq(dev, q_idx);
939 if (!nvmeq)
940 return -ENODEV;
941 794
942 cmdinfo.task = current; 795 cmdinfo.task = current;
943 cmdinfo.status = -EINTR; 796 cmdinfo.status = -EINTR;
944 797
945 cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); 798 cmd->common.command_id = req->tag;
946 if (cmdid < 0) { 799
947 unlock_nvmeq(nvmeq); 800 nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
948 return cmdid;
949 }
950 cmd->common.command_id = cmdid;
951 801
952 set_current_state(TASK_KILLABLE); 802 set_current_state(TASK_KILLABLE);
953 ret = nvme_submit_cmd(nvmeq, cmd); 803 ret = nvme_submit_cmd(nvmeq, cmd);
954 if (ret) { 804 if (ret) {
955 free_cmdid(nvmeq, cmdid, NULL); 805 nvme_finish_cmd(nvmeq, req->tag, NULL);
956 unlock_nvmeq(nvmeq);
957 set_current_state(TASK_RUNNING); 806 set_current_state(TASK_RUNNING);
958 return ret;
959 } 807 }
960 unlock_nvmeq(nvmeq);
961 schedule_timeout(timeout); 808 schedule_timeout(timeout);
962 809
963 if (cmdinfo.status == -EINTR) { 810 if (cmdinfo.status == -EINTR) {
964 nvmeq = lock_nvmeq(dev, q_idx); 811 nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req));
965 if (nvmeq) {
966 nvme_abort_command(nvmeq, cmdid);
967 unlock_nvmeq(nvmeq);
968 }
969 return -EINTR; 812 return -EINTR;
970 } 813 }
971 814
@@ -975,59 +818,99 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
975 return cmdinfo.status; 818 return cmdinfo.status;
976} 819}
977 820
978static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, 821static int nvme_submit_async_admin_req(struct nvme_dev *dev)
822{
823 struct nvme_queue *nvmeq = dev->queues[0];
824 struct nvme_command c;
825 struct nvme_cmd_info *cmd_info;
826 struct request *req;
827
828 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
829 if (!req)
830 return -ENOMEM;
831
832 cmd_info = blk_mq_rq_to_pdu(req);
833 nvme_set_info(cmd_info, req, async_req_completion);
834
835 memset(&c, 0, sizeof(c));
836 c.common.opcode = nvme_admin_async_event;
837 c.common.command_id = req->tag;
838
839 return __nvme_submit_cmd(nvmeq, &c);
840}
841
842static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
979 struct nvme_command *cmd, 843 struct nvme_command *cmd,
980 struct async_cmd_info *cmdinfo, unsigned timeout) 844 struct async_cmd_info *cmdinfo, unsigned timeout)
981{ 845{
982 int cmdid; 846 struct nvme_queue *nvmeq = dev->queues[0];
847 struct request *req;
848 struct nvme_cmd_info *cmd_rq;
983 849
984 cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); 850 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
985 if (cmdid < 0) 851 if (!req)
986 return cmdid; 852 return -ENOMEM;
853
854 req->timeout = timeout;
855 cmd_rq = blk_mq_rq_to_pdu(req);
856 cmdinfo->req = req;
857 nvme_set_info(cmd_rq, cmdinfo, async_completion);
987 cmdinfo->status = -EINTR; 858 cmdinfo->status = -EINTR;
988 cmd->common.command_id = cmdid; 859
860 cmd->common.command_id = req->tag;
861
989 return nvme_submit_cmd(nvmeq, cmd); 862 return nvme_submit_cmd(nvmeq, cmd);
990} 863}
991 864
992int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 865int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
993 u32 *result) 866 u32 *result, unsigned timeout)
994{ 867{
995 return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); 868 int res;
869 struct request *req;
870
871 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
872 if (!req)
873 return -ENOMEM;
874 res = nvme_submit_sync_cmd(req, cmd, result, timeout);
875 blk_put_request(req);
876 return res;
996} 877}
997 878
998int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 879int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
999 u32 *result) 880 u32 *result)
1000{ 881{
1001 return nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), cmd, 882 return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
1002 result, NVME_IO_TIMEOUT);
1003} 883}
1004 884
1005static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, 885int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
1006 struct nvme_command *cmd, struct async_cmd_info *cmdinfo) 886 struct nvme_command *cmd, u32 *result)
1007{ 887{
1008 return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, 888 int res;
1009 ADMIN_TIMEOUT); 889 struct request *req;
890
891 req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
892 false);
893 if (!req)
894 return -ENOMEM;
895 res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
896 blk_put_request(req);
897 return res;
1010} 898}
1011 899
1012static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 900static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1013{ 901{
1014 int status;
1015 struct nvme_command c; 902 struct nvme_command c;
1016 903
1017 memset(&c, 0, sizeof(c)); 904 memset(&c, 0, sizeof(c));
1018 c.delete_queue.opcode = opcode; 905 c.delete_queue.opcode = opcode;
1019 c.delete_queue.qid = cpu_to_le16(id); 906 c.delete_queue.qid = cpu_to_le16(id);
1020 907
1021 status = nvme_submit_admin_cmd(dev, &c, NULL); 908 return nvme_submit_admin_cmd(dev, &c, NULL);
1022 if (status)
1023 return -EIO;
1024 return 0;
1025} 909}
1026 910
1027static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 911static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1028 struct nvme_queue *nvmeq) 912 struct nvme_queue *nvmeq)
1029{ 913{
1030 int status;
1031 struct nvme_command c; 914 struct nvme_command c;
1032 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 915 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
1033 916
@@ -1039,16 +922,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1039 c.create_cq.cq_flags = cpu_to_le16(flags); 922 c.create_cq.cq_flags = cpu_to_le16(flags);
1040 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 923 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
1041 924
1042 status = nvme_submit_admin_cmd(dev, &c, NULL); 925 return nvme_submit_admin_cmd(dev, &c, NULL);
1043 if (status)
1044 return -EIO;
1045 return 0;
1046} 926}
1047 927
1048static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 928static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1049 struct nvme_queue *nvmeq) 929 struct nvme_queue *nvmeq)
1050{ 930{
1051 int status;
1052 struct nvme_command c; 931 struct nvme_command c;
1053 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 932 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
1054 933
@@ -1060,10 +939,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1060 c.create_sq.sq_flags = cpu_to_le16(flags); 939 c.create_sq.sq_flags = cpu_to_le16(flags);
1061 c.create_sq.cqid = cpu_to_le16(qid); 940 c.create_sq.cqid = cpu_to_le16(qid);
1062 941
1063 status = nvme_submit_admin_cmd(dev, &c, NULL); 942 return nvme_submit_admin_cmd(dev, &c, NULL);
1064 if (status)
1065 return -EIO;
1066 return 0;
1067} 943}
1068 944
1069static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 945static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1119,28 +995,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
1119} 995}
1120 996
1121/** 997/**
1122 * nvme_abort_cmd - Attempt aborting a command 998 * nvme_abort_req - Attempt aborting a request
1123 * @cmdid: Command id of a timed out IO
1124 * @queue: The queue with timed out IO
1125 * 999 *
1126 * Schedule controller reset if the command was already aborted once before and 1000 * Schedule controller reset if the command was already aborted once before and
1127 * still hasn't been returned to the driver, or if this is the admin queue. 1001 * still hasn't been returned to the driver, or if this is the admin queue.
1128 */ 1002 */
1129static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) 1003static void nvme_abort_req(struct request *req)
1130{ 1004{
1131 int a_cmdid; 1005 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
1132 struct nvme_command cmd; 1006 struct nvme_queue *nvmeq = cmd_rq->nvmeq;
1133 struct nvme_dev *dev = nvmeq->dev; 1007 struct nvme_dev *dev = nvmeq->dev;
1134 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1008 struct request *abort_req;
1135 struct nvme_queue *adminq; 1009 struct nvme_cmd_info *abort_cmd;
1010 struct nvme_command cmd;
1136 1011
1137 if (!nvmeq->qid || info[cmdid].aborted) { 1012 if (!nvmeq->qid || cmd_rq->aborted) {
1138 if (work_busy(&dev->reset_work)) 1013 if (work_busy(&dev->reset_work))
1139 return; 1014 return;
1140 list_del_init(&dev->node); 1015 list_del_init(&dev->node);
1141 dev_warn(&dev->pci_dev->dev, 1016 dev_warn(&dev->pci_dev->dev,
1142 "I/O %d QID %d timeout, reset controller\n", cmdid, 1017 "I/O %d QID %d timeout, reset controller\n",
1143 nvmeq->qid); 1018 req->tag, nvmeq->qid);
1144 dev->reset_workfn = nvme_reset_failed_dev; 1019 dev->reset_workfn = nvme_reset_failed_dev;
1145 queue_work(nvme_workq, &dev->reset_work); 1020 queue_work(nvme_workq, &dev->reset_work);
1146 return; 1021 return;
@@ -1149,89 +1024,79 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
1149 if (!dev->abort_limit) 1024 if (!dev->abort_limit)
1150 return; 1025 return;
1151 1026
1152 adminq = rcu_dereference(dev->queues[0]); 1027 abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
1153 a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, 1028 false);
1154 ADMIN_TIMEOUT); 1029 if (!abort_req)
1155 if (a_cmdid < 0)
1156 return; 1030 return;
1157 1031
1032 abort_cmd = blk_mq_rq_to_pdu(abort_req);
1033 nvme_set_info(abort_cmd, abort_req, abort_completion);
1034
1158 memset(&cmd, 0, sizeof(cmd)); 1035 memset(&cmd, 0, sizeof(cmd));
1159 cmd.abort.opcode = nvme_admin_abort_cmd; 1036 cmd.abort.opcode = nvme_admin_abort_cmd;
1160 cmd.abort.cid = cmdid; 1037 cmd.abort.cid = req->tag;
1161 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1038 cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
1162 cmd.abort.command_id = a_cmdid; 1039 cmd.abort.command_id = abort_req->tag;
1163 1040
1164 --dev->abort_limit; 1041 --dev->abort_limit;
1165 info[cmdid].aborted = 1; 1042 cmd_rq->aborted = 1;
1166 info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
1167 1043
1168 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, 1044 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
1169 nvmeq->qid); 1045 nvmeq->qid);
1170 nvme_submit_cmd(adminq, &cmd); 1046 if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
1047 dev_warn(nvmeq->q_dmadev,
1048 "Could not abort I/O %d QID %d",
1049 req->tag, nvmeq->qid);
1050 blk_put_request(req);
1051 }
1171} 1052}
1172 1053
1173/** 1054static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx,
1174 * nvme_cancel_ios - Cancel outstanding I/Os 1055 struct request *req, void *data, bool reserved)
1175 * @queue: The queue to cancel I/Os on
1176 * @timeout: True to only cancel I/Os which have timed out
1177 */
1178static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
1179{ 1056{
1180 int depth = nvmeq->q_depth - 1; 1057 struct nvme_queue *nvmeq = data;
1181 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1058 void *ctx;
1182 unsigned long now = jiffies; 1059 nvme_completion_fn fn;
1183 int cmdid; 1060 struct nvme_cmd_info *cmd;
1061 static struct nvme_completion cqe = {
1062 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
1063 };
1184 1064
1185 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1065 cmd = blk_mq_rq_to_pdu(req);
1186 void *ctx;
1187 nvme_completion_fn fn;
1188 static struct nvme_completion cqe = {
1189 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
1190 };
1191 1066
1192 if (timeout && !time_after(now, info[cmdid].timeout)) 1067 if (cmd->ctx == CMD_CTX_CANCELLED)
1193 continue; 1068 return;
1194 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 1069
1195 continue; 1070 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
1196 if (timeout && info[cmdid].ctx == CMD_CTX_ASYNC) 1071 req->tag, nvmeq->qid);
1197 continue; 1072 ctx = cancel_cmd_info(cmd, &fn);
1198 if (timeout && nvmeq->dev->initialized) { 1073 fn(nvmeq, ctx, &cqe);
1199 nvme_abort_cmd(cmdid, nvmeq);
1200 continue;
1201 }
1202 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
1203 nvmeq->qid);
1204 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
1205 fn(nvmeq, ctx, &cqe);
1206 }
1207} 1074}
1208 1075
1209static void nvme_free_queue(struct nvme_queue *nvmeq) 1076static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1210{ 1077{
1211 spin_lock_irq(&nvmeq->q_lock); 1078 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
1212 while (bio_list_peek(&nvmeq->sq_cong)) { 1079 struct nvme_queue *nvmeq = cmd->nvmeq;
1213 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1080
1214 bio_endio(bio, -EIO); 1081 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
1215 } 1082 nvmeq->qid);
1216 while (!list_empty(&nvmeq->iod_bio)) { 1083 if (nvmeq->dev->initialized)
1217 static struct nvme_completion cqe = { 1084 nvme_abort_req(req);
1218 .status = cpu_to_le16( 1085
1219 (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), 1086 /*
1220 }; 1087 * The aborted req will be completed on receiving the abort req.
1221 struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, 1088 * We enable the timer again. If hit twice, it'll cause a device reset,
1222 struct nvme_iod, 1089 * as the device then is in a faulty state.
1223 node); 1090 */
1224 list_del(&iod->node); 1091 return BLK_EH_RESET_TIMER;
1225 bio_completion(nvmeq, iod, &cqe); 1092}
1226 }
1227 spin_unlock_irq(&nvmeq->q_lock);
1228 1093
1094static void nvme_free_queue(struct nvme_queue *nvmeq)
1095{
1229 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1096 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
1230 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1097 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1231 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1098 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1232 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1099 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1233 if (nvmeq->qid)
1234 free_cpumask_var(nvmeq->cpu_mask);
1235 kfree(nvmeq); 1100 kfree(nvmeq);
1236} 1101}
1237 1102
@@ -1243,10 +1108,10 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1243 int i; 1108 int i;
1244 1109
1245 for (i = dev->queue_count - 1; i >= lowest; i--) { 1110 for (i = dev->queue_count - 1; i >= lowest; i--) {
1246 nvmeq = raw_nvmeq(dev, i); 1111 struct nvme_queue *nvmeq = dev->queues[i];
1247 RCU_INIT_POINTER(dev->queues[i], NULL);
1248 llist_add(&nvmeq->node, &q_list); 1112 llist_add(&nvmeq->node, &q_list);
1249 dev->queue_count--; 1113 dev->queue_count--;
1114 dev->queues[i] = NULL;
1250 } 1115 }
1251 synchronize_rcu(); 1116 synchronize_rcu();
1252 entry = llist_del_all(&q_list); 1117 entry = llist_del_all(&q_list);
@@ -1257,19 +1122,12 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1257/** 1122/**
1258 * nvme_suspend_queue - put queue into suspended state 1123 * nvme_suspend_queue - put queue into suspended state
1259 * @nvmeq - queue to suspend 1124 * @nvmeq - queue to suspend
1260 *
1261 * Returns 1 if already suspended, 0 otherwise.
1262 */ 1125 */
1263static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1126static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1264{ 1127{
1265 int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1128 int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
1266 1129
1267 spin_lock_irq(&nvmeq->q_lock); 1130 spin_lock_irq(&nvmeq->q_lock);
1268 if (nvmeq->q_suspended) {
1269 spin_unlock_irq(&nvmeq->q_lock);
1270 return 1;
1271 }
1272 nvmeq->q_suspended = 1;
1273 nvmeq->dev->online_queues--; 1131 nvmeq->dev->online_queues--;
1274 spin_unlock_irq(&nvmeq->q_lock); 1132 spin_unlock_irq(&nvmeq->q_lock);
1275 1133
@@ -1281,15 +1139,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1281 1139
1282static void nvme_clear_queue(struct nvme_queue *nvmeq) 1140static void nvme_clear_queue(struct nvme_queue *nvmeq)
1283{ 1141{
1142 struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
1143
1284 spin_lock_irq(&nvmeq->q_lock); 1144 spin_lock_irq(&nvmeq->q_lock);
1285 nvme_process_cq(nvmeq); 1145 nvme_process_cq(nvmeq);
1286 nvme_cancel_ios(nvmeq, false); 1146 if (hctx && hctx->tags)
1147 blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
1287 spin_unlock_irq(&nvmeq->q_lock); 1148 spin_unlock_irq(&nvmeq->q_lock);
1288} 1149}
1289 1150
1290static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1151static void nvme_disable_queue(struct nvme_dev *dev, int qid)
1291{ 1152{
1292 struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); 1153 struct nvme_queue *nvmeq = dev->queues[qid];
1293 1154
1294 if (!nvmeq) 1155 if (!nvmeq)
1295 return; 1156 return;
@@ -1309,8 +1170,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1309 int depth, int vector) 1170 int depth, int vector)
1310{ 1171{
1311 struct device *dmadev = &dev->pci_dev->dev; 1172 struct device *dmadev = &dev->pci_dev->dev;
1312 unsigned extra = nvme_queue_extra(depth); 1173 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
1313 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
1314 if (!nvmeq) 1174 if (!nvmeq)
1315 return NULL; 1175 return NULL;
1316 1176
@@ -1324,9 +1184,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1324 if (!nvmeq->sq_cmds) 1184 if (!nvmeq->sq_cmds)
1325 goto free_cqdma; 1185 goto free_cqdma;
1326 1186
1327 if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
1328 goto free_sqdma;
1329
1330 nvmeq->q_dmadev = dmadev; 1187 nvmeq->q_dmadev = dmadev;
1331 nvmeq->dev = dev; 1188 nvmeq->dev = dev;
1332 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1189 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
@@ -1334,22 +1191,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1334 spin_lock_init(&nvmeq->q_lock); 1191 spin_lock_init(&nvmeq->q_lock);
1335 nvmeq->cq_head = 0; 1192 nvmeq->cq_head = 0;
1336 nvmeq->cq_phase = 1; 1193 nvmeq->cq_phase = 1;
1337 init_waitqueue_head(&nvmeq->sq_full);
1338 bio_list_init(&nvmeq->sq_cong);
1339 INIT_LIST_HEAD(&nvmeq->iod_bio);
1340 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1194 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1341 nvmeq->q_depth = depth; 1195 nvmeq->q_depth = depth;
1342 nvmeq->cq_vector = vector; 1196 nvmeq->cq_vector = vector;
1343 nvmeq->qid = qid; 1197 nvmeq->qid = qid;
1344 nvmeq->q_suspended = 1;
1345 dev->queue_count++; 1198 dev->queue_count++;
1346 rcu_assign_pointer(dev->queues[qid], nvmeq); 1199 dev->queues[qid] = nvmeq;
1347 1200
1348 return nvmeq; 1201 return nvmeq;
1349 1202
1350 free_sqdma:
1351 dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
1352 nvmeq->sq_dma_addr);
1353 free_cqdma: 1203 free_cqdma:
1354 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1204 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
1355 nvmeq->cq_dma_addr); 1205 nvmeq->cq_dma_addr);
@@ -1372,18 +1222,13 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1372static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1222static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
1373{ 1223{
1374 struct nvme_dev *dev = nvmeq->dev; 1224 struct nvme_dev *dev = nvmeq->dev;
1375 unsigned extra = nvme_queue_extra(nvmeq->q_depth);
1376 1225
1377 spin_lock_irq(&nvmeq->q_lock); 1226 spin_lock_irq(&nvmeq->q_lock);
1378 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
1379 nvmeq->sq_tail = 0; 1227 nvmeq->sq_tail = 0;
1380 nvmeq->cq_head = 0; 1228 nvmeq->cq_head = 0;
1381 nvmeq->cq_phase = 1; 1229 nvmeq->cq_phase = 1;
1382 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1230 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1383 memset(nvmeq->cmdid_data, 0, extra);
1384 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1231 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1385 nvme_cancel_ios(nvmeq, false);
1386 nvmeq->q_suspended = 0;
1387 dev->online_queues++; 1232 dev->online_queues++;
1388 spin_unlock_irq(&nvmeq->q_lock); 1233 spin_unlock_irq(&nvmeq->q_lock);
1389} 1234}
@@ -1486,6 +1331,52 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
1486 return 0; 1331 return 0;
1487} 1332}
1488 1333
1334static struct blk_mq_ops nvme_mq_admin_ops = {
1335 .queue_rq = nvme_admin_queue_rq,
1336 .map_queue = blk_mq_map_queue,
1337 .init_hctx = nvme_admin_init_hctx,
1338 .init_request = nvme_admin_init_request,
1339 .timeout = nvme_timeout,
1340};
1341
1342static struct blk_mq_ops nvme_mq_ops = {
1343 .queue_rq = nvme_queue_rq,
1344 .map_queue = blk_mq_map_queue,
1345 .init_hctx = nvme_init_hctx,
1346 .init_request = nvme_init_request,
1347 .timeout = nvme_timeout,
1348};
1349
1350static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1351{
1352 if (!dev->admin_q) {
1353 dev->admin_tagset.ops = &nvme_mq_admin_ops;
1354 dev->admin_tagset.nr_hw_queues = 1;
1355 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
1356 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1357 dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
1358 dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
1359 dev->admin_tagset.driver_data = dev;
1360
1361 if (blk_mq_alloc_tag_set(&dev->admin_tagset))
1362 return -ENOMEM;
1363
1364 dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
1365 if (!dev->admin_q) {
1366 blk_mq_free_tag_set(&dev->admin_tagset);
1367 return -ENOMEM;
1368 }
1369 }
1370
1371 return 0;
1372}
1373
1374static void nvme_free_admin_tags(struct nvme_dev *dev)
1375{
1376 if (dev->admin_q)
1377 blk_mq_free_tag_set(&dev->admin_tagset);
1378}
1379
1489static int nvme_configure_admin_queue(struct nvme_dev *dev) 1380static int nvme_configure_admin_queue(struct nvme_dev *dev)
1490{ 1381{
1491 int result; 1382 int result;
@@ -1515,9 +1406,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1515 if (result < 0) 1406 if (result < 0)
1516 return result; 1407 return result;
1517 1408
1518 nvmeq = raw_nvmeq(dev, 0); 1409 nvmeq = dev->queues[0];
1519 if (!nvmeq) { 1410 if (!nvmeq) {
1520 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1411 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0);
1521 if (!nvmeq) 1412 if (!nvmeq)
1522 return -ENOMEM; 1413 return -ENOMEM;
1523 } 1414 }
@@ -1538,13 +1429,23 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1538 1429
1539 result = nvme_enable_ctrl(dev, cap); 1430 result = nvme_enable_ctrl(dev, cap);
1540 if (result) 1431 if (result)
1541 return result; 1432 goto free_nvmeq;
1433
1434 result = nvme_alloc_admin_tags(dev);
1435 if (result)
1436 goto free_nvmeq;
1542 1437
1543 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1438 result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1544 if (result) 1439 if (result)
1545 return result; 1440 goto free_tags;
1546 1441
1547 return result; 1442 return result;
1443
1444 free_tags:
1445 nvme_free_admin_tags(dev);
1446 free_nvmeq:
1447 nvme_free_queues(dev, 0);
1448 return result;
1548} 1449}
1549 1450
1550struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1451struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
@@ -1702,7 +1603,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1702 if (length != (io.nblocks + 1) << ns->lba_shift) 1603 if (length != (io.nblocks + 1) << ns->lba_shift)
1703 status = -ENOMEM; 1604 status = -ENOMEM;
1704 else 1605 else
1705 status = nvme_submit_io_cmd(dev, &c, NULL); 1606 status = nvme_submit_io_cmd(dev, ns, &c, NULL);
1706 1607
1707 if (meta_len) { 1608 if (meta_len) {
1708 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1609 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
@@ -1734,8 +1635,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1734 return status; 1635 return status;
1735} 1636}
1736 1637
1737static int nvme_user_cmd(struct nvme_dev *dev, 1638static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
1738 struct nvme_passthru_cmd __user *ucmd, bool ioq) 1639 struct nvme_passthru_cmd __user *ucmd)
1739{ 1640{
1740 struct nvme_passthru_cmd cmd; 1641 struct nvme_passthru_cmd cmd;
1741 struct nvme_command c; 1642 struct nvme_command c;
@@ -1774,13 +1675,23 @@ static int nvme_user_cmd(struct nvme_dev *dev,
1774 1675
1775 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1676 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
1776 ADMIN_TIMEOUT; 1677 ADMIN_TIMEOUT;
1678
1777 if (length != cmd.data_len) 1679 if (length != cmd.data_len)
1778 status = -ENOMEM; 1680 status = -ENOMEM;
1779 else if (ioq) 1681 else if (ns) {
1780 status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c, 1682 struct request *req;
1781 &cmd.result, timeout); 1683
1782 else 1684 req = blk_mq_alloc_request(ns->queue, WRITE,
1783 status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); 1685 (GFP_KERNEL|__GFP_WAIT), false);
1686 if (!req)
1687 status = -ENOMEM;
1688 else {
1689 status = nvme_submit_sync_cmd(req, &c, &cmd.result,
1690 timeout);
1691 blk_put_request(req);
1692 }
1693 } else
1694 status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
1784 1695
1785 if (cmd.data_len) { 1696 if (cmd.data_len) {
1786 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1697 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1804,9 +1715,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1804 force_successful_syscall_return(); 1715 force_successful_syscall_return();
1805 return ns->ns_id; 1716 return ns->ns_id;
1806 case NVME_IOCTL_ADMIN_CMD: 1717 case NVME_IOCTL_ADMIN_CMD:
1807 return nvme_user_cmd(ns->dev, (void __user *)arg, false); 1718 return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
1808 case NVME_IOCTL_IO_CMD: 1719 case NVME_IOCTL_IO_CMD:
1809 return nvme_user_cmd(ns->dev, (void __user *)arg, true); 1720 return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
1810 case NVME_IOCTL_SUBMIT_IO: 1721 case NVME_IOCTL_SUBMIT_IO:
1811 return nvme_submit_io(ns, (void __user *)arg); 1722 return nvme_submit_io(ns, (void __user *)arg);
1812 case SG_GET_VERSION_NUM: 1723 case SG_GET_VERSION_NUM:
@@ -1906,62 +1817,6 @@ static const struct block_device_operations nvme_fops = {
1906 .revalidate_disk= nvme_revalidate_disk, 1817 .revalidate_disk= nvme_revalidate_disk,
1907}; 1818};
1908 1819
1909static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
1910{
1911 struct nvme_iod *iod, *next;
1912
1913 list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
1914 if (unlikely(nvme_submit_iod(nvmeq, iod)))
1915 break;
1916 list_del(&iod->node);
1917 if (bio_list_empty(&nvmeq->sq_cong) &&
1918 list_empty(&nvmeq->iod_bio))
1919 remove_wait_queue(&nvmeq->sq_full,
1920 &nvmeq->sq_cong_wait);
1921 }
1922}
1923
1924static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
1925{
1926 while (bio_list_peek(&nvmeq->sq_cong)) {
1927 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
1928 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
1929
1930 if (bio_list_empty(&nvmeq->sq_cong) &&
1931 list_empty(&nvmeq->iod_bio))
1932 remove_wait_queue(&nvmeq->sq_full,
1933 &nvmeq->sq_cong_wait);
1934 if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
1935 if (!waitqueue_active(&nvmeq->sq_full))
1936 add_wait_queue(&nvmeq->sq_full,
1937 &nvmeq->sq_cong_wait);
1938 bio_list_add_head(&nvmeq->sq_cong, bio);
1939 break;
1940 }
1941 }
1942}
1943
1944static int nvme_submit_async_req(struct nvme_queue *nvmeq)
1945{
1946 struct nvme_command *c;
1947 int cmdid;
1948
1949 cmdid = alloc_cmdid(nvmeq, CMD_CTX_ASYNC, special_completion, 0);
1950 if (cmdid < 0)
1951 return cmdid;
1952
1953 c = &nvmeq->sq_cmds[nvmeq->sq_tail];
1954 memset(c, 0, sizeof(*c));
1955 c->common.opcode = nvme_admin_async_event;
1956 c->common.command_id = cmdid;
1957
1958 if (++nvmeq->sq_tail == nvmeq->q_depth)
1959 nvmeq->sq_tail = 0;
1960 writel(nvmeq->sq_tail, nvmeq->q_db);
1961
1962 return 0;
1963}
1964
1965static int nvme_kthread(void *data) 1820static int nvme_kthread(void *data)
1966{ 1821{
1967 struct nvme_dev *dev, *next; 1822 struct nvme_dev *dev, *next;
@@ -1977,34 +1832,26 @@ static int nvme_kthread(void *data)
1977 continue; 1832 continue;
1978 list_del_init(&dev->node); 1833 list_del_init(&dev->node);
1979 dev_warn(&dev->pci_dev->dev, 1834 dev_warn(&dev->pci_dev->dev,
1980 "Failed status, reset controller\n"); 1835 "Failed status: %x, reset controller\n",
1836 readl(&dev->bar->csts));
1981 dev->reset_workfn = nvme_reset_failed_dev; 1837 dev->reset_workfn = nvme_reset_failed_dev;
1982 queue_work(nvme_workq, &dev->reset_work); 1838 queue_work(nvme_workq, &dev->reset_work);
1983 continue; 1839 continue;
1984 } 1840 }
1985 rcu_read_lock();
1986 for (i = 0; i < dev->queue_count; i++) { 1841 for (i = 0; i < dev->queue_count; i++) {
1987 struct nvme_queue *nvmeq = 1842 struct nvme_queue *nvmeq = dev->queues[i];
1988 rcu_dereference(dev->queues[i]);
1989 if (!nvmeq) 1843 if (!nvmeq)
1990 continue; 1844 continue;
1991 spin_lock_irq(&nvmeq->q_lock); 1845 spin_lock_irq(&nvmeq->q_lock);
1992 if (nvmeq->q_suspended)
1993 goto unlock;
1994 nvme_process_cq(nvmeq); 1846 nvme_process_cq(nvmeq);
1995 nvme_cancel_ios(nvmeq, true);
1996 nvme_resubmit_bios(nvmeq);
1997 nvme_resubmit_iods(nvmeq);
1998 1847
1999 while ((i == 0) && (dev->event_limit > 0)) { 1848 while ((i == 0) && (dev->event_limit > 0)) {
2000 if (nvme_submit_async_req(nvmeq)) 1849 if (nvme_submit_async_admin_req(dev))
2001 break; 1850 break;
2002 dev->event_limit--; 1851 dev->event_limit--;
2003 } 1852 }
2004 unlock:
2005 spin_unlock_irq(&nvmeq->q_lock); 1853 spin_unlock_irq(&nvmeq->q_lock);
2006 } 1854 }
2007 rcu_read_unlock();
2008 } 1855 }
2009 spin_unlock(&dev_list_lock); 1856 spin_unlock(&dev_list_lock);
2010 schedule_timeout(round_jiffies_relative(HZ)); 1857 schedule_timeout(round_jiffies_relative(HZ));
@@ -2027,29 +1874,29 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
2027{ 1874{
2028 struct nvme_ns *ns; 1875 struct nvme_ns *ns;
2029 struct gendisk *disk; 1876 struct gendisk *disk;
1877 int node = dev_to_node(&dev->pci_dev->dev);
2030 int lbaf; 1878 int lbaf;
2031 1879
2032 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1880 if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
2033 return NULL; 1881 return NULL;
2034 1882
2035 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1883 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2036 if (!ns) 1884 if (!ns)
2037 return NULL; 1885 return NULL;
2038 ns->queue = blk_alloc_queue(GFP_KERNEL); 1886 ns->queue = blk_mq_init_queue(&dev->tagset);
2039 if (!ns->queue) 1887 if (!ns->queue)
2040 goto out_free_ns; 1888 goto out_free_ns;
2041 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
2042 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, ns->queue);
2043 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1889 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
2044 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1890 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2045 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue); 1891 queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
2046 blk_queue_make_request(ns->queue, nvme_make_request); 1892 queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue);
2047 ns->dev = dev; 1893 ns->dev = dev;
2048 ns->queue->queuedata = ns; 1894 ns->queue->queuedata = ns;
2049 1895
2050 disk = alloc_disk(0); 1896 disk = alloc_disk_node(0, node);
2051 if (!disk) 1897 if (!disk)
2052 goto out_free_queue; 1898 goto out_free_queue;
1899
2053 ns->ns_id = nsid; 1900 ns->ns_id = nsid;
2054 ns->disk = disk; 1901 ns->disk = disk;
2055 lbaf = id->flbas & 0xf; 1902 lbaf = id->flbas & 0xf;
@@ -2058,6 +1905,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
2058 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1905 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2059 if (dev->max_hw_sectors) 1906 if (dev->max_hw_sectors)
2060 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1907 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
1908 if (dev->stripe_size)
1909 blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
2061 if (dev->vwc & NVME_CTRL_VWC_PRESENT) 1910 if (dev->vwc & NVME_CTRL_VWC_PRESENT)
2062 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 1911 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
2063 1912
@@ -2083,143 +1932,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
2083 return NULL; 1932 return NULL;
2084} 1933}
2085 1934
2086static int nvme_find_closest_node(int node)
2087{
2088 int n, val, min_val = INT_MAX, best_node = node;
2089
2090 for_each_online_node(n) {
2091 if (n == node)
2092 continue;
2093 val = node_distance(node, n);
2094 if (val < min_val) {
2095 min_val = val;
2096 best_node = n;
2097 }
2098 }
2099 return best_node;
2100}
2101
2102static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
2103 int count)
2104{
2105 int cpu;
2106 for_each_cpu(cpu, qmask) {
2107 if (cpumask_weight(nvmeq->cpu_mask) >= count)
2108 break;
2109 if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
2110 *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
2111 }
2112}
2113
2114static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
2115 const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
2116{
2117 int next_cpu;
2118 for_each_cpu(next_cpu, new_mask) {
2119 cpumask_or(mask, mask, get_cpu_mask(next_cpu));
2120 cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
2121 cpumask_and(mask, mask, unassigned_cpus);
2122 nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
2123 }
2124}
2125
2126static void nvme_create_io_queues(struct nvme_dev *dev) 1935static void nvme_create_io_queues(struct nvme_dev *dev)
2127{ 1936{
2128 unsigned i, max; 1937 unsigned i;
2129 1938
2130 max = min(dev->max_qid, num_online_cpus()); 1939 for (i = dev->queue_count; i <= dev->max_qid; i++)
2131 for (i = dev->queue_count; i <= max; i++)
2132 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) 1940 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
2133 break; 1941 break;
2134 1942
2135 max = min(dev->queue_count - 1, num_online_cpus()); 1943 for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
2136 for (i = dev->online_queues; i <= max; i++) 1944 if (nvme_create_queue(dev->queues[i], i))
2137 if (nvme_create_queue(raw_nvmeq(dev, i), i))
2138 break; 1945 break;
2139} 1946}
2140 1947
2141/*
2142 * If there are fewer queues than online cpus, this will try to optimally
2143 * assign a queue to multiple cpus by grouping cpus that are "close" together:
2144 * thread siblings, core, socket, closest node, then whatever else is
2145 * available.
2146 */
2147static void nvme_assign_io_queues(struct nvme_dev *dev)
2148{
2149 unsigned cpu, cpus_per_queue, queues, remainder, i;
2150 cpumask_var_t unassigned_cpus;
2151
2152 nvme_create_io_queues(dev);
2153
2154 queues = min(dev->online_queues - 1, num_online_cpus());
2155 if (!queues)
2156 return;
2157
2158 cpus_per_queue = num_online_cpus() / queues;
2159 remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
2160
2161 if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
2162 return;
2163
2164 cpumask_copy(unassigned_cpus, cpu_online_mask);
2165 cpu = cpumask_first(unassigned_cpus);
2166 for (i = 1; i <= queues; i++) {
2167 struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
2168 cpumask_t mask;
2169
2170 cpumask_clear(nvmeq->cpu_mask);
2171 if (!cpumask_weight(unassigned_cpus)) {
2172 unlock_nvmeq(nvmeq);
2173 break;
2174 }
2175
2176 mask = *get_cpu_mask(cpu);
2177 nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
2178 if (cpus_weight(mask) < cpus_per_queue)
2179 nvme_add_cpus(&mask, unassigned_cpus,
2180 topology_thread_cpumask(cpu),
2181 nvmeq, cpus_per_queue);
2182 if (cpus_weight(mask) < cpus_per_queue)
2183 nvme_add_cpus(&mask, unassigned_cpus,
2184 topology_core_cpumask(cpu),
2185 nvmeq, cpus_per_queue);
2186 if (cpus_weight(mask) < cpus_per_queue)
2187 nvme_add_cpus(&mask, unassigned_cpus,
2188 cpumask_of_node(cpu_to_node(cpu)),
2189 nvmeq, cpus_per_queue);
2190 if (cpus_weight(mask) < cpus_per_queue)
2191 nvme_add_cpus(&mask, unassigned_cpus,
2192 cpumask_of_node(
2193 nvme_find_closest_node(
2194 cpu_to_node(cpu))),
2195 nvmeq, cpus_per_queue);
2196 if (cpus_weight(mask) < cpus_per_queue)
2197 nvme_add_cpus(&mask, unassigned_cpus,
2198 unassigned_cpus,
2199 nvmeq, cpus_per_queue);
2200
2201 WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
2202 "nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
2203 dev->instance, i);
2204
2205 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
2206 nvmeq->cpu_mask);
2207 cpumask_andnot(unassigned_cpus, unassigned_cpus,
2208 nvmeq->cpu_mask);
2209 cpu = cpumask_next(cpu, unassigned_cpus);
2210 if (remainder && !--remainder)
2211 cpus_per_queue++;
2212 unlock_nvmeq(nvmeq);
2213 }
2214 WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
2215 dev->instance);
2216 i = 0;
2217 cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
2218 for_each_cpu(cpu, unassigned_cpus)
2219 *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
2220 free_cpumask_var(unassigned_cpus);
2221}
2222
2223static int set_queue_count(struct nvme_dev *dev, int count) 1948static int set_queue_count(struct nvme_dev *dev, int count)
2224{ 1949{
2225 int status; 1950 int status;
@@ -2243,33 +1968,9 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
2243 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 1968 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
2244} 1969}
2245 1970
2246static void nvme_cpu_workfn(struct work_struct *work)
2247{
2248 struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work);
2249 if (dev->initialized)
2250 nvme_assign_io_queues(dev);
2251}
2252
2253static int nvme_cpu_notify(struct notifier_block *self,
2254 unsigned long action, void *hcpu)
2255{
2256 struct nvme_dev *dev;
2257
2258 switch (action) {
2259 case CPU_ONLINE:
2260 case CPU_DEAD:
2261 spin_lock(&dev_list_lock);
2262 list_for_each_entry(dev, &dev_list, node)
2263 schedule_work(&dev->cpu_work);
2264 spin_unlock(&dev_list_lock);
2265 break;
2266 }
2267 return NOTIFY_OK;
2268}
2269
2270static int nvme_setup_io_queues(struct nvme_dev *dev) 1971static int nvme_setup_io_queues(struct nvme_dev *dev)
2271{ 1972{
2272 struct nvme_queue *adminq = raw_nvmeq(dev, 0); 1973 struct nvme_queue *adminq = dev->queues[0];
2273 struct pci_dev *pdev = dev->pci_dev; 1974 struct pci_dev *pdev = dev->pci_dev;
2274 int result, i, vecs, nr_io_queues, size; 1975 int result, i, vecs, nr_io_queues, size;
2275 1976
@@ -2321,14 +2022,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
2321 dev->max_qid = nr_io_queues; 2022 dev->max_qid = nr_io_queues;
2322 2023
2323 result = queue_request_irq(dev, adminq, adminq->irqname); 2024 result = queue_request_irq(dev, adminq, adminq->irqname);
2324 if (result) { 2025 if (result)
2325 adminq->q_suspended = 1;
2326 goto free_queues; 2026 goto free_queues;
2327 }
2328 2027
2329 /* Free previously allocated queues that are no longer usable */ 2028 /* Free previously allocated queues that are no longer usable */
2330 nvme_free_queues(dev, nr_io_queues + 1); 2029 nvme_free_queues(dev, nr_io_queues + 1);
2331 nvme_assign_io_queues(dev); 2030 nvme_create_io_queues(dev);
2332 2031
2333 return 0; 2032 return 0;
2334 2033
@@ -2378,8 +2077,30 @@ static int nvme_dev_add(struct nvme_dev *dev)
2378 if (ctrl->mdts) 2077 if (ctrl->mdts)
2379 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2078 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
2380 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2079 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
2381 (pdev->device == 0x0953) && ctrl->vs[3]) 2080 (pdev->device == 0x0953) && ctrl->vs[3]) {
2081 unsigned int max_hw_sectors;
2082
2382 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2083 dev->stripe_size = 1 << (ctrl->vs[3] + shift);
2084 max_hw_sectors = dev->stripe_size >> (shift - 9);
2085 if (dev->max_hw_sectors) {
2086 dev->max_hw_sectors = min(max_hw_sectors,
2087 dev->max_hw_sectors);
2088 } else
2089 dev->max_hw_sectors = max_hw_sectors;
2090 }
2091
2092 dev->tagset.ops = &nvme_mq_ops;
2093 dev->tagset.nr_hw_queues = dev->online_queues - 1;
2094 dev->tagset.timeout = NVME_IO_TIMEOUT;
2095 dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
2096 dev->tagset.queue_depth =
2097 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
2098 dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
2099 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2100 dev->tagset.driver_data = dev;
2101
2102 if (blk_mq_alloc_tag_set(&dev->tagset))
2103 goto out;
2383 2104
2384 id_ns = mem; 2105 id_ns = mem;
2385 for (i = 1; i <= nn; i++) { 2106 for (i = 1; i <= nn; i++) {
@@ -2529,7 +2250,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
2529 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2250 c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2530 2251
2531 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2252 init_kthread_work(&nvmeq->cmdinfo.work, fn);
2532 return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); 2253 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
2254 ADMIN_TIMEOUT);
2533} 2255}
2534 2256
2535static void nvme_del_cq_work_handler(struct kthread_work *work) 2257static void nvme_del_cq_work_handler(struct kthread_work *work)
@@ -2592,7 +2314,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
2592 atomic_set(&dq.refcount, 0); 2314 atomic_set(&dq.refcount, 0);
2593 dq.worker = &worker; 2315 dq.worker = &worker;
2594 for (i = dev->queue_count - 1; i > 0; i--) { 2316 for (i = dev->queue_count - 1; i > 0; i--) {
2595 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2317 struct nvme_queue *nvmeq = dev->queues[i];
2596 2318
2597 if (nvme_suspend_queue(nvmeq)) 2319 if (nvme_suspend_queue(nvmeq))
2598 continue; 2320 continue;
@@ -2637,7 +2359,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
2637 csts = readl(&dev->bar->csts); 2359 csts = readl(&dev->bar->csts);
2638 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 2360 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
2639 for (i = dev->queue_count - 1; i >= 0; i--) { 2361 for (i = dev->queue_count - 1; i >= 0; i--) {
2640 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2362 struct nvme_queue *nvmeq = dev->queues[i];
2641 nvme_suspend_queue(nvmeq); 2363 nvme_suspend_queue(nvmeq);
2642 nvme_clear_queue(nvmeq); 2364 nvme_clear_queue(nvmeq);
2643 } 2365 }
@@ -2649,6 +2371,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
2649 nvme_dev_unmap(dev); 2371 nvme_dev_unmap(dev);
2650} 2372}
2651 2373
2374static void nvme_dev_remove_admin(struct nvme_dev *dev)
2375{
2376 if (dev->admin_q && !blk_queue_dying(dev->admin_q))
2377 blk_cleanup_queue(dev->admin_q);
2378}
2379
2652static void nvme_dev_remove(struct nvme_dev *dev) 2380static void nvme_dev_remove(struct nvme_dev *dev)
2653{ 2381{
2654 struct nvme_ns *ns; 2382 struct nvme_ns *ns;
@@ -2736,7 +2464,7 @@ static void nvme_free_dev(struct kref *kref)
2736 2464
2737 pci_dev_put(dev->pci_dev); 2465 pci_dev_put(dev->pci_dev);
2738 nvme_free_namespaces(dev); 2466 nvme_free_namespaces(dev);
2739 free_percpu(dev->io_queue); 2467 blk_mq_free_tag_set(&dev->tagset);
2740 kfree(dev->queues); 2468 kfree(dev->queues);
2741 kfree(dev->entry); 2469 kfree(dev->entry);
2742 kfree(dev); 2470 kfree(dev);
@@ -2761,11 +2489,16 @@ static int nvme_dev_release(struct inode *inode, struct file *f)
2761static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2489static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
2762{ 2490{
2763 struct nvme_dev *dev = f->private_data; 2491 struct nvme_dev *dev = f->private_data;
2492 struct nvme_ns *ns;
2493
2764 switch (cmd) { 2494 switch (cmd) {
2765 case NVME_IOCTL_ADMIN_CMD: 2495 case NVME_IOCTL_ADMIN_CMD:
2766 return nvme_user_cmd(dev, (void __user *)arg, false); 2496 return nvme_user_cmd(dev, NULL, (void __user *)arg);
2767 case NVME_IOCTL_IO_CMD: 2497 case NVME_IOCTL_IO_CMD:
2768 return nvme_user_cmd(dev, (void __user *)arg, true); 2498 if (list_empty(&dev->namespaces))
2499 return -ENOTTY;
2500 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
2501 return nvme_user_cmd(dev, ns, (void __user *)arg);
2769 default: 2502 default:
2770 return -ENOTTY; 2503 return -ENOTTY;
2771 } 2504 }
@@ -2779,6 +2512,22 @@ static const struct file_operations nvme_dev_fops = {
2779 .compat_ioctl = nvme_dev_ioctl, 2512 .compat_ioctl = nvme_dev_ioctl,
2780}; 2513};
2781 2514
2515static void nvme_set_irq_hints(struct nvme_dev *dev)
2516{
2517 struct nvme_queue *nvmeq;
2518 int i;
2519
2520 for (i = 0; i < dev->online_queues; i++) {
2521 nvmeq = dev->queues[i];
2522
2523 if (!nvmeq->hctx)
2524 continue;
2525
2526 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
2527 nvmeq->hctx->cpumask);
2528 }
2529}
2530
2782static int nvme_dev_start(struct nvme_dev *dev) 2531static int nvme_dev_start(struct nvme_dev *dev)
2783{ 2532{
2784 int result; 2533 int result;
@@ -2810,12 +2559,15 @@ static int nvme_dev_start(struct nvme_dev *dev)
2810 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 2559 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
2811 goto disable; 2560 goto disable;
2812 } 2561 }
2813 nvme_init_queue(raw_nvmeq(dev, 0), 0); 2562
2563 nvme_init_queue(dev->queues[0], 0);
2814 2564
2815 result = nvme_setup_io_queues(dev); 2565 result = nvme_setup_io_queues(dev);
2816 if (result) 2566 if (result)
2817 goto disable; 2567 goto disable;
2818 2568
2569 nvme_set_irq_hints(dev);
2570
2819 return result; 2571 return result;
2820 2572
2821 disable: 2573 disable:
@@ -2866,7 +2618,7 @@ static void nvme_dev_reset(struct nvme_dev *dev)
2866{ 2618{
2867 nvme_dev_shutdown(dev); 2619 nvme_dev_shutdown(dev);
2868 if (nvme_dev_resume(dev)) { 2620 if (nvme_dev_resume(dev)) {
2869 dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); 2621 dev_warn(&dev->pci_dev->dev, "Device failed to resume\n");
2870 kref_get(&dev->kref); 2622 kref_get(&dev->kref);
2871 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 2623 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
2872 dev->instance))) { 2624 dev->instance))) {
@@ -2891,28 +2643,28 @@ static void nvme_reset_workfn(struct work_struct *work)
2891 2643
2892static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2644static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2893{ 2645{
2894 int result = -ENOMEM; 2646 int node, result = -ENOMEM;
2895 struct nvme_dev *dev; 2647 struct nvme_dev *dev;
2896 2648
2897 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2649 node = dev_to_node(&pdev->dev);
2650 if (node == NUMA_NO_NODE)
2651 set_dev_node(&pdev->dev, 0);
2652
2653 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
2898 if (!dev) 2654 if (!dev)
2899 return -ENOMEM; 2655 return -ENOMEM;
2900 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 2656 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
2901 GFP_KERNEL); 2657 GFP_KERNEL, node);
2902 if (!dev->entry) 2658 if (!dev->entry)
2903 goto free; 2659 goto free;
2904 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 2660 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
2905 GFP_KERNEL); 2661 GFP_KERNEL, node);
2906 if (!dev->queues) 2662 if (!dev->queues)
2907 goto free; 2663 goto free;
2908 dev->io_queue = alloc_percpu(unsigned short);
2909 if (!dev->io_queue)
2910 goto free;
2911 2664
2912 INIT_LIST_HEAD(&dev->namespaces); 2665 INIT_LIST_HEAD(&dev->namespaces);
2913 dev->reset_workfn = nvme_reset_failed_dev; 2666 dev->reset_workfn = nvme_reset_failed_dev;
2914 INIT_WORK(&dev->reset_work, nvme_reset_workfn); 2667 INIT_WORK(&dev->reset_work, nvme_reset_workfn);
2915 INIT_WORK(&dev->cpu_work, nvme_cpu_workfn);
2916 dev->pci_dev = pci_dev_get(pdev); 2668 dev->pci_dev = pci_dev_get(pdev);
2917 pci_set_drvdata(pdev, dev); 2669 pci_set_drvdata(pdev, dev);
2918 result = nvme_set_instance(dev); 2670 result = nvme_set_instance(dev);
@@ -2942,11 +2694,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2942 if (result) 2694 if (result)
2943 goto remove; 2695 goto remove;
2944 2696
2697 nvme_set_irq_hints(dev);
2698
2945 dev->initialized = 1; 2699 dev->initialized = 1;
2946 return 0; 2700 return 0;
2947 2701
2948 remove: 2702 remove:
2949 nvme_dev_remove(dev); 2703 nvme_dev_remove(dev);
2704 nvme_dev_remove_admin(dev);
2950 nvme_free_namespaces(dev); 2705 nvme_free_namespaces(dev);
2951 shutdown: 2706 shutdown:
2952 nvme_dev_shutdown(dev); 2707 nvme_dev_shutdown(dev);
@@ -2958,7 +2713,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2958 put_pci: 2713 put_pci:
2959 pci_dev_put(dev->pci_dev); 2714 pci_dev_put(dev->pci_dev);
2960 free: 2715 free:
2961 free_percpu(dev->io_queue);
2962 kfree(dev->queues); 2716 kfree(dev->queues);
2963 kfree(dev->entry); 2717 kfree(dev->entry);
2964 kfree(dev); 2718 kfree(dev);
@@ -2991,11 +2745,12 @@ static void nvme_remove(struct pci_dev *pdev)
2991 2745
2992 pci_set_drvdata(pdev, NULL); 2746 pci_set_drvdata(pdev, NULL);
2993 flush_work(&dev->reset_work); 2747 flush_work(&dev->reset_work);
2994 flush_work(&dev->cpu_work);
2995 misc_deregister(&dev->miscdev); 2748 misc_deregister(&dev->miscdev);
2749 nvme_dev_remove(dev);
2996 nvme_dev_shutdown(dev); 2750 nvme_dev_shutdown(dev);
2751 nvme_dev_remove_admin(dev);
2997 nvme_free_queues(dev, 0); 2752 nvme_free_queues(dev, 0);
2998 nvme_dev_remove(dev); 2753 nvme_free_admin_tags(dev);
2999 nvme_release_instance(dev); 2754 nvme_release_instance(dev);
3000 nvme_release_prp_pools(dev); 2755 nvme_release_prp_pools(dev);
3001 kref_put(&dev->kref, nvme_free_dev); 2756 kref_put(&dev->kref, nvme_free_dev);
@@ -3079,18 +2834,11 @@ static int __init nvme_init(void)
3079 else if (result > 0) 2834 else if (result > 0)
3080 nvme_major = result; 2835 nvme_major = result;
3081 2836
3082 nvme_nb.notifier_call = &nvme_cpu_notify;
3083 result = register_hotcpu_notifier(&nvme_nb);
3084 if (result)
3085 goto unregister_blkdev;
3086
3087 result = pci_register_driver(&nvme_driver); 2837 result = pci_register_driver(&nvme_driver);
3088 if (result) 2838 if (result)
3089 goto unregister_hotcpu; 2839 goto unregister_blkdev;
3090 return 0; 2840 return 0;
3091 2841
3092 unregister_hotcpu:
3093 unregister_hotcpu_notifier(&nvme_nb);
3094 unregister_blkdev: 2842 unregister_blkdev:
3095 unregister_blkdev(nvme_major, "nvme"); 2843 unregister_blkdev(nvme_major, "nvme");
3096 kill_workq: 2844 kill_workq:
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 046ae3321c5e..49f86d1a5aa2 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2105,7 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2105 2105
2106 nvme_offset += unit_num_blocks; 2106 nvme_offset += unit_num_blocks;
2107 2107
2108 nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); 2108 nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
2109 if (nvme_sc != NVME_SC_SUCCESS) { 2109 if (nvme_sc != NVME_SC_SUCCESS) {
2110 nvme_unmap_user_pages(dev, 2110 nvme_unmap_user_pages(dev,
2111 (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, 2111 (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2658,7 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2658 c.common.opcode = nvme_cmd_flush; 2658 c.common.opcode = nvme_cmd_flush;
2659 c.common.nsid = cpu_to_le32(ns->ns_id); 2659 c.common.nsid = cpu_to_le32(ns->ns_id);
2660 2660
2661 nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); 2661 nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
2662 res = nvme_trans_status_code(hdr, nvme_sc); 2662 res = nvme_trans_status_code(hdr, nvme_sc);
2663 if (res) 2663 if (res)
2664 goto out; 2664 goto out;
@@ -2686,7 +2686,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
2686 c.common.opcode = nvme_cmd_flush; 2686 c.common.opcode = nvme_cmd_flush;
2687 c.common.nsid = cpu_to_le32(ns->ns_id); 2687 c.common.nsid = cpu_to_le32(ns->ns_id);
2688 2688
2689 nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); 2689 nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
2690 2690
2691 res = nvme_trans_status_code(hdr, nvme_sc); 2691 res = nvme_trans_status_code(hdr, nvme_sc);
2692 if (res) 2692 if (res)
@@ -2894,7 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2894 c.dsm.nr = cpu_to_le32(ndesc - 1); 2894 c.dsm.nr = cpu_to_le32(ndesc - 1);
2895 c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 2895 c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
2896 2896
2897 nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); 2897 nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
2898 res = nvme_trans_status_code(hdr, nvme_sc); 2898 res = nvme_trans_status_code(hdr, nvme_sc);
2899 2899
2900 dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), 2900 dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index ed09074e5554..258945fcabf1 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
19#include <linux/pci.h> 19#include <linux/pci.h>
20#include <linux/miscdevice.h> 20#include <linux/miscdevice.h>
21#include <linux/kref.h> 21#include <linux/kref.h>
22#include <linux/blk-mq.h>
22 23
23struct nvme_bar { 24struct nvme_bar {
24 __u64 cap; /* Controller Capabilities */ 25 __u64 cap; /* Controller Capabilities */
@@ -71,8 +72,10 @@ extern unsigned char nvme_io_timeout;
71 */ 72 */
72struct nvme_dev { 73struct nvme_dev {
73 struct list_head node; 74 struct list_head node;
74 struct nvme_queue __rcu **queues; 75 struct nvme_queue **queues;
75 unsigned short __percpu *io_queue; 76 struct request_queue *admin_q;
77 struct blk_mq_tag_set tagset;
78 struct blk_mq_tag_set admin_tagset;
76 u32 __iomem *dbs; 79 u32 __iomem *dbs;
77 struct pci_dev *pci_dev; 80 struct pci_dev *pci_dev;
78 struct dma_pool *prp_page_pool; 81 struct dma_pool *prp_page_pool;
@@ -91,7 +94,6 @@ struct nvme_dev {
91 struct miscdevice miscdev; 94 struct miscdevice miscdev;
92 work_func_t reset_workfn; 95 work_func_t reset_workfn;
93 struct work_struct reset_work; 96 struct work_struct reset_work;
94 struct work_struct cpu_work;
95 char name[12]; 97 char name[12];
96 char serial[20]; 98 char serial[20];
97 char model[40]; 99 char model[40];
@@ -135,7 +137,6 @@ struct nvme_iod {
135 int offset; /* Of PRP list */ 137 int offset; /* Of PRP list */
136 int nents; /* Used in scatterlist */ 138 int nents; /* Used in scatterlist */
137 int length; /* Of data, in bytes */ 139 int length; /* Of data, in bytes */
138 unsigned long start_time;
139 dma_addr_t first_dma; 140 dma_addr_t first_dma;
140 struct list_head node; 141 struct list_head node;
141 struct scatterlist sg[0]; 142 struct scatterlist sg[0];
@@ -153,12 +154,14 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
153 */ 154 */
154void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod); 155void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
155 156
156int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int , gfp_t); 157int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
157struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 158struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
158 unsigned long addr, unsigned length); 159 unsigned long addr, unsigned length);
159void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 160void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
160 struct nvme_iod *iod); 161 struct nvme_iod *iod);
161int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *); 162int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *,
163 struct nvme_command *, u32 *);
164int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
162int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, 165int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
163 u32 *result); 166 u32 *result);
164int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, 167int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,