aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/nvme
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-21 22:58:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-21 22:58:02 -0500
commit3e1e21c7bfcfa9bf06c07f48a13faca2f62b3339 (patch)
treeb26e480594c8e978c48118e2e3d624d1386f51df /drivers/nvme
parent0a13daedf7ffc71b0c374a036355da7fddb20d6d (diff)
parenta9cf8284b45110a4d98aea180a89c857e53bf850 (diff)
Merge branch 'for-4.5/nvme' of git://git.kernel.dk/linux-block
Pull NVMe updates from Jens Axboe: "Last branch for this series is the nvme changes. It's in a separate branch to avoid splitting too much between core and NVMe changes, since NVMe is still helping drive some blk-mq changes. That said, not a huge amount of core changes in here. The grunt of the work is the continued split of the code" * 'for-4.5/nvme' of git://git.kernel.dk/linux-block: (67 commits) uapi: update install list after nvme.h rename NVMe: Export NVMe attributes to sysfs group NVMe: Shutdown controller only for power-off NVMe: IO queue deletion re-write NVMe: Remove queue freezing on resets NVMe: Use a retryable error code on reset NVMe: Fix admin queue ring wrap nvme: make SG_IO support optional nvme: fixes for NVME_IOCTL_IO_CMD on the char device nvme: synchronize access to ctrl->namespaces nvme: Move nvme_freeze/unfreeze_queues to nvme core PCI/AER: include header file NVMe: Export namespace attributes to sysfs NVMe: Add pci error handlers block: remove REQ_NO_TIMEOUT flag nvme: merge iod and cmd_info nvme: meta_sg doesn't have to be an array nvme: properly free resources for cancelled command nvme: simplify completion handling nvme: special case AEN requests ...
Diffstat (limited to 'drivers/nvme')
-rw-r--r--drivers/nvme/host/Kconfig11
-rw-r--r--drivers/nvme/host/Makefile5
-rw-r--r--drivers/nvme/host/core.c1472
-rw-r--r--drivers/nvme/host/lightnvm.c35
-rw-r--r--drivers/nvme/host/nvme.h242
-rw-r--r--drivers/nvme/host/pci.c2700
-rw-r--r--drivers/nvme/host/scsi.c212
7 files changed, 2527 insertions, 2150 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 002a94abdbc4..5d6237391dcd 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -8,3 +8,14 @@ config BLK_DEV_NVME
8 8
9 To compile this driver as a module, choose M here: the 9 To compile this driver as a module, choose M here: the
10 module will be called nvme. 10 module will be called nvme.
11
12config BLK_DEV_NVME_SCSI
13 bool "SCSI emulation for NVMe device nodes"
14 depends on BLK_DEV_NVME
15 ---help---
16 This adds support for the SG_IO ioctl on the NVMe character
17 and block devices nodes, as well a a translation for a small
18 number of selected SCSI commands to NVMe commands to the NVMe
19 driver. If you don't know what this means you probably want
20 to say N here, and if you know what it means you probably
21 want to say N as well.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a5fe23952586..51bf90871549 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,5 +1,6 @@
1 1
2obj-$(CONFIG_BLK_DEV_NVME) += nvme.o 2obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
3 3
4lightnvm-$(CONFIG_NVM) := lightnvm.o 4lightnvm-$(CONFIG_NVM) := lightnvm.o
5nvme-y += pci.o scsi.o $(lightnvm-y) 5nvme-y += core.o pci.o $(lightnvm-y)
6nvme-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
new file mode 100644
index 000000000000..c5bf001af559
--- /dev/null
+++ b/drivers/nvme/host/core.c
@@ -0,0 +1,1472 @@
1/*
2 * NVM Express device driver
3 * Copyright (c) 2011-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/blkdev.h>
16#include <linux/blk-mq.h>
17#include <linux/delay.h>
18#include <linux/errno.h>
19#include <linux/hdreg.h>
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/list_sort.h>
23#include <linux/slab.h>
24#include <linux/types.h>
25#include <linux/pr.h>
26#include <linux/ptrace.h>
27#include <linux/nvme_ioctl.h>
28#include <linux/t10-pi.h>
29#include <scsi/sg.h>
30#include <asm/unaligned.h>
31
32#include "nvme.h"
33
34#define NVME_MINORS (1U << MINORBITS)
35
36static int nvme_major;
37module_param(nvme_major, int, 0);
38
39static int nvme_char_major;
40module_param(nvme_char_major, int, 0);
41
42static LIST_HEAD(nvme_ctrl_list);
43DEFINE_SPINLOCK(dev_list_lock);
44
45static struct class *nvme_class;
46
47static void nvme_free_ns(struct kref *kref)
48{
49 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
50
51 if (ns->type == NVME_NS_LIGHTNVM)
52 nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
53
54 spin_lock(&dev_list_lock);
55 ns->disk->private_data = NULL;
56 spin_unlock(&dev_list_lock);
57
58 nvme_put_ctrl(ns->ctrl);
59 put_disk(ns->disk);
60 kfree(ns);
61}
62
63static void nvme_put_ns(struct nvme_ns *ns)
64{
65 kref_put(&ns->kref, nvme_free_ns);
66}
67
68static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
69{
70 struct nvme_ns *ns;
71
72 spin_lock(&dev_list_lock);
73 ns = disk->private_data;
74 if (ns && !kref_get_unless_zero(&ns->kref))
75 ns = NULL;
76 spin_unlock(&dev_list_lock);
77
78 return ns;
79}
80
81void nvme_requeue_req(struct request *req)
82{
83 unsigned long flags;
84
85 blk_mq_requeue_request(req);
86 spin_lock_irqsave(req->q->queue_lock, flags);
87 if (!blk_queue_stopped(req->q))
88 blk_mq_kick_requeue_list(req->q);
89 spin_unlock_irqrestore(req->q->queue_lock, flags);
90}
91
92struct request *nvme_alloc_request(struct request_queue *q,
93 struct nvme_command *cmd, unsigned int flags)
94{
95 bool write = cmd->common.opcode & 1;
96 struct request *req;
97
98 req = blk_mq_alloc_request(q, write, flags);
99 if (IS_ERR(req))
100 return req;
101
102 req->cmd_type = REQ_TYPE_DRV_PRIV;
103 req->cmd_flags |= REQ_FAILFAST_DRIVER;
104 req->__data_len = 0;
105 req->__sector = (sector_t) -1;
106 req->bio = req->biotail = NULL;
107
108 req->cmd = (unsigned char *)cmd;
109 req->cmd_len = sizeof(struct nvme_command);
110 req->special = (void *)0;
111
112 return req;
113}
114
115/*
116 * Returns 0 on success. If the result is negative, it's a Linux error code;
117 * if the result is positive, it's an NVM Express status code
118 */
119int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
120 void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
121{
122 struct request *req;
123 int ret;
124
125 req = nvme_alloc_request(q, cmd, 0);
126 if (IS_ERR(req))
127 return PTR_ERR(req);
128
129 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
130
131 if (buffer && bufflen) {
132 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
133 if (ret)
134 goto out;
135 }
136
137 blk_execute_rq(req->q, NULL, req, 0);
138 if (result)
139 *result = (u32)(uintptr_t)req->special;
140 ret = req->errors;
141 out:
142 blk_mq_free_request(req);
143 return ret;
144}
145
146int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
147 void *buffer, unsigned bufflen)
148{
149 return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
150}
151
152int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
153 void __user *ubuffer, unsigned bufflen,
154 void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
155 u32 *result, unsigned timeout)
156{
157 bool write = cmd->common.opcode & 1;
158 struct nvme_ns *ns = q->queuedata;
159 struct gendisk *disk = ns ? ns->disk : NULL;
160 struct request *req;
161 struct bio *bio = NULL;
162 void *meta = NULL;
163 int ret;
164
165 req = nvme_alloc_request(q, cmd, 0);
166 if (IS_ERR(req))
167 return PTR_ERR(req);
168
169 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
170
171 if (ubuffer && bufflen) {
172 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
173 GFP_KERNEL);
174 if (ret)
175 goto out;
176 bio = req->bio;
177
178 if (!disk)
179 goto submit;
180 bio->bi_bdev = bdget_disk(disk, 0);
181 if (!bio->bi_bdev) {
182 ret = -ENODEV;
183 goto out_unmap;
184 }
185
186 if (meta_buffer) {
187 struct bio_integrity_payload *bip;
188
189 meta = kmalloc(meta_len, GFP_KERNEL);
190 if (!meta) {
191 ret = -ENOMEM;
192 goto out_unmap;
193 }
194
195 if (write) {
196 if (copy_from_user(meta, meta_buffer,
197 meta_len)) {
198 ret = -EFAULT;
199 goto out_free_meta;
200 }
201 }
202
203 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
204 if (IS_ERR(bip)) {
205 ret = PTR_ERR(bip);
206 goto out_free_meta;
207 }
208
209 bip->bip_iter.bi_size = meta_len;
210 bip->bip_iter.bi_sector = meta_seed;
211
212 ret = bio_integrity_add_page(bio, virt_to_page(meta),
213 meta_len, offset_in_page(meta));
214 if (ret != meta_len) {
215 ret = -ENOMEM;
216 goto out_free_meta;
217 }
218 }
219 }
220 submit:
221 blk_execute_rq(req->q, disk, req, 0);
222 ret = req->errors;
223 if (result)
224 *result = (u32)(uintptr_t)req->special;
225 if (meta && !ret && !write) {
226 if (copy_to_user(meta_buffer, meta, meta_len))
227 ret = -EFAULT;
228 }
229 out_free_meta:
230 kfree(meta);
231 out_unmap:
232 if (bio) {
233 if (disk && bio->bi_bdev)
234 bdput(bio->bi_bdev);
235 blk_rq_unmap_user(bio);
236 }
237 out:
238 blk_mq_free_request(req);
239 return ret;
240}
241
242int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
243 void __user *ubuffer, unsigned bufflen, u32 *result,
244 unsigned timeout)
245{
246 return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
247 result, timeout);
248}
249
250int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
251{
252 struct nvme_command c = { };
253 int error;
254
255 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
256 c.identify.opcode = nvme_admin_identify;
257 c.identify.cns = cpu_to_le32(1);
258
259 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
260 if (!*id)
261 return -ENOMEM;
262
263 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
264 sizeof(struct nvme_id_ctrl));
265 if (error)
266 kfree(*id);
267 return error;
268}
269
270static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
271{
272 struct nvme_command c = { };
273
274 c.identify.opcode = nvme_admin_identify;
275 c.identify.cns = cpu_to_le32(2);
276 c.identify.nsid = cpu_to_le32(nsid);
277 return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
278}
279
280int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
281 struct nvme_id_ns **id)
282{
283 struct nvme_command c = { };
284 int error;
285
286 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
287 c.identify.opcode = nvme_admin_identify,
288 c.identify.nsid = cpu_to_le32(nsid),
289
290 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
291 if (!*id)
292 return -ENOMEM;
293
294 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
295 sizeof(struct nvme_id_ns));
296 if (error)
297 kfree(*id);
298 return error;
299}
300
301int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
302 dma_addr_t dma_addr, u32 *result)
303{
304 struct nvme_command c;
305
306 memset(&c, 0, sizeof(c));
307 c.features.opcode = nvme_admin_get_features;
308 c.features.nsid = cpu_to_le32(nsid);
309 c.features.prp1 = cpu_to_le64(dma_addr);
310 c.features.fid = cpu_to_le32(fid);
311
312 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
313}
314
315int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
316 dma_addr_t dma_addr, u32 *result)
317{
318 struct nvme_command c;
319
320 memset(&c, 0, sizeof(c));
321 c.features.opcode = nvme_admin_set_features;
322 c.features.prp1 = cpu_to_le64(dma_addr);
323 c.features.fid = cpu_to_le32(fid);
324 c.features.dword11 = cpu_to_le32(dword11);
325
326 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
327}
328
329int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
330{
331 struct nvme_command c = { };
332 int error;
333
334 c.common.opcode = nvme_admin_get_log_page,
335 c.common.nsid = cpu_to_le32(0xFFFFFFFF),
336 c.common.cdw10[0] = cpu_to_le32(
337 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
338 NVME_LOG_SMART),
339
340 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
341 if (!*log)
342 return -ENOMEM;
343
344 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
345 sizeof(struct nvme_smart_log));
346 if (error)
347 kfree(*log);
348 return error;
349}
350
351int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
352{
353 u32 q_count = (*count - 1) | ((*count - 1) << 16);
354 u32 result;
355 int status, nr_io_queues;
356
357 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
358 &result);
359 if (status)
360 return status;
361
362 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
363 *count = min(*count, nr_io_queues);
364 return 0;
365}
366
367static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
368{
369 struct nvme_user_io io;
370 struct nvme_command c;
371 unsigned length, meta_len;
372 void __user *metadata;
373
374 if (copy_from_user(&io, uio, sizeof(io)))
375 return -EFAULT;
376
377 switch (io.opcode) {
378 case nvme_cmd_write:
379 case nvme_cmd_read:
380 case nvme_cmd_compare:
381 break;
382 default:
383 return -EINVAL;
384 }
385
386 length = (io.nblocks + 1) << ns->lba_shift;
387 meta_len = (io.nblocks + 1) * ns->ms;
388 metadata = (void __user *)(uintptr_t)io.metadata;
389
390 if (ns->ext) {
391 length += meta_len;
392 meta_len = 0;
393 } else if (meta_len) {
394 if ((io.metadata & 3) || !io.metadata)
395 return -EINVAL;
396 }
397
398 memset(&c, 0, sizeof(c));
399 c.rw.opcode = io.opcode;
400 c.rw.flags = io.flags;
401 c.rw.nsid = cpu_to_le32(ns->ns_id);
402 c.rw.slba = cpu_to_le64(io.slba);
403 c.rw.length = cpu_to_le16(io.nblocks);
404 c.rw.control = cpu_to_le16(io.control);
405 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
406 c.rw.reftag = cpu_to_le32(io.reftag);
407 c.rw.apptag = cpu_to_le16(io.apptag);
408 c.rw.appmask = cpu_to_le16(io.appmask);
409
410 return __nvme_submit_user_cmd(ns->queue, &c,
411 (void __user *)(uintptr_t)io.addr, length,
412 metadata, meta_len, io.slba, NULL, 0);
413}
414
415static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
416 struct nvme_passthru_cmd __user *ucmd)
417{
418 struct nvme_passthru_cmd cmd;
419 struct nvme_command c;
420 unsigned timeout = 0;
421 int status;
422
423 if (!capable(CAP_SYS_ADMIN))
424 return -EACCES;
425 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
426 return -EFAULT;
427
428 memset(&c, 0, sizeof(c));
429 c.common.opcode = cmd.opcode;
430 c.common.flags = cmd.flags;
431 c.common.nsid = cpu_to_le32(cmd.nsid);
432 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
433 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
434 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
435 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
436 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
437 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
438 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
439 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
440
441 if (cmd.timeout_ms)
442 timeout = msecs_to_jiffies(cmd.timeout_ms);
443
444 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
445 (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
446 &cmd.result, timeout);
447 if (status >= 0) {
448 if (put_user(cmd.result, &ucmd->result))
449 return -EFAULT;
450 }
451
452 return status;
453}
454
455static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
456 unsigned int cmd, unsigned long arg)
457{
458 struct nvme_ns *ns = bdev->bd_disk->private_data;
459
460 switch (cmd) {
461 case NVME_IOCTL_ID:
462 force_successful_syscall_return();
463 return ns->ns_id;
464 case NVME_IOCTL_ADMIN_CMD:
465 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
466 case NVME_IOCTL_IO_CMD:
467 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
468 case NVME_IOCTL_SUBMIT_IO:
469 return nvme_submit_io(ns, (void __user *)arg);
470#ifdef CONFIG_BLK_DEV_NVME_SCSI
471 case SG_GET_VERSION_NUM:
472 return nvme_sg_get_version_num((void __user *)arg);
473 case SG_IO:
474 return nvme_sg_io(ns, (void __user *)arg);
475#endif
476 default:
477 return -ENOTTY;
478 }
479}
480
481#ifdef CONFIG_COMPAT
482static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
483 unsigned int cmd, unsigned long arg)
484{
485 switch (cmd) {
486 case SG_IO:
487 return -ENOIOCTLCMD;
488 }
489 return nvme_ioctl(bdev, mode, cmd, arg);
490}
491#else
492#define nvme_compat_ioctl NULL
493#endif
494
495static int nvme_open(struct block_device *bdev, fmode_t mode)
496{
497 return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
498}
499
500static void nvme_release(struct gendisk *disk, fmode_t mode)
501{
502 nvme_put_ns(disk->private_data);
503}
504
505static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
506{
507 /* some standard values */
508 geo->heads = 1 << 6;
509 geo->sectors = 1 << 5;
510 geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
511 return 0;
512}
513
514#ifdef CONFIG_BLK_DEV_INTEGRITY
515static void nvme_init_integrity(struct nvme_ns *ns)
516{
517 struct blk_integrity integrity;
518
519 switch (ns->pi_type) {
520 case NVME_NS_DPS_PI_TYPE3:
521 integrity.profile = &t10_pi_type3_crc;
522 break;
523 case NVME_NS_DPS_PI_TYPE1:
524 case NVME_NS_DPS_PI_TYPE2:
525 integrity.profile = &t10_pi_type1_crc;
526 break;
527 default:
528 integrity.profile = NULL;
529 break;
530 }
531 integrity.tuple_size = ns->ms;
532 blk_integrity_register(ns->disk, &integrity);
533 blk_queue_max_integrity_segments(ns->queue, 1);
534}
535#else
536static void nvme_init_integrity(struct nvme_ns *ns)
537{
538}
539#endif /* CONFIG_BLK_DEV_INTEGRITY */
540
541static void nvme_config_discard(struct nvme_ns *ns)
542{
543 u32 logical_block_size = queue_logical_block_size(ns->queue);
544 ns->queue->limits.discard_zeroes_data = 0;
545 ns->queue->limits.discard_alignment = logical_block_size;
546 ns->queue->limits.discard_granularity = logical_block_size;
547 blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
548 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
549}
550
551static int nvme_revalidate_disk(struct gendisk *disk)
552{
553 struct nvme_ns *ns = disk->private_data;
554 struct nvme_id_ns *id;
555 u8 lbaf, pi_type;
556 u16 old_ms;
557 unsigned short bs;
558
559 if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
560 dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
561 __func__, ns->ctrl->instance, ns->ns_id);
562 return -ENODEV;
563 }
564 if (id->ncap == 0) {
565 kfree(id);
566 return -ENODEV;
567 }
568
569 if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
570 if (nvme_nvm_register(ns->queue, disk->disk_name)) {
571 dev_warn(ns->ctrl->dev,
572 "%s: LightNVM init failure\n", __func__);
573 kfree(id);
574 return -ENODEV;
575 }
576 ns->type = NVME_NS_LIGHTNVM;
577 }
578
579 if (ns->ctrl->vs >= NVME_VS(1, 1))
580 memcpy(ns->eui, id->eui64, sizeof(ns->eui));
581 if (ns->ctrl->vs >= NVME_VS(1, 2))
582 memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
583
584 old_ms = ns->ms;
585 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
586 ns->lba_shift = id->lbaf[lbaf].ds;
587 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
588 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
589
590 /*
591 * If identify namespace failed, use default 512 byte block size so
592 * block layer can use before failing read/write for 0 capacity.
593 */
594 if (ns->lba_shift == 0)
595 ns->lba_shift = 9;
596 bs = 1 << ns->lba_shift;
597 /* XXX: PI implementation requires metadata equal t10 pi tuple size */
598 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
599 id->dps & NVME_NS_DPS_PI_MASK : 0;
600
601 blk_mq_freeze_queue(disk->queue);
602 if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
603 ns->ms != old_ms ||
604 bs != queue_logical_block_size(disk->queue) ||
605 (ns->ms && ns->ext)))
606 blk_integrity_unregister(disk);
607
608 ns->pi_type = pi_type;
609 blk_queue_logical_block_size(ns->queue, bs);
610
611 if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
612 nvme_init_integrity(ns);
613 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
614 set_capacity(disk, 0);
615 else
616 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
617
618 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
619 nvme_config_discard(ns);
620 blk_mq_unfreeze_queue(disk->queue);
621
622 kfree(id);
623 return 0;
624}
625
626static char nvme_pr_type(enum pr_type type)
627{
628 switch (type) {
629 case PR_WRITE_EXCLUSIVE:
630 return 1;
631 case PR_EXCLUSIVE_ACCESS:
632 return 2;
633 case PR_WRITE_EXCLUSIVE_REG_ONLY:
634 return 3;
635 case PR_EXCLUSIVE_ACCESS_REG_ONLY:
636 return 4;
637 case PR_WRITE_EXCLUSIVE_ALL_REGS:
638 return 5;
639 case PR_EXCLUSIVE_ACCESS_ALL_REGS:
640 return 6;
641 default:
642 return 0;
643 }
644};
645
646static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
647 u64 key, u64 sa_key, u8 op)
648{
649 struct nvme_ns *ns = bdev->bd_disk->private_data;
650 struct nvme_command c;
651 u8 data[16] = { 0, };
652
653 put_unaligned_le64(key, &data[0]);
654 put_unaligned_le64(sa_key, &data[8]);
655
656 memset(&c, 0, sizeof(c));
657 c.common.opcode = op;
658 c.common.nsid = cpu_to_le32(ns->ns_id);
659 c.common.cdw10[0] = cpu_to_le32(cdw10);
660
661 return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
662}
663
664static int nvme_pr_register(struct block_device *bdev, u64 old,
665 u64 new, unsigned flags)
666{
667 u32 cdw10;
668
669 if (flags & ~PR_FL_IGNORE_KEY)
670 return -EOPNOTSUPP;
671
672 cdw10 = old ? 2 : 0;
673 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
674 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
675 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
676}
677
678static int nvme_pr_reserve(struct block_device *bdev, u64 key,
679 enum pr_type type, unsigned flags)
680{
681 u32 cdw10;
682
683 if (flags & ~PR_FL_IGNORE_KEY)
684 return -EOPNOTSUPP;
685
686 cdw10 = nvme_pr_type(type) << 8;
687 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
688 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
689}
690
691static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
692 enum pr_type type, bool abort)
693{
694 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
695 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
696}
697
698static int nvme_pr_clear(struct block_device *bdev, u64 key)
699{
700 u32 cdw10 = 1 | (key ? 1 << 3 : 0);
701 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
702}
703
704static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
705{
706 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
707 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
708}
709
710static const struct pr_ops nvme_pr_ops = {
711 .pr_register = nvme_pr_register,
712 .pr_reserve = nvme_pr_reserve,
713 .pr_release = nvme_pr_release,
714 .pr_preempt = nvme_pr_preempt,
715 .pr_clear = nvme_pr_clear,
716};
717
718static const struct block_device_operations nvme_fops = {
719 .owner = THIS_MODULE,
720 .ioctl = nvme_ioctl,
721 .compat_ioctl = nvme_compat_ioctl,
722 .open = nvme_open,
723 .release = nvme_release,
724 .getgeo = nvme_getgeo,
725 .revalidate_disk= nvme_revalidate_disk,
726 .pr_ops = &nvme_pr_ops,
727};
728
729static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
730{
731 unsigned long timeout =
732 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
733 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
734 int ret;
735
736 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
737 if ((csts & NVME_CSTS_RDY) == bit)
738 break;
739
740 msleep(100);
741 if (fatal_signal_pending(current))
742 return -EINTR;
743 if (time_after(jiffies, timeout)) {
744 dev_err(ctrl->dev,
745 "Device not ready; aborting %s\n", enabled ?
746 "initialisation" : "reset");
747 return -ENODEV;
748 }
749 }
750
751 return ret;
752}
753
754/*
755 * If the device has been passed off to us in an enabled state, just clear
756 * the enabled bit. The spec says we should set the 'shutdown notification
757 * bits', but doing so may cause the device to complete commands to the
758 * admin queue ... and we don't know what memory that might be pointing at!
759 */
760int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
761{
762 int ret;
763
764 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
765 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
766
767 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
768 if (ret)
769 return ret;
770 return nvme_wait_ready(ctrl, cap, false);
771}
772
773int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
774{
775 /*
776 * Default to a 4K page size, with the intention to update this
777 * path in the future to accomodate architectures with differing
778 * kernel and IO page sizes.
779 */
780 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
781 int ret;
782
783 if (page_shift < dev_page_min) {
784 dev_err(ctrl->dev,
785 "Minimum device page size %u too large for host (%u)\n",
786 1 << dev_page_min, 1 << page_shift);
787 return -ENODEV;
788 }
789
790 ctrl->page_size = 1 << page_shift;
791
792 ctrl->ctrl_config = NVME_CC_CSS_NVM;
793 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
794 ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
795 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
796 ctrl->ctrl_config |= NVME_CC_ENABLE;
797
798 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
799 if (ret)
800 return ret;
801 return nvme_wait_ready(ctrl, cap, true);
802}
803
804int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
805{
806 unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
807 u32 csts;
808 int ret;
809
810 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
811 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
812
813 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
814 if (ret)
815 return ret;
816
817 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
818 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
819 break;
820
821 msleep(100);
822 if (fatal_signal_pending(current))
823 return -EINTR;
824 if (time_after(jiffies, timeout)) {
825 dev_err(ctrl->dev,
826 "Device shutdown incomplete; abort shutdown\n");
827 return -ENODEV;
828 }
829 }
830
831 return ret;
832}
833
834/*
835 * Initialize the cached copies of the Identify data and various controller
836 * register in our nvme_ctrl structure. This should be called as soon as
837 * the admin queue is fully up and running.
838 */
839int nvme_init_identify(struct nvme_ctrl *ctrl)
840{
841 struct nvme_id_ctrl *id;
842 u64 cap;
843 int ret, page_shift;
844
845 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
846 if (ret) {
847 dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
848 return ret;
849 }
850
851 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
852 if (ret) {
853 dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
854 return ret;
855 }
856 page_shift = NVME_CAP_MPSMIN(cap) + 12;
857
858 if (ctrl->vs >= NVME_VS(1, 1))
859 ctrl->subsystem = NVME_CAP_NSSRC(cap);
860
861 ret = nvme_identify_ctrl(ctrl, &id);
862 if (ret) {
863 dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
864 return -EIO;
865 }
866
867 ctrl->oncs = le16_to_cpup(&id->oncs);
868 atomic_set(&ctrl->abort_limit, id->acl + 1);
869 ctrl->vwc = id->vwc;
870 memcpy(ctrl->serial, id->sn, sizeof(id->sn));
871 memcpy(ctrl->model, id->mn, sizeof(id->mn));
872 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
873 if (id->mdts)
874 ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
875 else
876 ctrl->max_hw_sectors = UINT_MAX;
877
878 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
879 unsigned int max_hw_sectors;
880
881 ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
882 max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
883 if (ctrl->max_hw_sectors) {
884 ctrl->max_hw_sectors = min(max_hw_sectors,
885 ctrl->max_hw_sectors);
886 } else {
887 ctrl->max_hw_sectors = max_hw_sectors;
888 }
889 }
890
891 kfree(id);
892 return 0;
893}
894
895static int nvme_dev_open(struct inode *inode, struct file *file)
896{
897 struct nvme_ctrl *ctrl;
898 int instance = iminor(inode);
899 int ret = -ENODEV;
900
901 spin_lock(&dev_list_lock);
902 list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
903 if (ctrl->instance != instance)
904 continue;
905
906 if (!ctrl->admin_q) {
907 ret = -EWOULDBLOCK;
908 break;
909 }
910 if (!kref_get_unless_zero(&ctrl->kref))
911 break;
912 file->private_data = ctrl;
913 ret = 0;
914 break;
915 }
916 spin_unlock(&dev_list_lock);
917
918 return ret;
919}
920
921static int nvme_dev_release(struct inode *inode, struct file *file)
922{
923 nvme_put_ctrl(file->private_data);
924 return 0;
925}
926
927static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
928{
929 struct nvme_ns *ns;
930 int ret;
931
932 mutex_lock(&ctrl->namespaces_mutex);
933 if (list_empty(&ctrl->namespaces)) {
934 ret = -ENOTTY;
935 goto out_unlock;
936 }
937
938 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
939 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
940 dev_warn(ctrl->dev,
941 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
942 ret = -EINVAL;
943 goto out_unlock;
944 }
945
946 dev_warn(ctrl->dev,
947 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
948 kref_get(&ns->kref);
949 mutex_unlock(&ctrl->namespaces_mutex);
950
951 ret = nvme_user_cmd(ctrl, ns, argp);
952 nvme_put_ns(ns);
953 return ret;
954
955out_unlock:
956 mutex_unlock(&ctrl->namespaces_mutex);
957 return ret;
958}
959
960static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
961 unsigned long arg)
962{
963 struct nvme_ctrl *ctrl = file->private_data;
964 void __user *argp = (void __user *)arg;
965
966 switch (cmd) {
967 case NVME_IOCTL_ADMIN_CMD:
968 return nvme_user_cmd(ctrl, NULL, argp);
969 case NVME_IOCTL_IO_CMD:
970 return nvme_dev_user_cmd(ctrl, argp);
971 case NVME_IOCTL_RESET:
972 dev_warn(ctrl->dev, "resetting controller\n");
973 return ctrl->ops->reset_ctrl(ctrl);
974 case NVME_IOCTL_SUBSYS_RESET:
975 return nvme_reset_subsystem(ctrl);
976 default:
977 return -ENOTTY;
978 }
979}
980
981static const struct file_operations nvme_dev_fops = {
982 .owner = THIS_MODULE,
983 .open = nvme_dev_open,
984 .release = nvme_dev_release,
985 .unlocked_ioctl = nvme_dev_ioctl,
986 .compat_ioctl = nvme_dev_ioctl,
987};
988
989static ssize_t nvme_sysfs_reset(struct device *dev,
990 struct device_attribute *attr, const char *buf,
991 size_t count)
992{
993 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
994 int ret;
995
996 ret = ctrl->ops->reset_ctrl(ctrl);
997 if (ret < 0)
998 return ret;
999 return count;
1000}
1001static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
1002
1003static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
1004 char *buf)
1005{
1006 struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1007 return sprintf(buf, "%pU\n", ns->uuid);
1008}
1009static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
1010
1011static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
1012 char *buf)
1013{
1014 struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1015 return sprintf(buf, "%8phd\n", ns->eui);
1016}
1017static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
1018
1019static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
1020 char *buf)
1021{
1022 struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1023 return sprintf(buf, "%d\n", ns->ns_id);
1024}
1025static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
1026
1027static struct attribute *nvme_ns_attrs[] = {
1028 &dev_attr_uuid.attr,
1029 &dev_attr_eui.attr,
1030 &dev_attr_nsid.attr,
1031 NULL,
1032};
1033
1034static umode_t nvme_attrs_are_visible(struct kobject *kobj,
1035 struct attribute *a, int n)
1036{
1037 struct device *dev = container_of(kobj, struct device, kobj);
1038 struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1039
1040 if (a == &dev_attr_uuid.attr) {
1041 if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
1042 return 0;
1043 }
1044 if (a == &dev_attr_eui.attr) {
1045 if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
1046 return 0;
1047 }
1048 return a->mode;
1049}
1050
1051static const struct attribute_group nvme_ns_attr_group = {
1052 .attrs = nvme_ns_attrs,
1053 .is_visible = nvme_attrs_are_visible,
1054};
1055
1056#define nvme_show_function(field) \
1057static ssize_t field##_show(struct device *dev, \
1058 struct device_attribute *attr, char *buf) \
1059{ \
1060 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
1061 return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \
1062} \
1063static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
1064
1065nvme_show_function(model);
1066nvme_show_function(serial);
1067nvme_show_function(firmware_rev);
1068
1069static struct attribute *nvme_dev_attrs[] = {
1070 &dev_attr_reset_controller.attr,
1071 &dev_attr_model.attr,
1072 &dev_attr_serial.attr,
1073 &dev_attr_firmware_rev.attr,
1074 NULL
1075};
1076
1077static struct attribute_group nvme_dev_attrs_group = {
1078 .attrs = nvme_dev_attrs,
1079};
1080
1081static const struct attribute_group *nvme_dev_attr_groups[] = {
1082 &nvme_dev_attrs_group,
1083 NULL,
1084};
1085
1086static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
1087{
1088 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
1089 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
1090
1091 return nsa->ns_id - nsb->ns_id;
1092}
1093
1094static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1095{
1096 struct nvme_ns *ns;
1097
1098 lockdep_assert_held(&ctrl->namespaces_mutex);
1099
1100 list_for_each_entry(ns, &ctrl->namespaces, list) {
1101 if (ns->ns_id == nsid)
1102 return ns;
1103 if (ns->ns_id > nsid)
1104 break;
1105 }
1106 return NULL;
1107}
1108
1109static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1110{
1111 struct nvme_ns *ns;
1112 struct gendisk *disk;
1113 int node = dev_to_node(ctrl->dev);
1114
1115 lockdep_assert_held(&ctrl->namespaces_mutex);
1116
1117 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
1118 if (!ns)
1119 return;
1120
1121 ns->queue = blk_mq_init_queue(ctrl->tagset);
1122 if (IS_ERR(ns->queue))
1123 goto out_free_ns;
1124 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
1125 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1126 ns->queue->queuedata = ns;
1127 ns->ctrl = ctrl;
1128
1129 disk = alloc_disk_node(0, node);
1130 if (!disk)
1131 goto out_free_queue;
1132
1133 kref_init(&ns->kref);
1134 ns->ns_id = nsid;
1135 ns->disk = disk;
1136 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
1137
1138 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1139 if (ctrl->max_hw_sectors) {
1140 blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
1141 blk_queue_max_segments(ns->queue,
1142 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
1143 }
1144 if (ctrl->stripe_size)
1145 blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
1146 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1147 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
1148 blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
1149
1150 disk->major = nvme_major;
1151 disk->first_minor = 0;
1152 disk->fops = &nvme_fops;
1153 disk->private_data = ns;
1154 disk->queue = ns->queue;
1155 disk->driverfs_dev = ctrl->device;
1156 disk->flags = GENHD_FL_EXT_DEVT;
1157 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
1158
1159 if (nvme_revalidate_disk(ns->disk))
1160 goto out_free_disk;
1161
1162 list_add_tail(&ns->list, &ctrl->namespaces);
1163 kref_get(&ctrl->kref);
1164 if (ns->type == NVME_NS_LIGHTNVM)
1165 return;
1166
1167 add_disk(ns->disk);
1168 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
1169 &nvme_ns_attr_group))
1170 pr_warn("%s: failed to create sysfs group for identification\n",
1171 ns->disk->disk_name);
1172 return;
1173 out_free_disk:
1174 kfree(disk);
1175 out_free_queue:
1176 blk_cleanup_queue(ns->queue);
1177 out_free_ns:
1178 kfree(ns);
1179}
1180
1181static void nvme_ns_remove(struct nvme_ns *ns)
1182{
1183 bool kill = nvme_io_incapable(ns->ctrl) &&
1184 !blk_queue_dying(ns->queue);
1185
1186 lockdep_assert_held(&ns->ctrl->namespaces_mutex);
1187
1188 if (kill) {
1189 blk_set_queue_dying(ns->queue);
1190
1191 /*
1192 * The controller was shutdown first if we got here through
1193 * device removal. The shutdown may requeue outstanding
1194 * requests. These need to be aborted immediately so
1195 * del_gendisk doesn't block indefinitely for their completion.
1196 */
1197 blk_mq_abort_requeue_list(ns->queue);
1198 }
1199 if (ns->disk->flags & GENHD_FL_UP) {
1200 if (blk_get_integrity(ns->disk))
1201 blk_integrity_unregister(ns->disk);
1202 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
1203 &nvme_ns_attr_group);
1204 del_gendisk(ns->disk);
1205 }
1206 if (kill || !blk_queue_dying(ns->queue)) {
1207 blk_mq_abort_requeue_list(ns->queue);
1208 blk_cleanup_queue(ns->queue);
1209 }
1210 list_del_init(&ns->list);
1211 nvme_put_ns(ns);
1212}
1213
1214static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1215{
1216 struct nvme_ns *ns;
1217
1218 ns = nvme_find_ns(ctrl, nsid);
1219 if (ns) {
1220 if (revalidate_disk(ns->disk))
1221 nvme_ns_remove(ns);
1222 } else
1223 nvme_alloc_ns(ctrl, nsid);
1224}
1225
1226static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
1227{
1228 struct nvme_ns *ns;
1229 __le32 *ns_list;
1230 unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
1231 int ret = 0;
1232
1233 ns_list = kzalloc(0x1000, GFP_KERNEL);
1234 if (!ns_list)
1235 return -ENOMEM;
1236
1237 for (i = 0; i < num_lists; i++) {
1238 ret = nvme_identify_ns_list(ctrl, prev, ns_list);
1239 if (ret)
1240 goto out;
1241
1242 for (j = 0; j < min(nn, 1024U); j++) {
1243 nsid = le32_to_cpu(ns_list[j]);
1244 if (!nsid)
1245 goto out;
1246
1247 nvme_validate_ns(ctrl, nsid);
1248
1249 while (++prev < nsid) {
1250 ns = nvme_find_ns(ctrl, prev);
1251 if (ns)
1252 nvme_ns_remove(ns);
1253 }
1254 }
1255 nn -= j;
1256 }
1257 out:
1258 kfree(ns_list);
1259 return ret;
1260}
1261
1262static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
1263{
1264 struct nvme_ns *ns, *next;
1265 unsigned i;
1266
1267 lockdep_assert_held(&ctrl->namespaces_mutex);
1268
1269 for (i = 1; i <= nn; i++)
1270 nvme_validate_ns(ctrl, i);
1271
1272 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
1273 if (ns->ns_id > nn)
1274 nvme_ns_remove(ns);
1275 }
1276}
1277
1278void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
1279{
1280 struct nvme_id_ctrl *id;
1281 unsigned nn;
1282
1283 if (nvme_identify_ctrl(ctrl, &id))
1284 return;
1285
1286 mutex_lock(&ctrl->namespaces_mutex);
1287 nn = le32_to_cpu(id->nn);
1288 if (ctrl->vs >= NVME_VS(1, 1) &&
1289 !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
1290 if (!nvme_scan_ns_list(ctrl, nn))
1291 goto done;
1292 }
1293 __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
1294 done:
1295 list_sort(NULL, &ctrl->namespaces, ns_cmp);
1296 mutex_unlock(&ctrl->namespaces_mutex);
1297 kfree(id);
1298}
1299
1300void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
1301{
1302 struct nvme_ns *ns, *next;
1303
1304 mutex_lock(&ctrl->namespaces_mutex);
1305 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
1306 nvme_ns_remove(ns);
1307 mutex_unlock(&ctrl->namespaces_mutex);
1308}
1309
1310static DEFINE_IDA(nvme_instance_ida);
1311
1312static int nvme_set_instance(struct nvme_ctrl *ctrl)
1313{
1314 int instance, error;
1315
1316 do {
1317 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
1318 return -ENODEV;
1319
1320 spin_lock(&dev_list_lock);
1321 error = ida_get_new(&nvme_instance_ida, &instance);
1322 spin_unlock(&dev_list_lock);
1323 } while (error == -EAGAIN);
1324
1325 if (error)
1326 return -ENODEV;
1327
1328 ctrl->instance = instance;
1329 return 0;
1330}
1331
1332static void nvme_release_instance(struct nvme_ctrl *ctrl)
1333{
1334 spin_lock(&dev_list_lock);
1335 ida_remove(&nvme_instance_ida, ctrl->instance);
1336 spin_unlock(&dev_list_lock);
1337}
1338
1339void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
1340 {
1341 device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
1342
1343 spin_lock(&dev_list_lock);
1344 list_del(&ctrl->node);
1345 spin_unlock(&dev_list_lock);
1346}
1347
1348static void nvme_free_ctrl(struct kref *kref)
1349{
1350 struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
1351
1352 put_device(ctrl->device);
1353 nvme_release_instance(ctrl);
1354
1355 ctrl->ops->free_ctrl(ctrl);
1356}
1357
1358void nvme_put_ctrl(struct nvme_ctrl *ctrl)
1359{
1360 kref_put(&ctrl->kref, nvme_free_ctrl);
1361}
1362
1363/*
1364 * Initialize a NVMe controller structures. This needs to be called during
1365 * earliest initialization so that we have the initialized structured around
1366 * during probing.
1367 */
1368int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
1369 const struct nvme_ctrl_ops *ops, unsigned long quirks)
1370{
1371 int ret;
1372
1373 INIT_LIST_HEAD(&ctrl->namespaces);
1374 mutex_init(&ctrl->namespaces_mutex);
1375 kref_init(&ctrl->kref);
1376 ctrl->dev = dev;
1377 ctrl->ops = ops;
1378 ctrl->quirks = quirks;
1379
1380 ret = nvme_set_instance(ctrl);
1381 if (ret)
1382 goto out;
1383
1384 ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
1385 MKDEV(nvme_char_major, ctrl->instance),
1386 dev, nvme_dev_attr_groups,
1387 "nvme%d", ctrl->instance);
1388 if (IS_ERR(ctrl->device)) {
1389 ret = PTR_ERR(ctrl->device);
1390 goto out_release_instance;
1391 }
1392 get_device(ctrl->device);
1393 dev_set_drvdata(ctrl->device, ctrl);
1394
1395 spin_lock(&dev_list_lock);
1396 list_add_tail(&ctrl->node, &nvme_ctrl_list);
1397 spin_unlock(&dev_list_lock);
1398
1399 return 0;
1400out_release_instance:
1401 nvme_release_instance(ctrl);
1402out:
1403 return ret;
1404}
1405
1406void nvme_stop_queues(struct nvme_ctrl *ctrl)
1407{
1408 struct nvme_ns *ns;
1409
1410 mutex_lock(&ctrl->namespaces_mutex);
1411 list_for_each_entry(ns, &ctrl->namespaces, list) {
1412 spin_lock_irq(ns->queue->queue_lock);
1413 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
1414 spin_unlock_irq(ns->queue->queue_lock);
1415
1416 blk_mq_cancel_requeue_work(ns->queue);
1417 blk_mq_stop_hw_queues(ns->queue);
1418 }
1419 mutex_unlock(&ctrl->namespaces_mutex);
1420}
1421
1422void nvme_start_queues(struct nvme_ctrl *ctrl)
1423{
1424 struct nvme_ns *ns;
1425
1426 mutex_lock(&ctrl->namespaces_mutex);
1427 list_for_each_entry(ns, &ctrl->namespaces, list) {
1428 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
1429 blk_mq_start_stopped_hw_queues(ns->queue, true);
1430 blk_mq_kick_requeue_list(ns->queue);
1431 }
1432 mutex_unlock(&ctrl->namespaces_mutex);
1433}
1434
1435int __init nvme_core_init(void)
1436{
1437 int result;
1438
1439 result = register_blkdev(nvme_major, "nvme");
1440 if (result < 0)
1441 return result;
1442 else if (result > 0)
1443 nvme_major = result;
1444
1445 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
1446 &nvme_dev_fops);
1447 if (result < 0)
1448 goto unregister_blkdev;
1449 else if (result > 0)
1450 nvme_char_major = result;
1451
1452 nvme_class = class_create(THIS_MODULE, "nvme");
1453 if (IS_ERR(nvme_class)) {
1454 result = PTR_ERR(nvme_class);
1455 goto unregister_chrdev;
1456 }
1457
1458 return 0;
1459
1460 unregister_chrdev:
1461 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1462 unregister_blkdev:
1463 unregister_blkdev(nvme_major, "nvme");
1464 return result;
1465}
1466
1467void nvme_core_exit(void)
1468{
1469 unregister_blkdev(nvme_major, "nvme");
1470 class_destroy(nvme_class);
1471 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1472}
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 71f2bbc865cf..5cd3725e2fa4 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -294,7 +294,6 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
294static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) 294static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
295{ 295{
296 struct nvme_ns *ns = nvmdev->q->queuedata; 296 struct nvme_ns *ns = nvmdev->q->queuedata;
297 struct nvme_dev *dev = ns->dev;
298 struct nvme_nvm_id *nvme_nvm_id; 297 struct nvme_nvm_id *nvme_nvm_id;
299 struct nvme_nvm_command c = {}; 298 struct nvme_nvm_command c = {};
300 int ret; 299 int ret;
@@ -307,7 +306,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
307 if (!nvme_nvm_id) 306 if (!nvme_nvm_id)
308 return -ENOMEM; 307 return -ENOMEM;
309 308
310 ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, 309 ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
311 nvme_nvm_id, sizeof(struct nvme_nvm_id)); 310 nvme_nvm_id, sizeof(struct nvme_nvm_id));
312 if (ret) { 311 if (ret) {
313 ret = -EIO; 312 ret = -EIO;
@@ -332,9 +331,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
332 nvm_l2p_update_fn *update_l2p, void *priv) 331 nvm_l2p_update_fn *update_l2p, void *priv)
333{ 332{
334 struct nvme_ns *ns = nvmdev->q->queuedata; 333 struct nvme_ns *ns = nvmdev->q->queuedata;
335 struct nvme_dev *dev = ns->dev;
336 struct nvme_nvm_command c = {}; 334 struct nvme_nvm_command c = {};
337 u32 len = queue_max_hw_sectors(dev->admin_q) << 9; 335 u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
338 u32 nlb_pr_rq = len / sizeof(u64); 336 u32 nlb_pr_rq = len / sizeof(u64);
339 u64 cmd_slba = slba; 337 u64 cmd_slba = slba;
340 void *entries; 338 void *entries;
@@ -352,10 +350,10 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
352 c.l2p.slba = cpu_to_le64(cmd_slba); 350 c.l2p.slba = cpu_to_le64(cmd_slba);
353 c.l2p.nlb = cpu_to_le32(cmd_nlb); 351 c.l2p.nlb = cpu_to_le32(cmd_nlb);
354 352
355 ret = nvme_submit_sync_cmd(dev->admin_q, 353 ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
356 (struct nvme_command *)&c, entries, len); 354 (struct nvme_command *)&c, entries, len);
357 if (ret) { 355 if (ret) {
358 dev_err(dev->dev, "L2P table transfer failed (%d)\n", 356 dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n",
359 ret); 357 ret);
360 ret = -EIO; 358 ret = -EIO;
361 goto out; 359 goto out;
@@ -381,7 +379,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
381{ 379{
382 struct request_queue *q = nvmdev->q; 380 struct request_queue *q = nvmdev->q;
383 struct nvme_ns *ns = q->queuedata; 381 struct nvme_ns *ns = q->queuedata;
384 struct nvme_dev *dev = ns->dev; 382 struct nvme_ctrl *ctrl = ns->ctrl;
385 struct nvme_nvm_command c = {}; 383 struct nvme_nvm_command c = {};
386 struct nvme_nvm_bb_tbl *bb_tbl; 384 struct nvme_nvm_bb_tbl *bb_tbl;
387 int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; 385 int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks;
@@ -395,30 +393,30 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
395 if (!bb_tbl) 393 if (!bb_tbl)
396 return -ENOMEM; 394 return -ENOMEM;
397 395
398 ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, 396 ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c,
399 bb_tbl, tblsz); 397 bb_tbl, tblsz);
400 if (ret) { 398 if (ret) {
401 dev_err(dev->dev, "get bad block table failed (%d)\n", ret); 399 dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret);
402 ret = -EIO; 400 ret = -EIO;
403 goto out; 401 goto out;
404 } 402 }
405 403
406 if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || 404 if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' ||
407 bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { 405 bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') {
408 dev_err(dev->dev, "bbt format mismatch\n"); 406 dev_err(ctrl->dev, "bbt format mismatch\n");
409 ret = -EINVAL; 407 ret = -EINVAL;
410 goto out; 408 goto out;
411 } 409 }
412 410
413 if (le16_to_cpu(bb_tbl->verid) != 1) { 411 if (le16_to_cpu(bb_tbl->verid) != 1) {
414 ret = -EINVAL; 412 ret = -EINVAL;
415 dev_err(dev->dev, "bbt version not supported\n"); 413 dev_err(ctrl->dev, "bbt version not supported\n");
416 goto out; 414 goto out;
417 } 415 }
418 416
419 if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { 417 if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) {
420 ret = -EINVAL; 418 ret = -EINVAL;
421 dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)", 419 dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)",
422 le32_to_cpu(bb_tbl->tblks), nr_blocks); 420 le32_to_cpu(bb_tbl->tblks), nr_blocks);
423 goto out; 421 goto out;
424 } 422 }
@@ -434,7 +432,6 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
434 int type) 432 int type)
435{ 433{
436 struct nvme_ns *ns = nvmdev->q->queuedata; 434 struct nvme_ns *ns = nvmdev->q->queuedata;
437 struct nvme_dev *dev = ns->dev;
438 struct nvme_nvm_command c = {}; 435 struct nvme_nvm_command c = {};
439 int ret = 0; 436 int ret = 0;
440 437
@@ -444,10 +441,10 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
444 c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); 441 c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1);
445 c.set_bb.value = type; 442 c.set_bb.value = type;
446 443
447 ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, 444 ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
448 NULL, 0); 445 NULL, 0);
449 if (ret) 446 if (ret)
450 dev_err(dev->dev, "set bad block table failed (%d)\n", ret); 447 dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret);
451 return ret; 448 return ret;
452} 449}
453 450
@@ -532,9 +529,8 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
532static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) 529static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
533{ 530{
534 struct nvme_ns *ns = nvmdev->q->queuedata; 531 struct nvme_ns *ns = nvmdev->q->queuedata;
535 struct nvme_dev *dev = ns->dev;
536 532
537 return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); 533 return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
538} 534}
539 535
540static void nvme_nvm_destroy_dma_pool(void *pool) 536static void nvme_nvm_destroy_dma_pool(void *pool)
@@ -592,8 +588,9 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name)
592 588
593int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) 589int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
594{ 590{
595 struct nvme_dev *dev = ns->dev; 591 struct nvme_ctrl *ctrl = ns->ctrl;
596 struct pci_dev *pdev = to_pci_dev(dev->dev); 592 /* XXX: this is poking into PCI structures from generic code! */
593 struct pci_dev *pdev = to_pci_dev(ctrl->dev);
597 594
598 /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ 595 /* QEMU NVMe simulator - PCI ID + Vendor specific bit */
599 if (pdev->vendor == PCI_VENDOR_ID_CNEX && 596 if (pdev->vendor == PCI_VENDOR_ID_CNEX &&
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 044253dca30a..4fb5bb737868 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -19,58 +19,77 @@
19#include <linux/kref.h> 19#include <linux/kref.h>
20#include <linux/blk-mq.h> 20#include <linux/blk-mq.h>
21 21
22enum {
23 /*
24 * Driver internal status code for commands that were cancelled due
25 * to timeouts or controller shutdown. The value is negative so
26 * that it a) doesn't overlap with the unsigned hardware error codes,
27 * and b) can easily be tested for.
28 */
29 NVME_SC_CANCELLED = -EINTR,
30};
31
22extern unsigned char nvme_io_timeout; 32extern unsigned char nvme_io_timeout;
23#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) 33#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
24 34
35extern unsigned char admin_timeout;
36#define ADMIN_TIMEOUT (admin_timeout * HZ)
37
38extern unsigned char shutdown_timeout;
39#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
40
25enum { 41enum {
26 NVME_NS_LBA = 0, 42 NVME_NS_LBA = 0,
27 NVME_NS_LIGHTNVM = 1, 43 NVME_NS_LIGHTNVM = 1,
28}; 44};
29 45
30/* 46/*
31 * Represents an NVM Express device. Each nvme_dev is a PCI function. 47 * List of workarounds for devices that required behavior not specified in
48 * the standard.
32 */ 49 */
33struct nvme_dev { 50enum nvme_quirks {
34 struct list_head node; 51 /*
35 struct nvme_queue **queues; 52 * Prefers I/O aligned to a stripe size specified in a vendor
53 * specific Identify field.
54 */
55 NVME_QUIRK_STRIPE_SIZE = (1 << 0),
56
57 /*
58 * The controller doesn't handle Identify value others than 0 or 1
59 * correctly.
60 */
61 NVME_QUIRK_IDENTIFY_CNS = (1 << 1),
62};
63
64struct nvme_ctrl {
65 const struct nvme_ctrl_ops *ops;
36 struct request_queue *admin_q; 66 struct request_queue *admin_q;
37 struct blk_mq_tag_set tagset;
38 struct blk_mq_tag_set admin_tagset;
39 u32 __iomem *dbs;
40 struct device *dev; 67 struct device *dev;
41 struct dma_pool *prp_page_pool; 68 struct kref kref;
42 struct dma_pool *prp_small_pool;
43 int instance; 69 int instance;
44 unsigned queue_count; 70 struct blk_mq_tag_set *tagset;
45 unsigned online_queues;
46 unsigned max_qid;
47 int q_depth;
48 u32 db_stride;
49 u32 ctrl_config;
50 struct msix_entry *entry;
51 struct nvme_bar __iomem *bar;
52 struct list_head namespaces; 71 struct list_head namespaces;
53 struct kref kref; 72 struct mutex namespaces_mutex;
54 struct device *device; 73 struct device *device; /* char device */
55 struct work_struct reset_work; 74 struct list_head node;
56 struct work_struct probe_work; 75
57 struct work_struct scan_work;
58 char name[12]; 76 char name[12];
59 char serial[20]; 77 char serial[20];
60 char model[40]; 78 char model[40];
61 char firmware_rev[8]; 79 char firmware_rev[8];
62 bool subsystem; 80
81 u32 ctrl_config;
82
83 u32 page_size;
63 u32 max_hw_sectors; 84 u32 max_hw_sectors;
64 u32 stripe_size; 85 u32 stripe_size;
65 u32 page_size;
66 void __iomem *cmb;
67 dma_addr_t cmb_dma_addr;
68 u64 cmb_size;
69 u32 cmbsz;
70 u16 oncs; 86 u16 oncs;
71 u16 abort_limit; 87 atomic_t abort_limit;
72 u8 event_limit; 88 u8 event_limit;
73 u8 vwc; 89 u8 vwc;
90 u32 vs;
91 bool subsystem;
92 unsigned long quirks;
74}; 93};
75 94
76/* 95/*
@@ -79,11 +98,14 @@ struct nvme_dev {
79struct nvme_ns { 98struct nvme_ns {
80 struct list_head list; 99 struct list_head list;
81 100
82 struct nvme_dev *dev; 101 struct nvme_ctrl *ctrl;
83 struct request_queue *queue; 102 struct request_queue *queue;
84 struct gendisk *disk; 103 struct gendisk *disk;
85 struct kref kref; 104 struct kref kref;
86 105
106 u8 eui[8];
107 u8 uuid[16];
108
87 unsigned ns_id; 109 unsigned ns_id;
88 int lba_shift; 110 int lba_shift;
89 u16 ms; 111 u16 ms;
@@ -94,41 +116,156 @@ struct nvme_ns {
94 u32 mode_select_block_len; 116 u32 mode_select_block_len;
95}; 117};
96 118
97/* 119struct nvme_ctrl_ops {
98 * The nvme_iod describes the data in an I/O, including the list of PRP 120 int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
99 * entries. You can't see it in this data structure because C doesn't let 121 int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
100 * me express that. Use nvme_alloc_iod to ensure there's enough space 122 int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
101 * allocated to store the PRP list. 123 bool (*io_incapable)(struct nvme_ctrl *ctrl);
102 */ 124 int (*reset_ctrl)(struct nvme_ctrl *ctrl);
103struct nvme_iod { 125 void (*free_ctrl)(struct nvme_ctrl *ctrl);
104 unsigned long private; /* For the use of the submitter of the I/O */
105 int npages; /* In the PRP list. 0 means small pool in use */
106 int offset; /* Of PRP list */
107 int nents; /* Used in scatterlist */
108 int length; /* Of data, in bytes */
109 dma_addr_t first_dma;
110 struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */
111 struct scatterlist sg[0];
112}; 126};
113 127
128static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
129{
130 u32 val = 0;
131
132 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
133 return false;
134 return val & NVME_CSTS_RDY;
135}
136
137static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl)
138{
139 u32 val = 0;
140
141 if (ctrl->ops->io_incapable(ctrl))
142 return false;
143 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
144 return false;
145 return val & NVME_CSTS_CFS;
146}
147
148static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
149{
150 if (!ctrl->subsystem)
151 return -ENOTTY;
152 return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
153}
154
114static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) 155static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
115{ 156{
116 return (sector >> (ns->lba_shift - 9)); 157 return (sector >> (ns->lba_shift - 9));
117} 158}
118 159
160static inline void nvme_setup_flush(struct nvme_ns *ns,
161 struct nvme_command *cmnd)
162{
163 memset(cmnd, 0, sizeof(*cmnd));
164 cmnd->common.opcode = nvme_cmd_flush;
165 cmnd->common.nsid = cpu_to_le32(ns->ns_id);
166}
167
168static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
169 struct nvme_command *cmnd)
170{
171 u16 control = 0;
172 u32 dsmgmt = 0;
173
174 if (req->cmd_flags & REQ_FUA)
175 control |= NVME_RW_FUA;
176 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
177 control |= NVME_RW_LR;
178
179 if (req->cmd_flags & REQ_RAHEAD)
180 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
181
182 memset(cmnd, 0, sizeof(*cmnd));
183 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
184 cmnd->rw.command_id = req->tag;
185 cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
186 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
187 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
188
189 if (ns->ms) {
190 switch (ns->pi_type) {
191 case NVME_NS_DPS_PI_TYPE3:
192 control |= NVME_RW_PRINFO_PRCHK_GUARD;
193 break;
194 case NVME_NS_DPS_PI_TYPE1:
195 case NVME_NS_DPS_PI_TYPE2:
196 control |= NVME_RW_PRINFO_PRCHK_GUARD |
197 NVME_RW_PRINFO_PRCHK_REF;
198 cmnd->rw.reftag = cpu_to_le32(
199 nvme_block_nr(ns, blk_rq_pos(req)));
200 break;
201 }
202 if (!blk_integrity_rq(req))
203 control |= NVME_RW_PRINFO_PRACT;
204 }
205
206 cmnd->rw.control = cpu_to_le16(control);
207 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
208}
209
210
211static inline int nvme_error_status(u16 status)
212{
213 switch (status & 0x7ff) {
214 case NVME_SC_SUCCESS:
215 return 0;
216 case NVME_SC_CAP_EXCEEDED:
217 return -ENOSPC;
218 default:
219 return -EIO;
220 }
221}
222
223static inline bool nvme_req_needs_retry(struct request *req, u16 status)
224{
225 return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
226 (jiffies - req->start_time) < req->timeout;
227}
228
229int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
230int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
231int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
232int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
233 const struct nvme_ctrl_ops *ops, unsigned long quirks);
234void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
235void nvme_put_ctrl(struct nvme_ctrl *ctrl);
236int nvme_init_identify(struct nvme_ctrl *ctrl);
237
238void nvme_scan_namespaces(struct nvme_ctrl *ctrl);
239void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
240
241void nvme_stop_queues(struct nvme_ctrl *ctrl);
242void nvme_start_queues(struct nvme_ctrl *ctrl);
243
244struct request *nvme_alloc_request(struct request_queue *q,
245 struct nvme_command *cmd, unsigned int flags);
246void nvme_requeue_req(struct request *req);
119int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 247int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
120 void *buf, unsigned bufflen); 248 void *buf, unsigned bufflen);
121int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 249int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
122 void *buffer, void __user *ubuffer, unsigned bufflen, 250 void *buffer, unsigned bufflen, u32 *result, unsigned timeout);
251int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
252 void __user *ubuffer, unsigned bufflen, u32 *result,
253 unsigned timeout);
254int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
255 void __user *ubuffer, unsigned bufflen,
256 void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
123 u32 *result, unsigned timeout); 257 u32 *result, unsigned timeout);
124int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id); 258int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id);
125int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, 259int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
126 struct nvme_id_ns **id); 260 struct nvme_id_ns **id);
127int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log); 261int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log);
128int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 262int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
129 dma_addr_t dma_addr, u32 *result); 263 dma_addr_t dma_addr, u32 *result);
130int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 264int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
131 dma_addr_t dma_addr, u32 *result); 265 dma_addr_t dma_addr, u32 *result);
266int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
267
268extern spinlock_t dev_list_lock;
132 269
133struct sg_io_hdr; 270struct sg_io_hdr;
134 271
@@ -154,4 +291,7 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i
154} 291}
155#endif /* CONFIG_NVM */ 292#endif /* CONFIG_NVM */
156 293
294int __init nvme_core_init(void);
295void nvme_core_exit(void);
296
157#endif /* _NVME_H */ 297#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5c0e2613c7c..72ef8322d32a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -12,6 +12,7 @@
12 * more details. 12 * more details.
13 */ 13 */
14 14
15#include <linux/aer.h>
15#include <linux/bitops.h> 16#include <linux/bitops.h>
16#include <linux/blkdev.h> 17#include <linux/blkdev.h>
17#include <linux/blk-mq.h> 18#include <linux/blk-mq.h>
@@ -28,10 +29,10 @@
28#include <linux/kdev_t.h> 29#include <linux/kdev_t.h>
29#include <linux/kthread.h> 30#include <linux/kthread.h>
30#include <linux/kernel.h> 31#include <linux/kernel.h>
31#include <linux/list_sort.h>
32#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/mutex.h>
35#include <linux/pci.h> 36#include <linux/pci.h>
36#include <linux/poison.h> 37#include <linux/poison.h>
37#include <linux/ptrace.h> 38#include <linux/ptrace.h>
@@ -39,23 +40,24 @@
39#include <linux/slab.h> 40#include <linux/slab.h>
40#include <linux/t10-pi.h> 41#include <linux/t10-pi.h>
41#include <linux/types.h> 42#include <linux/types.h>
42#include <linux/pr.h>
43#include <scsi/sg.h>
44#include <linux/io-64-nonatomic-lo-hi.h> 43#include <linux/io-64-nonatomic-lo-hi.h>
45#include <asm/unaligned.h> 44#include <asm/unaligned.h>
46 45
47#include <uapi/linux/nvme_ioctl.h>
48#include "nvme.h" 46#include "nvme.h"
49 47
50#define NVME_MINORS (1U << MINORBITS)
51#define NVME_Q_DEPTH 1024 48#define NVME_Q_DEPTH 1024
52#define NVME_AQ_DEPTH 256 49#define NVME_AQ_DEPTH 256
53#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 50#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
54#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 51#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
55#define ADMIN_TIMEOUT (admin_timeout * HZ) 52
56#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) 53/*
54 * We handle AEN commands ourselves and don't even let the
55 * block layer know about them.
56 */
57#define NVME_NR_AEN_COMMANDS 1
58#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
57 59
58static unsigned char admin_timeout = 60; 60unsigned char admin_timeout = 60;
59module_param(admin_timeout, byte, 0644); 61module_param(admin_timeout, byte, 0644);
60MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 62MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
61 63
@@ -63,16 +65,10 @@ unsigned char nvme_io_timeout = 30;
63module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 65module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
64MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 66MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
65 67
66static unsigned char shutdown_timeout = 5; 68unsigned char shutdown_timeout = 5;
67module_param(shutdown_timeout, byte, 0644); 69module_param(shutdown_timeout, byte, 0644);
68MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 70MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
69 71
70static int nvme_major;
71module_param(nvme_major, int, 0);
72
73static int nvme_char_major;
74module_param(nvme_char_major, int, 0);
75
76static int use_threaded_interrupts; 72static int use_threaded_interrupts;
77module_param(use_threaded_interrupts, int, 0); 73module_param(use_threaded_interrupts, int, 0);
78 74
@@ -80,28 +76,60 @@ static bool use_cmb_sqes = true;
80module_param(use_cmb_sqes, bool, 0644); 76module_param(use_cmb_sqes, bool, 0644);
81MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 77MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
82 78
83static DEFINE_SPINLOCK(dev_list_lock);
84static LIST_HEAD(dev_list); 79static LIST_HEAD(dev_list);
85static struct task_struct *nvme_thread; 80static struct task_struct *nvme_thread;
86static struct workqueue_struct *nvme_workq; 81static struct workqueue_struct *nvme_workq;
87static wait_queue_head_t nvme_kthread_wait; 82static wait_queue_head_t nvme_kthread_wait;
88 83
89static struct class *nvme_class; 84struct nvme_dev;
85struct nvme_queue;
90 86
91static int __nvme_reset(struct nvme_dev *dev);
92static int nvme_reset(struct nvme_dev *dev); 87static int nvme_reset(struct nvme_dev *dev);
93static void nvme_process_cq(struct nvme_queue *nvmeq); 88static void nvme_process_cq(struct nvme_queue *nvmeq);
94static void nvme_dead_ctrl(struct nvme_dev *dev); 89static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
90static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
95 91
96struct async_cmd_info { 92/*
97 struct kthread_work work; 93 * Represents an NVM Express device. Each nvme_dev is a PCI function.
98 struct kthread_worker *worker; 94 */
99 struct request *req; 95struct nvme_dev {
100 u32 result; 96 struct list_head node;
101 int status; 97 struct nvme_queue **queues;
102 void *ctx; 98 struct blk_mq_tag_set tagset;
99 struct blk_mq_tag_set admin_tagset;
100 u32 __iomem *dbs;
101 struct device *dev;
102 struct dma_pool *prp_page_pool;
103 struct dma_pool *prp_small_pool;
104 unsigned queue_count;
105 unsigned online_queues;
106 unsigned max_qid;
107 int q_depth;
108 u32 db_stride;
109 struct msix_entry *entry;
110 void __iomem *bar;
111 struct work_struct reset_work;
112 struct work_struct scan_work;
113 struct work_struct remove_work;
114 struct mutex shutdown_lock;
115 bool subsystem;
116 void __iomem *cmb;
117 dma_addr_t cmb_dma_addr;
118 u64 cmb_size;
119 u32 cmbsz;
120 unsigned long flags;
121
122#define NVME_CTRL_RESETTING 0
123
124 struct nvme_ctrl ctrl;
125 struct completion ioq_wait;
103}; 126};
104 127
128static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
129{
130 return container_of(ctrl, struct nvme_dev, ctrl);
131}
132
105/* 133/*
106 * An NVM Express queue. Each device has at least two (one for admin 134 * An NVM Express queue. Each device has at least two (one for admin
107 * commands and one for I/O commands). 135 * commands and one for I/O commands).
@@ -126,7 +154,24 @@ struct nvme_queue {
126 u16 qid; 154 u16 qid;
127 u8 cq_phase; 155 u8 cq_phase;
128 u8 cqe_seen; 156 u8 cqe_seen;
129 struct async_cmd_info cmdinfo; 157};
158
159/*
160 * The nvme_iod describes the data in an I/O, including the list of PRP
161 * entries. You can't see it in this data structure because C doesn't let
162 * me express that. Use nvme_init_iod to ensure there's enough space
163 * allocated to store the PRP list.
164 */
165struct nvme_iod {
166 struct nvme_queue *nvmeq;
167 int aborted;
168 int npages; /* In the PRP list. 0 means small pool in use */
169 int nents; /* Used in scatterlist */
170 int length; /* Of data, in bytes */
171 dma_addr_t first_dma;
172 struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
173 struct scatterlist *sg;
174 struct scatterlist inline_sg[0];
130}; 175};
131 176
132/* 177/*
@@ -148,23 +193,11 @@ static inline void _nvme_check_size(void)
148 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 193 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
149} 194}
150 195
151typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
152 struct nvme_completion *);
153
154struct nvme_cmd_info {
155 nvme_completion_fn fn;
156 void *ctx;
157 int aborted;
158 struct nvme_queue *nvmeq;
159 struct nvme_iod iod[0];
160};
161
162/* 196/*
163 * Max size of iod being embedded in the request payload 197 * Max size of iod being embedded in the request payload
164 */ 198 */
165#define NVME_INT_PAGES 2 199#define NVME_INT_PAGES 2
166#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) 200#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size)
167#define NVME_INT_MASK 0x01
168 201
169/* 202/*
170 * Will slightly overestimate the number of pages needed. This is OK 203 * Will slightly overestimate the number of pages needed. This is OK
@@ -173,19 +206,22 @@ struct nvme_cmd_info {
173 */ 206 */
174static int nvme_npages(unsigned size, struct nvme_dev *dev) 207static int nvme_npages(unsigned size, struct nvme_dev *dev)
175{ 208{
176 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); 209 unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
210 dev->ctrl.page_size);
177 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 211 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
178} 212}
179 213
180static unsigned int nvme_cmd_size(struct nvme_dev *dev) 214static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
215 unsigned int size, unsigned int nseg)
181{ 216{
182 unsigned int ret = sizeof(struct nvme_cmd_info); 217 return sizeof(__le64 *) * nvme_npages(size, dev) +
183 218 sizeof(struct scatterlist) * nseg;
184 ret += sizeof(struct nvme_iod); 219}
185 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
186 ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
187 220
188 return ret; 221static unsigned int nvme_cmd_size(struct nvme_dev *dev)
222{
223 return sizeof(struct nvme_iod) +
224 nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
189} 225}
190 226
191static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 227static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -215,11 +251,11 @@ static int nvme_admin_init_request(void *data, struct request *req,
215 unsigned int numa_node) 251 unsigned int numa_node)
216{ 252{
217 struct nvme_dev *dev = data; 253 struct nvme_dev *dev = data;
218 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 254 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
219 struct nvme_queue *nvmeq = dev->queues[0]; 255 struct nvme_queue *nvmeq = dev->queues[0];
220 256
221 BUG_ON(!nvmeq); 257 BUG_ON(!nvmeq);
222 cmd->nvmeq = nvmeq; 258 iod->nvmeq = nvmeq;
223 return 0; 259 return 0;
224} 260}
225 261
@@ -242,148 +278,36 @@ static int nvme_init_request(void *data, struct request *req,
242 unsigned int numa_node) 278 unsigned int numa_node)
243{ 279{
244 struct nvme_dev *dev = data; 280 struct nvme_dev *dev = data;
245 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 281 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
246 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 282 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
247 283
248 BUG_ON(!nvmeq); 284 BUG_ON(!nvmeq);
249 cmd->nvmeq = nvmeq; 285 iod->nvmeq = nvmeq;
250 return 0; 286 return 0;
251} 287}
252 288
253static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, 289static void nvme_complete_async_event(struct nvme_dev *dev,
254 nvme_completion_fn handler) 290 struct nvme_completion *cqe)
255{
256 cmd->fn = handler;
257 cmd->ctx = ctx;
258 cmd->aborted = 0;
259 blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
260}
261
262static void *iod_get_private(struct nvme_iod *iod)
263{
264 return (void *) (iod->private & ~0x1UL);
265}
266
267/*
268 * If bit 0 is set, the iod is embedded in the request payload.
269 */
270static bool iod_should_kfree(struct nvme_iod *iod)
271{
272 return (iod->private & NVME_INT_MASK) == 0;
273}
274
275/* Special values must be less than 0x1000 */
276#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
277#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
278#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
279#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
280
281static void special_completion(struct nvme_queue *nvmeq, void *ctx,
282 struct nvme_completion *cqe)
283{
284 if (ctx == CMD_CTX_CANCELLED)
285 return;
286 if (ctx == CMD_CTX_COMPLETED) {
287 dev_warn(nvmeq->q_dmadev,
288 "completed id %d twice on queue %d\n",
289 cqe->command_id, le16_to_cpup(&cqe->sq_id));
290 return;
291 }
292 if (ctx == CMD_CTX_INVALID) {
293 dev_warn(nvmeq->q_dmadev,
294 "invalid id %d completed on queue %d\n",
295 cqe->command_id, le16_to_cpup(&cqe->sq_id));
296 return;
297 }
298 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
299}
300
301static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
302{
303 void *ctx;
304
305 if (fn)
306 *fn = cmd->fn;
307 ctx = cmd->ctx;
308 cmd->fn = special_completion;
309 cmd->ctx = CMD_CTX_CANCELLED;
310 return ctx;
311}
312
313static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
314 struct nvme_completion *cqe)
315{ 291{
316 u32 result = le32_to_cpup(&cqe->result); 292 u16 status = le16_to_cpu(cqe->status) >> 1;
317 u16 status = le16_to_cpup(&cqe->status) >> 1; 293 u32 result = le32_to_cpu(cqe->result);
318 294
319 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) 295 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
320 ++nvmeq->dev->event_limit; 296 ++dev->ctrl.event_limit;
321 if (status != NVME_SC_SUCCESS) 297 if (status != NVME_SC_SUCCESS)
322 return; 298 return;
323 299
324 switch (result & 0xff07) { 300 switch (result & 0xff07) {
325 case NVME_AER_NOTICE_NS_CHANGED: 301 case NVME_AER_NOTICE_NS_CHANGED:
326 dev_info(nvmeq->q_dmadev, "rescanning\n"); 302 dev_info(dev->dev, "rescanning\n");
327 schedule_work(&nvmeq->dev->scan_work); 303 queue_work(nvme_workq, &dev->scan_work);
328 default: 304 default:
329 dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); 305 dev_warn(dev->dev, "async event result %08x\n", result);
330 } 306 }
331} 307}
332 308
333static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
334 struct nvme_completion *cqe)
335{
336 struct request *req = ctx;
337
338 u16 status = le16_to_cpup(&cqe->status) >> 1;
339 u32 result = le32_to_cpup(&cqe->result);
340
341 blk_mq_free_request(req);
342
343 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
344 ++nvmeq->dev->abort_limit;
345}
346
347static void async_completion(struct nvme_queue *nvmeq, void *ctx,
348 struct nvme_completion *cqe)
349{
350 struct async_cmd_info *cmdinfo = ctx;
351 cmdinfo->result = le32_to_cpup(&cqe->result);
352 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
353 queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
354 blk_mq_free_request(cmdinfo->req);
355}
356
357static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
358 unsigned int tag)
359{
360 struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
361
362 return blk_mq_rq_to_pdu(req);
363}
364
365/*
366 * Called with local interrupts disabled and the q_lock held. May not sleep.
367 */
368static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
369 nvme_completion_fn *fn)
370{
371 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
372 void *ctx;
373 if (tag >= nvmeq->q_depth) {
374 *fn = special_completion;
375 return CMD_CTX_INVALID;
376 }
377 if (fn)
378 *fn = cmd->fn;
379 ctx = cmd->ctx;
380 cmd->fn = special_completion;
381 cmd->ctx = CMD_CTX_COMPLETED;
382 return ctx;
383}
384
385/** 309/**
386 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 310 * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
387 * @nvmeq: The queue to use 311 * @nvmeq: The queue to use
388 * @cmd: The command to send 312 * @cmd: The command to send
389 * 313 *
@@ -405,69 +329,44 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
405 nvmeq->sq_tail = tail; 329 nvmeq->sq_tail = tail;
406} 330}
407 331
408static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 332static __le64 **iod_list(struct request *req)
409{
410 unsigned long flags;
411 spin_lock_irqsave(&nvmeq->q_lock, flags);
412 __nvme_submit_cmd(nvmeq, cmd);
413 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
414}
415
416static __le64 **iod_list(struct nvme_iod *iod)
417{ 333{
418 return ((void *)iod) + iod->offset; 334 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
335 return (__le64 **)(iod->sg + req->nr_phys_segments);
419} 336}
420 337
421static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, 338static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
422 unsigned nseg, unsigned long private)
423{ 339{
424 iod->private = private; 340 struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
425 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 341 int nseg = rq->nr_phys_segments;
426 iod->npages = -1; 342 unsigned size;
427 iod->length = nbytes;
428 iod->nents = 0;
429}
430
431static struct nvme_iod *
432__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
433 unsigned long priv, gfp_t gfp)
434{
435 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
436 sizeof(__le64 *) * nvme_npages(bytes, dev) +
437 sizeof(struct scatterlist) * nseg, gfp);
438
439 if (iod)
440 iod_init(iod, bytes, nseg, priv);
441
442 return iod;
443}
444
445static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
446 gfp_t gfp)
447{
448 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
449 sizeof(struct nvme_dsm_range);
450 struct nvme_iod *iod;
451 343
452 if (rq->nr_phys_segments <= NVME_INT_PAGES && 344 if (rq->cmd_flags & REQ_DISCARD)
453 size <= NVME_INT_BYTES(dev)) { 345 size = sizeof(struct nvme_dsm_range);
454 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); 346 else
347 size = blk_rq_bytes(rq);
455 348
456 iod = cmd->iod; 349 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
457 iod_init(iod, size, rq->nr_phys_segments, 350 iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
458 (unsigned long) rq | NVME_INT_MASK); 351 if (!iod->sg)
459 return iod; 352 return BLK_MQ_RQ_QUEUE_BUSY;
353 } else {
354 iod->sg = iod->inline_sg;
460 } 355 }
461 356
462 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, 357 iod->aborted = 0;
463 (unsigned long) rq, gfp); 358 iod->npages = -1;
359 iod->nents = 0;
360 iod->length = size;
361 return 0;
464} 362}
465 363
466static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 364static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
467{ 365{
468 const int last_prp = dev->page_size / 8 - 1; 366 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
367 const int last_prp = dev->ctrl.page_size / 8 - 1;
469 int i; 368 int i;
470 __le64 **list = iod_list(iod); 369 __le64 **list = iod_list(req);
471 dma_addr_t prp_dma = iod->first_dma; 370 dma_addr_t prp_dma = iod->first_dma;
472 371
473 if (iod->npages == 0) 372 if (iod->npages == 0)
@@ -479,20 +378,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
479 prp_dma = next_prp_dma; 378 prp_dma = next_prp_dma;
480 } 379 }
481 380
482 if (iod_should_kfree(iod)) 381 if (iod->sg != iod->inline_sg)
483 kfree(iod); 382 kfree(iod->sg);
484}
485
486static int nvme_error_status(u16 status)
487{
488 switch (status & 0x7ff) {
489 case NVME_SC_SUCCESS:
490 return 0;
491 case NVME_SC_CAP_EXCEEDED:
492 return -ENOSPC;
493 default:
494 return -EIO;
495 }
496} 383}
497 384
498#ifdef CONFIG_BLK_DEV_INTEGRITY 385#ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -549,27 +436,6 @@ static void nvme_dif_remap(struct request *req,
549 } 436 }
550 kunmap_atomic(pmap); 437 kunmap_atomic(pmap);
551} 438}
552
553static void nvme_init_integrity(struct nvme_ns *ns)
554{
555 struct blk_integrity integrity;
556
557 switch (ns->pi_type) {
558 case NVME_NS_DPS_PI_TYPE3:
559 integrity.profile = &t10_pi_type3_crc;
560 break;
561 case NVME_NS_DPS_PI_TYPE1:
562 case NVME_NS_DPS_PI_TYPE2:
563 integrity.profile = &t10_pi_type1_crc;
564 break;
565 default:
566 integrity.profile = NULL;
567 break;
568 }
569 integrity.tuple_size = ns->ms;
570 blk_integrity_register(ns->disk, &integrity);
571 blk_queue_max_integrity_segments(ns->queue, 1);
572}
573#else /* CONFIG_BLK_DEV_INTEGRITY */ 439#else /* CONFIG_BLK_DEV_INTEGRITY */
574static void nvme_dif_remap(struct request *req, 440static void nvme_dif_remap(struct request *req,
575 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 441 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
@@ -581,91 +447,27 @@ static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
581static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 447static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
582{ 448{
583} 449}
584static void nvme_init_integrity(struct nvme_ns *ns)
585{
586}
587#endif 450#endif
588 451
589static void req_completion(struct nvme_queue *nvmeq, void *ctx, 452static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req,
590 struct nvme_completion *cqe) 453 int total_len)
591{
592 struct nvme_iod *iod = ctx;
593 struct request *req = iod_get_private(iod);
594 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
595 u16 status = le16_to_cpup(&cqe->status) >> 1;
596 bool requeue = false;
597 int error = 0;
598
599 if (unlikely(status)) {
600 if (!(status & NVME_SC_DNR || blk_noretry_request(req))
601 && (jiffies - req->start_time) < req->timeout) {
602 unsigned long flags;
603
604 requeue = true;
605 blk_mq_requeue_request(req);
606 spin_lock_irqsave(req->q->queue_lock, flags);
607 if (!blk_queue_stopped(req->q))
608 blk_mq_kick_requeue_list(req->q);
609 spin_unlock_irqrestore(req->q->queue_lock, flags);
610 goto release_iod;
611 }
612
613 if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
614 if (cmd_rq->ctx == CMD_CTX_CANCELLED)
615 error = -EINTR;
616 else
617 error = status;
618 } else {
619 error = nvme_error_status(status);
620 }
621 }
622
623 if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
624 u32 result = le32_to_cpup(&cqe->result);
625 req->special = (void *)(uintptr_t)result;
626 }
627
628 if (cmd_rq->aborted)
629 dev_warn(nvmeq->dev->dev,
630 "completing aborted command with status:%04x\n",
631 error);
632
633release_iod:
634 if (iod->nents) {
635 dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
636 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
637 if (blk_integrity_rq(req)) {
638 if (!rq_data_dir(req))
639 nvme_dif_remap(req, nvme_dif_complete);
640 dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
641 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
642 }
643 }
644 nvme_free_iod(nvmeq->dev, iod);
645
646 if (likely(!requeue))
647 blk_mq_complete_request(req, error);
648}
649
650/* length is in bytes. gfp flags indicates whether we may sleep. */
651static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
652 int total_len, gfp_t gfp)
653{ 454{
455 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
654 struct dma_pool *pool; 456 struct dma_pool *pool;
655 int length = total_len; 457 int length = total_len;
656 struct scatterlist *sg = iod->sg; 458 struct scatterlist *sg = iod->sg;
657 int dma_len = sg_dma_len(sg); 459 int dma_len = sg_dma_len(sg);
658 u64 dma_addr = sg_dma_address(sg); 460 u64 dma_addr = sg_dma_address(sg);
659 u32 page_size = dev->page_size; 461 u32 page_size = dev->ctrl.page_size;
660 int offset = dma_addr & (page_size - 1); 462 int offset = dma_addr & (page_size - 1);
661 __le64 *prp_list; 463 __le64 *prp_list;
662 __le64 **list = iod_list(iod); 464 __le64 **list = iod_list(req);
663 dma_addr_t prp_dma; 465 dma_addr_t prp_dma;
664 int nprps, i; 466 int nprps, i;
665 467
666 length -= (page_size - offset); 468 length -= (page_size - offset);
667 if (length <= 0) 469 if (length <= 0)
668 return total_len; 470 return true;
669 471
670 dma_len -= (page_size - offset); 472 dma_len -= (page_size - offset);
671 if (dma_len) { 473 if (dma_len) {
@@ -678,7 +480,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
678 480
679 if (length <= page_size) { 481 if (length <= page_size) {
680 iod->first_dma = dma_addr; 482 iod->first_dma = dma_addr;
681 return total_len; 483 return true;
682 } 484 }
683 485
684 nprps = DIV_ROUND_UP(length, page_size); 486 nprps = DIV_ROUND_UP(length, page_size);
@@ -690,11 +492,11 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
690 iod->npages = 1; 492 iod->npages = 1;
691 } 493 }
692 494
693 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 495 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
694 if (!prp_list) { 496 if (!prp_list) {
695 iod->first_dma = dma_addr; 497 iod->first_dma = dma_addr;
696 iod->npages = -1; 498 iod->npages = -1;
697 return (total_len - length) + page_size; 499 return false;
698 } 500 }
699 list[0] = prp_list; 501 list[0] = prp_list;
700 iod->first_dma = prp_dma; 502 iod->first_dma = prp_dma;
@@ -702,9 +504,9 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
702 for (;;) { 504 for (;;) {
703 if (i == page_size >> 3) { 505 if (i == page_size >> 3) {
704 __le64 *old_prp_list = prp_list; 506 __le64 *old_prp_list = prp_list;
705 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 507 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
706 if (!prp_list) 508 if (!prp_list)
707 return total_len - length; 509 return false;
708 list[iod->npages++] = prp_list; 510 list[iod->npages++] = prp_list;
709 prp_list[0] = old_prp_list[i - 1]; 511 prp_list[0] = old_prp_list[i - 1];
710 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 512 old_prp_list[i - 1] = cpu_to_le64(prp_dma);
@@ -724,115 +526,105 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
724 dma_len = sg_dma_len(sg); 526 dma_len = sg_dma_len(sg);
725 } 527 }
726 528
727 return total_len; 529 return true;
728} 530}
729 531
730static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, 532static int nvme_map_data(struct nvme_dev *dev, struct request *req,
731 struct nvme_iod *iod) 533 struct nvme_command *cmnd)
732{ 534{
733 struct nvme_command cmnd; 535 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
536 struct request_queue *q = req->q;
537 enum dma_data_direction dma_dir = rq_data_dir(req) ?
538 DMA_TO_DEVICE : DMA_FROM_DEVICE;
539 int ret = BLK_MQ_RQ_QUEUE_ERROR;
734 540
735 memcpy(&cmnd, req->cmd, sizeof(cmnd)); 541 sg_init_table(iod->sg, req->nr_phys_segments);
736 cmnd.rw.command_id = req->tag; 542 iod->nents = blk_rq_map_sg(q, req, iod->sg);
737 if (req->nr_phys_segments) { 543 if (!iod->nents)
738 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 544 goto out;
739 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
740 }
741 545
742 __nvme_submit_cmd(nvmeq, &cmnd); 546 ret = BLK_MQ_RQ_QUEUE_BUSY;
743} 547 if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
548 goto out;
744 549
745/* 550 if (!nvme_setup_prps(dev, req, blk_rq_bytes(req)))
746 * We reuse the small pool to allocate the 16-byte range here as it is not 551 goto out_unmap;
747 * worth having a special pool for these or additional cases to handle freeing
748 * the iod.
749 */
750static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
751 struct request *req, struct nvme_iod *iod)
752{
753 struct nvme_dsm_range *range =
754 (struct nvme_dsm_range *)iod_list(iod)[0];
755 struct nvme_command cmnd;
756 552
757 range->cattr = cpu_to_le32(0); 553 ret = BLK_MQ_RQ_QUEUE_ERROR;
758 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); 554 if (blk_integrity_rq(req)) {
759 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 555 if (blk_rq_count_integrity_sg(q, req->bio) != 1)
556 goto out_unmap;
760 557
761 memset(&cmnd, 0, sizeof(cmnd)); 558 sg_init_table(&iod->meta_sg, 1);
762 cmnd.dsm.opcode = nvme_cmd_dsm; 559 if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
763 cmnd.dsm.command_id = req->tag; 560 goto out_unmap;
764 cmnd.dsm.nsid = cpu_to_le32(ns->ns_id);
765 cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma);
766 cmnd.dsm.nr = 0;
767 cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
768 561
769 __nvme_submit_cmd(nvmeq, &cmnd); 562 if (rq_data_dir(req))
770} 563 nvme_dif_remap(req, nvme_dif_prep);
771 564
772static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 565 if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
773 int cmdid) 566 goto out_unmap;
774{ 567 }
775 struct nvme_command cmnd;
776 568
777 memset(&cmnd, 0, sizeof(cmnd)); 569 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
778 cmnd.common.opcode = nvme_cmd_flush; 570 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
779 cmnd.common.command_id = cmdid; 571 if (blk_integrity_rq(req))
780 cmnd.common.nsid = cpu_to_le32(ns->ns_id); 572 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
573 return BLK_MQ_RQ_QUEUE_OK;
781 574
782 __nvme_submit_cmd(nvmeq, &cmnd); 575out_unmap:
576 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
577out:
578 return ret;
783} 579}
784 580
785static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, 581static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
786 struct nvme_ns *ns)
787{ 582{
788 struct request *req = iod_get_private(iod); 583 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
789 struct nvme_command cmnd; 584 enum dma_data_direction dma_dir = rq_data_dir(req) ?
790 u16 control = 0; 585 DMA_TO_DEVICE : DMA_FROM_DEVICE;
791 u32 dsmgmt = 0; 586
792 587 if (iod->nents) {
793 if (req->cmd_flags & REQ_FUA) 588 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
794 control |= NVME_RW_FUA; 589 if (blk_integrity_rq(req)) {
795 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 590 if (!rq_data_dir(req))
796 control |= NVME_RW_LR; 591 nvme_dif_remap(req, nvme_dif_complete);
797 592 dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
798 if (req->cmd_flags & REQ_RAHEAD)
799 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
800
801 memset(&cmnd, 0, sizeof(cmnd));
802 cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
803 cmnd.rw.command_id = req->tag;
804 cmnd.rw.nsid = cpu_to_le32(ns->ns_id);
805 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
806 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
807 cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
808 cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
809
810 if (ns->ms) {
811 switch (ns->pi_type) {
812 case NVME_NS_DPS_PI_TYPE3:
813 control |= NVME_RW_PRINFO_PRCHK_GUARD;
814 break;
815 case NVME_NS_DPS_PI_TYPE1:
816 case NVME_NS_DPS_PI_TYPE2:
817 control |= NVME_RW_PRINFO_PRCHK_GUARD |
818 NVME_RW_PRINFO_PRCHK_REF;
819 cmnd.rw.reftag = cpu_to_le32(
820 nvme_block_nr(ns, blk_rq_pos(req)));
821 break;
822 } 593 }
823 if (blk_integrity_rq(req))
824 cmnd.rw.metadata =
825 cpu_to_le64(sg_dma_address(iod->meta_sg));
826 else
827 control |= NVME_RW_PRINFO_PRACT;
828 } 594 }
829 595
830 cmnd.rw.control = cpu_to_le16(control); 596 nvme_free_iod(dev, req);
831 cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); 597}
832 598
833 __nvme_submit_cmd(nvmeq, &cmnd); 599/*
600 * We reuse the small pool to allocate the 16-byte range here as it is not
601 * worth having a special pool for these or additional cases to handle freeing
602 * the iod.
603 */
604static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
605 struct request *req, struct nvme_command *cmnd)
606{
607 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
608 struct nvme_dsm_range *range;
834 609
835 return 0; 610 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
611 &iod->first_dma);
612 if (!range)
613 return BLK_MQ_RQ_QUEUE_BUSY;
614 iod_list(req)[0] = (__le64 *)range;
615 iod->npages = 0;
616
617 range->cattr = cpu_to_le32(0);
618 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
619 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
620
621 memset(cmnd, 0, sizeof(*cmnd));
622 cmnd->dsm.opcode = nvme_cmd_dsm;
623 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
624 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
625 cmnd->dsm.nr = 0;
626 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
627 return BLK_MQ_RQ_QUEUE_OK;
836} 628}
837 629
838/* 630/*
@@ -845,9 +637,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
845 struct nvme_queue *nvmeq = hctx->driver_data; 637 struct nvme_queue *nvmeq = hctx->driver_data;
846 struct nvme_dev *dev = nvmeq->dev; 638 struct nvme_dev *dev = nvmeq->dev;
847 struct request *req = bd->rq; 639 struct request *req = bd->rq;
848 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 640 struct nvme_command cmnd;
849 struct nvme_iod *iod; 641 int ret = BLK_MQ_RQ_QUEUE_OK;
850 enum dma_data_direction dma_dir;
851 642
852 /* 643 /*
853 * If formated with metadata, require the block layer provide a buffer 644 * If formated with metadata, require the block layer provide a buffer
@@ -857,91 +648,72 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
857 if (ns && ns->ms && !blk_integrity_rq(req)) { 648 if (ns && ns->ms && !blk_integrity_rq(req)) {
858 if (!(ns->pi_type && ns->ms == 8) && 649 if (!(ns->pi_type && ns->ms == 8) &&
859 req->cmd_type != REQ_TYPE_DRV_PRIV) { 650 req->cmd_type != REQ_TYPE_DRV_PRIV) {
860 blk_mq_complete_request(req, -EFAULT); 651 blk_mq_end_request(req, -EFAULT);
861 return BLK_MQ_RQ_QUEUE_OK; 652 return BLK_MQ_RQ_QUEUE_OK;
862 } 653 }
863 } 654 }
864 655
865 iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); 656 ret = nvme_init_iod(req, dev);
866 if (!iod) 657 if (ret)
867 return BLK_MQ_RQ_QUEUE_BUSY; 658 return ret;
868 659
869 if (req->cmd_flags & REQ_DISCARD) { 660 if (req->cmd_flags & REQ_DISCARD) {
870 void *range; 661 ret = nvme_setup_discard(nvmeq, ns, req, &cmnd);
871 /* 662 } else {
872 * We reuse the small pool to allocate the 16-byte range here 663 if (req->cmd_type == REQ_TYPE_DRV_PRIV)
873 * as it is not worth having a special pool for these or 664 memcpy(&cmnd, req->cmd, sizeof(cmnd));
874 * additional cases to handle freeing the iod. 665 else if (req->cmd_flags & REQ_FLUSH)
875 */ 666 nvme_setup_flush(ns, &cmnd);
876 range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, 667 else
877 &iod->first_dma); 668 nvme_setup_rw(ns, req, &cmnd);
878 if (!range)
879 goto retry_cmd;
880 iod_list(iod)[0] = (__le64 *)range;
881 iod->npages = 0;
882 } else if (req->nr_phys_segments) {
883 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
884 669
885 sg_init_table(iod->sg, req->nr_phys_segments); 670 if (req->nr_phys_segments)
886 iod->nents = blk_rq_map_sg(req->q, req, iod->sg); 671 ret = nvme_map_data(dev, req, &cmnd);
887 if (!iod->nents) 672 }
888 goto error_cmd;
889 673
890 if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) 674 if (ret)
891 goto retry_cmd; 675 goto out;
892 676
893 if (blk_rq_bytes(req) != 677 cmnd.common.command_id = req->tag;
894 nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { 678 blk_mq_start_request(req);
895 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
896 goto retry_cmd;
897 }
898 if (blk_integrity_rq(req)) {
899 if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) {
900 dma_unmap_sg(dev->dev, iod->sg, iod->nents,
901 dma_dir);
902 goto error_cmd;
903 }
904 679
905 sg_init_table(iod->meta_sg, 1); 680 spin_lock_irq(&nvmeq->q_lock);
906 if (blk_rq_map_integrity_sg( 681 __nvme_submit_cmd(nvmeq, &cmnd);
907 req->q, req->bio, iod->meta_sg) != 1) { 682 nvme_process_cq(nvmeq);
908 dma_unmap_sg(dev->dev, iod->sg, iod->nents, 683 spin_unlock_irq(&nvmeq->q_lock);
909 dma_dir); 684 return BLK_MQ_RQ_QUEUE_OK;
910 goto error_cmd; 685out:
911 } 686 nvme_free_iod(dev, req);
687 return ret;
688}
912 689
913 if (rq_data_dir(req)) 690static void nvme_complete_rq(struct request *req)
914 nvme_dif_remap(req, nvme_dif_prep); 691{
692 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
693 struct nvme_dev *dev = iod->nvmeq->dev;
694 int error = 0;
915 695
916 if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) { 696 nvme_unmap_data(dev, req);
917 dma_unmap_sg(dev->dev, iod->sg, iod->nents, 697
918 dma_dir); 698 if (unlikely(req->errors)) {
919 goto error_cmd; 699 if (nvme_req_needs_retry(req, req->errors)) {
920 } 700 nvme_requeue_req(req);
701 return;
921 } 702 }
922 }
923 703
924 nvme_set_info(cmd, iod, req_completion); 704 if (req->cmd_type == REQ_TYPE_DRV_PRIV)
925 spin_lock_irq(&nvmeq->q_lock); 705 error = req->errors;
926 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 706 else
927 nvme_submit_priv(nvmeq, req, iod); 707 error = nvme_error_status(req->errors);
928 else if (req->cmd_flags & REQ_DISCARD) 708 }
929 nvme_submit_discard(nvmeq, ns, req, iod);
930 else if (req->cmd_flags & REQ_FLUSH)
931 nvme_submit_flush(nvmeq, ns, req->tag);
932 else
933 nvme_submit_iod(nvmeq, iod, ns);
934 709
935 nvme_process_cq(nvmeq); 710 if (unlikely(iod->aborted)) {
936 spin_unlock_irq(&nvmeq->q_lock); 711 dev_warn(dev->dev,
937 return BLK_MQ_RQ_QUEUE_OK; 712 "completing aborted command with status: %04x\n",
713 req->errors);
714 }
938 715
939 error_cmd: 716 blk_mq_end_request(req, error);
940 nvme_free_iod(dev, iod);
941 return BLK_MQ_RQ_QUEUE_ERROR;
942 retry_cmd:
943 nvme_free_iod(dev, iod);
944 return BLK_MQ_RQ_QUEUE_BUSY;
945} 717}
946 718
947static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) 719static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
@@ -952,20 +724,47 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
952 phase = nvmeq->cq_phase; 724 phase = nvmeq->cq_phase;
953 725
954 for (;;) { 726 for (;;) {
955 void *ctx;
956 nvme_completion_fn fn;
957 struct nvme_completion cqe = nvmeq->cqes[head]; 727 struct nvme_completion cqe = nvmeq->cqes[head];
958 if ((le16_to_cpu(cqe.status) & 1) != phase) 728 u16 status = le16_to_cpu(cqe.status);
729 struct request *req;
730
731 if ((status & 1) != phase)
959 break; 732 break;
960 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 733 nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
961 if (++head == nvmeq->q_depth) { 734 if (++head == nvmeq->q_depth) {
962 head = 0; 735 head = 0;
963 phase = !phase; 736 phase = !phase;
964 } 737 }
738
965 if (tag && *tag == cqe.command_id) 739 if (tag && *tag == cqe.command_id)
966 *tag = -1; 740 *tag = -1;
967 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); 741
968 fn(nvmeq, ctx, &cqe); 742 if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
743 dev_warn(nvmeq->q_dmadev,
744 "invalid id %d completed on queue %d\n",
745 cqe.command_id, le16_to_cpu(cqe.sq_id));
746 continue;
747 }
748
749 /*
750 * AEN requests are special as they don't time out and can
751 * survive any kind of queue freeze and often don't respond to
752 * aborts. We don't even bother to allocate a struct request
753 * for them but rather special case them here.
754 */
755 if (unlikely(nvmeq->qid == 0 &&
756 cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
757 nvme_complete_async_event(nvmeq->dev, &cqe);
758 continue;
759 }
760
761 req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
762 if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
763 u32 result = le32_to_cpu(cqe.result);
764 req->special = (void *)(uintptr_t)result;
765 }
766 blk_mq_complete_request(req, status >> 1);
767
969 } 768 }
970 769
971 /* If the controller ignores the cq head doorbell and continuously 770 /* If the controller ignores the cq head doorbell and continuously
@@ -1028,112 +827,15 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1028 return 0; 827 return 0;
1029} 828}
1030 829
1031/* 830static void nvme_submit_async_event(struct nvme_dev *dev)
1032 * Returns 0 on success. If the result is negative, it's a Linux error code;
1033 * if the result is positive, it's an NVM Express status code
1034 */
1035int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1036 void *buffer, void __user *ubuffer, unsigned bufflen,
1037 u32 *result, unsigned timeout)
1038{
1039 bool write = cmd->common.opcode & 1;
1040 struct bio *bio = NULL;
1041 struct request *req;
1042 int ret;
1043
1044 req = blk_mq_alloc_request(q, write, 0);
1045 if (IS_ERR(req))
1046 return PTR_ERR(req);
1047
1048 req->cmd_type = REQ_TYPE_DRV_PRIV;
1049 req->cmd_flags |= REQ_FAILFAST_DRIVER;
1050 req->__data_len = 0;
1051 req->__sector = (sector_t) -1;
1052 req->bio = req->biotail = NULL;
1053
1054 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
1055
1056 req->cmd = (unsigned char *)cmd;
1057 req->cmd_len = sizeof(struct nvme_command);
1058 req->special = (void *)0;
1059
1060 if (buffer && bufflen) {
1061 ret = blk_rq_map_kern(q, req, buffer, bufflen,
1062 __GFP_DIRECT_RECLAIM);
1063 if (ret)
1064 goto out;
1065 } else if (ubuffer && bufflen) {
1066 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
1067 __GFP_DIRECT_RECLAIM);
1068 if (ret)
1069 goto out;
1070 bio = req->bio;
1071 }
1072
1073 blk_execute_rq(req->q, NULL, req, 0);
1074 if (bio)
1075 blk_rq_unmap_user(bio);
1076 if (result)
1077 *result = (u32)(uintptr_t)req->special;
1078 ret = req->errors;
1079 out:
1080 blk_mq_free_request(req);
1081 return ret;
1082}
1083
1084int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1085 void *buffer, unsigned bufflen)
1086{
1087 return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
1088}
1089
1090static int nvme_submit_async_admin_req(struct nvme_dev *dev)
1091{ 831{
1092 struct nvme_queue *nvmeq = dev->queues[0];
1093 struct nvme_command c; 832 struct nvme_command c;
1094 struct nvme_cmd_info *cmd_info;
1095 struct request *req;
1096
1097 req = blk_mq_alloc_request(dev->admin_q, WRITE,
1098 BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
1099 if (IS_ERR(req))
1100 return PTR_ERR(req);
1101
1102 req->cmd_flags |= REQ_NO_TIMEOUT;
1103 cmd_info = blk_mq_rq_to_pdu(req);
1104 nvme_set_info(cmd_info, NULL, async_req_completion);
1105 833
1106 memset(&c, 0, sizeof(c)); 834 memset(&c, 0, sizeof(c));
1107 c.common.opcode = nvme_admin_async_event; 835 c.common.opcode = nvme_admin_async_event;
1108 c.common.command_id = req->tag; 836 c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit;
1109
1110 blk_mq_free_request(req);
1111 __nvme_submit_cmd(nvmeq, &c);
1112 return 0;
1113}
1114
1115static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
1116 struct nvme_command *cmd,
1117 struct async_cmd_info *cmdinfo, unsigned timeout)
1118{
1119 struct nvme_queue *nvmeq = dev->queues[0];
1120 struct request *req;
1121 struct nvme_cmd_info *cmd_rq;
1122
1123 req = blk_mq_alloc_request(dev->admin_q, WRITE, 0);
1124 if (IS_ERR(req))
1125 return PTR_ERR(req);
1126
1127 req->timeout = timeout;
1128 cmd_rq = blk_mq_rq_to_pdu(req);
1129 cmdinfo->req = req;
1130 nvme_set_info(cmd_rq, cmdinfo, async_completion);
1131 cmdinfo->status = -EINTR;
1132 837
1133 cmd->common.command_id = req->tag; 838 __nvme_submit_cmd(dev->queues[0], &c);
1134
1135 nvme_submit_cmd(nvmeq, cmd);
1136 return 0;
1137} 839}
1138 840
1139static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 841static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1144,7 +846,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1144 c.delete_queue.opcode = opcode; 846 c.delete_queue.opcode = opcode;
1145 c.delete_queue.qid = cpu_to_le16(id); 847 c.delete_queue.qid = cpu_to_le16(id);
1146 848
1147 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 849 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1148} 850}
1149 851
1150static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 852static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
@@ -1165,7 +867,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1165 c.create_cq.cq_flags = cpu_to_le16(flags); 867 c.create_cq.cq_flags = cpu_to_le16(flags);
1166 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 868 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
1167 869
1168 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 870 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1169} 871}
1170 872
1171static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 873static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1186,7 +888,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1186 c.create_sq.sq_flags = cpu_to_le16(flags); 888 c.create_sq.sq_flags = cpu_to_le16(flags);
1187 c.create_sq.cqid = cpu_to_le16(qid); 889 c.create_sq.cqid = cpu_to_le16(qid);
1188 890
1189 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 891 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1190} 892}
1191 893
1192static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 894static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1199,195 +901,111 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
1199 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 901 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
1200} 902}
1201 903
1202int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) 904static void abort_endio(struct request *req, int error)
1203{
1204 struct nvme_command c = { };
1205 int error;
1206
1207 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1208 c.identify.opcode = nvme_admin_identify;
1209 c.identify.cns = cpu_to_le32(1);
1210
1211 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1212 if (!*id)
1213 return -ENOMEM;
1214
1215 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1216 sizeof(struct nvme_id_ctrl));
1217 if (error)
1218 kfree(*id);
1219 return error;
1220}
1221
1222int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
1223 struct nvme_id_ns **id)
1224{
1225 struct nvme_command c = { };
1226 int error;
1227
1228 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1229 c.identify.opcode = nvme_admin_identify,
1230 c.identify.nsid = cpu_to_le32(nsid),
1231
1232 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
1233 if (!*id)
1234 return -ENOMEM;
1235
1236 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1237 sizeof(struct nvme_id_ns));
1238 if (error)
1239 kfree(*id);
1240 return error;
1241}
1242
1243int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
1244 dma_addr_t dma_addr, u32 *result)
1245{
1246 struct nvme_command c;
1247
1248 memset(&c, 0, sizeof(c));
1249 c.features.opcode = nvme_admin_get_features;
1250 c.features.nsid = cpu_to_le32(nsid);
1251 c.features.prp1 = cpu_to_le64(dma_addr);
1252 c.features.fid = cpu_to_le32(fid);
1253
1254 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
1255 result, 0);
1256}
1257
1258int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
1259 dma_addr_t dma_addr, u32 *result)
1260{
1261 struct nvme_command c;
1262
1263 memset(&c, 0, sizeof(c));
1264 c.features.opcode = nvme_admin_set_features;
1265 c.features.prp1 = cpu_to_le64(dma_addr);
1266 c.features.fid = cpu_to_le32(fid);
1267 c.features.dword11 = cpu_to_le32(dword11);
1268
1269 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
1270 result, 0);
1271}
1272
1273int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
1274{ 905{
1275 struct nvme_command c = { }; 906 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
1276 int error; 907 struct nvme_queue *nvmeq = iod->nvmeq;
1277 908 u32 result = (u32)(uintptr_t)req->special;
1278 c.common.opcode = nvme_admin_get_log_page, 909 u16 status = req->errors;
1279 c.common.nsid = cpu_to_le32(0xFFFFFFFF),
1280 c.common.cdw10[0] = cpu_to_le32(
1281 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
1282 NVME_LOG_SMART),
1283 910
1284 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); 911 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
1285 if (!*log) 912 atomic_inc(&nvmeq->dev->ctrl.abort_limit);
1286 return -ENOMEM;
1287 913
1288 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, 914 blk_mq_free_request(req);
1289 sizeof(struct nvme_smart_log));
1290 if (error)
1291 kfree(*log);
1292 return error;
1293} 915}
1294 916
1295/** 917static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1296 * nvme_abort_req - Attempt aborting a request
1297 *
1298 * Schedule controller reset if the command was already aborted once before and
1299 * still hasn't been returned to the driver, or if this is the admin queue.
1300 */
1301static void nvme_abort_req(struct request *req)
1302{ 918{
1303 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 919 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
1304 struct nvme_queue *nvmeq = cmd_rq->nvmeq; 920 struct nvme_queue *nvmeq = iod->nvmeq;
1305 struct nvme_dev *dev = nvmeq->dev; 921 struct nvme_dev *dev = nvmeq->dev;
1306 struct request *abort_req; 922 struct request *abort_req;
1307 struct nvme_cmd_info *abort_cmd;
1308 struct nvme_command cmd; 923 struct nvme_command cmd;
1309 924
1310 if (!nvmeq->qid || cmd_rq->aborted) { 925 /*
1311 spin_lock(&dev_list_lock); 926 * Shutdown immediately if controller times out while starting. The
1312 if (!__nvme_reset(dev)) { 927 * reset work will see the pci device disabled when it gets the forced
1313 dev_warn(dev->dev, 928 * cancellation error. All outstanding requests are completed on
1314 "I/O %d QID %d timeout, reset controller\n", 929 * shutdown, so we return BLK_EH_HANDLED.
1315 req->tag, nvmeq->qid); 930 */
1316 } 931 if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
1317 spin_unlock(&dev_list_lock); 932 dev_warn(dev->dev,
1318 return; 933 "I/O %d QID %d timeout, disable controller\n",
934 req->tag, nvmeq->qid);
935 nvme_dev_disable(dev, false);
936 req->errors = NVME_SC_CANCELLED;
937 return BLK_EH_HANDLED;
1319 } 938 }
1320 939
1321 if (!dev->abort_limit) 940 /*
1322 return; 941 * Shutdown the controller immediately and schedule a reset if the
942 * command was already aborted once before and still hasn't been
943 * returned to the driver, or if this is the admin queue.
944 */
945 if (!nvmeq->qid || iod->aborted) {
946 dev_warn(dev->dev,
947 "I/O %d QID %d timeout, reset controller\n",
948 req->tag, nvmeq->qid);
949 nvme_dev_disable(dev, false);
950 queue_work(nvme_workq, &dev->reset_work);
1323 951
1324 abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, 952 /*
1325 BLK_MQ_REQ_NOWAIT); 953 * Mark the request as handled, since the inline shutdown
1326 if (IS_ERR(abort_req)) 954 * forces all outstanding requests to complete.
1327 return; 955 */
956 req->errors = NVME_SC_CANCELLED;
957 return BLK_EH_HANDLED;
958 }
1328 959
1329 abort_cmd = blk_mq_rq_to_pdu(abort_req); 960 iod->aborted = 1;
1330 nvme_set_info(abort_cmd, abort_req, abort_completion); 961
962 if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
963 atomic_inc(&dev->ctrl.abort_limit);
964 return BLK_EH_RESET_TIMER;
965 }
1331 966
1332 memset(&cmd, 0, sizeof(cmd)); 967 memset(&cmd, 0, sizeof(cmd));
1333 cmd.abort.opcode = nvme_admin_abort_cmd; 968 cmd.abort.opcode = nvme_admin_abort_cmd;
1334 cmd.abort.cid = req->tag; 969 cmd.abort.cid = req->tag;
1335 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 970 cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
1336 cmd.abort.command_id = abort_req->tag;
1337 971
1338 --dev->abort_limit; 972 dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n",
1339 cmd_rq->aborted = 1; 973 req->tag, nvmeq->qid);
1340 974
1341 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, 975 abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1342 nvmeq->qid); 976 BLK_MQ_REQ_NOWAIT);
1343 nvme_submit_cmd(dev->queues[0], &cmd); 977 if (IS_ERR(abort_req)) {
978 atomic_inc(&dev->ctrl.abort_limit);
979 return BLK_EH_RESET_TIMER;
980 }
981
982 abort_req->timeout = ADMIN_TIMEOUT;
983 abort_req->end_io_data = NULL;
984 blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
985
986 /*
987 * The aborted req will be completed on receiving the abort req.
988 * We enable the timer again. If hit twice, it'll cause a device reset,
989 * as the device then is in a faulty state.
990 */
991 return BLK_EH_RESET_TIMER;
1344} 992}
1345 993
1346static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) 994static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
1347{ 995{
1348 struct nvme_queue *nvmeq = data; 996 struct nvme_queue *nvmeq = data;
1349 void *ctx; 997 int status;
1350 nvme_completion_fn fn;
1351 struct nvme_cmd_info *cmd;
1352 struct nvme_completion cqe;
1353 998
1354 if (!blk_mq_request_started(req)) 999 if (!blk_mq_request_started(req))
1355 return; 1000 return;
1356 1001
1357 cmd = blk_mq_rq_to_pdu(req); 1002 dev_warn(nvmeq->q_dmadev,
1358 1003 "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
1359 if (cmd->ctx == CMD_CTX_CANCELLED)
1360 return;
1361 1004
1005 status = NVME_SC_ABORT_REQ;
1362 if (blk_queue_dying(req->q)) 1006 if (blk_queue_dying(req->q))
1363 cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); 1007 status |= NVME_SC_DNR;
1364 else 1008 blk_mq_complete_request(req, status);
1365 cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
1366
1367
1368 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
1369 req->tag, nvmeq->qid);
1370 ctx = cancel_cmd_info(cmd, &fn);
1371 fn(nvmeq, ctx, &cqe);
1372}
1373
1374static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1375{
1376 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
1377 struct nvme_queue *nvmeq = cmd->nvmeq;
1378
1379 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
1380 nvmeq->qid);
1381 spin_lock_irq(&nvmeq->q_lock);
1382 nvme_abort_req(req);
1383 spin_unlock_irq(&nvmeq->q_lock);
1384
1385 /*
1386 * The aborted req will be completed on receiving the abort req.
1387 * We enable the timer again. If hit twice, it'll cause a device reset,
1388 * as the device then is in a faulty state.
1389 */
1390 return BLK_EH_RESET_TIMER;
1391} 1009}
1392 1010
1393static void nvme_free_queue(struct nvme_queue *nvmeq) 1011static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1430,8 +1048,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1430 nvmeq->cq_vector = -1; 1048 nvmeq->cq_vector = -1;
1431 spin_unlock_irq(&nvmeq->q_lock); 1049 spin_unlock_irq(&nvmeq->q_lock);
1432 1050
1433 if (!nvmeq->qid && nvmeq->dev->admin_q) 1051 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1434 blk_mq_freeze_queue_start(nvmeq->dev->admin_q); 1052 blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q);
1435 1053
1436 irq_set_affinity_hint(vector, NULL); 1054 irq_set_affinity_hint(vector, NULL);
1437 free_irq(vector, nvmeq); 1055 free_irq(vector, nvmeq);
@@ -1447,21 +1065,20 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
1447 spin_unlock_irq(&nvmeq->q_lock); 1065 spin_unlock_irq(&nvmeq->q_lock);
1448} 1066}
1449 1067
1450static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1068static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
1451{ 1069{
1452 struct nvme_queue *nvmeq = dev->queues[qid]; 1070 struct nvme_queue *nvmeq = dev->queues[0];
1453 1071
1454 if (!nvmeq) 1072 if (!nvmeq)
1455 return; 1073 return;
1456 if (nvme_suspend_queue(nvmeq)) 1074 if (nvme_suspend_queue(nvmeq))
1457 return; 1075 return;
1458 1076
1459 /* Don't tell the adapter to delete the admin queue. 1077 if (shutdown)
1460 * Don't tell a removed adapter to delete IO queues. */ 1078 nvme_shutdown_ctrl(&dev->ctrl);
1461 if (qid && readl(&dev->bar->csts) != -1) { 1079 else
1462 adapter_delete_sq(dev, qid); 1080 nvme_disable_ctrl(&dev->ctrl, lo_hi_readq(
1463 adapter_delete_cq(dev, qid); 1081 dev->bar + NVME_REG_CAP));
1464 }
1465 1082
1466 spin_lock_irq(&nvmeq->q_lock); 1083 spin_lock_irq(&nvmeq->q_lock);
1467 nvme_process_cq(nvmeq); 1084 nvme_process_cq(nvmeq);
@@ -1472,11 +1089,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
1472 int entry_size) 1089 int entry_size)
1473{ 1090{
1474 int q_depth = dev->q_depth; 1091 int q_depth = dev->q_depth;
1475 unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); 1092 unsigned q_size_aligned = roundup(q_depth * entry_size,
1093 dev->ctrl.page_size);
1476 1094
1477 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1095 if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1478 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1096 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1479 mem_per_q = round_down(mem_per_q, dev->page_size); 1097 mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
1480 q_depth = div_u64(mem_per_q, entry_size); 1098 q_depth = div_u64(mem_per_q, entry_size);
1481 1099
1482 /* 1100 /*
@@ -1495,8 +1113,8 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1495 int qid, int depth) 1113 int qid, int depth)
1496{ 1114{
1497 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1115 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
1498 unsigned offset = (qid - 1) * 1116 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
1499 roundup(SQ_SIZE(depth), dev->page_size); 1117 dev->ctrl.page_size);
1500 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; 1118 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
1501 nvmeq->sq_cmds_io = dev->cmb + offset; 1119 nvmeq->sq_cmds_io = dev->cmb + offset;
1502 } else { 1120 } else {
@@ -1527,7 +1145,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1527 nvmeq->q_dmadev = dev->dev; 1145 nvmeq->q_dmadev = dev->dev;
1528 nvmeq->dev = dev; 1146 nvmeq->dev = dev;
1529 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1147 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
1530 dev->instance, qid); 1148 dev->ctrl.instance, qid);
1531 spin_lock_init(&nvmeq->q_lock); 1149 spin_lock_init(&nvmeq->q_lock);
1532 nvmeq->cq_head = 0; 1150 nvmeq->cq_head = 0;
1533 nvmeq->cq_phase = 1; 1151 nvmeq->cq_phase = 1;
@@ -1604,79 +1222,9 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
1604 return result; 1222 return result;
1605} 1223}
1606 1224
1607static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
1608{
1609 unsigned long timeout;
1610 u32 bit = enabled ? NVME_CSTS_RDY : 0;
1611
1612 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1613
1614 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
1615 msleep(100);
1616 if (fatal_signal_pending(current))
1617 return -EINTR;
1618 if (time_after(jiffies, timeout)) {
1619 dev_err(dev->dev,
1620 "Device not ready; aborting %s\n", enabled ?
1621 "initialisation" : "reset");
1622 return -ENODEV;
1623 }
1624 }
1625
1626 return 0;
1627}
1628
1629/*
1630 * If the device has been passed off to us in an enabled state, just clear
1631 * the enabled bit. The spec says we should set the 'shutdown notification
1632 * bits', but doing so may cause the device to complete commands to the
1633 * admin queue ... and we don't know what memory that might be pointing at!
1634 */
1635static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
1636{
1637 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1638 dev->ctrl_config &= ~NVME_CC_ENABLE;
1639 writel(dev->ctrl_config, &dev->bar->cc);
1640
1641 return nvme_wait_ready(dev, cap, false);
1642}
1643
1644static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
1645{
1646 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1647 dev->ctrl_config |= NVME_CC_ENABLE;
1648 writel(dev->ctrl_config, &dev->bar->cc);
1649
1650 return nvme_wait_ready(dev, cap, true);
1651}
1652
1653static int nvme_shutdown_ctrl(struct nvme_dev *dev)
1654{
1655 unsigned long timeout;
1656
1657 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1658 dev->ctrl_config |= NVME_CC_SHN_NORMAL;
1659
1660 writel(dev->ctrl_config, &dev->bar->cc);
1661
1662 timeout = SHUTDOWN_TIMEOUT + jiffies;
1663 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
1664 NVME_CSTS_SHST_CMPLT) {
1665 msleep(100);
1666 if (fatal_signal_pending(current))
1667 return -EINTR;
1668 if (time_after(jiffies, timeout)) {
1669 dev_err(dev->dev,
1670 "Device shutdown incomplete; abort shutdown\n");
1671 return -ENODEV;
1672 }
1673 }
1674
1675 return 0;
1676}
1677
1678static struct blk_mq_ops nvme_mq_admin_ops = { 1225static struct blk_mq_ops nvme_mq_admin_ops = {
1679 .queue_rq = nvme_queue_rq, 1226 .queue_rq = nvme_queue_rq,
1227 .complete = nvme_complete_rq,
1680 .map_queue = blk_mq_map_queue, 1228 .map_queue = blk_mq_map_queue,
1681 .init_hctx = nvme_admin_init_hctx, 1229 .init_hctx = nvme_admin_init_hctx,
1682 .exit_hctx = nvme_admin_exit_hctx, 1230 .exit_hctx = nvme_admin_exit_hctx,
@@ -1686,6 +1234,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
1686 1234
1687static struct blk_mq_ops nvme_mq_ops = { 1235static struct blk_mq_ops nvme_mq_ops = {
1688 .queue_rq = nvme_queue_rq, 1236 .queue_rq = nvme_queue_rq,
1237 .complete = nvme_complete_rq,
1689 .map_queue = blk_mq_map_queue, 1238 .map_queue = blk_mq_map_queue,
1690 .init_hctx = nvme_init_hctx, 1239 .init_hctx = nvme_init_hctx,
1691 .init_request = nvme_init_request, 1240 .init_request = nvme_init_request,
@@ -1695,19 +1244,23 @@ static struct blk_mq_ops nvme_mq_ops = {
1695 1244
1696static void nvme_dev_remove_admin(struct nvme_dev *dev) 1245static void nvme_dev_remove_admin(struct nvme_dev *dev)
1697{ 1246{
1698 if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { 1247 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1699 blk_cleanup_queue(dev->admin_q); 1248 blk_cleanup_queue(dev->ctrl.admin_q);
1700 blk_mq_free_tag_set(&dev->admin_tagset); 1249 blk_mq_free_tag_set(&dev->admin_tagset);
1701 } 1250 }
1702} 1251}
1703 1252
1704static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1253static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1705{ 1254{
1706 if (!dev->admin_q) { 1255 if (!dev->ctrl.admin_q) {
1707 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1256 dev->admin_tagset.ops = &nvme_mq_admin_ops;
1708 dev->admin_tagset.nr_hw_queues = 1; 1257 dev->admin_tagset.nr_hw_queues = 1;
1709 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1258
1710 dev->admin_tagset.reserved_tags = 1; 1259 /*
1260 * Subtract one to leave an empty queue entry for 'Full Queue'
1261 * condition. See NVM-Express 1.2 specification, section 4.1.2.
1262 */
1263 dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
1711 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1264 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1712 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1265 dev->admin_tagset.numa_node = dev_to_node(dev->dev);
1713 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1266 dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
@@ -1716,18 +1269,18 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1716 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1269 if (blk_mq_alloc_tag_set(&dev->admin_tagset))
1717 return -ENOMEM; 1270 return -ENOMEM;
1718 1271
1719 dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); 1272 dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
1720 if (IS_ERR(dev->admin_q)) { 1273 if (IS_ERR(dev->ctrl.admin_q)) {
1721 blk_mq_free_tag_set(&dev->admin_tagset); 1274 blk_mq_free_tag_set(&dev->admin_tagset);
1722 return -ENOMEM; 1275 return -ENOMEM;
1723 } 1276 }
1724 if (!blk_get_queue(dev->admin_q)) { 1277 if (!blk_get_queue(dev->ctrl.admin_q)) {
1725 nvme_dev_remove_admin(dev); 1278 nvme_dev_remove_admin(dev);
1726 dev->admin_q = NULL; 1279 dev->ctrl.admin_q = NULL;
1727 return -ENODEV; 1280 return -ENODEV;
1728 } 1281 }
1729 } else 1282 } else
1730 blk_mq_unfreeze_queue(dev->admin_q); 1283 blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
1731 1284
1732 return 0; 1285 return 0;
1733} 1286}
@@ -1736,31 +1289,17 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1736{ 1289{
1737 int result; 1290 int result;
1738 u32 aqa; 1291 u32 aqa;
1739 u64 cap = lo_hi_readq(&dev->bar->cap); 1292 u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
1740 struct nvme_queue *nvmeq; 1293 struct nvme_queue *nvmeq;
1741 /*
1742 * default to a 4K page size, with the intention to update this
1743 * path in the future to accomodate architectures with differing
1744 * kernel and IO page sizes.
1745 */
1746 unsigned page_shift = 12;
1747 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
1748
1749 if (page_shift < dev_page_min) {
1750 dev_err(dev->dev,
1751 "Minimum device page size (%u) too large for "
1752 "host (%u)\n", 1 << dev_page_min,
1753 1 << page_shift);
1754 return -ENODEV;
1755 }
1756 1294
1757 dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? 1295 dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ?
1758 NVME_CAP_NSSRC(cap) : 0; 1296 NVME_CAP_NSSRC(cap) : 0;
1759 1297
1760 if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) 1298 if (dev->subsystem &&
1761 writel(NVME_CSTS_NSSRO, &dev->bar->csts); 1299 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
1300 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1762 1301
1763 result = nvme_disable_ctrl(dev, cap); 1302 result = nvme_disable_ctrl(&dev->ctrl, cap);
1764 if (result < 0) 1303 if (result < 0)
1765 return result; 1304 return result;
1766 1305
@@ -1774,18 +1313,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1774 aqa = nvmeq->q_depth - 1; 1313 aqa = nvmeq->q_depth - 1;
1775 aqa |= aqa << 16; 1314 aqa |= aqa << 16;
1776 1315
1777 dev->page_size = 1 << page_shift; 1316 writel(aqa, dev->bar + NVME_REG_AQA);
1778 1317 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
1779 dev->ctrl_config = NVME_CC_CSS_NVM; 1318 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
1780 dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1781 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
1782 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1783
1784 writel(aqa, &dev->bar->aqa);
1785 lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
1786 lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
1787 1319
1788 result = nvme_enable_ctrl(dev, cap); 1320 result = nvme_enable_ctrl(&dev->ctrl, cap);
1789 if (result) 1321 if (result)
1790 goto free_nvmeq; 1322 goto free_nvmeq;
1791 1323
@@ -1803,406 +1335,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1803 return result; 1335 return result;
1804} 1336}
1805 1337
1806static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1807{
1808 struct nvme_dev *dev = ns->dev;
1809 struct nvme_user_io io;
1810 struct nvme_command c;
1811 unsigned length, meta_len;
1812 int status, write;
1813 dma_addr_t meta_dma = 0;
1814 void *meta = NULL;
1815 void __user *metadata;
1816
1817 if (copy_from_user(&io, uio, sizeof(io)))
1818 return -EFAULT;
1819
1820 switch (io.opcode) {
1821 case nvme_cmd_write:
1822 case nvme_cmd_read:
1823 case nvme_cmd_compare:
1824 break;
1825 default:
1826 return -EINVAL;
1827 }
1828
1829 length = (io.nblocks + 1) << ns->lba_shift;
1830 meta_len = (io.nblocks + 1) * ns->ms;
1831 metadata = (void __user *)(uintptr_t)io.metadata;
1832 write = io.opcode & 1;
1833
1834 if (ns->ext) {
1835 length += meta_len;
1836 meta_len = 0;
1837 }
1838 if (meta_len) {
1839 if (((io.metadata & 3) || !io.metadata) && !ns->ext)
1840 return -EINVAL;
1841
1842 meta = dma_alloc_coherent(dev->dev, meta_len,
1843 &meta_dma, GFP_KERNEL);
1844
1845 if (!meta) {
1846 status = -ENOMEM;
1847 goto unmap;
1848 }
1849 if (write) {
1850 if (copy_from_user(meta, metadata, meta_len)) {
1851 status = -EFAULT;
1852 goto unmap;
1853 }
1854 }
1855 }
1856
1857 memset(&c, 0, sizeof(c));
1858 c.rw.opcode = io.opcode;
1859 c.rw.flags = io.flags;
1860 c.rw.nsid = cpu_to_le32(ns->ns_id);
1861 c.rw.slba = cpu_to_le64(io.slba);
1862 c.rw.length = cpu_to_le16(io.nblocks);
1863 c.rw.control = cpu_to_le16(io.control);
1864 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1865 c.rw.reftag = cpu_to_le32(io.reftag);
1866 c.rw.apptag = cpu_to_le16(io.apptag);
1867 c.rw.appmask = cpu_to_le16(io.appmask);
1868 c.rw.metadata = cpu_to_le64(meta_dma);
1869
1870 status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
1871 (void __user *)(uintptr_t)io.addr, length, NULL, 0);
1872 unmap:
1873 if (meta) {
1874 if (status == NVME_SC_SUCCESS && !write) {
1875 if (copy_to_user(metadata, meta, meta_len))
1876 status = -EFAULT;
1877 }
1878 dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
1879 }
1880 return status;
1881}
1882
1883static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
1884 struct nvme_passthru_cmd __user *ucmd)
1885{
1886 struct nvme_passthru_cmd cmd;
1887 struct nvme_command c;
1888 unsigned timeout = 0;
1889 int status;
1890
1891 if (!capable(CAP_SYS_ADMIN))
1892 return -EACCES;
1893 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1894 return -EFAULT;
1895
1896 memset(&c, 0, sizeof(c));
1897 c.common.opcode = cmd.opcode;
1898 c.common.flags = cmd.flags;
1899 c.common.nsid = cpu_to_le32(cmd.nsid);
1900 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1901 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1902 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1903 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1904 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1905 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1906 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1907 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1908
1909 if (cmd.timeout_ms)
1910 timeout = msecs_to_jiffies(cmd.timeout_ms);
1911
1912 status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
1913 NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1914 &cmd.result, timeout);
1915 if (status >= 0) {
1916 if (put_user(cmd.result, &ucmd->result))
1917 return -EFAULT;
1918 }
1919
1920 return status;
1921}
1922
1923static int nvme_subsys_reset(struct nvme_dev *dev)
1924{
1925 if (!dev->subsystem)
1926 return -ENOTTY;
1927
1928 writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
1929 return 0;
1930}
1931
1932static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1933 unsigned long arg)
1934{
1935 struct nvme_ns *ns = bdev->bd_disk->private_data;
1936
1937 switch (cmd) {
1938 case NVME_IOCTL_ID:
1939 force_successful_syscall_return();
1940 return ns->ns_id;
1941 case NVME_IOCTL_ADMIN_CMD:
1942 return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
1943 case NVME_IOCTL_IO_CMD:
1944 return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
1945 case NVME_IOCTL_SUBMIT_IO:
1946 return nvme_submit_io(ns, (void __user *)arg);
1947 case SG_GET_VERSION_NUM:
1948 return nvme_sg_get_version_num((void __user *)arg);
1949 case SG_IO:
1950 return nvme_sg_io(ns, (void __user *)arg);
1951 default:
1952 return -ENOTTY;
1953 }
1954}
1955
1956#ifdef CONFIG_COMPAT
1957static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1958 unsigned int cmd, unsigned long arg)
1959{
1960 switch (cmd) {
1961 case SG_IO:
1962 return -ENOIOCTLCMD;
1963 }
1964 return nvme_ioctl(bdev, mode, cmd, arg);
1965}
1966#else
1967#define nvme_compat_ioctl NULL
1968#endif
1969
1970static void nvme_free_dev(struct kref *kref);
1971static void nvme_free_ns(struct kref *kref)
1972{
1973 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
1974
1975 if (ns->type == NVME_NS_LIGHTNVM)
1976 nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
1977
1978 spin_lock(&dev_list_lock);
1979 ns->disk->private_data = NULL;
1980 spin_unlock(&dev_list_lock);
1981
1982 kref_put(&ns->dev->kref, nvme_free_dev);
1983 put_disk(ns->disk);
1984 kfree(ns);
1985}
1986
1987static int nvme_open(struct block_device *bdev, fmode_t mode)
1988{
1989 int ret = 0;
1990 struct nvme_ns *ns;
1991
1992 spin_lock(&dev_list_lock);
1993 ns = bdev->bd_disk->private_data;
1994 if (!ns)
1995 ret = -ENXIO;
1996 else if (!kref_get_unless_zero(&ns->kref))
1997 ret = -ENXIO;
1998 spin_unlock(&dev_list_lock);
1999
2000 return ret;
2001}
2002
2003static void nvme_release(struct gendisk *disk, fmode_t mode)
2004{
2005 struct nvme_ns *ns = disk->private_data;
2006 kref_put(&ns->kref, nvme_free_ns);
2007}
2008
2009static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
2010{
2011 /* some standard values */
2012 geo->heads = 1 << 6;
2013 geo->sectors = 1 << 5;
2014 geo->cylinders = get_capacity(bd->bd_disk) >> 11;
2015 return 0;
2016}
2017
2018static void nvme_config_discard(struct nvme_ns *ns)
2019{
2020 u32 logical_block_size = queue_logical_block_size(ns->queue);
2021 ns->queue->limits.discard_zeroes_data = 0;
2022 ns->queue->limits.discard_alignment = logical_block_size;
2023 ns->queue->limits.discard_granularity = logical_block_size;
2024 blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
2025 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
2026}
2027
2028static int nvme_revalidate_disk(struct gendisk *disk)
2029{
2030 struct nvme_ns *ns = disk->private_data;
2031 struct nvme_dev *dev = ns->dev;
2032 struct nvme_id_ns *id;
2033 u8 lbaf, pi_type;
2034 u16 old_ms;
2035 unsigned short bs;
2036
2037 if (nvme_identify_ns(dev, ns->ns_id, &id)) {
2038 dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__,
2039 dev->instance, ns->ns_id);
2040 return -ENODEV;
2041 }
2042 if (id->ncap == 0) {
2043 kfree(id);
2044 return -ENODEV;
2045 }
2046
2047 if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
2048 if (nvme_nvm_register(ns->queue, disk->disk_name)) {
2049 dev_warn(dev->dev,
2050 "%s: LightNVM init failure\n", __func__);
2051 kfree(id);
2052 return -ENODEV;
2053 }
2054 ns->type = NVME_NS_LIGHTNVM;
2055 }
2056
2057 old_ms = ns->ms;
2058 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
2059 ns->lba_shift = id->lbaf[lbaf].ds;
2060 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
2061 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
2062
2063 /*
2064 * If identify namespace failed, use default 512 byte block size so
2065 * block layer can use before failing read/write for 0 capacity.
2066 */
2067 if (ns->lba_shift == 0)
2068 ns->lba_shift = 9;
2069 bs = 1 << ns->lba_shift;
2070
2071 /* XXX: PI implementation requires metadata equal t10 pi tuple size */
2072 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
2073 id->dps & NVME_NS_DPS_PI_MASK : 0;
2074
2075 blk_mq_freeze_queue(disk->queue);
2076 if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
2077 ns->ms != old_ms ||
2078 bs != queue_logical_block_size(disk->queue) ||
2079 (ns->ms && ns->ext)))
2080 blk_integrity_unregister(disk);
2081
2082 ns->pi_type = pi_type;
2083 blk_queue_logical_block_size(ns->queue, bs);
2084
2085 if (ns->ms && !ns->ext)
2086 nvme_init_integrity(ns);
2087
2088 if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
2089 !blk_get_integrity(disk)) ||
2090 ns->type == NVME_NS_LIGHTNVM)
2091 set_capacity(disk, 0);
2092 else
2093 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
2094
2095 if (dev->oncs & NVME_CTRL_ONCS_DSM)
2096 nvme_config_discard(ns);
2097 blk_mq_unfreeze_queue(disk->queue);
2098
2099 kfree(id);
2100 return 0;
2101}
2102
2103static char nvme_pr_type(enum pr_type type)
2104{
2105 switch (type) {
2106 case PR_WRITE_EXCLUSIVE:
2107 return 1;
2108 case PR_EXCLUSIVE_ACCESS:
2109 return 2;
2110 case PR_WRITE_EXCLUSIVE_REG_ONLY:
2111 return 3;
2112 case PR_EXCLUSIVE_ACCESS_REG_ONLY:
2113 return 4;
2114 case PR_WRITE_EXCLUSIVE_ALL_REGS:
2115 return 5;
2116 case PR_EXCLUSIVE_ACCESS_ALL_REGS:
2117 return 6;
2118 default:
2119 return 0;
2120 }
2121};
2122
2123static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
2124 u64 key, u64 sa_key, u8 op)
2125{
2126 struct nvme_ns *ns = bdev->bd_disk->private_data;
2127 struct nvme_command c;
2128 u8 data[16] = { 0, };
2129
2130 put_unaligned_le64(key, &data[0]);
2131 put_unaligned_le64(sa_key, &data[8]);
2132
2133 memset(&c, 0, sizeof(c));
2134 c.common.opcode = op;
2135 c.common.nsid = cpu_to_le32(ns->ns_id);
2136 c.common.cdw10[0] = cpu_to_le32(cdw10);
2137
2138 return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
2139}
2140
2141static int nvme_pr_register(struct block_device *bdev, u64 old,
2142 u64 new, unsigned flags)
2143{
2144 u32 cdw10;
2145
2146 if (flags & ~PR_FL_IGNORE_KEY)
2147 return -EOPNOTSUPP;
2148
2149 cdw10 = old ? 2 : 0;
2150 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
2151 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
2152 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
2153}
2154
2155static int nvme_pr_reserve(struct block_device *bdev, u64 key,
2156 enum pr_type type, unsigned flags)
2157{
2158 u32 cdw10;
2159
2160 if (flags & ~PR_FL_IGNORE_KEY)
2161 return -EOPNOTSUPP;
2162
2163 cdw10 = nvme_pr_type(type) << 8;
2164 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
2165 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
2166}
2167
2168static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
2169 enum pr_type type, bool abort)
2170{
2171 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
2172 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
2173}
2174
2175static int nvme_pr_clear(struct block_device *bdev, u64 key)
2176{
2177 u32 cdw10 = 1 | (key ? 1 << 3 : 0);
2178 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
2179}
2180
2181static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2182{
2183 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
2184 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2185}
2186
2187static const struct pr_ops nvme_pr_ops = {
2188 .pr_register = nvme_pr_register,
2189 .pr_reserve = nvme_pr_reserve,
2190 .pr_release = nvme_pr_release,
2191 .pr_preempt = nvme_pr_preempt,
2192 .pr_clear = nvme_pr_clear,
2193};
2194
2195static const struct block_device_operations nvme_fops = {
2196 .owner = THIS_MODULE,
2197 .ioctl = nvme_ioctl,
2198 .compat_ioctl = nvme_compat_ioctl,
2199 .open = nvme_open,
2200 .release = nvme_release,
2201 .getgeo = nvme_getgeo,
2202 .revalidate_disk= nvme_revalidate_disk,
2203 .pr_ops = &nvme_pr_ops,
2204};
2205
2206static int nvme_kthread(void *data) 1338static int nvme_kthread(void *data)
2207{ 1339{
2208 struct nvme_dev *dev, *next; 1340 struct nvme_dev *dev, *next;
@@ -2212,14 +1344,20 @@ static int nvme_kthread(void *data)
2212 spin_lock(&dev_list_lock); 1344 spin_lock(&dev_list_lock);
2213 list_for_each_entry_safe(dev, next, &dev_list, node) { 1345 list_for_each_entry_safe(dev, next, &dev_list, node) {
2214 int i; 1346 int i;
2215 u32 csts = readl(&dev->bar->csts); 1347 u32 csts = readl(dev->bar + NVME_REG_CSTS);
1348
1349 /*
1350 * Skip controllers currently under reset.
1351 */
1352 if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work))
1353 continue;
2216 1354
2217 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || 1355 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
2218 csts & NVME_CSTS_CFS) { 1356 csts & NVME_CSTS_CFS) {
2219 if (!__nvme_reset(dev)) { 1357 if (queue_work(nvme_workq, &dev->reset_work)) {
2220 dev_warn(dev->dev, 1358 dev_warn(dev->dev,
2221 "Failed status: %x, reset controller\n", 1359 "Failed status: %x, reset controller\n",
2222 readl(&dev->bar->csts)); 1360 readl(dev->bar + NVME_REG_CSTS));
2223 } 1361 }
2224 continue; 1362 continue;
2225 } 1363 }
@@ -2230,11 +1368,8 @@ static int nvme_kthread(void *data)
2230 spin_lock_irq(&nvmeq->q_lock); 1368 spin_lock_irq(&nvmeq->q_lock);
2231 nvme_process_cq(nvmeq); 1369 nvme_process_cq(nvmeq);
2232 1370
2233 while ((i == 0) && (dev->event_limit > 0)) { 1371 while (i == 0 && dev->ctrl.event_limit > 0)
2234 if (nvme_submit_async_admin_req(dev)) 1372 nvme_submit_async_event(dev);
2235 break;
2236 dev->event_limit--;
2237 }
2238 spin_unlock_irq(&nvmeq->q_lock); 1373 spin_unlock_irq(&nvmeq->q_lock);
2239 } 1374 }
2240 } 1375 }
@@ -2244,127 +1379,33 @@ static int nvme_kthread(void *data)
2244 return 0; 1379 return 0;
2245} 1380}
2246 1381
2247static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) 1382static int nvme_create_io_queues(struct nvme_dev *dev)
2248{
2249 struct nvme_ns *ns;
2250 struct gendisk *disk;
2251 int node = dev_to_node(dev->dev);
2252
2253 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2254 if (!ns)
2255 return;
2256
2257 ns->queue = blk_mq_init_queue(&dev->tagset);
2258 if (IS_ERR(ns->queue))
2259 goto out_free_ns;
2260 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
2261 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2262 ns->dev = dev;
2263 ns->queue->queuedata = ns;
2264
2265 disk = alloc_disk_node(0, node);
2266 if (!disk)
2267 goto out_free_queue;
2268
2269 kref_init(&ns->kref);
2270 ns->ns_id = nsid;
2271 ns->disk = disk;
2272 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2273 list_add_tail(&ns->list, &dev->namespaces);
2274
2275 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2276 if (dev->max_hw_sectors) {
2277 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
2278 blk_queue_max_segments(ns->queue,
2279 (dev->max_hw_sectors / (dev->page_size >> 9)) + 1);
2280 }
2281 if (dev->stripe_size)
2282 blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
2283 if (dev->vwc & NVME_CTRL_VWC_PRESENT)
2284 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
2285 blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
2286
2287 disk->major = nvme_major;
2288 disk->first_minor = 0;
2289 disk->fops = &nvme_fops;
2290 disk->private_data = ns;
2291 disk->queue = ns->queue;
2292 disk->driverfs_dev = dev->device;
2293 disk->flags = GENHD_FL_EXT_DEVT;
2294 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
2295
2296 /*
2297 * Initialize capacity to 0 until we establish the namespace format and
2298 * setup integrity extentions if necessary. The revalidate_disk after
2299 * add_disk allows the driver to register with integrity if the format
2300 * requires it.
2301 */
2302 set_capacity(disk, 0);
2303 if (nvme_revalidate_disk(ns->disk))
2304 goto out_free_disk;
2305
2306 kref_get(&dev->kref);
2307 if (ns->type != NVME_NS_LIGHTNVM) {
2308 add_disk(ns->disk);
2309 if (ns->ms) {
2310 struct block_device *bd = bdget_disk(ns->disk, 0);
2311 if (!bd)
2312 return;
2313 if (blkdev_get(bd, FMODE_READ, NULL)) {
2314 bdput(bd);
2315 return;
2316 }
2317 blkdev_reread_part(bd);
2318 blkdev_put(bd, FMODE_READ);
2319 }
2320 }
2321 return;
2322 out_free_disk:
2323 kfree(disk);
2324 list_del(&ns->list);
2325 out_free_queue:
2326 blk_cleanup_queue(ns->queue);
2327 out_free_ns:
2328 kfree(ns);
2329}
2330
2331/*
2332 * Create I/O queues. Failing to create an I/O queue is not an issue,
2333 * we can continue with less than the desired amount of queues, and
2334 * even a controller without I/O queues an still be used to issue
2335 * admin commands. This might be useful to upgrade a buggy firmware
2336 * for example.
2337 */
2338static void nvme_create_io_queues(struct nvme_dev *dev)
2339{ 1383{
2340 unsigned i; 1384 unsigned i;
1385 int ret = 0;
2341 1386
2342 for (i = dev->queue_count; i <= dev->max_qid; i++) 1387 for (i = dev->queue_count; i <= dev->max_qid; i++) {
2343 if (!nvme_alloc_queue(dev, i, dev->q_depth)) 1388 if (!nvme_alloc_queue(dev, i, dev->q_depth)) {
1389 ret = -ENOMEM;
2344 break; 1390 break;
1391 }
1392 }
2345 1393
2346 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) 1394 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) {
2347 if (nvme_create_queue(dev->queues[i], i)) { 1395 ret = nvme_create_queue(dev->queues[i], i);
1396 if (ret) {
2348 nvme_free_queues(dev, i); 1397 nvme_free_queues(dev, i);
2349 break; 1398 break;
2350 } 1399 }
2351}
2352
2353static int set_queue_count(struct nvme_dev *dev, int count)
2354{
2355 int status;
2356 u32 result;
2357 u32 q_count = (count - 1) | ((count - 1) << 16);
2358
2359 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
2360 &result);
2361 if (status < 0)
2362 return status;
2363 if (status > 0) {
2364 dev_err(dev->dev, "Could not set queue count (%d)\n", status);
2365 return 0;
2366 } 1400 }
2367 return min(result & 0xffff, result >> 16) + 1; 1401
1402 /*
1403 * Ignore failing Create SQ/CQ commands, we can continue with less
1404 * than the desired aount of queues, and even a controller without
1405 * I/O queues an still be used to issue admin commands. This might
1406 * be useful to upgrade a buggy firmware for example.
1407 */
1408 return ret >= 0 ? 0 : ret;
2368} 1409}
2369 1410
2370static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 1411static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
@@ -2379,11 +1420,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
2379 if (!use_cmb_sqes) 1420 if (!use_cmb_sqes)
2380 return NULL; 1421 return NULL;
2381 1422
2382 dev->cmbsz = readl(&dev->bar->cmbsz); 1423 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
2383 if (!(NVME_CMB_SZ(dev->cmbsz))) 1424 if (!(NVME_CMB_SZ(dev->cmbsz)))
2384 return NULL; 1425 return NULL;
2385 1426
2386 cmbloc = readl(&dev->bar->cmbloc); 1427 cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
2387 1428
2388 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 1429 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
2389 size = szu * NVME_CMB_SZ(dev->cmbsz); 1430 size = szu * NVME_CMB_SZ(dev->cmbsz);
@@ -2431,11 +1472,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
2431 int result, i, vecs, nr_io_queues, size; 1472 int result, i, vecs, nr_io_queues, size;
2432 1473
2433 nr_io_queues = num_possible_cpus(); 1474 nr_io_queues = num_possible_cpus();
2434 result = set_queue_count(dev, nr_io_queues); 1475 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
2435 if (result <= 0) 1476 if (result < 0)
2436 return result; 1477 return result;
2437 if (result < nr_io_queues) 1478
2438 nr_io_queues = result; 1479 /*
1480 * Degraded controllers might return an error when setting the queue
1481 * count. We still want to be able to bring them online and offer
1482 * access to the admin queue, as that might be only way to fix them up.
1483 */
1484 if (result > 0) {
1485 dev_err(dev->dev, "Could not set queue count (%d)\n", result);
1486 nr_io_queues = 0;
1487 result = 0;
1488 }
2439 1489
2440 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 1490 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
2441 result = nvme_cmb_qdepth(dev, nr_io_queues, 1491 result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2457,7 +1507,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
2457 return -ENOMEM; 1507 return -ENOMEM;
2458 size = db_bar_size(dev, nr_io_queues); 1508 size = db_bar_size(dev, nr_io_queues);
2459 } while (1); 1509 } while (1);
2460 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1510 dev->dbs = dev->bar + 4096;
2461 adminq->q_db = dev->dbs; 1511 adminq->q_db = dev->dbs;
2462 } 1512 }
2463 1513
@@ -2501,115 +1551,115 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
2501 1551
2502 /* Free previously allocated queues that are no longer usable */ 1552 /* Free previously allocated queues that are no longer usable */
2503 nvme_free_queues(dev, nr_io_queues + 1); 1553 nvme_free_queues(dev, nr_io_queues + 1);
2504 nvme_create_io_queues(dev); 1554 return nvme_create_io_queues(dev);
2505
2506 return 0;
2507 1555
2508 free_queues: 1556 free_queues:
2509 nvme_free_queues(dev, 1); 1557 nvme_free_queues(dev, 1);
2510 return result; 1558 return result;
2511} 1559}
2512 1560
2513static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 1561static void nvme_set_irq_hints(struct nvme_dev *dev)
2514{ 1562{
2515 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 1563 struct nvme_queue *nvmeq;
2516 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 1564 int i;
2517 1565
2518 return nsa->ns_id - nsb->ns_id; 1566 for (i = 0; i < dev->online_queues; i++) {
2519} 1567 nvmeq = dev->queues[i];
2520 1568
2521static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) 1569 if (!nvmeq->tags || !(*nvmeq->tags))
2522{ 1570 continue;
2523 struct nvme_ns *ns;
2524 1571
2525 list_for_each_entry(ns, &dev->namespaces, list) { 1572 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
2526 if (ns->ns_id == nsid) 1573 blk_mq_tags_cpumask(*nvmeq->tags));
2527 return ns;
2528 if (ns->ns_id > nsid)
2529 break;
2530 } 1574 }
2531 return NULL;
2532} 1575}
2533 1576
2534static inline bool nvme_io_incapable(struct nvme_dev *dev) 1577static void nvme_dev_scan(struct work_struct *work)
2535{ 1578{
2536 return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || 1579 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
2537 dev->online_queues < 2); 1580
1581 if (!dev->tagset.tags)
1582 return;
1583 nvme_scan_namespaces(&dev->ctrl);
1584 nvme_set_irq_hints(dev);
2538} 1585}
2539 1586
2540static void nvme_ns_remove(struct nvme_ns *ns) 1587static void nvme_del_queue_end(struct request *req, int error)
2541{ 1588{
2542 bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); 1589 struct nvme_queue *nvmeq = req->end_io_data;
2543 1590
2544 if (kill) { 1591 blk_mq_free_request(req);
2545 blk_set_queue_dying(ns->queue); 1592 complete(&nvmeq->dev->ioq_wait);
2546
2547 /*
2548 * The controller was shutdown first if we got here through
2549 * device removal. The shutdown may requeue outstanding
2550 * requests. These need to be aborted immediately so
2551 * del_gendisk doesn't block indefinitely for their completion.
2552 */
2553 blk_mq_abort_requeue_list(ns->queue);
2554 }
2555 if (ns->disk->flags & GENHD_FL_UP)
2556 del_gendisk(ns->disk);
2557 if (kill || !blk_queue_dying(ns->queue)) {
2558 blk_mq_abort_requeue_list(ns->queue);
2559 blk_cleanup_queue(ns->queue);
2560 }
2561 list_del_init(&ns->list);
2562 kref_put(&ns->kref, nvme_free_ns);
2563} 1593}
2564 1594
2565static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) 1595static void nvme_del_cq_end(struct request *req, int error)
2566{ 1596{
2567 struct nvme_ns *ns, *next; 1597 struct nvme_queue *nvmeq = req->end_io_data;
2568 unsigned i;
2569 1598
2570 for (i = 1; i <= nn; i++) { 1599 if (!error) {
2571 ns = nvme_find_ns(dev, i); 1600 unsigned long flags;
2572 if (ns) { 1601
2573 if (revalidate_disk(ns->disk)) 1602 spin_lock_irqsave(&nvmeq->q_lock, flags);
2574 nvme_ns_remove(ns); 1603 nvme_process_cq(nvmeq);
2575 } else 1604 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
2576 nvme_alloc_ns(dev, i);
2577 }
2578 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
2579 if (ns->ns_id > nn)
2580 nvme_ns_remove(ns);
2581 } 1605 }
2582 list_sort(NULL, &dev->namespaces, ns_cmp); 1606
1607 nvme_del_queue_end(req, error);
2583} 1608}
2584 1609
2585static void nvme_set_irq_hints(struct nvme_dev *dev) 1610static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2586{ 1611{
2587 struct nvme_queue *nvmeq; 1612 struct request_queue *q = nvmeq->dev->ctrl.admin_q;
2588 int i; 1613 struct request *req;
1614 struct nvme_command cmd;
2589 1615
2590 for (i = 0; i < dev->online_queues; i++) { 1616 memset(&cmd, 0, sizeof(cmd));
2591 nvmeq = dev->queues[i]; 1617 cmd.delete_queue.opcode = opcode;
1618 cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2592 1619
2593 if (!nvmeq->tags || !(*nvmeq->tags)) 1620 req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
2594 continue; 1621 if (IS_ERR(req))
1622 return PTR_ERR(req);
2595 1623
2596 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 1624 req->timeout = ADMIN_TIMEOUT;
2597 blk_mq_tags_cpumask(*nvmeq->tags)); 1625 req->end_io_data = nvmeq;
2598 } 1626
1627 blk_execute_rq_nowait(q, NULL, req, false,
1628 opcode == nvme_admin_delete_cq ?
1629 nvme_del_cq_end : nvme_del_queue_end);
1630 return 0;
2599} 1631}
2600 1632
2601static void nvme_dev_scan(struct work_struct *work) 1633static void nvme_disable_io_queues(struct nvme_dev *dev)
2602{ 1634{
2603 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); 1635 int pass;
2604 struct nvme_id_ctrl *ctrl; 1636 unsigned long timeout;
1637 u8 opcode = nvme_admin_delete_sq;
2605 1638
2606 if (!dev->tagset.tags) 1639 for (pass = 0; pass < 2; pass++) {
2607 return; 1640 int sent = 0, i = dev->queue_count - 1;
2608 if (nvme_identify_ctrl(dev, &ctrl)) 1641
2609 return; 1642 reinit_completion(&dev->ioq_wait);
2610 nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); 1643 retry:
2611 kfree(ctrl); 1644 timeout = ADMIN_TIMEOUT;
2612 nvme_set_irq_hints(dev); 1645 for (; i > 0; i--) {
1646 struct nvme_queue *nvmeq = dev->queues[i];
1647
1648 if (!pass)
1649 nvme_suspend_queue(nvmeq);
1650 if (nvme_delete_queue(nvmeq, opcode))
1651 break;
1652 ++sent;
1653 }
1654 while (sent--) {
1655 timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
1656 if (timeout == 0)
1657 return;
1658 if (i)
1659 goto retry;
1660 }
1661 opcode = nvme_admin_delete_cq;
1662 }
2613} 1663}
2614 1664
2615/* 1665/*
@@ -2620,42 +1670,7 @@ static void nvme_dev_scan(struct work_struct *work)
2620 */ 1670 */
2621static int nvme_dev_add(struct nvme_dev *dev) 1671static int nvme_dev_add(struct nvme_dev *dev)
2622{ 1672{
2623 struct pci_dev *pdev = to_pci_dev(dev->dev); 1673 if (!dev->ctrl.tagset) {
2624 int res;
2625 struct nvme_id_ctrl *ctrl;
2626 int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12;
2627
2628 res = nvme_identify_ctrl(dev, &ctrl);
2629 if (res) {
2630 dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
2631 return -EIO;
2632 }
2633
2634 dev->oncs = le16_to_cpup(&ctrl->oncs);
2635 dev->abort_limit = ctrl->acl + 1;
2636 dev->vwc = ctrl->vwc;
2637 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
2638 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
2639 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
2640 if (ctrl->mdts)
2641 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
2642 else
2643 dev->max_hw_sectors = UINT_MAX;
2644 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
2645 (pdev->device == 0x0953) && ctrl->vs[3]) {
2646 unsigned int max_hw_sectors;
2647
2648 dev->stripe_size = 1 << (ctrl->vs[3] + shift);
2649 max_hw_sectors = dev->stripe_size >> (shift - 9);
2650 if (dev->max_hw_sectors) {
2651 dev->max_hw_sectors = min(max_hw_sectors,
2652 dev->max_hw_sectors);
2653 } else
2654 dev->max_hw_sectors = max_hw_sectors;
2655 }
2656 kfree(ctrl);
2657
2658 if (!dev->tagset.tags) {
2659 dev->tagset.ops = &nvme_mq_ops; 1674 dev->tagset.ops = &nvme_mq_ops;
2660 dev->tagset.nr_hw_queues = dev->online_queues - 1; 1675 dev->tagset.nr_hw_queues = dev->online_queues - 1;
2661 dev->tagset.timeout = NVME_IO_TIMEOUT; 1676 dev->tagset.timeout = NVME_IO_TIMEOUT;
@@ -2668,8 +1683,9 @@ static int nvme_dev_add(struct nvme_dev *dev)
2668 1683
2669 if (blk_mq_alloc_tag_set(&dev->tagset)) 1684 if (blk_mq_alloc_tag_set(&dev->tagset))
2670 return 0; 1685 return 0;
1686 dev->ctrl.tagset = &dev->tagset;
2671 } 1687 }
2672 schedule_work(&dev->scan_work); 1688 queue_work(nvme_workq, &dev->scan_work);
2673 return 0; 1689 return 0;
2674} 1690}
2675 1691
@@ -2699,7 +1715,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
2699 if (!dev->bar) 1715 if (!dev->bar)
2700 goto disable; 1716 goto disable;
2701 1717
2702 if (readl(&dev->bar->csts) == -1) { 1718 if (readl(dev->bar + NVME_REG_CSTS) == -1) {
2703 result = -ENODEV; 1719 result = -ENODEV;
2704 goto unmap; 1720 goto unmap;
2705 } 1721 }
@@ -2714,10 +1730,11 @@ static int nvme_dev_map(struct nvme_dev *dev)
2714 goto unmap; 1730 goto unmap;
2715 } 1731 }
2716 1732
2717 cap = lo_hi_readq(&dev->bar->cap); 1733 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
1734
2718 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 1735 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
2719 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 1736 dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
2720 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1737 dev->dbs = dev->bar + 4096;
2721 1738
2722 /* 1739 /*
2723 * Temporary fix for the Apple controller found in the MacBook8,1 and 1740 * Temporary fix for the Apple controller found in the MacBook8,1 and
@@ -2730,9 +1747,11 @@ static int nvme_dev_map(struct nvme_dev *dev)
2730 dev->q_depth); 1747 dev->q_depth);
2731 } 1748 }
2732 1749
2733 if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) 1750 if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2))
2734 dev->cmb = nvme_map_cmb(dev); 1751 dev->cmb = nvme_map_cmb(dev);
2735 1752
1753 pci_enable_pcie_error_reporting(pdev);
1754 pci_save_state(pdev);
2736 return 0; 1755 return 0;
2737 1756
2738 unmap: 1757 unmap:
@@ -2760,152 +1779,34 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
2760 pci_release_regions(pdev); 1779 pci_release_regions(pdev);
2761 } 1780 }
2762 1781
2763 if (pci_is_enabled(pdev)) 1782 if (pci_is_enabled(pdev)) {
1783 pci_disable_pcie_error_reporting(pdev);
2764 pci_disable_device(pdev); 1784 pci_disable_device(pdev);
2765}
2766
2767struct nvme_delq_ctx {
2768 struct task_struct *waiter;
2769 struct kthread_worker *worker;
2770 atomic_t refcount;
2771};
2772
2773static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
2774{
2775 dq->waiter = current;
2776 mb();
2777
2778 for (;;) {
2779 set_current_state(TASK_KILLABLE);
2780 if (!atomic_read(&dq->refcount))
2781 break;
2782 if (!schedule_timeout(ADMIN_TIMEOUT) ||
2783 fatal_signal_pending(current)) {
2784 /*
2785 * Disable the controller first since we can't trust it
2786 * at this point, but leave the admin queue enabled
2787 * until all queue deletion requests are flushed.
2788 * FIXME: This may take a while if there are more h/w
2789 * queues than admin tags.
2790 */
2791 set_current_state(TASK_RUNNING);
2792 nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap));
2793 nvme_clear_queue(dev->queues[0]);
2794 flush_kthread_worker(dq->worker);
2795 nvme_disable_queue(dev, 0);
2796 return;
2797 }
2798 } 1785 }
2799 set_current_state(TASK_RUNNING);
2800}
2801
2802static void nvme_put_dq(struct nvme_delq_ctx *dq)
2803{
2804 atomic_dec(&dq->refcount);
2805 if (dq->waiter)
2806 wake_up_process(dq->waiter);
2807}
2808
2809static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
2810{
2811 atomic_inc(&dq->refcount);
2812 return dq;
2813}
2814
2815static void nvme_del_queue_end(struct nvme_queue *nvmeq)
2816{
2817 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
2818 nvme_put_dq(dq);
2819
2820 spin_lock_irq(&nvmeq->q_lock);
2821 nvme_process_cq(nvmeq);
2822 spin_unlock_irq(&nvmeq->q_lock);
2823}
2824
2825static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
2826 kthread_work_func_t fn)
2827{
2828 struct nvme_command c;
2829
2830 memset(&c, 0, sizeof(c));
2831 c.delete_queue.opcode = opcode;
2832 c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2833
2834 init_kthread_work(&nvmeq->cmdinfo.work, fn);
2835 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
2836 ADMIN_TIMEOUT);
2837}
2838
2839static void nvme_del_cq_work_handler(struct kthread_work *work)
2840{
2841 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2842 cmdinfo.work);
2843 nvme_del_queue_end(nvmeq);
2844}
2845
2846static int nvme_delete_cq(struct nvme_queue *nvmeq)
2847{
2848 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
2849 nvme_del_cq_work_handler);
2850}
2851
2852static void nvme_del_sq_work_handler(struct kthread_work *work)
2853{
2854 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2855 cmdinfo.work);
2856 int status = nvmeq->cmdinfo.status;
2857
2858 if (!status)
2859 status = nvme_delete_cq(nvmeq);
2860 if (status)
2861 nvme_del_queue_end(nvmeq);
2862}
2863
2864static int nvme_delete_sq(struct nvme_queue *nvmeq)
2865{
2866 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
2867 nvme_del_sq_work_handler);
2868} 1786}
2869 1787
2870static void nvme_del_queue_start(struct kthread_work *work) 1788static int nvme_dev_list_add(struct nvme_dev *dev)
2871{ 1789{
2872 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 1790 bool start_thread = false;
2873 cmdinfo.work);
2874 if (nvme_delete_sq(nvmeq))
2875 nvme_del_queue_end(nvmeq);
2876}
2877 1791
2878static void nvme_disable_io_queues(struct nvme_dev *dev) 1792 spin_lock(&dev_list_lock);
2879{ 1793 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
2880 int i; 1794 start_thread = true;
2881 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 1795 nvme_thread = NULL;
2882 struct nvme_delq_ctx dq;
2883 struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
2884 &worker, "nvme%d", dev->instance);
2885
2886 if (IS_ERR(kworker_task)) {
2887 dev_err(dev->dev,
2888 "Failed to create queue del task\n");
2889 for (i = dev->queue_count - 1; i > 0; i--)
2890 nvme_disable_queue(dev, i);
2891 return;
2892 } 1796 }
1797 list_add(&dev->node, &dev_list);
1798 spin_unlock(&dev_list_lock);
2893 1799
2894 dq.waiter = NULL; 1800 if (start_thread) {
2895 atomic_set(&dq.refcount, 0); 1801 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
2896 dq.worker = &worker; 1802 wake_up_all(&nvme_kthread_wait);
2897 for (i = dev->queue_count - 1; i > 0; i--) { 1803 } else
2898 struct nvme_queue *nvmeq = dev->queues[i]; 1804 wait_event_killable(nvme_kthread_wait, nvme_thread);
2899 1805
2900 if (nvme_suspend_queue(nvmeq)) 1806 if (IS_ERR_OR_NULL(nvme_thread))
2901 continue; 1807 return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
2902 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 1808
2903 nvmeq->cmdinfo.worker = dq.worker; 1809 return 0;
2904 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
2905 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
2906 }
2907 nvme_wait_dq(&dq, dev);
2908 kthread_stop(kworker_task);
2909} 1810}
2910 1811
2911/* 1812/*
@@ -2928,44 +1829,17 @@ static void nvme_dev_list_remove(struct nvme_dev *dev)
2928 kthread_stop(tmp); 1829 kthread_stop(tmp);
2929} 1830}
2930 1831
2931static void nvme_freeze_queues(struct nvme_dev *dev) 1832static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
2932{
2933 struct nvme_ns *ns;
2934
2935 list_for_each_entry(ns, &dev->namespaces, list) {
2936 blk_mq_freeze_queue_start(ns->queue);
2937
2938 spin_lock_irq(ns->queue->queue_lock);
2939 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
2940 spin_unlock_irq(ns->queue->queue_lock);
2941
2942 blk_mq_cancel_requeue_work(ns->queue);
2943 blk_mq_stop_hw_queues(ns->queue);
2944 }
2945}
2946
2947static void nvme_unfreeze_queues(struct nvme_dev *dev)
2948{
2949 struct nvme_ns *ns;
2950
2951 list_for_each_entry(ns, &dev->namespaces, list) {
2952 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
2953 blk_mq_unfreeze_queue(ns->queue);
2954 blk_mq_start_stopped_hw_queues(ns->queue, true);
2955 blk_mq_kick_requeue_list(ns->queue);
2956 }
2957}
2958
2959static void nvme_dev_shutdown(struct nvme_dev *dev)
2960{ 1833{
2961 int i; 1834 int i;
2962 u32 csts = -1; 1835 u32 csts = -1;
2963 1836
2964 nvme_dev_list_remove(dev); 1837 nvme_dev_list_remove(dev);
2965 1838
1839 mutex_lock(&dev->shutdown_lock);
2966 if (dev->bar) { 1840 if (dev->bar) {
2967 nvme_freeze_queues(dev); 1841 nvme_stop_queues(&dev->ctrl);
2968 csts = readl(&dev->bar->csts); 1842 csts = readl(dev->bar + NVME_REG_CSTS);
2969 } 1843 }
2970 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 1844 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
2971 for (i = dev->queue_count - 1; i >= 0; i--) { 1845 for (i = dev->queue_count - 1; i >= 0; i--) {
@@ -2974,30 +1848,13 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
2974 } 1848 }
2975 } else { 1849 } else {
2976 nvme_disable_io_queues(dev); 1850 nvme_disable_io_queues(dev);
2977 nvme_shutdown_ctrl(dev); 1851 nvme_disable_admin_queue(dev, shutdown);
2978 nvme_disable_queue(dev, 0);
2979 } 1852 }
2980 nvme_dev_unmap(dev); 1853 nvme_dev_unmap(dev);
2981 1854
2982 for (i = dev->queue_count - 1; i >= 0; i--) 1855 for (i = dev->queue_count - 1; i >= 0; i--)
2983 nvme_clear_queue(dev->queues[i]); 1856 nvme_clear_queue(dev->queues[i]);
2984} 1857 mutex_unlock(&dev->shutdown_lock);
2985
2986static void nvme_dev_remove(struct nvme_dev *dev)
2987{
2988 struct nvme_ns *ns, *next;
2989
2990 if (nvme_io_incapable(dev)) {
2991 /*
2992 * If the device is not capable of IO (surprise hot-removal,
2993 * for example), we need to quiesce prior to deleting the
2994 * namespaces. This will end outstanding requests and prevent
2995 * attempts to sync dirty data.
2996 */
2997 nvme_dev_shutdown(dev);
2998 }
2999 list_for_each_entry_safe(ns, next, &dev->namespaces, list)
3000 nvme_ns_remove(ns);
3001} 1858}
3002 1859
3003static int nvme_setup_prp_pools(struct nvme_dev *dev) 1860static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -3023,119 +1880,36 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
3023 dma_pool_destroy(dev->prp_small_pool); 1880 dma_pool_destroy(dev->prp_small_pool);
3024} 1881}
3025 1882
3026static DEFINE_IDA(nvme_instance_ida); 1883static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
3027
3028static int nvme_set_instance(struct nvme_dev *dev)
3029{
3030 int instance, error;
3031
3032 do {
3033 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
3034 return -ENODEV;
3035
3036 spin_lock(&dev_list_lock);
3037 error = ida_get_new(&nvme_instance_ida, &instance);
3038 spin_unlock(&dev_list_lock);
3039 } while (error == -EAGAIN);
3040
3041 if (error)
3042 return -ENODEV;
3043
3044 dev->instance = instance;
3045 return 0;
3046}
3047
3048static void nvme_release_instance(struct nvme_dev *dev)
3049{ 1884{
3050 spin_lock(&dev_list_lock); 1885 struct nvme_dev *dev = to_nvme_dev(ctrl);
3051 ida_remove(&nvme_instance_ida, dev->instance);
3052 spin_unlock(&dev_list_lock);
3053}
3054
3055static void nvme_free_dev(struct kref *kref)
3056{
3057 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
3058 1886
3059 put_device(dev->dev); 1887 put_device(dev->dev);
3060 put_device(dev->device);
3061 nvme_release_instance(dev);
3062 if (dev->tagset.tags) 1888 if (dev->tagset.tags)
3063 blk_mq_free_tag_set(&dev->tagset); 1889 blk_mq_free_tag_set(&dev->tagset);
3064 if (dev->admin_q) 1890 if (dev->ctrl.admin_q)
3065 blk_put_queue(dev->admin_q); 1891 blk_put_queue(dev->ctrl.admin_q);
3066 kfree(dev->queues); 1892 kfree(dev->queues);
3067 kfree(dev->entry); 1893 kfree(dev->entry);
3068 kfree(dev); 1894 kfree(dev);
3069} 1895}
3070 1896
3071static int nvme_dev_open(struct inode *inode, struct file *f) 1897static void nvme_reset_work(struct work_struct *work)
3072{ 1898{
3073 struct nvme_dev *dev; 1899 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
3074 int instance = iminor(inode); 1900 int result;
3075 int ret = -ENODEV;
3076
3077 spin_lock(&dev_list_lock);
3078 list_for_each_entry(dev, &dev_list, node) {
3079 if (dev->instance == instance) {
3080 if (!dev->admin_q) {
3081 ret = -EWOULDBLOCK;
3082 break;
3083 }
3084 if (!kref_get_unless_zero(&dev->kref))
3085 break;
3086 f->private_data = dev;
3087 ret = 0;
3088 break;
3089 }
3090 }
3091 spin_unlock(&dev_list_lock);
3092
3093 return ret;
3094}
3095 1901
3096static int nvme_dev_release(struct inode *inode, struct file *f) 1902 if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
3097{ 1903 goto out;
3098 struct nvme_dev *dev = f->private_data;
3099 kref_put(&dev->kref, nvme_free_dev);
3100 return 0;
3101}
3102 1904
3103static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1905 /*
3104{ 1906 * If we're called to reset a live controller first shut it down before
3105 struct nvme_dev *dev = f->private_data; 1907 * moving on.
3106 struct nvme_ns *ns; 1908 */
3107 1909 if (dev->bar)
3108 switch (cmd) { 1910 nvme_dev_disable(dev, false);
3109 case NVME_IOCTL_ADMIN_CMD:
3110 return nvme_user_cmd(dev, NULL, (void __user *)arg);
3111 case NVME_IOCTL_IO_CMD:
3112 if (list_empty(&dev->namespaces))
3113 return -ENOTTY;
3114 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
3115 return nvme_user_cmd(dev, ns, (void __user *)arg);
3116 case NVME_IOCTL_RESET:
3117 dev_warn(dev->dev, "resetting controller\n");
3118 return nvme_reset(dev);
3119 case NVME_IOCTL_SUBSYS_RESET:
3120 return nvme_subsys_reset(dev);
3121 default:
3122 return -ENOTTY;
3123 }
3124}
3125 1911
3126static const struct file_operations nvme_dev_fops = { 1912 set_bit(NVME_CTRL_RESETTING, &dev->flags);
3127 .owner = THIS_MODULE,
3128 .open = nvme_dev_open,
3129 .release = nvme_dev_release,
3130 .unlocked_ioctl = nvme_dev_ioctl,
3131 .compat_ioctl = nvme_dev_ioctl,
3132};
3133
3134static void nvme_probe_work(struct work_struct *work)
3135{
3136 struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
3137 bool start_thread = false;
3138 int result;
3139 1913
3140 result = nvme_dev_map(dev); 1914 result = nvme_dev_map(dev);
3141 if (result) 1915 if (result)
@@ -3145,35 +1919,24 @@ static void nvme_probe_work(struct work_struct *work)
3145 if (result) 1919 if (result)
3146 goto unmap; 1920 goto unmap;
3147 1921
3148 spin_lock(&dev_list_lock);
3149 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
3150 start_thread = true;
3151 nvme_thread = NULL;
3152 }
3153 list_add(&dev->node, &dev_list);
3154 spin_unlock(&dev_list_lock);
3155
3156 if (start_thread) {
3157 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
3158 wake_up_all(&nvme_kthread_wait);
3159 } else
3160 wait_event_killable(nvme_kthread_wait, nvme_thread);
3161
3162 if (IS_ERR_OR_NULL(nvme_thread)) {
3163 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
3164 goto disable;
3165 }
3166
3167 nvme_init_queue(dev->queues[0], 0); 1922 nvme_init_queue(dev->queues[0], 0);
3168 result = nvme_alloc_admin_tags(dev); 1923 result = nvme_alloc_admin_tags(dev);
3169 if (result) 1924 if (result)
3170 goto disable; 1925 goto disable;
3171 1926
1927 result = nvme_init_identify(&dev->ctrl);
1928 if (result)
1929 goto free_tags;
1930
3172 result = nvme_setup_io_queues(dev); 1931 result = nvme_setup_io_queues(dev);
3173 if (result) 1932 if (result)
3174 goto free_tags; 1933 goto free_tags;
3175 1934
3176 dev->event_limit = 1; 1935 dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
1936
1937 result = nvme_dev_list_add(dev);
1938 if (result)
1939 goto remove;
3177 1940
3178 /* 1941 /*
3179 * Keep the controller around but remove all namespaces if we don't have 1942 * Keep the controller around but remove all namespaces if we don't have
@@ -3181,117 +1944,98 @@ static void nvme_probe_work(struct work_struct *work)
3181 */ 1944 */
3182 if (dev->online_queues < 2) { 1945 if (dev->online_queues < 2) {
3183 dev_warn(dev->dev, "IO queues not created\n"); 1946 dev_warn(dev->dev, "IO queues not created\n");
3184 nvme_dev_remove(dev); 1947 nvme_remove_namespaces(&dev->ctrl);
3185 } else { 1948 } else {
3186 nvme_unfreeze_queues(dev); 1949 nvme_start_queues(&dev->ctrl);
3187 nvme_dev_add(dev); 1950 nvme_dev_add(dev);
3188 } 1951 }
3189 1952
1953 clear_bit(NVME_CTRL_RESETTING, &dev->flags);
3190 return; 1954 return;
3191 1955
1956 remove:
1957 nvme_dev_list_remove(dev);
3192 free_tags: 1958 free_tags:
3193 nvme_dev_remove_admin(dev); 1959 nvme_dev_remove_admin(dev);
3194 blk_put_queue(dev->admin_q); 1960 blk_put_queue(dev->ctrl.admin_q);
3195 dev->admin_q = NULL; 1961 dev->ctrl.admin_q = NULL;
3196 dev->queues[0]->tags = NULL; 1962 dev->queues[0]->tags = NULL;
3197 disable: 1963 disable:
3198 nvme_disable_queue(dev, 0); 1964 nvme_disable_admin_queue(dev, false);
3199 nvme_dev_list_remove(dev);
3200 unmap: 1965 unmap:
3201 nvme_dev_unmap(dev); 1966 nvme_dev_unmap(dev);
3202 out: 1967 out:
3203 if (!work_busy(&dev->reset_work)) 1968 nvme_remove_dead_ctrl(dev);
3204 nvme_dead_ctrl(dev);
3205} 1969}
3206 1970
3207static int nvme_remove_dead_ctrl(void *arg) 1971static void nvme_remove_dead_ctrl_work(struct work_struct *work)
3208{ 1972{
3209 struct nvme_dev *dev = (struct nvme_dev *)arg; 1973 struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
3210 struct pci_dev *pdev = to_pci_dev(dev->dev); 1974 struct pci_dev *pdev = to_pci_dev(dev->dev);
3211 1975
3212 if (pci_get_drvdata(pdev)) 1976 if (pci_get_drvdata(pdev))
3213 pci_stop_and_remove_bus_device_locked(pdev); 1977 pci_stop_and_remove_bus_device_locked(pdev);
3214 kref_put(&dev->kref, nvme_free_dev); 1978 nvme_put_ctrl(&dev->ctrl);
3215 return 0;
3216} 1979}
3217 1980
3218static void nvme_dead_ctrl(struct nvme_dev *dev) 1981static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
3219{ 1982{
3220 dev_warn(dev->dev, "Device failed to resume\n"); 1983 dev_warn(dev->dev, "Removing after probe failure\n");
3221 kref_get(&dev->kref); 1984 kref_get(&dev->ctrl.kref);
3222 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 1985 if (!schedule_work(&dev->remove_work))
3223 dev->instance))) { 1986 nvme_put_ctrl(&dev->ctrl);
3224 dev_err(dev->dev,
3225 "Failed to start controller remove task\n");
3226 kref_put(&dev->kref, nvme_free_dev);
3227 }
3228} 1987}
3229 1988
3230static void nvme_reset_work(struct work_struct *ws) 1989static int nvme_reset(struct nvme_dev *dev)
3231{ 1990{
3232 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 1991 if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
3233 bool in_probe = work_busy(&dev->probe_work); 1992 return -ENODEV;
3234
3235 nvme_dev_shutdown(dev);
3236 1993
3237 /* Synchronize with device probe so that work will see failure status 1994 if (!queue_work(nvme_workq, &dev->reset_work))
3238 * and exit gracefully without trying to schedule another reset */ 1995 return -EBUSY;
3239 flush_work(&dev->probe_work);
3240 1996
3241 /* Fail this device if reset occured during probe to avoid 1997 flush_work(&dev->reset_work);
3242 * infinite initialization loops. */ 1998 return 0;
3243 if (in_probe) {
3244 nvme_dead_ctrl(dev);
3245 return;
3246 }
3247 /* Schedule device resume asynchronously so the reset work is available
3248 * to cleanup errors that may occur during reinitialization */
3249 schedule_work(&dev->probe_work);
3250} 1999}
3251 2000
3252static int __nvme_reset(struct nvme_dev *dev) 2001static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
3253{ 2002{
3254 if (work_pending(&dev->reset_work)) 2003 *val = readl(to_nvme_dev(ctrl)->bar + off);
3255 return -EBUSY;
3256 list_del_init(&dev->node);
3257 queue_work(nvme_workq, &dev->reset_work);
3258 return 0; 2004 return 0;
3259} 2005}
3260 2006
3261static int nvme_reset(struct nvme_dev *dev) 2007static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
3262{ 2008{
3263 int ret; 2009 writel(val, to_nvme_dev(ctrl)->bar + off);
3264 2010 return 0;
3265 if (!dev->admin_q || blk_queue_dying(dev->admin_q)) 2011}
3266 return -ENODEV;
3267
3268 spin_lock(&dev_list_lock);
3269 ret = __nvme_reset(dev);
3270 spin_unlock(&dev_list_lock);
3271
3272 if (!ret) {
3273 flush_work(&dev->reset_work);
3274 flush_work(&dev->probe_work);
3275 return 0;
3276 }
3277 2012
3278 return ret; 2013static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
2014{
2015 *val = readq(to_nvme_dev(ctrl)->bar + off);
2016 return 0;
3279} 2017}
3280 2018
3281static ssize_t nvme_sysfs_reset(struct device *dev, 2019static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl)
3282 struct device_attribute *attr, const char *buf,
3283 size_t count)
3284{ 2020{
3285 struct nvme_dev *ndev = dev_get_drvdata(dev); 2021 struct nvme_dev *dev = to_nvme_dev(ctrl);
3286 int ret;
3287 2022
3288 ret = nvme_reset(ndev); 2023 return !dev->bar || dev->online_queues < 2;
3289 if (ret < 0) 2024}
3290 return ret;
3291 2025
3292 return count; 2026static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
2027{
2028 return nvme_reset(to_nvme_dev(ctrl));
3293} 2029}
3294static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 2030
2031static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
2032 .reg_read32 = nvme_pci_reg_read32,
2033 .reg_write32 = nvme_pci_reg_write32,
2034 .reg_read64 = nvme_pci_reg_read64,
2035 .io_incapable = nvme_pci_io_incapable,
2036 .reset_ctrl = nvme_pci_reset_ctrl,
2037 .free_ctrl = nvme_pci_free_ctrl,
2038};
3295 2039
3296static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2040static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
3297{ 2041{
@@ -3314,46 +2058,30 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
3314 if (!dev->queues) 2058 if (!dev->queues)
3315 goto free; 2059 goto free;
3316 2060
3317 INIT_LIST_HEAD(&dev->namespaces);
3318 INIT_WORK(&dev->reset_work, nvme_reset_work);
3319 dev->dev = get_device(&pdev->dev); 2061 dev->dev = get_device(&pdev->dev);
3320 pci_set_drvdata(pdev, dev); 2062 pci_set_drvdata(pdev, dev);
3321 result = nvme_set_instance(dev); 2063
3322 if (result) 2064 INIT_LIST_HEAD(&dev->node);
3323 goto put_pci; 2065 INIT_WORK(&dev->scan_work, nvme_dev_scan);
2066 INIT_WORK(&dev->reset_work, nvme_reset_work);
2067 INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2068 mutex_init(&dev->shutdown_lock);
2069 init_completion(&dev->ioq_wait);
3324 2070
3325 result = nvme_setup_prp_pools(dev); 2071 result = nvme_setup_prp_pools(dev);
3326 if (result) 2072 if (result)
3327 goto release; 2073 goto put_pci;
3328
3329 kref_init(&dev->kref);
3330 dev->device = device_create(nvme_class, &pdev->dev,
3331 MKDEV(nvme_char_major, dev->instance),
3332 dev, "nvme%d", dev->instance);
3333 if (IS_ERR(dev->device)) {
3334 result = PTR_ERR(dev->device);
3335 goto release_pools;
3336 }
3337 get_device(dev->device);
3338 dev_set_drvdata(dev->device, dev);
3339 2074
3340 result = device_create_file(dev->device, &dev_attr_reset_controller); 2075 result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
2076 id->driver_data);
3341 if (result) 2077 if (result)
3342 goto put_dev; 2078 goto release_pools;
3343 2079
3344 INIT_LIST_HEAD(&dev->node); 2080 queue_work(nvme_workq, &dev->reset_work);
3345 INIT_WORK(&dev->scan_work, nvme_dev_scan);
3346 INIT_WORK(&dev->probe_work, nvme_probe_work);
3347 schedule_work(&dev->probe_work);
3348 return 0; 2081 return 0;
3349 2082
3350 put_dev:
3351 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
3352 put_device(dev->device);
3353 release_pools: 2083 release_pools:
3354 nvme_release_prp_pools(dev); 2084 nvme_release_prp_pools(dev);
3355 release:
3356 nvme_release_instance(dev);
3357 put_pci: 2085 put_pci:
3358 put_device(dev->dev); 2086 put_device(dev->dev);
3359 free: 2087 free:
@@ -3368,15 +2096,15 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
3368 struct nvme_dev *dev = pci_get_drvdata(pdev); 2096 struct nvme_dev *dev = pci_get_drvdata(pdev);
3369 2097
3370 if (prepare) 2098 if (prepare)
3371 nvme_dev_shutdown(dev); 2099 nvme_dev_disable(dev, false);
3372 else 2100 else
3373 schedule_work(&dev->probe_work); 2101 queue_work(nvme_workq, &dev->reset_work);
3374} 2102}
3375 2103
3376static void nvme_shutdown(struct pci_dev *pdev) 2104static void nvme_shutdown(struct pci_dev *pdev)
3377{ 2105{
3378 struct nvme_dev *dev = pci_get_drvdata(pdev); 2106 struct nvme_dev *dev = pci_get_drvdata(pdev);
3379 nvme_dev_shutdown(dev); 2107 nvme_dev_disable(dev, true);
3380} 2108}
3381 2109
3382static void nvme_remove(struct pci_dev *pdev) 2110static void nvme_remove(struct pci_dev *pdev)
@@ -3388,34 +2116,25 @@ static void nvme_remove(struct pci_dev *pdev)
3388 spin_unlock(&dev_list_lock); 2116 spin_unlock(&dev_list_lock);
3389 2117
3390 pci_set_drvdata(pdev, NULL); 2118 pci_set_drvdata(pdev, NULL);
3391 flush_work(&dev->probe_work);
3392 flush_work(&dev->reset_work); 2119 flush_work(&dev->reset_work);
3393 flush_work(&dev->scan_work); 2120 flush_work(&dev->scan_work);
3394 device_remove_file(dev->device, &dev_attr_reset_controller); 2121 nvme_remove_namespaces(&dev->ctrl);
3395 nvme_dev_remove(dev); 2122 nvme_uninit_ctrl(&dev->ctrl);
3396 nvme_dev_shutdown(dev); 2123 nvme_dev_disable(dev, true);
3397 nvme_dev_remove_admin(dev); 2124 nvme_dev_remove_admin(dev);
3398 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
3399 nvme_free_queues(dev, 0); 2125 nvme_free_queues(dev, 0);
3400 nvme_release_cmb(dev); 2126 nvme_release_cmb(dev);
3401 nvme_release_prp_pools(dev); 2127 nvme_release_prp_pools(dev);
3402 kref_put(&dev->kref, nvme_free_dev); 2128 nvme_put_ctrl(&dev->ctrl);
3403} 2129}
3404 2130
3405/* These functions are yet to be implemented */
3406#define nvme_error_detected NULL
3407#define nvme_dump_registers NULL
3408#define nvme_link_reset NULL
3409#define nvme_slot_reset NULL
3410#define nvme_error_resume NULL
3411
3412#ifdef CONFIG_PM_SLEEP 2131#ifdef CONFIG_PM_SLEEP
3413static int nvme_suspend(struct device *dev) 2132static int nvme_suspend(struct device *dev)
3414{ 2133{
3415 struct pci_dev *pdev = to_pci_dev(dev); 2134 struct pci_dev *pdev = to_pci_dev(dev);
3416 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2135 struct nvme_dev *ndev = pci_get_drvdata(pdev);
3417 2136
3418 nvme_dev_shutdown(ndev); 2137 nvme_dev_disable(ndev, true);
3419 return 0; 2138 return 0;
3420} 2139}
3421 2140
@@ -3424,17 +2143,53 @@ static int nvme_resume(struct device *dev)
3424 struct pci_dev *pdev = to_pci_dev(dev); 2143 struct pci_dev *pdev = to_pci_dev(dev);
3425 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2144 struct nvme_dev *ndev = pci_get_drvdata(pdev);
3426 2145
3427 schedule_work(&ndev->probe_work); 2146 queue_work(nvme_workq, &ndev->reset_work);
3428 return 0; 2147 return 0;
3429} 2148}
3430#endif 2149#endif
3431 2150
3432static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2151static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
3433 2152
2153static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
2154 pci_channel_state_t state)
2155{
2156 struct nvme_dev *dev = pci_get_drvdata(pdev);
2157
2158 /*
2159 * A frozen channel requires a reset. When detected, this method will
2160 * shutdown the controller to quiesce. The controller will be restarted
2161 * after the slot reset through driver's slot_reset callback.
2162 */
2163 dev_warn(&pdev->dev, "error detected: state:%d\n", state);
2164 switch (state) {
2165 case pci_channel_io_normal:
2166 return PCI_ERS_RESULT_CAN_RECOVER;
2167 case pci_channel_io_frozen:
2168 nvme_dev_disable(dev, false);
2169 return PCI_ERS_RESULT_NEED_RESET;
2170 case pci_channel_io_perm_failure:
2171 return PCI_ERS_RESULT_DISCONNECT;
2172 }
2173 return PCI_ERS_RESULT_NEED_RESET;
2174}
2175
2176static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
2177{
2178 struct nvme_dev *dev = pci_get_drvdata(pdev);
2179
2180 dev_info(&pdev->dev, "restart after slot reset\n");
2181 pci_restore_state(pdev);
2182 queue_work(nvme_workq, &dev->reset_work);
2183 return PCI_ERS_RESULT_RECOVERED;
2184}
2185
2186static void nvme_error_resume(struct pci_dev *pdev)
2187{
2188 pci_cleanup_aer_uncorrect_error_status(pdev);
2189}
2190
3434static const struct pci_error_handlers nvme_err_handler = { 2191static const struct pci_error_handlers nvme_err_handler = {
3435 .error_detected = nvme_error_detected, 2192 .error_detected = nvme_error_detected,
3436 .mmio_enabled = nvme_dump_registers,
3437 .link_reset = nvme_link_reset,
3438 .slot_reset = nvme_slot_reset, 2193 .slot_reset = nvme_slot_reset,
3439 .resume = nvme_error_resume, 2194 .resume = nvme_error_resume,
3440 .reset_notify = nvme_reset_notify, 2195 .reset_notify = nvme_reset_notify,
@@ -3444,6 +2199,10 @@ static const struct pci_error_handlers nvme_err_handler = {
3444#define PCI_CLASS_STORAGE_EXPRESS 0x010802 2199#define PCI_CLASS_STORAGE_EXPRESS 0x010802
3445 2200
3446static const struct pci_device_id nvme_id_table[] = { 2201static const struct pci_device_id nvme_id_table[] = {
2202 { PCI_VDEVICE(INTEL, 0x0953),
2203 .driver_data = NVME_QUIRK_STRIPE_SIZE, },
2204 { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
2205 .driver_data = NVME_QUIRK_IDENTIFY_CNS, },
3447 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2206 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
3448 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 2207 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
3449 { 0, } 2208 { 0, }
@@ -3468,40 +2227,21 @@ static int __init nvme_init(void)
3468 2227
3469 init_waitqueue_head(&nvme_kthread_wait); 2228 init_waitqueue_head(&nvme_kthread_wait);
3470 2229
3471 nvme_workq = create_singlethread_workqueue("nvme"); 2230 nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
3472 if (!nvme_workq) 2231 if (!nvme_workq)
3473 return -ENOMEM; 2232 return -ENOMEM;
3474 2233
3475 result = register_blkdev(nvme_major, "nvme"); 2234 result = nvme_core_init();
3476 if (result < 0) 2235 if (result < 0)
3477 goto kill_workq; 2236 goto kill_workq;
3478 else if (result > 0)
3479 nvme_major = result;
3480
3481 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
3482 &nvme_dev_fops);
3483 if (result < 0)
3484 goto unregister_blkdev;
3485 else if (result > 0)
3486 nvme_char_major = result;
3487
3488 nvme_class = class_create(THIS_MODULE, "nvme");
3489 if (IS_ERR(nvme_class)) {
3490 result = PTR_ERR(nvme_class);
3491 goto unregister_chrdev;
3492 }
3493 2237
3494 result = pci_register_driver(&nvme_driver); 2238 result = pci_register_driver(&nvme_driver);
3495 if (result) 2239 if (result)
3496 goto destroy_class; 2240 goto core_exit;
3497 return 0; 2241 return 0;
3498 2242
3499 destroy_class: 2243 core_exit:
3500 class_destroy(nvme_class); 2244 nvme_core_exit();
3501 unregister_chrdev:
3502 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
3503 unregister_blkdev:
3504 unregister_blkdev(nvme_major, "nvme");
3505 kill_workq: 2245 kill_workq:
3506 destroy_workqueue(nvme_workq); 2246 destroy_workqueue(nvme_workq);
3507 return result; 2247 return result;
@@ -3510,10 +2250,8 @@ static int __init nvme_init(void)
3510static void __exit nvme_exit(void) 2250static void __exit nvme_exit(void)
3511{ 2251{
3512 pci_unregister_driver(&nvme_driver); 2252 pci_unregister_driver(&nvme_driver);
3513 unregister_blkdev(nvme_major, "nvme"); 2253 nvme_core_exit();
3514 destroy_workqueue(nvme_workq); 2254 destroy_workqueue(nvme_workq);
3515 class_destroy(nvme_class);
3516 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
3517 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 2255 BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
3518 _nvme_check_size(); 2256 _nvme_check_size();
3519} 2257}
diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c
index c3d8d3887a31..e947e298a737 100644
--- a/drivers/nvme/host/scsi.c
+++ b/drivers/nvme/host/scsi.c
@@ -524,7 +524,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
524 struct sg_io_hdr *hdr, u8 *inq_response, 524 struct sg_io_hdr *hdr, u8 *inq_response,
525 int alloc_len) 525 int alloc_len)
526{ 526{
527 struct nvme_dev *dev = ns->dev; 527 struct nvme_ctrl *ctrl = ns->ctrl;
528 struct nvme_id_ns *id_ns; 528 struct nvme_id_ns *id_ns;
529 int res; 529 int res;
530 int nvme_sc; 530 int nvme_sc;
@@ -532,10 +532,10 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
532 u8 resp_data_format = 0x02; 532 u8 resp_data_format = 0x02;
533 u8 protect; 533 u8 protect;
534 u8 cmdque = 0x01 << 1; 534 u8 cmdque = 0x01 << 1;
535 u8 fw_offset = sizeof(dev->firmware_rev); 535 u8 fw_offset = sizeof(ctrl->firmware_rev);
536 536
537 /* nvme ns identify - use DPS value for PROTECT field */ 537 /* nvme ns identify - use DPS value for PROTECT field */
538 nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); 538 nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns);
539 res = nvme_trans_status_code(hdr, nvme_sc); 539 res = nvme_trans_status_code(hdr, nvme_sc);
540 if (res) 540 if (res)
541 return res; 541 return res;
@@ -553,12 +553,12 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
553 inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ 553 inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */
554 inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ 554 inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */
555 strncpy(&inq_response[8], "NVMe ", 8); 555 strncpy(&inq_response[8], "NVMe ", 8);
556 strncpy(&inq_response[16], dev->model, 16); 556 strncpy(&inq_response[16], ctrl->model, 16);
557 557
558 while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) 558 while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
559 fw_offset--; 559 fw_offset--;
560 fw_offset -= 4; 560 fw_offset -= 4;
561 strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); 561 strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4);
562 562
563 xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); 563 xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
564 return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); 564 return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
@@ -588,82 +588,113 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
588 struct sg_io_hdr *hdr, u8 *inq_response, 588 struct sg_io_hdr *hdr, u8 *inq_response,
589 int alloc_len) 589 int alloc_len)
590{ 590{
591 struct nvme_dev *dev = ns->dev;
592 int xfer_len; 591 int xfer_len;
593 592
594 memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); 593 memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
595 inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ 594 inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */
596 inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ 595 inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */
597 strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); 596 strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH);
598 597
599 xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); 598 xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
600 return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); 599 return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
601} 600}
602 601
603static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, 602static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr,
604 u8 *inq_response, int alloc_len) 603 u8 *inq_response, int alloc_len)
605{ 604{
606 struct nvme_dev *dev = ns->dev; 605 struct nvme_id_ns *id_ns;
607 int res; 606 int nvme_sc, res;
608 int nvme_sc; 607 size_t len;
609 int xfer_len; 608 void *eui;
610 __be32 tmp_id = cpu_to_be32(ns->ns_id);
611 609
612 memset(inq_response, 0, alloc_len); 610 nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
613 inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ 611 res = nvme_trans_status_code(hdr, nvme_sc);
614 if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { 612 if (res)
615 struct nvme_id_ns *id_ns; 613 return res;
616 void *eui;
617 int len;
618 614
619 nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); 615 eui = id_ns->eui64;
620 res = nvme_trans_status_code(hdr, nvme_sc); 616 len = sizeof(id_ns->eui64);
621 if (res)
622 return res;
623 617
624 eui = id_ns->eui64; 618 if (ns->ctrl->vs >= NVME_VS(1, 2)) {
625 len = sizeof(id_ns->eui64);
626 if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
627 if (bitmap_empty(eui, len * 8)) {
628 eui = id_ns->nguid;
629 len = sizeof(id_ns->nguid);
630 }
631 }
632 if (bitmap_empty(eui, len * 8)) { 619 if (bitmap_empty(eui, len * 8)) {
633 kfree(id_ns); 620 eui = id_ns->nguid;
634 goto scsi_string; 621 len = sizeof(id_ns->nguid);
635 } 622 }
623 }
636 624
637 inq_response[3] = 4 + len; /* Page Length */ 625 if (bitmap_empty(eui, len * 8)) {
638 /* Designation Descriptor start */ 626 res = -EOPNOTSUPP;
639 inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ 627 goto out_free_id;
640 inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */
641 inq_response[6] = 0x00; /* Rsvd */
642 inq_response[7] = len; /* Designator Length */
643 memcpy(&inq_response[8], eui, len);
644 kfree(id_ns);
645 } else {
646 scsi_string:
647 if (alloc_len < 72) {
648 return nvme_trans_completion(hdr,
649 SAM_STAT_CHECK_CONDITION,
650 ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
651 SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
652 }
653 inq_response[3] = 0x48; /* Page Length */
654 /* Designation Descriptor start */
655 inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */
656 inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */
657 inq_response[6] = 0x00; /* Rsvd */
658 inq_response[7] = 0x44; /* Designator Length */
659
660 sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor);
661 memcpy(&inq_response[12], dev->model, sizeof(dev->model));
662 sprintf(&inq_response[52], "%04x", tmp_id);
663 memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
664 } 628 }
665 xfer_len = alloc_len; 629
666 return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); 630 memset(inq_response, 0, alloc_len);
631 inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;
632 inq_response[3] = 4 + len; /* Page Length */
633
634 /* Designation Descriptor start */
635 inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */
636 inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */
637 inq_response[6] = 0x00; /* Rsvd */
638 inq_response[7] = len; /* Designator Length */
639 memcpy(&inq_response[8], eui, len);
640
641 res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len);
642out_free_id:
643 kfree(id_ns);
644 return res;
645}
646
647static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns,
648 struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len)
649{
650 struct nvme_ctrl *ctrl = ns->ctrl;
651 struct nvme_id_ctrl *id_ctrl;
652 int nvme_sc, res;
653
654 if (alloc_len < 72) {
655 return nvme_trans_completion(hdr,
656 SAM_STAT_CHECK_CONDITION,
657 ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
658 SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
659 }
660
661 nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl);
662 res = nvme_trans_status_code(hdr, nvme_sc);
663 if (res)
664 return res;
665
666 memset(inq_response, 0, alloc_len);
667 inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;
668 inq_response[3] = 0x48; /* Page Length */
669
670 /* Designation Descriptor start */
671 inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */
672 inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */
673 inq_response[6] = 0x00; /* Rsvd */
674 inq_response[7] = 0x44; /* Designator Length */
675
676 sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid));
677 memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model));
678 sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id));
679 memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial));
680
681 res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len);
682 kfree(id_ctrl);
683 return res;
684}
685
686static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
687 u8 *resp, int alloc_len)
688{
689 int res;
690
691 if (ns->ctrl->vs >= NVME_VS(1, 1)) {
692 res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len);
693 if (res != -EOPNOTSUPP)
694 return res;
695 }
696
697 return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len);
667} 698}
668 699
669static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, 700static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
@@ -672,7 +703,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
672 u8 *inq_response; 703 u8 *inq_response;
673 int res; 704 int res;
674 int nvme_sc; 705 int nvme_sc;
675 struct nvme_dev *dev = ns->dev; 706 struct nvme_ctrl *ctrl = ns->ctrl;
676 struct nvme_id_ctrl *id_ctrl; 707 struct nvme_id_ctrl *id_ctrl;
677 struct nvme_id_ns *id_ns; 708 struct nvme_id_ns *id_ns;
678 int xfer_len; 709 int xfer_len;
@@ -688,7 +719,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
688 if (inq_response == NULL) 719 if (inq_response == NULL)
689 return -ENOMEM; 720 return -ENOMEM;
690 721
691 nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); 722 nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns);
692 res = nvme_trans_status_code(hdr, nvme_sc); 723 res = nvme_trans_status_code(hdr, nvme_sc);
693 if (res) 724 if (res)
694 goto out_free_inq; 725 goto out_free_inq;
@@ -704,7 +735,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
704 app_chk = protect << 1; 735 app_chk = protect << 1;
705 ref_chk = protect; 736 ref_chk = protect;
706 737
707 nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); 738 nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl);
708 res = nvme_trans_status_code(hdr, nvme_sc); 739 res = nvme_trans_status_code(hdr, nvme_sc);
709 if (res) 740 if (res)
710 goto out_free_inq; 741 goto out_free_inq;
@@ -815,7 +846,6 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
815 int res; 846 int res;
816 int xfer_len; 847 int xfer_len;
817 u8 *log_response; 848 u8 *log_response;
818 struct nvme_dev *dev = ns->dev;
819 struct nvme_smart_log *smart_log; 849 struct nvme_smart_log *smart_log;
820 u8 temp_c; 850 u8 temp_c;
821 u16 temp_k; 851 u16 temp_k;
@@ -824,7 +854,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
824 if (log_response == NULL) 854 if (log_response == NULL)
825 return -ENOMEM; 855 return -ENOMEM;
826 856
827 res = nvme_get_log_page(dev, &smart_log); 857 res = nvme_get_log_page(ns->ctrl, &smart_log);
828 if (res < 0) 858 if (res < 0)
829 goto out_free_response; 859 goto out_free_response;
830 860
@@ -862,7 +892,6 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
862 int res; 892 int res;
863 int xfer_len; 893 int xfer_len;
864 u8 *log_response; 894 u8 *log_response;
865 struct nvme_dev *dev = ns->dev;
866 struct nvme_smart_log *smart_log; 895 struct nvme_smart_log *smart_log;
867 u32 feature_resp; 896 u32 feature_resp;
868 u8 temp_c_cur, temp_c_thresh; 897 u8 temp_c_cur, temp_c_thresh;
@@ -872,7 +901,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
872 if (log_response == NULL) 901 if (log_response == NULL)
873 return -ENOMEM; 902 return -ENOMEM;
874 903
875 res = nvme_get_log_page(dev, &smart_log); 904 res = nvme_get_log_page(ns->ctrl, &smart_log);
876 if (res < 0) 905 if (res < 0)
877 goto out_free_response; 906 goto out_free_response;
878 907
@@ -886,7 +915,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
886 kfree(smart_log); 915 kfree(smart_log);
887 916
888 /* Get Features for Temp Threshold */ 917 /* Get Features for Temp Threshold */
889 res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, 918 res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0,
890 &feature_resp); 919 &feature_resp);
891 if (res != NVME_SC_SUCCESS) 920 if (res != NVME_SC_SUCCESS)
892 temp_c_thresh = LOG_TEMP_UNKNOWN; 921 temp_c_thresh = LOG_TEMP_UNKNOWN;
@@ -948,7 +977,6 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
948{ 977{
949 int res; 978 int res;
950 int nvme_sc; 979 int nvme_sc;
951 struct nvme_dev *dev = ns->dev;
952 struct nvme_id_ns *id_ns; 980 struct nvme_id_ns *id_ns;
953 u8 flbas; 981 u8 flbas;
954 u32 lba_length; 982 u32 lba_length;
@@ -958,7 +986,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
958 else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) 986 else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
959 return -EINVAL; 987 return -EINVAL;
960 988
961 nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); 989 nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
962 res = nvme_trans_status_code(hdr, nvme_sc); 990 res = nvme_trans_status_code(hdr, nvme_sc);
963 if (res) 991 if (res)
964 return res; 992 return res;
@@ -1014,14 +1042,13 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
1014{ 1042{
1015 int res = 0; 1043 int res = 0;
1016 int nvme_sc; 1044 int nvme_sc;
1017 struct nvme_dev *dev = ns->dev;
1018 u32 feature_resp; 1045 u32 feature_resp;
1019 u8 vwc; 1046 u8 vwc;
1020 1047
1021 if (len < MODE_PAGE_CACHING_LEN) 1048 if (len < MODE_PAGE_CACHING_LEN)
1022 return -EINVAL; 1049 return -EINVAL;
1023 1050
1024 nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, 1051 nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0,
1025 &feature_resp); 1052 &feature_resp);
1026 res = nvme_trans_status_code(hdr, nvme_sc); 1053 res = nvme_trans_status_code(hdr, nvme_sc);
1027 if (res) 1054 if (res)
@@ -1207,12 +1234,11 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
1207{ 1234{
1208 int res; 1235 int res;
1209 int nvme_sc; 1236 int nvme_sc;
1210 struct nvme_dev *dev = ns->dev;
1211 struct nvme_id_ctrl *id_ctrl; 1237 struct nvme_id_ctrl *id_ctrl;
1212 int lowest_pow_st; /* max npss = lowest power consumption */ 1238 int lowest_pow_st; /* max npss = lowest power consumption */
1213 unsigned ps_desired = 0; 1239 unsigned ps_desired = 0;
1214 1240
1215 nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); 1241 nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
1216 res = nvme_trans_status_code(hdr, nvme_sc); 1242 res = nvme_trans_status_code(hdr, nvme_sc);
1217 if (res) 1243 if (res)
1218 return res; 1244 return res;
@@ -1256,7 +1282,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
1256 SCSI_ASCQ_CAUSE_NOT_REPORTABLE); 1282 SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
1257 break; 1283 break;
1258 } 1284 }
1259 nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, 1285 nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0,
1260 NULL); 1286 NULL);
1261 return nvme_trans_status_code(hdr, nvme_sc); 1287 return nvme_trans_status_code(hdr, nvme_sc);
1262} 1288}
@@ -1280,7 +1306,6 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
1280 u8 buffer_id) 1306 u8 buffer_id)
1281{ 1307{
1282 int nvme_sc; 1308 int nvme_sc;
1283 struct nvme_dev *dev = ns->dev;
1284 struct nvme_command c; 1309 struct nvme_command c;
1285 1310
1286 if (hdr->iovec_count > 0) { 1311 if (hdr->iovec_count > 0) {
@@ -1297,7 +1322,7 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
1297 c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); 1322 c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
1298 c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); 1323 c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
1299 1324
1300 nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 1325 nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c,
1301 hdr->dxferp, tot_len, NULL, 0); 1326 hdr->dxferp, tot_len, NULL, 0);
1302 return nvme_trans_status_code(hdr, nvme_sc); 1327 return nvme_trans_status_code(hdr, nvme_sc);
1303} 1328}
@@ -1364,14 +1389,13 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
1364{ 1389{
1365 int res = 0; 1390 int res = 0;
1366 int nvme_sc; 1391 int nvme_sc;
1367 struct nvme_dev *dev = ns->dev;
1368 unsigned dword11; 1392 unsigned dword11;
1369 1393
1370 switch (page_code) { 1394 switch (page_code) {
1371 case MODE_PAGE_CACHING: 1395 case MODE_PAGE_CACHING:
1372 dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); 1396 dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
1373 nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, 1397 nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC,
1374 0, NULL); 1398 dword11, 0, NULL);
1375 res = nvme_trans_status_code(hdr, nvme_sc); 1399 res = nvme_trans_status_code(hdr, nvme_sc);
1376 break; 1400 break;
1377 case MODE_PAGE_CONTROL: 1401 case MODE_PAGE_CONTROL:
@@ -1473,7 +1497,6 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
1473{ 1497{
1474 int res = 0; 1498 int res = 0;
1475 int nvme_sc; 1499 int nvme_sc;
1476 struct nvme_dev *dev = ns->dev;
1477 u8 flbas; 1500 u8 flbas;
1478 1501
1479 /* 1502 /*
@@ -1486,7 +1509,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
1486 if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { 1509 if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
1487 struct nvme_id_ns *id_ns; 1510 struct nvme_id_ns *id_ns;
1488 1511
1489 nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); 1512 nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
1490 res = nvme_trans_status_code(hdr, nvme_sc); 1513 res = nvme_trans_status_code(hdr, nvme_sc);
1491 if (res) 1514 if (res)
1492 return res; 1515 return res;
@@ -1570,7 +1593,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
1570{ 1593{
1571 int res; 1594 int res;
1572 int nvme_sc; 1595 int nvme_sc;
1573 struct nvme_dev *dev = ns->dev;
1574 struct nvme_id_ns *id_ns; 1596 struct nvme_id_ns *id_ns;
1575 u8 i; 1597 u8 i;
1576 u8 flbas, nlbaf; 1598 u8 flbas, nlbaf;
@@ -1579,7 +1601,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
1579 struct nvme_command c; 1601 struct nvme_command c;
1580 1602
1581 /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ 1603 /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
1582 nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); 1604 nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
1583 res = nvme_trans_status_code(hdr, nvme_sc); 1605 res = nvme_trans_status_code(hdr, nvme_sc);
1584 if (res) 1606 if (res)
1585 return res; 1607 return res;
@@ -1611,7 +1633,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
1611 c.format.nsid = cpu_to_le32(ns->ns_id); 1633 c.format.nsid = cpu_to_le32(ns->ns_id);
1612 c.format.cdw10 = cpu_to_le32(cdw10); 1634 c.format.cdw10 = cpu_to_le32(cdw10);
1613 1635
1614 nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1636 nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0);
1615 res = nvme_trans_status_code(hdr, nvme_sc); 1637 res = nvme_trans_status_code(hdr, nvme_sc);
1616 1638
1617 kfree(id_ns); 1639 kfree(id_ns);
@@ -1704,7 +1726,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
1704 nvme_sc = NVME_SC_LBA_RANGE; 1726 nvme_sc = NVME_SC_LBA_RANGE;
1705 break; 1727 break;
1706 } 1728 }
1707 nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL, 1729 nvme_sc = nvme_submit_user_cmd(ns->queue, &c,
1708 next_mapping_addr, unit_len, NULL, 0); 1730 next_mapping_addr, unit_len, NULL, 0);
1709 if (nvme_sc) 1731 if (nvme_sc)
1710 break; 1732 break;
@@ -2040,7 +2062,6 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2040 u32 alloc_len; 2062 u32 alloc_len;
2041 u32 resp_size; 2063 u32 resp_size;
2042 u32 xfer_len; 2064 u32 xfer_len;
2043 struct nvme_dev *dev = ns->dev;
2044 struct nvme_id_ns *id_ns; 2065 struct nvme_id_ns *id_ns;
2045 u8 *response; 2066 u8 *response;
2046 2067
@@ -2052,7 +2073,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2052 resp_size = READ_CAP_10_RESP_SIZE; 2073 resp_size = READ_CAP_10_RESP_SIZE;
2053 } 2074 }
2054 2075
2055 nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); 2076 nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
2056 res = nvme_trans_status_code(hdr, nvme_sc); 2077 res = nvme_trans_status_code(hdr, nvme_sc);
2057 if (res) 2078 if (res)
2058 return res; 2079 return res;
@@ -2080,7 +2101,6 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2080 int nvme_sc; 2101 int nvme_sc;
2081 u32 alloc_len, xfer_len, resp_size; 2102 u32 alloc_len, xfer_len, resp_size;
2082 u8 *response; 2103 u8 *response;
2083 struct nvme_dev *dev = ns->dev;
2084 struct nvme_id_ctrl *id_ctrl; 2104 struct nvme_id_ctrl *id_ctrl;
2085 u32 ll_length, lun_id; 2105 u32 ll_length, lun_id;
2086 u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; 2106 u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
@@ -2094,7 +2114,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2094 case ALL_LUNS_RETURNED: 2114 case ALL_LUNS_RETURNED:
2095 case ALL_WELL_KNOWN_LUNS_RETURNED: 2115 case ALL_WELL_KNOWN_LUNS_RETURNED:
2096 case RESTRICTED_LUNS_RETURNED: 2116 case RESTRICTED_LUNS_RETURNED:
2097 nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); 2117 nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
2098 res = nvme_trans_status_code(hdr, nvme_sc); 2118 res = nvme_trans_status_code(hdr, nvme_sc);
2099 if (res) 2119 if (res)
2100 return res; 2120 return res;
@@ -2295,9 +2315,7 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns,
2295 struct sg_io_hdr *hdr, 2315 struct sg_io_hdr *hdr,
2296 u8 *cmd) 2316 u8 *cmd)
2297{ 2317{
2298 struct nvme_dev *dev = ns->dev; 2318 if (nvme_ctrl_ready(ns->ctrl))
2299
2300 if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY))
2301 return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, 2319 return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
2302 NOT_READY, SCSI_ASC_LUN_NOT_READY, 2320 NOT_READY, SCSI_ASC_LUN_NOT_READY,
2303 SCSI_ASCQ_CAUSE_NOT_REPORTABLE); 2321 SCSI_ASCQ_CAUSE_NOT_REPORTABLE);