aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig11
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/nvme.c1745
3 files changed, 1757 insertions, 0 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a30aa103f95b..4e4c8a4a5fd3 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -317,6 +317,17 @@ config BLK_DEV_NBD
317 317
318 If unsure, say N. 318 If unsure, say N.
319 319
320config BLK_DEV_NVME
321 tristate "NVM Express block device"
322 depends on PCI
323 ---help---
324 The NVM Express driver is for solid state drives directly
325 connected to the PCI or PCI Express bus. If you know you
326 don't have one of these, it is safe to answer N.
327
328 To compile this driver as a module, choose M here: the
329 module will be called nvme.
330
320config BLK_DEV_OSD 331config BLK_DEV_OSD
321 tristate "OSD object-as-blkdev support" 332 tristate "OSD object-as-blkdev support"
322 depends on SCSI_OSD_ULD 333 depends on SCSI_OSD_ULD
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index ad7b74a44ef3..5b795059f8fb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE) += xsysace.o
23obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o 23obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
24obj-$(CONFIG_MG_DISK) += mg_disk.o 24obj-$(CONFIG_MG_DISK) += mg_disk.o
25obj-$(CONFIG_SUNVDC) += sunvdc.o 25obj-$(CONFIG_SUNVDC) += sunvdc.o
26obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
26obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o 27obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
27 28
28obj-$(CONFIG_BLK_DEV_UMEM) += umem.o 29obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
new file mode 100644
index 000000000000..f4996b0e4b1a
--- /dev/null
+++ b/drivers/block/nvme.c
@@ -0,0 +1,1745 @@
1/*
2 * NVM Express device driver
3 * Copyright (c) 2011, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19#include <linux/nvme.h>
20#include <linux/bio.h>
21#include <linux/bitops.h>
22#include <linux/blkdev.h>
23#include <linux/delay.h>
24#include <linux/errno.h>
25#include <linux/fs.h>
26#include <linux/genhd.h>
27#include <linux/idr.h>
28#include <linux/init.h>
29#include <linux/interrupt.h>
30#include <linux/io.h>
31#include <linux/kdev_t.h>
32#include <linux/kthread.h>
33#include <linux/kernel.h>
34#include <linux/mm.h>
35#include <linux/module.h>
36#include <linux/moduleparam.h>
37#include <linux/pci.h>
38#include <linux/poison.h>
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/types.h>
42#include <linux/version.h>
43
44#define NVME_Q_DEPTH 1024
45#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
46#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
47#define NVME_MINORS 64
48#define NVME_IO_TIMEOUT (5 * HZ)
49#define ADMIN_TIMEOUT (60 * HZ)
50
51static int nvme_major;
52module_param(nvme_major, int, 0);
53
54static int use_threaded_interrupts;
55module_param(use_threaded_interrupts, int, 0);
56
57static DEFINE_SPINLOCK(dev_list_lock);
58static LIST_HEAD(dev_list);
59static struct task_struct *nvme_thread;
60
61/*
62 * Represents an NVM Express device. Each nvme_dev is a PCI function.
63 */
64struct nvme_dev {
65 struct list_head node;
66 struct nvme_queue **queues;
67 u32 __iomem *dbs;
68 struct pci_dev *pci_dev;
69 struct dma_pool *prp_page_pool;
70 struct dma_pool *prp_small_pool;
71 int instance;
72 int queue_count;
73 int db_stride;
74 u32 ctrl_config;
75 struct msix_entry *entry;
76 struct nvme_bar __iomem *bar;
77 struct list_head namespaces;
78 char serial[20];
79 char model[40];
80 char firmware_rev[8];
81};
82
83/*
84 * An NVM Express namespace is equivalent to a SCSI LUN
85 */
86struct nvme_ns {
87 struct list_head list;
88
89 struct nvme_dev *dev;
90 struct request_queue *queue;
91 struct gendisk *disk;
92
93 int ns_id;
94 int lba_shift;
95};
96
97/*
98 * An NVM Express queue. Each device has at least two (one for admin
99 * commands and one for I/O commands).
100 */
101struct nvme_queue {
102 struct device *q_dmadev;
103 struct nvme_dev *dev;
104 spinlock_t q_lock;
105 struct nvme_command *sq_cmds;
106 volatile struct nvme_completion *cqes;
107 dma_addr_t sq_dma_addr;
108 dma_addr_t cq_dma_addr;
109 wait_queue_head_t sq_full;
110 wait_queue_t sq_cong_wait;
111 struct bio_list sq_cong;
112 u32 __iomem *q_db;
113 u16 q_depth;
114 u16 cq_vector;
115 u16 sq_head;
116 u16 sq_tail;
117 u16 cq_head;
118 u16 cq_phase;
119 unsigned long cmdid_data[];
120};
121
122/*
123 * Check we didin't inadvertently grow the command struct
124 */
125static inline void _nvme_check_size(void)
126{
127 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
128 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
129 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
130 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
131 BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
132 BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
133 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
134 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
135 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
136}
137
138typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
139 struct nvme_completion *);
140
141struct nvme_cmd_info {
142 nvme_completion_fn fn;
143 void *ctx;
144 unsigned long timeout;
145};
146
147static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
148{
149 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
150}
151
152/**
153 * alloc_cmdid() - Allocate a Command ID
154 * @nvmeq: The queue that will be used for this command
155 * @ctx: A pointer that will be passed to the handler
156 * @handler: The function to call on completion
157 *
158 * Allocate a Command ID for a queue. The data passed in will
159 * be passed to the completion handler. This is implemented by using
160 * the bottom two bits of the ctx pointer to store the handler ID.
161 * Passing in a pointer that's not 4-byte aligned will cause a BUG.
162 * We can change this if it becomes a problem.
163 *
164 * May be called with local interrupts disabled and the q_lock held,
165 * or with interrupts enabled and no locks held.
166 */
167static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
168 nvme_completion_fn handler, unsigned timeout)
169{
170 int depth = nvmeq->q_depth - 1;
171 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
172 int cmdid;
173
174 do {
175 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
176 if (cmdid >= depth)
177 return -EBUSY;
178 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
179
180 info[cmdid].fn = handler;
181 info[cmdid].ctx = ctx;
182 info[cmdid].timeout = jiffies + timeout;
183 return cmdid;
184}
185
186static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
187 nvme_completion_fn handler, unsigned timeout)
188{
189 int cmdid;
190 wait_event_killable(nvmeq->sq_full,
191 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
192 return (cmdid < 0) ? -EINTR : cmdid;
193}
194
195/* Special values must be less than 0x1000 */
196#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
197#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
198#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
199#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
200#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
201
202static void special_completion(struct nvme_dev *dev, void *ctx,
203 struct nvme_completion *cqe)
204{
205 if (ctx == CMD_CTX_CANCELLED)
206 return;
207 if (ctx == CMD_CTX_FLUSH)
208 return;
209 if (ctx == CMD_CTX_COMPLETED) {
210 dev_warn(&dev->pci_dev->dev,
211 "completed id %d twice on queue %d\n",
212 cqe->command_id, le16_to_cpup(&cqe->sq_id));
213 return;
214 }
215 if (ctx == CMD_CTX_INVALID) {
216 dev_warn(&dev->pci_dev->dev,
217 "invalid id %d completed on queue %d\n",
218 cqe->command_id, le16_to_cpup(&cqe->sq_id));
219 return;
220 }
221
222 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
223}
224
225/*
226 * Called with local interrupts disabled and the q_lock held. May not sleep.
227 */
228static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
229 nvme_completion_fn *fn)
230{
231 void *ctx;
232 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
233
234 if (cmdid >= nvmeq->q_depth) {
235 *fn = special_completion;
236 return CMD_CTX_INVALID;
237 }
238 *fn = info[cmdid].fn;
239 ctx = info[cmdid].ctx;
240 info[cmdid].fn = special_completion;
241 info[cmdid].ctx = CMD_CTX_COMPLETED;
242 clear_bit(cmdid, nvmeq->cmdid_data);
243 wake_up(&nvmeq->sq_full);
244 return ctx;
245}
246
247static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
248 nvme_completion_fn *fn)
249{
250 void *ctx;
251 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
252 if (fn)
253 *fn = info[cmdid].fn;
254 ctx = info[cmdid].ctx;
255 info[cmdid].fn = special_completion;
256 info[cmdid].ctx = CMD_CTX_CANCELLED;
257 return ctx;
258}
259
260static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
261{
262 return dev->queues[get_cpu() + 1];
263}
264
265static void put_nvmeq(struct nvme_queue *nvmeq)
266{
267 put_cpu();
268}
269
270/**
271 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
272 * @nvmeq: The queue to use
273 * @cmd: The command to send
274 *
275 * Safe to use from interrupt context
276 */
277static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
278{
279 unsigned long flags;
280 u16 tail;
281 spin_lock_irqsave(&nvmeq->q_lock, flags);
282 tail = nvmeq->sq_tail;
283 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
284 if (++tail == nvmeq->q_depth)
285 tail = 0;
286 writel(tail, nvmeq->q_db);
287 nvmeq->sq_tail = tail;
288 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
289
290 return 0;
291}
292
293/*
294 * The nvme_iod describes the data in an I/O, including the list of PRP
295 * entries. You can't see it in this data structure because C doesn't let
296 * me express that. Use nvme_alloc_iod to ensure there's enough space
297 * allocated to store the PRP list.
298 */
299struct nvme_iod {
300 void *private; /* For the use of the submitter of the I/O */
301 int npages; /* In the PRP list. 0 means small pool in use */
302 int offset; /* Of PRP list */
303 int nents; /* Used in scatterlist */
304 int length; /* Of data, in bytes */
305 dma_addr_t first_dma;
306 struct scatterlist sg[0];
307};
308
309static __le64 **iod_list(struct nvme_iod *iod)
310{
311 return ((void *)iod) + iod->offset;
312}
313
314/*
315 * Will slightly overestimate the number of pages needed. This is OK
316 * as it only leads to a small amount of wasted memory for the lifetime of
317 * the I/O.
318 */
319static int nvme_npages(unsigned size)
320{
321 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
322 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
323}
324
325static struct nvme_iod *
326nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
327{
328 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
329 sizeof(__le64 *) * nvme_npages(nbytes) +
330 sizeof(struct scatterlist) * nseg, gfp);
331
332 if (iod) {
333 iod->offset = offsetof(struct nvme_iod, sg[nseg]);
334 iod->npages = -1;
335 iod->length = nbytes;
336 }
337
338 return iod;
339}
340
341static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
342{
343 const int last_prp = PAGE_SIZE / 8 - 1;
344 int i;
345 __le64 **list = iod_list(iod);
346 dma_addr_t prp_dma = iod->first_dma;
347
348 if (iod->npages == 0)
349 dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
350 for (i = 0; i < iod->npages; i++) {
351 __le64 *prp_list = list[i];
352 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
353 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
354 prp_dma = next_prp_dma;
355 }
356 kfree(iod);
357}
358
359static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
360{
361 struct nvme_queue *nvmeq = get_nvmeq(dev);
362 if (bio_list_empty(&nvmeq->sq_cong))
363 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
364 bio_list_add(&nvmeq->sq_cong, bio);
365 put_nvmeq(nvmeq);
366 wake_up_process(nvme_thread);
367}
368
369static void bio_completion(struct nvme_dev *dev, void *ctx,
370 struct nvme_completion *cqe)
371{
372 struct nvme_iod *iod = ctx;
373 struct bio *bio = iod->private;
374 u16 status = le16_to_cpup(&cqe->status) >> 1;
375
376 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
377 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
378 nvme_free_iod(dev, iod);
379 if (status) {
380 bio_endio(bio, -EIO);
381 } else if (bio->bi_vcnt > bio->bi_idx) {
382 requeue_bio(dev, bio);
383 } else {
384 bio_endio(bio, 0);
385 }
386}
387
388/* length is in bytes. gfp flags indicates whether we may sleep. */
389static int nvme_setup_prps(struct nvme_dev *dev,
390 struct nvme_common_command *cmd, struct nvme_iod *iod,
391 int total_len, gfp_t gfp)
392{
393 struct dma_pool *pool;
394 int length = total_len;
395 struct scatterlist *sg = iod->sg;
396 int dma_len = sg_dma_len(sg);
397 u64 dma_addr = sg_dma_address(sg);
398 int offset = offset_in_page(dma_addr);
399 __le64 *prp_list;
400 __le64 **list = iod_list(iod);
401 dma_addr_t prp_dma;
402 int nprps, i;
403
404 cmd->prp1 = cpu_to_le64(dma_addr);
405 length -= (PAGE_SIZE - offset);
406 if (length <= 0)
407 return total_len;
408
409 dma_len -= (PAGE_SIZE - offset);
410 if (dma_len) {
411 dma_addr += (PAGE_SIZE - offset);
412 } else {
413 sg = sg_next(sg);
414 dma_addr = sg_dma_address(sg);
415 dma_len = sg_dma_len(sg);
416 }
417
418 if (length <= PAGE_SIZE) {
419 cmd->prp2 = cpu_to_le64(dma_addr);
420 return total_len;
421 }
422
423 nprps = DIV_ROUND_UP(length, PAGE_SIZE);
424 if (nprps <= (256 / 8)) {
425 pool = dev->prp_small_pool;
426 iod->npages = 0;
427 } else {
428 pool = dev->prp_page_pool;
429 iod->npages = 1;
430 }
431
432 prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
433 if (!prp_list) {
434 cmd->prp2 = cpu_to_le64(dma_addr);
435 iod->npages = -1;
436 return (total_len - length) + PAGE_SIZE;
437 }
438 list[0] = prp_list;
439 iod->first_dma = prp_dma;
440 cmd->prp2 = cpu_to_le64(prp_dma);
441 i = 0;
442 for (;;) {
443 if (i == PAGE_SIZE / 8) {
444 __le64 *old_prp_list = prp_list;
445 prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
446 if (!prp_list)
447 return total_len - length;
448 list[iod->npages++] = prp_list;
449 prp_list[0] = old_prp_list[i - 1];
450 old_prp_list[i - 1] = cpu_to_le64(prp_dma);
451 i = 1;
452 }
453 prp_list[i++] = cpu_to_le64(dma_addr);
454 dma_len -= PAGE_SIZE;
455 dma_addr += PAGE_SIZE;
456 length -= PAGE_SIZE;
457 if (length <= 0)
458 break;
459 if (dma_len > 0)
460 continue;
461 BUG_ON(dma_len < 0);
462 sg = sg_next(sg);
463 dma_addr = sg_dma_address(sg);
464 dma_len = sg_dma_len(sg);
465 }
466
467 return total_len;
468}
469
470/* NVMe scatterlists require no holes in the virtual address */
471#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
472 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
473
474static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
475 struct bio *bio, enum dma_data_direction dma_dir, int psegs)
476{
477 struct bio_vec *bvec, *bvprv = NULL;
478 struct scatterlist *sg = NULL;
479 int i, old_idx, length = 0, nsegs = 0;
480
481 sg_init_table(iod->sg, psegs);
482 old_idx = bio->bi_idx;
483 bio_for_each_segment(bvec, bio, i) {
484 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
485 sg->length += bvec->bv_len;
486 } else {
487 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
488 break;
489 sg = sg ? sg + 1 : iod->sg;
490 sg_set_page(sg, bvec->bv_page, bvec->bv_len,
491 bvec->bv_offset);
492 nsegs++;
493 }
494 length += bvec->bv_len;
495 bvprv = bvec;
496 }
497 bio->bi_idx = i;
498 iod->nents = nsegs;
499 sg_mark_end(sg);
500 if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
501 bio->bi_idx = old_idx;
502 return -ENOMEM;
503 }
504 return length;
505}
506
507static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
508 int cmdid)
509{
510 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
511
512 memset(cmnd, 0, sizeof(*cmnd));
513 cmnd->common.opcode = nvme_cmd_flush;
514 cmnd->common.command_id = cmdid;
515 cmnd->common.nsid = cpu_to_le32(ns->ns_id);
516
517 if (++nvmeq->sq_tail == nvmeq->q_depth)
518 nvmeq->sq_tail = 0;
519 writel(nvmeq->sq_tail, nvmeq->q_db);
520
521 return 0;
522}
523
524static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
525{
526 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
527 special_completion, NVME_IO_TIMEOUT);
528 if (unlikely(cmdid < 0))
529 return cmdid;
530
531 return nvme_submit_flush(nvmeq, ns, cmdid);
532}
533
534/*
535 * Called with local interrupts disabled and the q_lock held. May not sleep.
536 */
537static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
538 struct bio *bio)
539{
540 struct nvme_command *cmnd;
541 struct nvme_iod *iod;
542 enum dma_data_direction dma_dir;
543 int cmdid, length, result = -ENOMEM;
544 u16 control;
545 u32 dsmgmt;
546 int psegs = bio_phys_segments(ns->queue, bio);
547
548 if ((bio->bi_rw & REQ_FLUSH) && psegs) {
549 result = nvme_submit_flush_data(nvmeq, ns);
550 if (result)
551 return result;
552 }
553
554 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
555 if (!iod)
556 goto nomem;
557 iod->private = bio;
558
559 result = -EBUSY;
560 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
561 if (unlikely(cmdid < 0))
562 goto free_iod;
563
564 if ((bio->bi_rw & REQ_FLUSH) && !psegs)
565 return nvme_submit_flush(nvmeq, ns, cmdid);
566
567 control = 0;
568 if (bio->bi_rw & REQ_FUA)
569 control |= NVME_RW_FUA;
570 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
571 control |= NVME_RW_LR;
572
573 dsmgmt = 0;
574 if (bio->bi_rw & REQ_RAHEAD)
575 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
576
577 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
578
579 memset(cmnd, 0, sizeof(*cmnd));
580 if (bio_data_dir(bio)) {
581 cmnd->rw.opcode = nvme_cmd_write;
582 dma_dir = DMA_TO_DEVICE;
583 } else {
584 cmnd->rw.opcode = nvme_cmd_read;
585 dma_dir = DMA_FROM_DEVICE;
586 }
587
588 result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
589 if (result < 0)
590 goto free_iod;
591 length = result;
592
593 cmnd->rw.command_id = cmdid;
594 cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
595 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
596 GFP_ATOMIC);
597 cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
598 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
599 cmnd->rw.control = cpu_to_le16(control);
600 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
601
602 bio->bi_sector += length >> 9;
603
604 if (++nvmeq->sq_tail == nvmeq->q_depth)
605 nvmeq->sq_tail = 0;
606 writel(nvmeq->sq_tail, nvmeq->q_db);
607
608 return 0;
609
610 free_iod:
611 nvme_free_iod(nvmeq->dev, iod);
612 nomem:
613 return result;
614}
615
616/*
617 * NB: return value of non-zero would mean that we were a stacking driver.
618 * make_request must always succeed.
619 */
620static int nvme_make_request(struct request_queue *q, struct bio *bio)
621{
622 struct nvme_ns *ns = q->queuedata;
623 struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
624 int result = -EBUSY;
625
626 spin_lock_irq(&nvmeq->q_lock);
627 if (bio_list_empty(&nvmeq->sq_cong))
628 result = nvme_submit_bio_queue(nvmeq, ns, bio);
629 if (unlikely(result)) {
630 if (bio_list_empty(&nvmeq->sq_cong))
631 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
632 bio_list_add(&nvmeq->sq_cong, bio);
633 }
634
635 spin_unlock_irq(&nvmeq->q_lock);
636 put_nvmeq(nvmeq);
637
638 return 0;
639}
640
641static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
642{
643 u16 head, phase;
644
645 head = nvmeq->cq_head;
646 phase = nvmeq->cq_phase;
647
648 for (;;) {
649 void *ctx;
650 nvme_completion_fn fn;
651 struct nvme_completion cqe = nvmeq->cqes[head];
652 if ((le16_to_cpu(cqe.status) & 1) != phase)
653 break;
654 nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
655 if (++head == nvmeq->q_depth) {
656 head = 0;
657 phase = !phase;
658 }
659
660 ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
661 fn(nvmeq->dev, ctx, &cqe);
662 }
663
664 /* If the controller ignores the cq head doorbell and continuously
665 * writes to the queue, it is theoretically possible to wrap around
666 * the queue twice and mistakenly return IRQ_NONE. Linux only
667 * requires that 0.1% of your interrupts are handled, so this isn't
668 * a big problem.
669 */
670 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
671 return IRQ_NONE;
672
673 writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
674 nvmeq->cq_head = head;
675 nvmeq->cq_phase = phase;
676
677 return IRQ_HANDLED;
678}
679
680static irqreturn_t nvme_irq(int irq, void *data)
681{
682 irqreturn_t result;
683 struct nvme_queue *nvmeq = data;
684 spin_lock(&nvmeq->q_lock);
685 result = nvme_process_cq(nvmeq);
686 spin_unlock(&nvmeq->q_lock);
687 return result;
688}
689
690static irqreturn_t nvme_irq_check(int irq, void *data)
691{
692 struct nvme_queue *nvmeq = data;
693 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
694 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
695 return IRQ_NONE;
696 return IRQ_WAKE_THREAD;
697}
698
699static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
700{
701 spin_lock_irq(&nvmeq->q_lock);
702 cancel_cmdid(nvmeq, cmdid, NULL);
703 spin_unlock_irq(&nvmeq->q_lock);
704}
705
706struct sync_cmd_info {
707 struct task_struct *task;
708 u32 result;
709 int status;
710};
711
712static void sync_completion(struct nvme_dev *dev, void *ctx,
713 struct nvme_completion *cqe)
714{
715 struct sync_cmd_info *cmdinfo = ctx;
716 cmdinfo->result = le32_to_cpup(&cqe->result);
717 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
718 wake_up_process(cmdinfo->task);
719}
720
721/*
722 * Returns 0 on success. If the result is negative, it's a Linux error code;
723 * if the result is positive, it's an NVM Express status code
724 */
725static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
726 struct nvme_command *cmd, u32 *result, unsigned timeout)
727{
728 int cmdid;
729 struct sync_cmd_info cmdinfo;
730
731 cmdinfo.task = current;
732 cmdinfo.status = -EINTR;
733
734 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
735 timeout);
736 if (cmdid < 0)
737 return cmdid;
738 cmd->common.command_id = cmdid;
739
740 set_current_state(TASK_KILLABLE);
741 nvme_submit_cmd(nvmeq, cmd);
742 schedule();
743
744 if (cmdinfo.status == -EINTR) {
745 nvme_abort_command(nvmeq, cmdid);
746 return -EINTR;
747 }
748
749 if (result)
750 *result = cmdinfo.result;
751
752 return cmdinfo.status;
753}
754
755static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
756 u32 *result)
757{
758 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
759}
760
761static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
762{
763 int status;
764 struct nvme_command c;
765
766 memset(&c, 0, sizeof(c));
767 c.delete_queue.opcode = opcode;
768 c.delete_queue.qid = cpu_to_le16(id);
769
770 status = nvme_submit_admin_cmd(dev, &c, NULL);
771 if (status)
772 return -EIO;
773 return 0;
774}
775
776static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
777 struct nvme_queue *nvmeq)
778{
779 int status;
780 struct nvme_command c;
781 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
782
783 memset(&c, 0, sizeof(c));
784 c.create_cq.opcode = nvme_admin_create_cq;
785 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
786 c.create_cq.cqid = cpu_to_le16(qid);
787 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
788 c.create_cq.cq_flags = cpu_to_le16(flags);
789 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
790
791 status = nvme_submit_admin_cmd(dev, &c, NULL);
792 if (status)
793 return -EIO;
794 return 0;
795}
796
797static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
798 struct nvme_queue *nvmeq)
799{
800 int status;
801 struct nvme_command c;
802 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
803
804 memset(&c, 0, sizeof(c));
805 c.create_sq.opcode = nvme_admin_create_sq;
806 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
807 c.create_sq.sqid = cpu_to_le16(qid);
808 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
809 c.create_sq.sq_flags = cpu_to_le16(flags);
810 c.create_sq.cqid = cpu_to_le16(qid);
811
812 status = nvme_submit_admin_cmd(dev, &c, NULL);
813 if (status)
814 return -EIO;
815 return 0;
816}
817
818static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
819{
820 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
821}
822
823static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
824{
825 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
826}
827
828static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
829 dma_addr_t dma_addr)
830{
831 struct nvme_command c;
832
833 memset(&c, 0, sizeof(c));
834 c.identify.opcode = nvme_admin_identify;
835 c.identify.nsid = cpu_to_le32(nsid);
836 c.identify.prp1 = cpu_to_le64(dma_addr);
837 c.identify.cns = cpu_to_le32(cns);
838
839 return nvme_submit_admin_cmd(dev, &c, NULL);
840}
841
842static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
843 unsigned dword11, dma_addr_t dma_addr)
844{
845 struct nvme_command c;
846
847 memset(&c, 0, sizeof(c));
848 c.features.opcode = nvme_admin_get_features;
849 c.features.prp1 = cpu_to_le64(dma_addr);
850 c.features.fid = cpu_to_le32(fid);
851 c.features.dword11 = cpu_to_le32(dword11);
852
853 return nvme_submit_admin_cmd(dev, &c, NULL);
854}
855
856static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
857 unsigned dword11, dma_addr_t dma_addr, u32 *result)
858{
859 struct nvme_command c;
860
861 memset(&c, 0, sizeof(c));
862 c.features.opcode = nvme_admin_set_features;
863 c.features.prp1 = cpu_to_le64(dma_addr);
864 c.features.fid = cpu_to_le32(fid);
865 c.features.dword11 = cpu_to_le32(dword11);
866
867 return nvme_submit_admin_cmd(dev, &c, result);
868}
869
870static void nvme_free_queue(struct nvme_dev *dev, int qid)
871{
872 struct nvme_queue *nvmeq = dev->queues[qid];
873 int vector = dev->entry[nvmeq->cq_vector].vector;
874
875 irq_set_affinity_hint(vector, NULL);
876 free_irq(vector, nvmeq);
877
878 /* Don't tell the adapter to delete the admin queue */
879 if (qid) {
880 adapter_delete_sq(dev, qid);
881 adapter_delete_cq(dev, qid);
882 }
883
884 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
885 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
886 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
887 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
888 kfree(nvmeq);
889}
890
891static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
892 int depth, int vector)
893{
894 struct device *dmadev = &dev->pci_dev->dev;
895 unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
896 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
897 if (!nvmeq)
898 return NULL;
899
900 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
901 &nvmeq->cq_dma_addr, GFP_KERNEL);
902 if (!nvmeq->cqes)
903 goto free_nvmeq;
904 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
905
906 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
907 &nvmeq->sq_dma_addr, GFP_KERNEL);
908 if (!nvmeq->sq_cmds)
909 goto free_cqdma;
910
911 nvmeq->q_dmadev = dmadev;
912 nvmeq->dev = dev;
913 spin_lock_init(&nvmeq->q_lock);
914 nvmeq->cq_head = 0;
915 nvmeq->cq_phase = 1;
916 init_waitqueue_head(&nvmeq->sq_full);
917 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
918 bio_list_init(&nvmeq->sq_cong);
919 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
920 nvmeq->q_depth = depth;
921 nvmeq->cq_vector = vector;
922
923 return nvmeq;
924
925 free_cqdma:
926 dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
927 nvmeq->cq_dma_addr);
928 free_nvmeq:
929 kfree(nvmeq);
930 return NULL;
931}
932
933static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
934 const char *name)
935{
936 if (use_threaded_interrupts)
937 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
938 nvme_irq_check, nvme_irq,
939 IRQF_DISABLED | IRQF_SHARED,
940 name, nvmeq);
941 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
942 IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
943}
944
945static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
946 int qid, int cq_size, int vector)
947{
948 int result;
949 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
950
951 if (!nvmeq)
952 return ERR_PTR(-ENOMEM);
953
954 result = adapter_alloc_cq(dev, qid, nvmeq);
955 if (result < 0)
956 goto free_nvmeq;
957
958 result = adapter_alloc_sq(dev, qid, nvmeq);
959 if (result < 0)
960 goto release_cq;
961
962 result = queue_request_irq(dev, nvmeq, "nvme");
963 if (result < 0)
964 goto release_sq;
965
966 return nvmeq;
967
968 release_sq:
969 adapter_delete_sq(dev, qid);
970 release_cq:
971 adapter_delete_cq(dev, qid);
972 free_nvmeq:
973 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
974 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
975 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
976 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
977 kfree(nvmeq);
978 return ERR_PTR(result);
979}
980
981static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
982{
983 int result;
984 u32 aqa;
985 u64 cap;
986 unsigned long timeout;
987 struct nvme_queue *nvmeq;
988
989 dev->dbs = ((void __iomem *)dev->bar) + 4096;
990
991 nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
992 if (!nvmeq)
993 return -ENOMEM;
994
995 aqa = nvmeq->q_depth - 1;
996 aqa |= aqa << 16;
997
998 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
999 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
1000 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
1001 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1002
1003 writel(0, &dev->bar->cc);
1004 writel(aqa, &dev->bar->aqa);
1005 writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
1006 writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
1007 writel(dev->ctrl_config, &dev->bar->cc);
1008
1009 cap = readq(&dev->bar->cap);
1010 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1011 dev->db_stride = NVME_CAP_STRIDE(cap);
1012
1013 while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
1014 msleep(100);
1015 if (fatal_signal_pending(current))
1016 return -EINTR;
1017 if (time_after(jiffies, timeout)) {
1018 dev_err(&dev->pci_dev->dev,
1019 "Device not ready; aborting initialisation\n");
1020 return -ENODEV;
1021 }
1022 }
1023
1024 result = queue_request_irq(dev, nvmeq, "nvme admin");
1025 dev->queues[0] = nvmeq;
1026 return result;
1027}
1028
1029static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1030 unsigned long addr, unsigned length)
1031{
1032 int i, err, count, nents, offset;
1033 struct scatterlist *sg;
1034 struct page **pages;
1035 struct nvme_iod *iod;
1036
1037 if (addr & 3)
1038 return ERR_PTR(-EINVAL);
1039 if (!length)
1040 return ERR_PTR(-EINVAL);
1041
1042 offset = offset_in_page(addr);
1043 count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
1044 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
1045
1046 err = get_user_pages_fast(addr, count, 1, pages);
1047 if (err < count) {
1048 count = err;
1049 err = -EFAULT;
1050 goto put_pages;
1051 }
1052
1053 iod = nvme_alloc_iod(count, length, GFP_KERNEL);
1054 sg = iod->sg;
1055 sg_init_table(sg, count);
1056 for (i = 0; i < count; i++) {
1057 sg_set_page(&sg[i], pages[i],
1058 min_t(int, length, PAGE_SIZE - offset), offset);
1059 length -= (PAGE_SIZE - offset);
1060 offset = 0;
1061 }
1062 sg_mark_end(&sg[i - 1]);
1063 iod->nents = count;
1064
1065 err = -ENOMEM;
1066 nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
1067 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1068 if (!nents)
1069 goto free_iod;
1070
1071 kfree(pages);
1072 return iod;
1073
1074 free_iod:
1075 kfree(iod);
1076 put_pages:
1077 for (i = 0; i < count; i++)
1078 put_page(pages[i]);
1079 kfree(pages);
1080 return ERR_PTR(err);
1081}
1082
1083static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
1084 struct nvme_iod *iod)
1085{
1086 int i;
1087
1088 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
1089 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1090
1091 for (i = 0; i < iod->nents; i++)
1092 put_page(sg_page(&iod->sg[i]));
1093}
1094
1095static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1096{
1097 struct nvme_dev *dev = ns->dev;
1098 struct nvme_queue *nvmeq;
1099 struct nvme_user_io io;
1100 struct nvme_command c;
1101 unsigned length;
1102 int status;
1103 struct nvme_iod *iod;
1104
1105 if (copy_from_user(&io, uio, sizeof(io)))
1106 return -EFAULT;
1107 length = (io.nblocks + 1) << ns->lba_shift;
1108
1109 switch (io.opcode) {
1110 case nvme_cmd_write:
1111 case nvme_cmd_read:
1112 case nvme_cmd_compare:
1113 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
1114 break;
1115 default:
1116 return -EINVAL;
1117 }
1118
1119 if (IS_ERR(iod))
1120 return PTR_ERR(iod);
1121
1122 memset(&c, 0, sizeof(c));
1123 c.rw.opcode = io.opcode;
1124 c.rw.flags = io.flags;
1125 c.rw.nsid = cpu_to_le32(ns->ns_id);
1126 c.rw.slba = cpu_to_le64(io.slba);
1127 c.rw.length = cpu_to_le16(io.nblocks);
1128 c.rw.control = cpu_to_le16(io.control);
1129 c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
1130 c.rw.reftag = io.reftag;
1131 c.rw.apptag = io.apptag;
1132 c.rw.appmask = io.appmask;
1133 /* XXX: metadata */
1134 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
1135
1136 nvmeq = get_nvmeq(dev);
1137 /*
1138 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
1139 * disabled. We may be preempted at any point, and be rescheduled
1140 * to a different CPU. That will cause cacheline bouncing, but no
1141 * additional races since q_lock already protects against other CPUs.
1142 */
1143 put_nvmeq(nvmeq);
1144 if (length != (io.nblocks + 1) << ns->lba_shift)
1145 status = -ENOMEM;
1146 else
1147 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
1148
1149 nvme_unmap_user_pages(dev, io.opcode & 1, iod);
1150 nvme_free_iod(dev, iod);
1151 return status;
1152}
1153
1154static int nvme_user_admin_cmd(struct nvme_ns *ns,
1155 struct nvme_admin_cmd __user *ucmd)
1156{
1157 struct nvme_dev *dev = ns->dev;
1158 struct nvme_admin_cmd cmd;
1159 struct nvme_command c;
1160 int status, length;
1161 struct nvme_iod *iod;
1162
1163 if (!capable(CAP_SYS_ADMIN))
1164 return -EACCES;
1165 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1166 return -EFAULT;
1167
1168 memset(&c, 0, sizeof(c));
1169 c.common.opcode = cmd.opcode;
1170 c.common.flags = cmd.flags;
1171 c.common.nsid = cpu_to_le32(cmd.nsid);
1172 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1173 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1174 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1175 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1176 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1177 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1178 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1179 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1180
1181 length = cmd.data_len;
1182 if (cmd.data_len) {
1183 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
1184 length);
1185 if (IS_ERR(iod))
1186 return PTR_ERR(iod);
1187 length = nvme_setup_prps(dev, &c.common, iod, length,
1188 GFP_KERNEL);
1189 }
1190
1191 if (length != cmd.data_len)
1192 status = -ENOMEM;
1193 else
1194 status = nvme_submit_admin_cmd(dev, &c, NULL);
1195
1196 if (cmd.data_len) {
1197 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
1198 nvme_free_iod(dev, iod);
1199 }
1200 return status;
1201}
1202
1203static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1204 unsigned long arg)
1205{
1206 struct nvme_ns *ns = bdev->bd_disk->private_data;
1207
1208 switch (cmd) {
1209 case NVME_IOCTL_ID:
1210 return ns->ns_id;
1211 case NVME_IOCTL_ADMIN_CMD:
1212 return nvme_user_admin_cmd(ns, (void __user *)arg);
1213 case NVME_IOCTL_SUBMIT_IO:
1214 return nvme_submit_io(ns, (void __user *)arg);
1215 default:
1216 return -ENOTTY;
1217 }
1218}
1219
1220static const struct block_device_operations nvme_fops = {
1221 .owner = THIS_MODULE,
1222 .ioctl = nvme_ioctl,
1223 .compat_ioctl = nvme_ioctl,
1224};
1225
1226static void nvme_timeout_ios(struct nvme_queue *nvmeq)
1227{
1228 int depth = nvmeq->q_depth - 1;
1229 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
1230 unsigned long now = jiffies;
1231 int cmdid;
1232
1233 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
1234 void *ctx;
1235 nvme_completion_fn fn;
1236 static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
1237
1238 if (!time_after(now, info[cmdid].timeout))
1239 continue;
1240 dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
1241 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
1242 fn(nvmeq->dev, ctx, &cqe);
1243 }
1244}
1245
1246static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
1247{
1248 while (bio_list_peek(&nvmeq->sq_cong)) {
1249 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
1250 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
1251 if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
1252 bio_list_add_head(&nvmeq->sq_cong, bio);
1253 break;
1254 }
1255 if (bio_list_empty(&nvmeq->sq_cong))
1256 remove_wait_queue(&nvmeq->sq_full,
1257 &nvmeq->sq_cong_wait);
1258 }
1259}
1260
1261static int nvme_kthread(void *data)
1262{
1263 struct nvme_dev *dev;
1264
1265 while (!kthread_should_stop()) {
1266 __set_current_state(TASK_RUNNING);
1267 spin_lock(&dev_list_lock);
1268 list_for_each_entry(dev, &dev_list, node) {
1269 int i;
1270 for (i = 0; i < dev->queue_count; i++) {
1271 struct nvme_queue *nvmeq = dev->queues[i];
1272 if (!nvmeq)
1273 continue;
1274 spin_lock_irq(&nvmeq->q_lock);
1275 if (nvme_process_cq(nvmeq))
1276 printk("process_cq did something\n");
1277 nvme_timeout_ios(nvmeq);
1278 nvme_resubmit_bios(nvmeq);
1279 spin_unlock_irq(&nvmeq->q_lock);
1280 }
1281 }
1282 spin_unlock(&dev_list_lock);
1283 set_current_state(TASK_INTERRUPTIBLE);
1284 schedule_timeout(HZ);
1285 }
1286 return 0;
1287}
1288
1289static DEFINE_IDA(nvme_index_ida);
1290
1291static int nvme_get_ns_idx(void)
1292{
1293 int index, error;
1294
1295 do {
1296 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL))
1297 return -1;
1298
1299 spin_lock(&dev_list_lock);
1300 error = ida_get_new(&nvme_index_ida, &index);
1301 spin_unlock(&dev_list_lock);
1302 } while (error == -EAGAIN);
1303
1304 if (error)
1305 index = -1;
1306 return index;
1307}
1308
1309static void nvme_put_ns_idx(int index)
1310{
1311 spin_lock(&dev_list_lock);
1312 ida_remove(&nvme_index_ida, index);
1313 spin_unlock(&dev_list_lock);
1314}
1315
1316static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
1317 struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
1318{
1319 struct nvme_ns *ns;
1320 struct gendisk *disk;
1321 int lbaf;
1322
1323 if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
1324 return NULL;
1325
1326 ns = kzalloc(sizeof(*ns), GFP_KERNEL);
1327 if (!ns)
1328 return NULL;
1329 ns->queue = blk_alloc_queue(GFP_KERNEL);
1330 if (!ns->queue)
1331 goto out_free_ns;
1332 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
1333 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
1334 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1335/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */
1336 blk_queue_make_request(ns->queue, nvme_make_request);
1337 ns->dev = dev;
1338 ns->queue->queuedata = ns;
1339
1340 disk = alloc_disk(NVME_MINORS);
1341 if (!disk)
1342 goto out_free_queue;
1343 ns->ns_id = nsid;
1344 ns->disk = disk;
1345 lbaf = id->flbas & 0xf;
1346 ns->lba_shift = id->lbaf[lbaf].ds;
1347
1348 disk->major = nvme_major;
1349 disk->minors = NVME_MINORS;
1350 disk->first_minor = NVME_MINORS * nvme_get_ns_idx();
1351 disk->fops = &nvme_fops;
1352 disk->private_data = ns;
1353 disk->queue = ns->queue;
1354 disk->driverfs_dev = &dev->pci_dev->dev;
1355 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
1356 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1357
1358 return ns;
1359
1360 out_free_queue:
1361 blk_cleanup_queue(ns->queue);
1362 out_free_ns:
1363 kfree(ns);
1364 return NULL;
1365}
1366
1367static void nvme_ns_free(struct nvme_ns *ns)
1368{
1369 int index = ns->disk->first_minor / NVME_MINORS;
1370 put_disk(ns->disk);
1371 nvme_put_ns_idx(index);
1372 blk_cleanup_queue(ns->queue);
1373 kfree(ns);
1374}
1375
1376static int set_queue_count(struct nvme_dev *dev, int count)
1377{
1378 int status;
1379 u32 result;
1380 u32 q_count = (count - 1) | ((count - 1) << 16);
1381
1382 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
1383 &result);
1384 if (status)
1385 return -EIO;
1386 return min(result & 0xffff, result >> 16) + 1;
1387}
1388
1389static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
1390{
1391 int result, cpu, i, nr_io_queues, db_bar_size;
1392
1393 nr_io_queues = num_online_cpus();
1394 result = set_queue_count(dev, nr_io_queues);
1395 if (result < 0)
1396 return result;
1397 if (result < nr_io_queues)
1398 nr_io_queues = result;
1399
1400 /* Deregister the admin queue's interrupt */
1401 free_irq(dev->entry[0].vector, dev->queues[0]);
1402
1403 db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
1404 if (db_bar_size > 8192) {
1405 iounmap(dev->bar);
1406 dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0),
1407 db_bar_size);
1408 dev->dbs = ((void __iomem *)dev->bar) + 4096;
1409 dev->queues[0]->q_db = dev->dbs;
1410 }
1411
1412 for (i = 0; i < nr_io_queues; i++)
1413 dev->entry[i].entry = i;
1414 for (;;) {
1415 result = pci_enable_msix(dev->pci_dev, dev->entry,
1416 nr_io_queues);
1417 if (result == 0) {
1418 break;
1419 } else if (result > 0) {
1420 nr_io_queues = result;
1421 continue;
1422 } else {
1423 nr_io_queues = 1;
1424 break;
1425 }
1426 }
1427
1428 result = queue_request_irq(dev, dev->queues[0], "nvme admin");
1429 /* XXX: handle failure here */
1430
1431 cpu = cpumask_first(cpu_online_mask);
1432 for (i = 0; i < nr_io_queues; i++) {
1433 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
1434 cpu = cpumask_next(cpu, cpu_online_mask);
1435 }
1436
1437 for (i = 0; i < nr_io_queues; i++) {
1438 dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
1439 NVME_Q_DEPTH, i);
1440 if (IS_ERR(dev->queues[i + 1]))
1441 return PTR_ERR(dev->queues[i + 1]);
1442 dev->queue_count++;
1443 }
1444
1445 for (; i < num_possible_cpus(); i++) {
1446 int target = i % rounddown_pow_of_two(dev->queue_count - 1);
1447 dev->queues[i + 1] = dev->queues[target + 1];
1448 }
1449
1450 return 0;
1451}
1452
1453static void nvme_free_queues(struct nvme_dev *dev)
1454{
1455 int i;
1456
1457 for (i = dev->queue_count - 1; i >= 0; i--)
1458 nvme_free_queue(dev, i);
1459}
1460
1461static int __devinit nvme_dev_add(struct nvme_dev *dev)
1462{
1463 int res, nn, i;
1464 struct nvme_ns *ns, *next;
1465 struct nvme_id_ctrl *ctrl;
1466 struct nvme_id_ns *id_ns;
1467 void *mem;
1468 dma_addr_t dma_addr;
1469
1470 res = nvme_setup_io_queues(dev);
1471 if (res)
1472 return res;
1473
1474 mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
1475 GFP_KERNEL);
1476
1477 res = nvme_identify(dev, 0, 1, dma_addr);
1478 if (res) {
1479 res = -EIO;
1480 goto out_free;
1481 }
1482
1483 ctrl = mem;
1484 nn = le32_to_cpup(&ctrl->nn);
1485 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
1486 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
1487 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
1488
1489 id_ns = mem;
1490 for (i = 1; i <= nn; i++) {
1491 res = nvme_identify(dev, i, 0, dma_addr);
1492 if (res)
1493 continue;
1494
1495 if (id_ns->ncap == 0)
1496 continue;
1497
1498 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
1499 dma_addr + 4096);
1500 if (res)
1501 continue;
1502
1503 ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
1504 if (ns)
1505 list_add_tail(&ns->list, &dev->namespaces);
1506 }
1507 list_for_each_entry(ns, &dev->namespaces, list)
1508 add_disk(ns->disk);
1509
1510 goto out;
1511
1512 out_free:
1513 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1514 list_del(&ns->list);
1515 nvme_ns_free(ns);
1516 }
1517
1518 out:
1519 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
1520 return res;
1521}
1522
1523static int nvme_dev_remove(struct nvme_dev *dev)
1524{
1525 struct nvme_ns *ns, *next;
1526
1527 spin_lock(&dev_list_lock);
1528 list_del(&dev->node);
1529 spin_unlock(&dev_list_lock);
1530
1531 /* TODO: wait all I/O finished or cancel them */
1532
1533 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1534 list_del(&ns->list);
1535 del_gendisk(ns->disk);
1536 nvme_ns_free(ns);
1537 }
1538
1539 nvme_free_queues(dev);
1540
1541 return 0;
1542}
1543
1544static int nvme_setup_prp_pools(struct nvme_dev *dev)
1545{
1546 struct device *dmadev = &dev->pci_dev->dev;
1547 dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
1548 PAGE_SIZE, PAGE_SIZE, 0);
1549 if (!dev->prp_page_pool)
1550 return -ENOMEM;
1551
1552 /* Optimisation for I/Os between 4k and 128k */
1553 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
1554 256, 256, 0);
1555 if (!dev->prp_small_pool) {
1556 dma_pool_destroy(dev->prp_page_pool);
1557 return -ENOMEM;
1558 }
1559 return 0;
1560}
1561
1562static void nvme_release_prp_pools(struct nvme_dev *dev)
1563{
1564 dma_pool_destroy(dev->prp_page_pool);
1565 dma_pool_destroy(dev->prp_small_pool);
1566}
1567
1568/* XXX: Use an ida or something to let remove / add work correctly */
1569static void nvme_set_instance(struct nvme_dev *dev)
1570{
1571 static int instance;
1572 dev->instance = instance++;
1573}
1574
1575static void nvme_release_instance(struct nvme_dev *dev)
1576{
1577}
1578
1579static int __devinit nvme_probe(struct pci_dev *pdev,
1580 const struct pci_device_id *id)
1581{
1582 int bars, result = -ENOMEM;
1583 struct nvme_dev *dev;
1584
1585 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1586 if (!dev)
1587 return -ENOMEM;
1588 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
1589 GFP_KERNEL);
1590 if (!dev->entry)
1591 goto free;
1592 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
1593 GFP_KERNEL);
1594 if (!dev->queues)
1595 goto free;
1596
1597 if (pci_enable_device_mem(pdev))
1598 goto free;
1599 pci_set_master(pdev);
1600 bars = pci_select_bars(pdev, IORESOURCE_MEM);
1601 if (pci_request_selected_regions(pdev, bars, "nvme"))
1602 goto disable;
1603
1604 INIT_LIST_HEAD(&dev->namespaces);
1605 dev->pci_dev = pdev;
1606 pci_set_drvdata(pdev, dev);
1607 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
1608 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
1609 nvme_set_instance(dev);
1610 dev->entry[0].vector = pdev->irq;
1611
1612 result = nvme_setup_prp_pools(dev);
1613 if (result)
1614 goto disable_msix;
1615
1616 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
1617 if (!dev->bar) {
1618 result = -ENOMEM;
1619 goto disable_msix;
1620 }
1621
1622 result = nvme_configure_admin_queue(dev);
1623 if (result)
1624 goto unmap;
1625 dev->queue_count++;
1626
1627 spin_lock(&dev_list_lock);
1628 list_add(&dev->node, &dev_list);
1629 spin_unlock(&dev_list_lock);
1630
1631 result = nvme_dev_add(dev);
1632 if (result)
1633 goto delete;
1634
1635 return 0;
1636
1637 delete:
1638 spin_lock(&dev_list_lock);
1639 list_del(&dev->node);
1640 spin_unlock(&dev_list_lock);
1641
1642 nvme_free_queues(dev);
1643 unmap:
1644 iounmap(dev->bar);
1645 disable_msix:
1646 pci_disable_msix(pdev);
1647 nvme_release_instance(dev);
1648 nvme_release_prp_pools(dev);
1649 disable:
1650 pci_disable_device(pdev);
1651 pci_release_regions(pdev);
1652 free:
1653 kfree(dev->queues);
1654 kfree(dev->entry);
1655 kfree(dev);
1656 return result;
1657}
1658
1659static void __devexit nvme_remove(struct pci_dev *pdev)
1660{
1661 struct nvme_dev *dev = pci_get_drvdata(pdev);
1662 nvme_dev_remove(dev);
1663 pci_disable_msix(pdev);
1664 iounmap(dev->bar);
1665 nvme_release_instance(dev);
1666 nvme_release_prp_pools(dev);
1667 pci_disable_device(pdev);
1668 pci_release_regions(pdev);
1669 kfree(dev->queues);
1670 kfree(dev->entry);
1671 kfree(dev);
1672}
1673
1674/* These functions are yet to be implemented */
1675#define nvme_error_detected NULL
1676#define nvme_dump_registers NULL
1677#define nvme_link_reset NULL
1678#define nvme_slot_reset NULL
1679#define nvme_error_resume NULL
1680#define nvme_suspend NULL
1681#define nvme_resume NULL
1682
1683static struct pci_error_handlers nvme_err_handler = {
1684 .error_detected = nvme_error_detected,
1685 .mmio_enabled = nvme_dump_registers,
1686 .link_reset = nvme_link_reset,
1687 .slot_reset = nvme_slot_reset,
1688 .resume = nvme_error_resume,
1689};
1690
1691/* Move to pci_ids.h later */
1692#define PCI_CLASS_STORAGE_EXPRESS 0x010802
1693
1694static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
1695 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
1696 { 0, }
1697};
1698MODULE_DEVICE_TABLE(pci, nvme_id_table);
1699
1700static struct pci_driver nvme_driver = {
1701 .name = "nvme",
1702 .id_table = nvme_id_table,
1703 .probe = nvme_probe,
1704 .remove = __devexit_p(nvme_remove),
1705 .suspend = nvme_suspend,
1706 .resume = nvme_resume,
1707 .err_handler = &nvme_err_handler,
1708};
1709
1710static int __init nvme_init(void)
1711{
1712 int result = -EBUSY;
1713
1714 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
1715 if (IS_ERR(nvme_thread))
1716 return PTR_ERR(nvme_thread);
1717
1718 nvme_major = register_blkdev(nvme_major, "nvme");
1719 if (nvme_major <= 0)
1720 goto kill_kthread;
1721
1722 result = pci_register_driver(&nvme_driver);
1723 if (result)
1724 goto unregister_blkdev;
1725 return 0;
1726
1727 unregister_blkdev:
1728 unregister_blkdev(nvme_major, "nvme");
1729 kill_kthread:
1730 kthread_stop(nvme_thread);
1731 return result;
1732}
1733
1734static void __exit nvme_exit(void)
1735{
1736 pci_unregister_driver(&nvme_driver);
1737 unregister_blkdev(nvme_major, "nvme");
1738 kthread_stop(nvme_thread);
1739}
1740
1741MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
1742MODULE_LICENSE("GPL");
1743MODULE_VERSION("0.8");
1744module_init(nvme_init);
1745module_exit(nvme_exit);