aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 18:37:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 18:37:51 -0400
commit3fc9d690936fb2e20e180710965ba2cc3a0881f8 (patch)
treeeaf4d9f788ee4d17cd40a116413873b7f80f9aa7
parentd05d7f40791ccbb6e543cc5dd6a6aa08fc71d635 (diff)
parent13880f5b57adf34d050cf7e229a6326da45a7347 (diff)
Merge branch 'for-4.8/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: "This branch also contains core changes. I've come to the conclusion that from 4.9 and forward, I'll be doing just a single branch. We often have dependencies between core and drivers, and it's hard to always split them up appropriately without pulling core into drivers when that happens. That said, this contains: - separate secure erase type for the core block layer, from Christoph. - set of discard fixes, from Christoph. - bio shrinking fixes from Christoph, as a followup up to the op/flags change in the core branch. - map and append request fixes from Christoph. - NVMeF (NVMe over Fabrics) code from Christoph. This is pretty exciting! - nvme-loop fixes from Arnd. - removal of ->driverfs_dev from Dan, after providing a device_add_disk() helper. - bcache fixes from Bhaktipriya and Yijing. - cdrom subchannel read fix from Vchannaiah. - set of lightnvm updates from Wenwei, Matias, Johannes, and Javier. - set of drbd updates and fixes from Fabian, Lars, and Philipp. - mg_disk error path fix from Bart. - user notification for failed device add for loop, from Minfei. - NVMe in general: + NVMe delay quirk from Guilherme. + SR-IOV support and command retry limits from Keith. + fix for memory-less NUMA node from Masayoshi. + use UINT_MAX for discard sectors, from Minfei. + cancel IO fixes from Ming. + don't allocate unused major, from Neil. + error code fixup from Dan. + use constants for PSDT/FUSE from James. + variable init fix from Jay. + fabrics fixes from Ming, Sagi, and Wei. + various fixes" * 'for-4.8/drivers' of git://git.kernel.dk/linux-block: (115 commits) nvme/pci: Provide SR-IOV support nvme: initialize variable before logical OR'ing it block: unexport various bio mapping helpers scsi/osd: open code blk_make_request target: stop using blk_make_request block: simplify and export blk_rq_append_bio block: ensure bios return from blk_get_request are properly initialized virtio_blk: use blk_rq_map_kern memstick: don't allow REQ_TYPE_BLOCK_PC requests block: shrink bio size again block: simplify and cleanup bvec pool handling block: get rid of bio_rw and READA block: don't ignore -EOPNOTSUPP blkdev_issue_write_same block: introduce BLKDEV_DISCARD_ZERO to fix zeroout NVMe: don't allocate unused nvme_major nvme: avoid crashes when node 0 is memoryless node. nvme: Limit command retries loop: Make user notify for adding loop device failed nvme-loop: fix nvme-loop Kconfig dependencies nvmet: fix return value check in nvmet_subsys_alloc() ...
-rw-r--r--Documentation/ioctl/cdrom.txt3
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/powerpc/sysdev/axonram.c3
-rw-r--r--arch/um/drivers/ubd_kern.c5
-rw-r--r--block/bio-integrity.c9
-rw-r--r--block/bio.c35
-rw-r--r--block/blk-core.c97
-rw-r--r--block/blk-lib.c44
-rw-r--r--block/blk-map.c25
-rw-r--r--block/blk-merge.c6
-rw-r--r--block/blk-mq-tag.c26
-rw-r--r--block/blk-mq.c43
-rw-r--r--block/blk.h2
-rw-r--r--block/genhd.c18
-rw-r--r--drivers/block/brd.c4
-rw-r--r--drivers/block/cciss.c3
-rw-r--r--drivers/block/drbd/drbd_actlog.c29
-rw-r--r--drivers/block/drbd/drbd_bitmap.c84
-rw-r--r--drivers/block/drbd/drbd_debugfs.c13
-rw-r--r--drivers/block/drbd/drbd_int.h49
-rw-r--r--drivers/block/drbd/drbd_interval.h14
-rw-r--r--drivers/block/drbd/drbd_main.c115
-rw-r--r--drivers/block/drbd/drbd_nl.c282
-rw-r--r--drivers/block/drbd/drbd_proc.c30
-rw-r--r--drivers/block/drbd/drbd_protocol.h77
-rw-r--r--drivers/block/drbd/drbd_receiver.c535
-rw-r--r--drivers/block/drbd/drbd_req.c118
-rw-r--r--drivers/block/drbd/drbd_req.h5
-rw-r--r--drivers/block/drbd/drbd_state.c61
-rw-r--r--drivers/block/drbd/drbd_state.h2
-rw-r--r--drivers/block/drbd/drbd_strings.c8
-rw-r--r--drivers/block/drbd/drbd_worker.c115
-rw-r--r--drivers/block/floppy.c3
-rw-r--r--drivers/block/loop.c1
-rw-r--r--drivers/block/mg_disk.c9
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c5
-rw-r--r--drivers/block/null_blk.c2
-rw-r--r--drivers/block/ps3disk.c3
-rw-r--r--drivers/block/ps3vram.c3
-rw-r--r--drivers/block/rsxx/dev.c4
-rw-r--r--drivers/block/skd_main.c8
-rw-r--r--drivers/block/sunvdc.c3
-rw-r--r--drivers/block/umem.c6
-rw-r--r--drivers/block/virtio_blk.c24
-rw-r--r--drivers/block/xen-blkback/xenbus.c2
-rw-r--r--drivers/block/xen-blkfront.c18
-rw-r--r--drivers/cdrom/cdrom.c28
-rw-r--r--drivers/ide/ide-cd.c3
-rw-r--r--drivers/ide/ide-gd.c3
-rw-r--r--drivers/lightnvm/Kconfig10
-rw-r--r--drivers/lightnvm/core.c242
-rw-r--r--drivers/lightnvm/gennvm.c385
-rw-r--r--drivers/lightnvm/gennvm.h10
-rw-r--r--drivers/lightnvm/rrpc.c149
-rw-r--r--drivers/lightnvm/rrpc.h13
-rw-r--r--drivers/lightnvm/sysblk.c8
-rw-r--r--drivers/md/bcache/closure.c2
-rw-r--r--drivers/md/bcache/closure.h3
-rw-r--r--drivers/md/bcache/io.c1
-rw-r--r--drivers/md/bcache/super.c10
-rw-r--r--drivers/md/dm-raid1.c8
-rw-r--r--drivers/md/dm-snap.c13
-rw-r--r--drivers/md/dm-zero.c15
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid10.c5
-rw-r--r--drivers/md/raid5.c2
-rw-r--r--drivers/memstick/core/ms_block.c6
-rw-r--r--drivers/memstick/core/mspro_block.c6
-rw-r--r--drivers/mmc/card/block.c15
-rw-r--r--drivers/mmc/card/queue.c2
-rw-r--r--drivers/mtd/mtd_blkdevs.c4
-rw-r--r--drivers/nvdimm/blk.c3
-rw-r--r--drivers/nvdimm/btt.c3
-rw-r--r--drivers/nvdimm/bus.c2
-rw-r--r--drivers/nvdimm/pmem.c3
-rw-r--r--drivers/nvme/Kconfig1
-rw-r--r--drivers/nvme/Makefile1
-rw-r--r--drivers/nvme/host/Kconfig19
-rw-r--r--drivers/nvme/host/Makefile6
-rw-r--r--drivers/nvme/host/core.c318
-rw-r--r--drivers/nvme/host/fabrics.c952
-rw-r--r--drivers/nvme/host/fabrics.h132
-rw-r--r--drivers/nvme/host/lightnvm.c4
-rw-r--r--drivers/nvme/host/nvme.h48
-rw-r--r--drivers/nvme/host/pci.c68
-rw-r--r--drivers/nvme/host/rdma.c2018
-rw-r--r--drivers/nvme/target/Kconfig36
-rw-r--r--drivers/nvme/target/Makefile9
-rw-r--r--drivers/nvme/target/admin-cmd.c465
-rw-r--r--drivers/nvme/target/configfs.c917
-rw-r--r--drivers/nvme/target/core.c964
-rw-r--r--drivers/nvme/target/discovery.c221
-rw-r--r--drivers/nvme/target/fabrics-cmd.c240
-rw-r--r--drivers/nvme/target/io-cmd.c215
-rw-r--r--drivers/nvme/target/loop.c754
-rw-r--r--drivers/nvme/target/nvmet.h331
-rw-r--r--drivers/nvme/target/rdma.c1448
-rw-r--r--drivers/s390/block/dasd_genhd.c3
-rw-r--r--drivers/s390/block/dcssblk.c3
-rw-r--r--drivers/s390/block/scm_blk.c3
-rw-r--r--drivers/scsi/osd/osd_initiator.c25
-rw-r--r--drivers/scsi/sd.c3
-rw-r--r--drivers/scsi/sr.c3
-rw-r--r--drivers/target/target_core_pscsi.c87
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/f2fs/checkpoint.c2
-rw-r--r--fs/f2fs/gc.c3
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/reiserfs/stree.c2
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h5
-rw-r--r--include/linux/blk_types.h31
-rw-r--r--include/linux/blkdev.h30
-rw-r--r--include/linux/drbd.h10
-rw-r--r--include/linux/drbd_genl.h7
-rw-r--r--include/linux/drbd_limits.h15
-rw-r--r--include/linux/fs.h18
-rw-r--r--include/linux/genhd.h8
-rw-r--r--include/linux/lightnvm.h34
-rw-r--r--include/linux/nvme-rdma.h71
-rw-r--r--include/linux/nvme.h406
-rw-r--r--include/trace/events/f2fs.h4
-rw-r--r--kernel/trace/blktrace.c6
129 files changed, 11613 insertions, 1279 deletions
diff --git a/Documentation/ioctl/cdrom.txt b/Documentation/ioctl/cdrom.txt
index 59df81c8da2b..a4d62a9d6771 100644
--- a/Documentation/ioctl/cdrom.txt
+++ b/Documentation/ioctl/cdrom.txt
@@ -340,7 +340,8 @@ CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl)
340 EINVAL format not CDROM_MSF or CDROM_LBA 340 EINVAL format not CDROM_MSF or CDROM_LBA
341 341
342 notes: 342 notes:
343 Format is converted to CDROM_MSF on return 343 Format is converted to CDROM_MSF or CDROM_LBA
344 as per user request on return
344 345
345 346
346 347
diff --git a/MAINTAINERS b/MAINTAINERS
index 9987f8491c55..831258676578 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8184,6 +8184,13 @@ S: Supported
8184F: drivers/nvme/host/ 8184F: drivers/nvme/host/
8185F: include/linux/nvme.h 8185F: include/linux/nvme.h
8186 8186
8187NVM EXPRESS TARGET DRIVER
8188M: Christoph Hellwig <hch@lst.de>
8189M: Sagi Grimberg <sagi@grimberg.me>
8190L: linux-nvme@lists.infradead.org
8191S: Supported
8192F: drivers/nvme/target/
8193
8187NVMEM FRAMEWORK 8194NVMEM FRAMEWORK
8188M: Srinivas Kandagatla <srinivas.kandagatla@linaro.org> 8195M: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
8189M: Maxime Ripard <maxime.ripard@free-electrons.com> 8196M: Maxime Ripard <maxime.ripard@free-electrons.com>
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index ff75d70f7285..f9af6461521a 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -223,7 +223,6 @@ static int axon_ram_probe(struct platform_device *device)
223 bank->disk->first_minor = azfs_minor; 223 bank->disk->first_minor = azfs_minor;
224 bank->disk->fops = &axon_ram_devops; 224 bank->disk->fops = &axon_ram_devops;
225 bank->disk->private_data = bank; 225 bank->disk->private_data = bank;
226 bank->disk->driverfs_dev = &device->dev;
227 226
228 sprintf(bank->disk->disk_name, "%s%d", 227 sprintf(bank->disk->disk_name, "%s%d",
229 AXON_RAM_DEVICE_NAME, axon_ram_bank_id); 228 AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
@@ -238,7 +237,7 @@ static int axon_ram_probe(struct platform_device *device)
238 set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT); 237 set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
239 blk_queue_make_request(bank->disk->queue, axon_ram_make_request); 238 blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
240 blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE); 239 blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
241 add_disk(bank->disk); 240 device_add_disk(&device->dev, bank->disk);
242 241
243 bank->irq_id = irq_of_parse_and_map(device->dev.of_node, 0); 242 bank->irq_id = irq_of_parse_and_map(device->dev.of_node, 0);
244 if (bank->irq_id == NO_IRQ) { 243 if (bank->irq_id == NO_IRQ) {
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index ef6b4d960bad..f3540270d096 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -801,6 +801,7 @@ static void ubd_device_release(struct device *dev)
801static int ubd_disk_register(int major, u64 size, int unit, 801static int ubd_disk_register(int major, u64 size, int unit,
802 struct gendisk **disk_out) 802 struct gendisk **disk_out)
803{ 803{
804 struct device *parent = NULL;
804 struct gendisk *disk; 805 struct gendisk *disk;
805 806
806 disk = alloc_disk(1 << UBD_SHIFT); 807 disk = alloc_disk(1 << UBD_SHIFT);
@@ -823,12 +824,12 @@ static int ubd_disk_register(int major, u64 size, int unit,
823 ubd_devs[unit].pdev.dev.release = ubd_device_release; 824 ubd_devs[unit].pdev.dev.release = ubd_device_release;
824 dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]); 825 dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
825 platform_device_register(&ubd_devs[unit].pdev); 826 platform_device_register(&ubd_devs[unit].pdev);
826 disk->driverfs_dev = &ubd_devs[unit].pdev.dev; 827 parent = &ubd_devs[unit].pdev.dev;
827 } 828 }
828 829
829 disk->private_data = &ubd_devs[unit]; 830 disk->private_data = &ubd_devs[unit];
830 disk->queue = ubd_devs[unit].queue; 831 disk->queue = ubd_devs[unit].queue;
831 add_disk(disk); 832 device_add_disk(parent, disk);
832 833
833 *disk_out = disk; 834 *disk_out = disk;
834 return 0; 835 return 0;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 15d37b1cd500..f70cc3bdfd01 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -54,7 +54,6 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
54{ 54{
55 struct bio_integrity_payload *bip; 55 struct bio_integrity_payload *bip;
56 struct bio_set *bs = bio->bi_pool; 56 struct bio_set *bs = bio->bi_pool;
57 unsigned long idx = BIO_POOL_NONE;
58 unsigned inline_vecs; 57 unsigned inline_vecs;
59 58
60 if (!bs || !bs->bio_integrity_pool) { 59 if (!bs || !bs->bio_integrity_pool) {
@@ -72,17 +71,19 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
72 memset(bip, 0, sizeof(*bip)); 71 memset(bip, 0, sizeof(*bip));
73 72
74 if (nr_vecs > inline_vecs) { 73 if (nr_vecs > inline_vecs) {
74 unsigned long idx = 0;
75
75 bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, 76 bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
76 bs->bvec_integrity_pool); 77 bs->bvec_integrity_pool);
77 if (!bip->bip_vec) 78 if (!bip->bip_vec)
78 goto err; 79 goto err;
79 bip->bip_max_vcnt = bvec_nr_vecs(idx); 80 bip->bip_max_vcnt = bvec_nr_vecs(idx);
81 bip->bip_slab = idx;
80 } else { 82 } else {
81 bip->bip_vec = bip->bip_inline_vecs; 83 bip->bip_vec = bip->bip_inline_vecs;
82 bip->bip_max_vcnt = inline_vecs; 84 bip->bip_max_vcnt = inline_vecs;
83 } 85 }
84 86
85 bip->bip_slab = idx;
86 bip->bip_bio = bio; 87 bip->bip_bio = bio;
87 bio->bi_integrity = bip; 88 bio->bi_integrity = bip;
88 bio->bi_rw |= REQ_INTEGRITY; 89 bio->bi_rw |= REQ_INTEGRITY;
@@ -111,9 +112,7 @@ void bio_integrity_free(struct bio *bio)
111 bip->bip_vec->bv_offset); 112 bip->bip_vec->bv_offset);
112 113
113 if (bs && bs->bio_integrity_pool) { 114 if (bs && bs->bio_integrity_pool) {
114 if (bip->bip_slab != BIO_POOL_NONE) 115 bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
115 bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
116 bip->bip_slab);
117 116
118 mempool_free(bip, bs->bio_integrity_pool); 117 mempool_free(bip, bs->bio_integrity_pool);
119 } else { 118 } else {
diff --git a/block/bio.c b/block/bio.c
index 848cd351513b..54ee3846c3a5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -43,7 +43,7 @@
43 * unsigned short 43 * unsigned short
44 */ 44 */
45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
46static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 46static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
48}; 48};
49#undef BV 49#undef BV
@@ -160,11 +160,15 @@ unsigned int bvec_nr_vecs(unsigned short idx)
160 160
161void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) 161void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
162{ 162{
163 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); 163 if (!idx)
164 return;
165 idx--;
166
167 BIO_BUG_ON(idx >= BVEC_POOL_NR);
164 168
165 if (idx == BIOVEC_MAX_IDX) 169 if (idx == BVEC_POOL_MAX) {
166 mempool_free(bv, pool); 170 mempool_free(bv, pool);
167 else { 171 } else {
168 struct biovec_slab *bvs = bvec_slabs + idx; 172 struct biovec_slab *bvs = bvec_slabs + idx;
169 173
170 kmem_cache_free(bvs->slab, bv); 174 kmem_cache_free(bvs->slab, bv);
@@ -206,7 +210,7 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
206 * idx now points to the pool we want to allocate from. only the 210 * idx now points to the pool we want to allocate from. only the
207 * 1-vec entry pool is mempool backed. 211 * 1-vec entry pool is mempool backed.
208 */ 212 */
209 if (*idx == BIOVEC_MAX_IDX) { 213 if (*idx == BVEC_POOL_MAX) {
210fallback: 214fallback:
211 bvl = mempool_alloc(pool, gfp_mask); 215 bvl = mempool_alloc(pool, gfp_mask);
212 } else { 216 } else {
@@ -226,11 +230,12 @@ fallback:
226 */ 230 */
227 bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); 231 bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
228 if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { 232 if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
229 *idx = BIOVEC_MAX_IDX; 233 *idx = BVEC_POOL_MAX;
230 goto fallback; 234 goto fallback;
231 } 235 }
232 } 236 }
233 237
238 (*idx)++;
234 return bvl; 239 return bvl;
235} 240}
236 241
@@ -250,8 +255,7 @@ static void bio_free(struct bio *bio)
250 __bio_free(bio); 255 __bio_free(bio);
251 256
252 if (bs) { 257 if (bs) {
253 if (bio_flagged(bio, BIO_OWNS_VEC)) 258 bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
254 bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
255 259
256 /* 260 /*
257 * If we have front padding, adjust the bio pointer before freeing 261 * If we have front padding, adjust the bio pointer before freeing
@@ -420,7 +424,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
420 gfp_t saved_gfp = gfp_mask; 424 gfp_t saved_gfp = gfp_mask;
421 unsigned front_pad; 425 unsigned front_pad;
422 unsigned inline_vecs; 426 unsigned inline_vecs;
423 unsigned long idx = BIO_POOL_NONE;
424 struct bio_vec *bvl = NULL; 427 struct bio_vec *bvl = NULL;
425 struct bio *bio; 428 struct bio *bio;
426 void *p; 429 void *p;
@@ -480,6 +483,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
480 bio_init(bio); 483 bio_init(bio);
481 484
482 if (nr_iovecs > inline_vecs) { 485 if (nr_iovecs > inline_vecs) {
486 unsigned long idx = 0;
487
483 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); 488 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
484 if (!bvl && gfp_mask != saved_gfp) { 489 if (!bvl && gfp_mask != saved_gfp) {
485 punt_bios_to_rescuer(bs); 490 punt_bios_to_rescuer(bs);
@@ -490,13 +495,12 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
490 if (unlikely(!bvl)) 495 if (unlikely(!bvl))
491 goto err_free; 496 goto err_free;
492 497
493 bio_set_flag(bio, BIO_OWNS_VEC); 498 bio->bi_flags |= idx << BVEC_POOL_OFFSET;
494 } else if (nr_iovecs) { 499 } else if (nr_iovecs) {
495 bvl = bio->bi_inline_vecs; 500 bvl = bio->bi_inline_vecs;
496 } 501 }
497 502
498 bio->bi_pool = bs; 503 bio->bi_pool = bs;
499 bio->bi_flags |= idx << BIO_POOL_OFFSET;
500 bio->bi_max_vecs = nr_iovecs; 504 bio->bi_max_vecs = nr_iovecs;
501 bio->bi_io_vec = bvl; 505 bio->bi_io_vec = bvl;
502 return bio; 506 return bio;
@@ -568,7 +572,7 @@ EXPORT_SYMBOL(bio_phys_segments);
568 */ 572 */
569void __bio_clone_fast(struct bio *bio, struct bio *bio_src) 573void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
570{ 574{
571 BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); 575 BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
572 576
573 /* 577 /*
574 * most users will be overriding ->bi_bdev with a new target, 578 * most users will be overriding ->bi_bdev with a new target,
@@ -1097,7 +1101,6 @@ int bio_uncopy_user(struct bio *bio)
1097 bio_put(bio); 1101 bio_put(bio);
1098 return ret; 1102 return ret;
1099} 1103}
1100EXPORT_SYMBOL(bio_uncopy_user);
1101 1104
1102/** 1105/**
1103 * bio_copy_user_iov - copy user data to bio 1106 * bio_copy_user_iov - copy user data to bio
@@ -1392,7 +1395,6 @@ void bio_unmap_user(struct bio *bio)
1392 __bio_unmap_user(bio); 1395 __bio_unmap_user(bio);
1393 bio_put(bio); 1396 bio_put(bio);
1394} 1397}
1395EXPORT_SYMBOL(bio_unmap_user);
1396 1398
1397static void bio_map_kern_endio(struct bio *bio) 1399static void bio_map_kern_endio(struct bio *bio)
1398{ 1400{
@@ -1538,7 +1540,6 @@ cleanup:
1538 bio_put(bio); 1540 bio_put(bio);
1539 return ERR_PTR(-ENOMEM); 1541 return ERR_PTR(-ENOMEM);
1540} 1542}
1541EXPORT_SYMBOL(bio_copy_kern);
1542 1543
1543/* 1544/*
1544 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 1545 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1832,7 +1833,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
1832 */ 1833 */
1833mempool_t *biovec_create_pool(int pool_entries) 1834mempool_t *biovec_create_pool(int pool_entries)
1834{ 1835{
1835 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; 1836 struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
1836 1837
1837 return mempool_create_slab_pool(pool_entries, bp->slab); 1838 return mempool_create_slab_pool(pool_entries, bp->slab);
1838} 1839}
@@ -2009,7 +2010,7 @@ static void __init biovec_init_slabs(void)
2009{ 2010{
2010 int i; 2011 int i;
2011 2012
2012 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 2013 for (i = 0; i < BVEC_POOL_NR; i++) {
2013 int size; 2014 int size;
2014 struct biovec_slab *bvs = bvec_slabs + i; 2015 struct biovec_slab *bvs = bvec_slabs + i;
2015 2016
diff --git a/block/blk-core.c b/block/blk-core.c
index 3cfd67d006fb..a687e9cc16c2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1294,10 +1294,15 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
1294 1294
1295 spin_lock_irq(q->queue_lock); 1295 spin_lock_irq(q->queue_lock);
1296 rq = get_request(q, rw, 0, NULL, gfp_mask); 1296 rq = get_request(q, rw, 0, NULL, gfp_mask);
1297 if (IS_ERR(rq)) 1297 if (IS_ERR(rq)) {
1298 spin_unlock_irq(q->queue_lock); 1298 spin_unlock_irq(q->queue_lock);
1299 /* q->queue_lock is unlocked at this point */ 1299 return rq;
1300 }
1300 1301
1302 /* q->queue_lock is unlocked at this point */
1303 rq->__data_len = 0;
1304 rq->__sector = (sector_t) -1;
1305 rq->bio = rq->biotail = NULL;
1301 return rq; 1306 return rq;
1302} 1307}
1303 1308
@@ -1313,63 +1318,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1313EXPORT_SYMBOL(blk_get_request); 1318EXPORT_SYMBOL(blk_get_request);
1314 1319
1315/** 1320/**
1316 * blk_make_request - given a bio, allocate a corresponding struct request.
1317 * @q: target request queue
1318 * @bio: The bio describing the memory mappings that will be submitted for IO.
1319 * It may be a chained-bio properly constructed by block/bio layer.
1320 * @gfp_mask: gfp flags to be used for memory allocation
1321 *
1322 * blk_make_request is the parallel of generic_make_request for BLOCK_PC
1323 * type commands. Where the struct request needs to be farther initialized by
1324 * the caller. It is passed a &struct bio, which describes the memory info of
1325 * the I/O transfer.
1326 *
1327 * The caller of blk_make_request must make sure that bi_io_vec
1328 * are set to describe the memory buffers. That bio_data_dir() will return
1329 * the needed direction of the request. (And all bio's in the passed bio-chain
1330 * are properly set accordingly)
1331 *
1332 * If called under none-sleepable conditions, mapped bio buffers must not
1333 * need bouncing, by calling the appropriate masked or flagged allocator,
1334 * suitable for the target device. Otherwise the call to blk_queue_bounce will
1335 * BUG.
1336 *
1337 * WARNING: When allocating/cloning a bio-chain, careful consideration should be
1338 * given to how you allocate bios. In particular, you cannot use
1339 * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
1340 * you risk waiting for IO completion of a bio that hasn't been submitted yet,
1341 * thus resulting in a deadlock. Alternatively bios should be allocated using
1342 * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
1343 * If possible a big IO should be split into smaller parts when allocation
1344 * fails. Partial allocation should not be an error, or you risk a live-lock.
1345 */
1346struct request *blk_make_request(struct request_queue *q, struct bio *bio,
1347 gfp_t gfp_mask)
1348{
1349 struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
1350
1351 if (IS_ERR(rq))
1352 return rq;
1353
1354 blk_rq_set_block_pc(rq);
1355
1356 for_each_bio(bio) {
1357 struct bio *bounce_bio = bio;
1358 int ret;
1359
1360 blk_queue_bounce(q, &bounce_bio);
1361 ret = blk_rq_append_bio(q, rq, bounce_bio);
1362 if (unlikely(ret)) {
1363 blk_put_request(rq);
1364 return ERR_PTR(ret);
1365 }
1366 }
1367
1368 return rq;
1369}
1370EXPORT_SYMBOL(blk_make_request);
1371
1372/**
1373 * blk_rq_set_block_pc - initialize a request to type BLOCK_PC 1321 * blk_rq_set_block_pc - initialize a request to type BLOCK_PC
1374 * @rq: request to be initialized 1322 * @rq: request to be initialized
1375 * 1323 *
@@ -1377,9 +1325,6 @@ EXPORT_SYMBOL(blk_make_request);
1377void blk_rq_set_block_pc(struct request *rq) 1325void blk_rq_set_block_pc(struct request *rq)
1378{ 1326{
1379 rq->cmd_type = REQ_TYPE_BLOCK_PC; 1327 rq->cmd_type = REQ_TYPE_BLOCK_PC;
1380 rq->__data_len = 0;
1381 rq->__sector = (sector_t) -1;
1382 rq->bio = rq->biotail = NULL;
1383 memset(rq->__cmd, 0, sizeof(rq->__cmd)); 1328 memset(rq->__cmd, 0, sizeof(rq->__cmd));
1384} 1329}
1385EXPORT_SYMBOL(blk_rq_set_block_pc); 1330EXPORT_SYMBOL(blk_rq_set_block_pc);
@@ -1982,16 +1927,21 @@ generic_make_request_checks(struct bio *bio)
1982 } 1927 }
1983 } 1928 }
1984 1929
1985 if ((bio_op(bio) == REQ_OP_DISCARD) && 1930 switch (bio_op(bio)) {
1986 (!blk_queue_discard(q) || 1931 case REQ_OP_DISCARD:
1987 ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { 1932 if (!blk_queue_discard(q))
1988 err = -EOPNOTSUPP; 1933 goto not_supported;
1989 goto end_io; 1934 break;
1990 } 1935 case REQ_OP_SECURE_ERASE:
1991 1936 if (!blk_queue_secure_erase(q))
1992 if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { 1937 goto not_supported;
1993 err = -EOPNOTSUPP; 1938 break;
1994 goto end_io; 1939 case REQ_OP_WRITE_SAME:
1940 if (!bdev_write_same(bio->bi_bdev))
1941 goto not_supported;
1942 break;
1943 default:
1944 break;
1995 } 1945 }
1996 1946
1997 /* 1947 /*
@@ -2008,6 +1958,8 @@ generic_make_request_checks(struct bio *bio)
2008 trace_block_bio_queue(q, bio); 1958 trace_block_bio_queue(q, bio);
2009 return true; 1959 return true;
2010 1960
1961not_supported:
1962 err = -EOPNOTSUPP;
2011end_io: 1963end_io:
2012 bio->bi_error = err; 1964 bio->bi_error = err;
2013 bio_endio(bio); 1965 bio_endio(bio);
@@ -3383,6 +3335,7 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
3383 3335
3384 return false; 3336 return false;
3385} 3337}
3338EXPORT_SYMBOL_GPL(blk_poll);
3386 3339
3387#ifdef CONFIG_PM 3340#ifdef CONFIG_PM
3388/** 3341/**
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9031d2af0b47..083e56f72308 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -23,20 +23,32 @@ static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
23} 23}
24 24
25int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, 25int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
26 sector_t nr_sects, gfp_t gfp_mask, int op_flags, 26 sector_t nr_sects, gfp_t gfp_mask, int flags,
27 struct bio **biop) 27 struct bio **biop)
28{ 28{
29 struct request_queue *q = bdev_get_queue(bdev); 29 struct request_queue *q = bdev_get_queue(bdev);
30 struct bio *bio = *biop; 30 struct bio *bio = *biop;
31 unsigned int granularity; 31 unsigned int granularity;
32 enum req_op op;
32 int alignment; 33 int alignment;
33 34
34 if (!q) 35 if (!q)
35 return -ENXIO; 36 return -ENXIO;
36 if (!blk_queue_discard(q)) 37
37 return -EOPNOTSUPP; 38 if (flags & BLKDEV_DISCARD_SECURE) {
38 if ((op_flags & REQ_SECURE) && !blk_queue_secdiscard(q)) 39 if (flags & BLKDEV_DISCARD_ZERO)
39 return -EOPNOTSUPP; 40 return -EOPNOTSUPP;
41 if (!blk_queue_secure_erase(q))
42 return -EOPNOTSUPP;
43 op = REQ_OP_SECURE_ERASE;
44 } else {
45 if (!blk_queue_discard(q))
46 return -EOPNOTSUPP;
47 if ((flags & BLKDEV_DISCARD_ZERO) &&
48 !q->limits.discard_zeroes_data)
49 return -EOPNOTSUPP;
50 op = REQ_OP_DISCARD;
51 }
40 52
41 /* Zero-sector (unknown) and one-sector granularities are the same. */ 53 /* Zero-sector (unknown) and one-sector granularities are the same. */
42 granularity = max(q->limits.discard_granularity >> 9, 1U); 54 granularity = max(q->limits.discard_granularity >> 9, 1U);
@@ -66,7 +78,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
66 bio = next_bio(bio, 1, gfp_mask); 78 bio = next_bio(bio, 1, gfp_mask);
67 bio->bi_iter.bi_sector = sector; 79 bio->bi_iter.bi_sector = sector;
68 bio->bi_bdev = bdev; 80 bio->bi_bdev = bdev;
69 bio_set_op_attrs(bio, REQ_OP_DISCARD, op_flags); 81 bio_set_op_attrs(bio, op, 0);
70 82
71 bio->bi_iter.bi_size = req_sects << 9; 83 bio->bi_iter.bi_size = req_sects << 9;
72 nr_sects -= req_sects; 84 nr_sects -= req_sects;
@@ -100,20 +112,16 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
100int blkdev_issue_discard(struct block_device *bdev, sector_t sector, 112int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
101 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) 113 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
102{ 114{
103 int op_flags = 0;
104 struct bio *bio = NULL; 115 struct bio *bio = NULL;
105 struct blk_plug plug; 116 struct blk_plug plug;
106 int ret; 117 int ret;
107 118
108 if (flags & BLKDEV_DISCARD_SECURE)
109 op_flags |= REQ_SECURE;
110
111 blk_start_plug(&plug); 119 blk_start_plug(&plug);
112 ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, op_flags, 120 ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
113 &bio); 121 &bio);
114 if (!ret && bio) { 122 if (!ret && bio) {
115 ret = submit_bio_wait(bio); 123 ret = submit_bio_wait(bio);
116 if (ret == -EOPNOTSUPP) 124 if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
117 ret = 0; 125 ret = 0;
118 bio_put(bio); 126 bio_put(bio);
119 } 127 }
@@ -173,7 +181,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
173 ret = submit_bio_wait(bio); 181 ret = submit_bio_wait(bio);
174 bio_put(bio); 182 bio_put(bio);
175 } 183 }
176 return ret != -EOPNOTSUPP ? ret : 0; 184 return ret;
177} 185}
178EXPORT_SYMBOL(blkdev_issue_write_same); 186EXPORT_SYMBOL(blkdev_issue_write_same);
179 187
@@ -244,11 +252,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
244int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 252int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
245 sector_t nr_sects, gfp_t gfp_mask, bool discard) 253 sector_t nr_sects, gfp_t gfp_mask, bool discard)
246{ 254{
247 struct request_queue *q = bdev_get_queue(bdev); 255 if (discard) {
248 256 if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
249 if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data && 257 BLKDEV_DISCARD_ZERO))
250 blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0) 258 return 0;
251 return 0; 259 }
252 260
253 if (bdev_write_same(bdev) && 261 if (bdev_write_same(bdev) &&
254 blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, 262 blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
diff --git a/block/blk-map.c b/block/blk-map.c
index 61733a660c3a..b8657fa8dc9a 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -9,21 +9,26 @@
9 9
10#include "blk.h" 10#include "blk.h"
11 11
12int blk_rq_append_bio(struct request_queue *q, struct request *rq, 12/*
13 struct bio *bio) 13 * Append a bio to a passthrough request. Only works can be merged into
14 * the request based on the driver constraints.
15 */
16int blk_rq_append_bio(struct request *rq, struct bio *bio)
14{ 17{
15 if (!rq->bio) 18 if (!rq->bio) {
16 blk_rq_bio_prep(q, rq, bio); 19 blk_rq_bio_prep(rq->q, rq, bio);
17 else if (!ll_back_merge_fn(q, rq, bio)) 20 } else {
18 return -EINVAL; 21 if (!ll_back_merge_fn(rq->q, rq, bio))
19 else { 22 return -EINVAL;
23
20 rq->biotail->bi_next = bio; 24 rq->biotail->bi_next = bio;
21 rq->biotail = bio; 25 rq->biotail = bio;
22
23 rq->__data_len += bio->bi_iter.bi_size; 26 rq->__data_len += bio->bi_iter.bi_size;
24 } 27 }
28
25 return 0; 29 return 0;
26} 30}
31EXPORT_SYMBOL(blk_rq_append_bio);
27 32
28static int __blk_rq_unmap_user(struct bio *bio) 33static int __blk_rq_unmap_user(struct bio *bio)
29{ 34{
@@ -71,7 +76,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
71 */ 76 */
72 bio_get(bio); 77 bio_get(bio);
73 78
74 ret = blk_rq_append_bio(q, rq, bio); 79 ret = blk_rq_append_bio(rq, bio);
75 if (ret) { 80 if (ret) {
76 bio_endio(bio); 81 bio_endio(bio);
77 __blk_rq_unmap_user(orig_bio); 82 __blk_rq_unmap_user(orig_bio);
@@ -229,7 +234,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
229 if (do_copy) 234 if (do_copy)
230 rq->cmd_flags |= REQ_COPY_USER; 235 rq->cmd_flags |= REQ_COPY_USER;
231 236
232 ret = blk_rq_append_bio(q, rq, bio); 237 ret = blk_rq_append_bio(rq, bio);
233 if (unlikely(ret)) { 238 if (unlikely(ret)) {
234 /* request is too big */ 239 /* request is too big */
235 bio_put(bio); 240 bio_put(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5e4d93edeaf7..41cbd4878958 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -649,8 +649,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
649 if (!rq_mergeable(req) || !rq_mergeable(next)) 649 if (!rq_mergeable(req) || !rq_mergeable(next))
650 return 0; 650 return 0;
651 651
652 if (!blk_check_merge_flags(req->cmd_flags, req_op(req), next->cmd_flags, 652 if (req_op(req) != req_op(next))
653 req_op(next)))
654 return 0; 653 return 0;
655 654
656 /* 655 /*
@@ -758,8 +757,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
758 if (!rq_mergeable(rq) || !bio_mergeable(bio)) 757 if (!rq_mergeable(rq) || !bio_mergeable(bio))
759 return false; 758 return false;
760 759
761 if (!blk_check_merge_flags(rq->cmd_flags, req_op(rq), bio->bi_rw, 760 if (req_op(rq) != bio_op(bio))
762 bio_op(bio)))
763 return false; 761 return false;
764 762
765 /* different data direction or already started, don't merge */ 763 /* different data direction or already started, don't merge */
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 56a0c37a3d06..729bac3a673b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -485,6 +485,32 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
485} 485}
486EXPORT_SYMBOL(blk_mq_tagset_busy_iter); 486EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
487 487
488int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
489{
490 int i, j, ret = 0;
491
492 if (!set->ops->reinit_request)
493 goto out;
494
495 for (i = 0; i < set->nr_hw_queues; i++) {
496 struct blk_mq_tags *tags = set->tags[i];
497
498 for (j = 0; j < tags->nr_tags; j++) {
499 if (!tags->rqs[j])
500 continue;
501
502 ret = set->ops->reinit_request(set->driver_data,
503 tags->rqs[j]);
504 if (ret)
505 goto out;
506 }
507 }
508
509out:
510 return ret;
511}
512EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
513
488void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 514void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
489 void *priv) 515 void *priv)
490{ 516{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2a1920c6d6e5..576e7112f807 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -263,10 +263,53 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
263 blk_queue_exit(q); 263 blk_queue_exit(q);
264 return ERR_PTR(-EWOULDBLOCK); 264 return ERR_PTR(-EWOULDBLOCK);
265 } 265 }
266
267 rq->__data_len = 0;
268 rq->__sector = (sector_t) -1;
269 rq->bio = rq->biotail = NULL;
266 return rq; 270 return rq;
267} 271}
268EXPORT_SYMBOL(blk_mq_alloc_request); 272EXPORT_SYMBOL(blk_mq_alloc_request);
269 273
274struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
275 unsigned int flags, unsigned int hctx_idx)
276{
277 struct blk_mq_hw_ctx *hctx;
278 struct blk_mq_ctx *ctx;
279 struct request *rq;
280 struct blk_mq_alloc_data alloc_data;
281 int ret;
282
283 /*
284 * If the tag allocator sleeps we could get an allocation for a
285 * different hardware context. No need to complicate the low level
286 * allocator for this for the rare use case of a command tied to
287 * a specific queue.
288 */
289 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
290 return ERR_PTR(-EINVAL);
291
292 if (hctx_idx >= q->nr_hw_queues)
293 return ERR_PTR(-EIO);
294
295 ret = blk_queue_enter(q, true);
296 if (ret)
297 return ERR_PTR(ret);
298
299 hctx = q->queue_hw_ctx[hctx_idx];
300 ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
301
302 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
303 rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
304 if (!rq) {
305 blk_queue_exit(q);
306 return ERR_PTR(-EWOULDBLOCK);
307 }
308
309 return rq;
310}
311EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
312
270static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 313static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
271 struct blk_mq_ctx *ctx, struct request *rq) 314 struct blk_mq_ctx *ctx, struct request *rq)
272{ 315{
diff --git a/block/blk.h b/block/blk.h
index 70e4aee9cdcb..c37492f5edaa 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -64,8 +64,6 @@ void blk_exit_rl(struct request_list *rl);
64void init_request_from_bio(struct request *req, struct bio *bio); 64void init_request_from_bio(struct request *req, struct bio *bio);
65void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 65void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
66 struct bio *bio); 66 struct bio *bio);
67int blk_rq_append_bio(struct request_queue *q, struct request *rq,
68 struct bio *bio);
69void blk_queue_bypass_start(struct request_queue *q); 67void blk_queue_bypass_start(struct request_queue *q);
70void blk_queue_bypass_end(struct request_queue *q); 68void blk_queue_bypass_end(struct request_queue *q);
71void blk_dequeue_request(struct request *rq); 69void blk_dequeue_request(struct request *rq);
diff --git a/block/genhd.c b/block/genhd.c
index f06d7f3b075b..3c9dede4e04f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -506,7 +506,7 @@ static int exact_lock(dev_t devt, void *data)
506 return 0; 506 return 0;
507} 507}
508 508
509static void register_disk(struct gendisk *disk) 509static void register_disk(struct device *parent, struct gendisk *disk)
510{ 510{
511 struct device *ddev = disk_to_dev(disk); 511 struct device *ddev = disk_to_dev(disk);
512 struct block_device *bdev; 512 struct block_device *bdev;
@@ -514,7 +514,7 @@ static void register_disk(struct gendisk *disk)
514 struct hd_struct *part; 514 struct hd_struct *part;
515 int err; 515 int err;
516 516
517 ddev->parent = disk->driverfs_dev; 517 ddev->parent = parent;
518 518
519 dev_set_name(ddev, "%s", disk->disk_name); 519 dev_set_name(ddev, "%s", disk->disk_name);
520 520
@@ -573,7 +573,8 @@ exit:
573} 573}
574 574
575/** 575/**
576 * add_disk - add partitioning information to kernel list 576 * device_add_disk - add partitioning information to kernel list
577 * @parent: parent device for the disk
577 * @disk: per-device partitioning information 578 * @disk: per-device partitioning information
578 * 579 *
579 * This function registers the partitioning information in @disk 580 * This function registers the partitioning information in @disk
@@ -581,7 +582,7 @@ exit:
581 * 582 *
582 * FIXME: error handling 583 * FIXME: error handling
583 */ 584 */
584void add_disk(struct gendisk *disk) 585void device_add_disk(struct device *parent, struct gendisk *disk)
585{ 586{
586 struct backing_dev_info *bdi; 587 struct backing_dev_info *bdi;
587 dev_t devt; 588 dev_t devt;
@@ -617,7 +618,7 @@ void add_disk(struct gendisk *disk)
617 618
618 blk_register_region(disk_devt(disk), disk->minors, NULL, 619 blk_register_region(disk_devt(disk), disk->minors, NULL,
619 exact_match, exact_lock, disk); 620 exact_match, exact_lock, disk);
620 register_disk(disk); 621 register_disk(parent, disk);
621 blk_register_queue(disk); 622 blk_register_queue(disk);
622 623
623 /* 624 /*
@@ -633,7 +634,7 @@ void add_disk(struct gendisk *disk)
633 disk_add_events(disk); 634 disk_add_events(disk);
634 blk_integrity_add(disk); 635 blk_integrity_add(disk);
635} 636}
636EXPORT_SYMBOL(add_disk); 637EXPORT_SYMBOL(device_add_disk);
637 638
638void del_gendisk(struct gendisk *disk) 639void del_gendisk(struct gendisk *disk)
639{ 640{
@@ -799,10 +800,9 @@ void __init printk_all_partitions(void)
799 , disk_name(disk, part->partno, name_buf), 800 , disk_name(disk, part->partno, name_buf),
800 part->info ? part->info->uuid : ""); 801 part->info ? part->info->uuid : "");
801 if (is_part0) { 802 if (is_part0) {
802 if (disk->driverfs_dev != NULL && 803 if (dev->parent && dev->parent->driver)
803 disk->driverfs_dev->driver != NULL)
804 printk(" driver: %s\n", 804 printk(" driver: %s\n",
805 disk->driverfs_dev->driver->name); 805 dev->parent->driver->name);
806 else 806 else
807 printk(" (driver?)\n"); 807 printk(" (driver?)\n");
808 } else 808 } else
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index dd96a935fba0..ba5145d384d8 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -347,9 +347,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
347 goto out; 347 goto out;
348 } 348 }
349 349
350 rw = bio_rw(bio); 350 rw = bio_data_dir(bio);
351 if (rw == READA)
352 rw = READ;
353 351
354 bio_for_each_segment(bvec, bio, iter) { 352 bio_for_each_segment(bvec, bio, iter) {
355 unsigned int len = bvec.bv_len; 353 unsigned int len = bvec.bv_len;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 63c2064689f8..db9d6bb6352d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1951,7 +1951,6 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
1951 if (cciss_create_ld_sysfs_entry(h, drv_index)) 1951 if (cciss_create_ld_sysfs_entry(h, drv_index))
1952 goto cleanup_queue; 1952 goto cleanup_queue;
1953 disk->private_data = h->drv[drv_index]; 1953 disk->private_data = h->drv[drv_index];
1954 disk->driverfs_dev = &h->drv[drv_index]->dev;
1955 1954
1956 /* Set up queue information */ 1955 /* Set up queue information */
1957 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); 1956 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
@@ -1973,7 +1972,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
1973 /* allows the interrupt handler to start the queue */ 1972 /* allows the interrupt handler to start the queue */
1974 wmb(); 1973 wmb();
1975 h->drv[drv_index]->queue = disk->queue; 1974 h->drv[drv_index]->queue = disk->queue;
1976 add_disk(disk); 1975 device_add_disk(&h->drv[drv_index]->dev, disk);
1977 return 0; 1976 return 0;
1978 1977
1979cleanup_queue: 1978cleanup_queue:
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index d524973f94b3..0a1aaf8c24c4 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -258,7 +258,7 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval
258 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 258 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
259 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 259 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
260 260
261 D_ASSERT(device, (unsigned)(last - first) <= 1); 261 D_ASSERT(device, first <= last);
262 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 262 D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
263 263
264 /* FIXME figure out a fast path for bios crossing AL extent boundaries */ 264 /* FIXME figure out a fast path for bios crossing AL extent boundaries */
@@ -341,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact
341 341
342 i = 0; 342 i = 0;
343 343
344 drbd_bm_reset_al_hints(device);
345
344 /* Even though no one can start to change this list 346 /* Even though no one can start to change this list
345 * once we set the LC_LOCKED -- from drbd_al_begin_io(), 347 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
346 * lc_try_lock_for_transaction() --, someone may still 348 * lc_try_lock_for_transaction() --, someone may still
@@ -770,10 +772,18 @@ static bool lazy_bitmap_update_due(struct drbd_device *device)
770 772
771static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) 773static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
772{ 774{
773 if (rs_done) 775 if (rs_done) {
774 set_bit(RS_DONE, &device->flags); 776 struct drbd_connection *connection = first_peer_device(device)->connection;
775 /* and also set RS_PROGRESS below */ 777 if (connection->agreed_pro_version <= 95 ||
776 else if (!lazy_bitmap_update_due(device)) 778 is_sync_target_state(device->state.conn))
779 set_bit(RS_DONE, &device->flags);
780 /* and also set RS_PROGRESS below */
781
782 /* Else: rather wait for explicit notification via receive_state,
783 * to avoid uuids-rotated-too-fast causing full resync
784 * in next handshake, in case the replication link breaks
785 * at the most unfortunate time... */
786 } else if (!lazy_bitmap_update_due(device))
777 return; 787 return;
778 788
779 drbd_device_post_work(device, RS_PROGRESS); 789 drbd_device_post_work(device, RS_PROGRESS);
@@ -832,6 +842,13 @@ static int update_sync_bits(struct drbd_device *device,
832 return count; 842 return count;
833} 843}
834 844
845static bool plausible_request_size(int size)
846{
847 return size > 0
848 && size <= DRBD_MAX_BATCH_BIO_SIZE
849 && IS_ALIGNED(size, 512);
850}
851
835/* clear the bit corresponding to the piece of storage in question: 852/* clear the bit corresponding to the piece of storage in question:
836 * size byte of data starting from sector. Only clear a bits of the affected 853 * size byte of data starting from sector. Only clear a bits of the affected
837 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 854 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -851,7 +868,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
851 if ((mode == SET_OUT_OF_SYNC) && size == 0) 868 if ((mode == SET_OUT_OF_SYNC) && size == 0)
852 return 0; 869 return 0;
853 870
854 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 871 if (!plausible_request_size(size)) {
855 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", 872 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
856 drbd_change_sync_fname[mode], 873 drbd_change_sync_fname[mode],
857 (unsigned long long)sector, size); 874 (unsigned long long)sector, size);
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index e5d89f623b90..ab62b81c2ca7 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -96,6 +96,13 @@ struct drbd_bitmap {
96 struct page **bm_pages; 96 struct page **bm_pages;
97 spinlock_t bm_lock; 97 spinlock_t bm_lock;
98 98
99 /* exclusively to be used by __al_write_transaction(),
100 * drbd_bm_mark_for_writeout() and
101 * and drbd_bm_write_hinted() -> bm_rw() called from there.
102 */
103 unsigned int n_bitmap_hints;
104 unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
105
99 /* see LIMITATIONS: above */ 106 /* see LIMITATIONS: above */
100 107
101 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ 108 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
@@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page)
242 set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); 249 set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
243} 250}
244 251
252void drbd_bm_reset_al_hints(struct drbd_device *device)
253{
254 device->bitmap->n_bitmap_hints = 0;
255}
256
245/** 257/**
246 * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout 258 * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
247 * @device: DRBD device. 259 * @device: DRBD device.
@@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page)
253 */ 265 */
254void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) 266void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
255{ 267{
268 struct drbd_bitmap *b = device->bitmap;
256 struct page *page; 269 struct page *page;
257 if (page_nr >= device->bitmap->bm_number_of_pages) { 270 if (page_nr >= device->bitmap->bm_number_of_pages) {
258 drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n", 271 drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
@@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
260 return; 273 return;
261 } 274 }
262 page = device->bitmap->bm_pages[page_nr]; 275 page = device->bitmap->bm_pages[page_nr];
263 set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); 276 BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
277 if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
278 b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
264} 279}
265 280
266static int bm_test_page_unchanged(struct page *page) 281static int bm_test_page_unchanged(struct page *page)
@@ -427,8 +442,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
427} 442}
428 443
429/* 444/*
430 * called on driver init only. TODO call when a device is created. 445 * allocates the drbd_bitmap and stores it in device->bitmap.
431 * allocates the drbd_bitmap, and stores it in device->bitmap.
432 */ 446 */
433int drbd_bm_init(struct drbd_device *device) 447int drbd_bm_init(struct drbd_device *device)
434{ 448{
@@ -633,7 +647,8 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
633 unsigned long bits, words, owords, obits; 647 unsigned long bits, words, owords, obits;
634 unsigned long want, have, onpages; /* number of pages */ 648 unsigned long want, have, onpages; /* number of pages */
635 struct page **npages, **opages = NULL; 649 struct page **npages, **opages = NULL;
636 int err = 0, growing; 650 int err = 0;
651 bool growing;
637 652
638 if (!expect(b)) 653 if (!expect(b))
639 return -ENOMEM; 654 return -ENOMEM;
@@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1030{ 1045{
1031 struct drbd_bm_aio_ctx *ctx; 1046 struct drbd_bm_aio_ctx *ctx;
1032 struct drbd_bitmap *b = device->bitmap; 1047 struct drbd_bitmap *b = device->bitmap;
1033 int num_pages, i, count = 0; 1048 unsigned int num_pages, i, count = 0;
1034 unsigned long now; 1049 unsigned long now;
1035 char ppb[10]; 1050 char ppb[10];
1036 int err = 0; 1051 int err = 0;
@@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1078 now = jiffies; 1093 now = jiffies;
1079 1094
1080 /* let the layers below us try to merge these bios... */ 1095 /* let the layers below us try to merge these bios... */
1081 for (i = 0; i < num_pages; i++) {
1082 /* ignore completely unchanged pages */
1083 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1084 break;
1085 if (!(flags & BM_AIO_READ)) {
1086 if ((flags & BM_AIO_WRITE_HINTED) &&
1087 !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
1088 &page_private(b->bm_pages[i])))
1089 continue;
1090 1096
1097 if (flags & BM_AIO_READ) {
1098 for (i = 0; i < num_pages; i++) {
1099 atomic_inc(&ctx->in_flight);
1100 bm_page_io_async(ctx, i);
1101 ++count;
1102 cond_resched();
1103 }
1104 } else if (flags & BM_AIO_WRITE_HINTED) {
1105 /* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
1106 unsigned int hint;
1107 for (hint = 0; hint < b->n_bitmap_hints; hint++) {
1108 i = b->al_bitmap_hints[hint];
1109 if (i >= num_pages) /* == -1U: no hint here. */
1110 continue;
1111 /* Several AL-extents may point to the same page. */
1112 if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
1113 &page_private(b->bm_pages[i])))
1114 continue;
1115 /* Has it even changed? */
1116 if (bm_test_page_unchanged(b->bm_pages[i]))
1117 continue;
1118 atomic_inc(&ctx->in_flight);
1119 bm_page_io_async(ctx, i);
1120 ++count;
1121 }
1122 } else {
1123 for (i = 0; i < num_pages; i++) {
1124 /* ignore completely unchanged pages */
1125 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1126 break;
1091 if (!(flags & BM_AIO_WRITE_ALL_PAGES) && 1127 if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
1092 bm_test_page_unchanged(b->bm_pages[i])) { 1128 bm_test_page_unchanged(b->bm_pages[i])) {
1093 dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); 1129 dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
@@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1100 dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i); 1136 dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
1101 continue; 1137 continue;
1102 } 1138 }
1139 atomic_inc(&ctx->in_flight);
1140 bm_page_io_async(ctx, i);
1141 ++count;
1142 cond_resched();
1103 } 1143 }
1104 atomic_inc(&ctx->in_flight);
1105 bm_page_io_async(ctx, i);
1106 ++count;
1107 cond_resched();
1108 } 1144 }
1109 1145
1110 /* 1146 /*
@@ -1121,10 +1157,14 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1121 kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); 1157 kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
1122 1158
1123 /* summary for global bitmap IO */ 1159 /* summary for global bitmap IO */
1124 if (flags == 0) 1160 if (flags == 0) {
1125 drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", 1161 unsigned int ms = jiffies_to_msecs(jiffies - now);
1126 (flags & BM_AIO_READ) ? "READ" : "WRITE", 1162 if (ms > 5) {
1127 count, jiffies - now); 1163 drbd_info(device, "bitmap %s of %u pages took %u ms\n",
1164 (flags & BM_AIO_READ) ? "READ" : "WRITE",
1165 count, ms);
1166 }
1167 }
1128 1168
1129 if (ctx->error) { 1169 if (ctx->error) {
1130 drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n"); 1170 drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 4de95bbff486..be91a8d7c22a 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
237 seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); 237 seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
238 seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); 238 seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
239 239
240 if (f & EE_IS_TRIM) { 240 if (f & EE_IS_TRIM)
241 seq_putc(m, sep); 241 __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
242 sep = '|'; 242 seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
243 if (f & EE_IS_TRIM_USE_ZEROOUT)
244 seq_puts(m, "zero-out");
245 else
246 seq_puts(m, "trim");
247 }
248 seq_putc(m, '\n'); 243 seq_putc(m, '\n');
249} 244}
250 245
@@ -908,7 +903,7 @@ static int drbd_version_open(struct inode *inode, struct file *file)
908 return single_open(file, drbd_version_show, NULL); 903 return single_open(file, drbd_version_show, NULL);
909} 904}
910 905
911static struct file_operations drbd_version_fops = { 906static const struct file_operations drbd_version_fops = {
912 .owner = THIS_MODULE, 907 .owner = THIS_MODULE,
913 .open = drbd_version_open, 908 .open = drbd_version_open,
914 .llseek = seq_lseek, 909 .llseek = seq_lseek,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a64c645b4184..7b54354976a5 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -468,9 +468,15 @@ enum {
468 /* this is/was a write request */ 468 /* this is/was a write request */
469 __EE_WRITE, 469 __EE_WRITE,
470 470
471 /* this is/was a write same request */
472 __EE_WRITE_SAME,
473
471 /* this originates from application on peer 474 /* this originates from application on peer
472 * (not some resync or verify or other DRBD internal request) */ 475 * (not some resync or verify or other DRBD internal request) */
473 __EE_APPLICATION, 476 __EE_APPLICATION,
477
478 /* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
479 __EE_RS_THIN_REQ,
474}; 480};
475#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 481#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
476#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 482#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
@@ -484,7 +490,9 @@ enum {
484#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) 490#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
485#define EE_SUBMITTED (1<<__EE_SUBMITTED) 491#define EE_SUBMITTED (1<<__EE_SUBMITTED)
486#define EE_WRITE (1<<__EE_WRITE) 492#define EE_WRITE (1<<__EE_WRITE)
493#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
487#define EE_APPLICATION (1<<__EE_APPLICATION) 494#define EE_APPLICATION (1<<__EE_APPLICATION)
495#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
488 496
489/* flag bits per device */ 497/* flag bits per device */
490enum { 498enum {
@@ -1123,6 +1131,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
1123extern int drbd_send_bitmap(struct drbd_device *device); 1131extern int drbd_send_bitmap(struct drbd_device *device);
1124extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); 1132extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
1125extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); 1133extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
1134extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
1126extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); 1135extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
1127extern void drbd_device_cleanup(struct drbd_device *device); 1136extern void drbd_device_cleanup(struct drbd_device *device);
1128void drbd_print_uuids(struct drbd_device *device, const char *text); 1137void drbd_print_uuids(struct drbd_device *device, const char *text);
@@ -1342,11 +1351,11 @@ struct bm_extent {
1342#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ 1351#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
1343#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ 1352#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
1344 1353
1345/* For now, don't allow more than one activity log extent worth of data 1354/* For now, don't allow more than half of what we can "activate" in one
1346 * to be discarded in one go. We may need to rework drbd_al_begin_io() 1355 * activity log transaction to be discarded in one go. We may need to rework
1347 * to allow for even larger discard ranges */ 1356 * drbd_al_begin_io() to allow for even larger discard ranges */
1348#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE 1357#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
1349#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) 1358#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
1350 1359
1351extern int drbd_bm_init(struct drbd_device *device); 1360extern int drbd_bm_init(struct drbd_device *device);
1352extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); 1361extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
@@ -1369,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
1369extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); 1378extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
1370extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); 1379extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
1371extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); 1380extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
1381extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local);
1372extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); 1382extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
1373extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); 1383extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
1374extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); 1384extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
@@ -1483,12 +1493,14 @@ enum determine_dev_size {
1483extern enum determine_dev_size 1493extern enum determine_dev_size
1484drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); 1494drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
1485extern void resync_after_online_grow(struct drbd_device *); 1495extern void resync_after_online_grow(struct drbd_device *);
1486extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); 1496extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
1497 struct drbd_backing_dev *bdev, struct o_qlim *o);
1487extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, 1498extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
1488 enum drbd_role new_role, 1499 enum drbd_role new_role,
1489 int force); 1500 int force);
1490extern bool conn_try_outdate_peer(struct drbd_connection *connection); 1501extern bool conn_try_outdate_peer(struct drbd_connection *connection);
1491extern void conn_try_outdate_peer_async(struct drbd_connection *connection); 1502extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
1503extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd);
1492extern int drbd_khelper(struct drbd_device *device, char *cmd); 1504extern int drbd_khelper(struct drbd_device *device, char *cmd);
1493 1505
1494/* drbd_worker.c */ 1506/* drbd_worker.c */
@@ -1548,6 +1560,8 @@ extern void start_resync_timer_fn(unsigned long data);
1548extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); 1560extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
1549 1561
1550/* drbd_receiver.c */ 1562/* drbd_receiver.c */
1563extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
1564 sector_t start, unsigned int nr_sectors, bool discard);
1551extern int drbd_receiver(struct drbd_thread *thi); 1565extern int drbd_receiver(struct drbd_thread *thi);
1552extern int drbd_ack_receiver(struct drbd_thread *thi); 1566extern int drbd_ack_receiver(struct drbd_thread *thi);
1553extern void drbd_send_ping_wf(struct work_struct *ws); 1567extern void drbd_send_ping_wf(struct work_struct *ws);
@@ -1561,7 +1575,7 @@ extern int drbd_submit_peer_request(struct drbd_device *,
1561extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); 1575extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
1562extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, 1576extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
1563 sector_t, unsigned int, 1577 sector_t, unsigned int,
1564 bool, 1578 unsigned int,
1565 gfp_t) __must_hold(local); 1579 gfp_t) __must_hold(local);
1566extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, 1580extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
1567 int); 1581 int);
@@ -1635,8 +1649,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
1635/* drbd_proc.c */ 1649/* drbd_proc.c */
1636extern struct proc_dir_entry *drbd_proc; 1650extern struct proc_dir_entry *drbd_proc;
1637extern const struct file_operations drbd_proc_fops; 1651extern const struct file_operations drbd_proc_fops;
1638extern const char *drbd_conn_str(enum drbd_conns s);
1639extern const char *drbd_role_str(enum drbd_role s);
1640 1652
1641/* drbd_actlog.c */ 1653/* drbd_actlog.c */
1642extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); 1654extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
@@ -2095,13 +2107,22 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
2095 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); 2107 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
2096} 2108}
2097 2109
2110static inline bool is_sync_target_state(enum drbd_conns connection_state)
2111{
2112 return connection_state == C_SYNC_TARGET ||
2113 connection_state == C_PAUSED_SYNC_T;
2114}
2115
2116static inline bool is_sync_source_state(enum drbd_conns connection_state)
2117{
2118 return connection_state == C_SYNC_SOURCE ||
2119 connection_state == C_PAUSED_SYNC_S;
2120}
2121
2098static inline bool is_sync_state(enum drbd_conns connection_state) 2122static inline bool is_sync_state(enum drbd_conns connection_state)
2099{ 2123{
2100 return 2124 return is_sync_source_state(connection_state) ||
2101 (connection_state == C_SYNC_SOURCE 2125 is_sync_target_state(connection_state);
2102 || connection_state == C_SYNC_TARGET
2103 || connection_state == C_PAUSED_SYNC_S
2104 || connection_state == C_PAUSED_SYNC_T);
2105} 2126}
2106 2127
2107/** 2128/**
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index f210543f05f4..23c5a94428d2 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -6,13 +6,13 @@
6 6
7struct drbd_interval { 7struct drbd_interval {
8 struct rb_node rb; 8 struct rb_node rb;
9 sector_t sector; /* start sector of the interval */ 9 sector_t sector; /* start sector of the interval */
10 unsigned int size; /* size in bytes */ 10 unsigned int size; /* size in bytes */
11 sector_t end; /* highest interval end in subtree */ 11 sector_t end; /* highest interval end in subtree */
12 int local:1 /* local or remote request? */; 12 unsigned int local:1 /* local or remote request? */;
13 int waiting:1; /* someone is waiting for this to complete */ 13 unsigned int waiting:1; /* someone is waiting for completion */
14 int completed:1; /* this has been completed already; 14 unsigned int completed:1; /* this has been completed already;
15 * ignore for conflict detection */ 15 * ignore for conflict detection */
16}; 16};
17 17
18static inline void drbd_clear_interval(struct drbd_interval *i) 18static inline void drbd_clear_interval(struct drbd_interval *i)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2b37744db0fa..0501ae0c517b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -31,7 +31,7 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/jiffies.h> 32#include <linux/jiffies.h>
33#include <linux/drbd.h> 33#include <linux/drbd.h>
34#include <asm/uaccess.h> 34#include <linux/uaccess.h>
35#include <asm/types.h> 35#include <asm/types.h>
36#include <net/sock.h> 36#include <net/sock.h>
37#include <linux/ctype.h> 37#include <linux/ctype.h>
@@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
920 } 920 }
921} 921}
922 922
923/* communicated if (agreed_features & DRBD_FF_WSAME) */
924void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q)
925{
926 if (q) {
927 p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
928 p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
929 p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
930 p->qlim->io_min = cpu_to_be32(queue_io_min(q));
931 p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
932 p->qlim->discard_enabled = blk_queue_discard(q);
933 p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
934 p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
935 } else {
936 q = device->rq_queue;
937 p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
938 p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
939 p->qlim->alignment_offset = 0;
940 p->qlim->io_min = cpu_to_be32(queue_io_min(q));
941 p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
942 p->qlim->discard_enabled = 0;
943 p->qlim->discard_zeroes_data = 0;
944 p->qlim->write_same_capable = 0;
945 }
946}
947
923int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) 948int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
924{ 949{
925 struct drbd_device *device = peer_device->device; 950 struct drbd_device *device = peer_device->device;
@@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
928 sector_t d_size, u_size; 953 sector_t d_size, u_size;
929 int q_order_type; 954 int q_order_type;
930 unsigned int max_bio_size; 955 unsigned int max_bio_size;
956 unsigned int packet_size;
957
958 sock = &peer_device->connection->data;
959 p = drbd_prepare_command(peer_device, sock);
960 if (!p)
961 return -EIO;
931 962
963 packet_size = sizeof(*p);
964 if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
965 packet_size += sizeof(p->qlim[0]);
966
967 memset(p, 0, packet_size);
932 if (get_ldev_if_state(device, D_NEGOTIATING)) { 968 if (get_ldev_if_state(device, D_NEGOTIATING)) {
933 D_ASSERT(device, device->ldev->backing_bdev); 969 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
934 d_size = drbd_get_max_capacity(device->ldev); 970 d_size = drbd_get_max_capacity(device->ldev);
935 rcu_read_lock(); 971 rcu_read_lock();
936 u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; 972 u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
937 rcu_read_unlock(); 973 rcu_read_unlock();
938 q_order_type = drbd_queue_order_type(device); 974 q_order_type = drbd_queue_order_type(device);
939 max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; 975 max_bio_size = queue_max_hw_sectors(q) << 9;
940 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); 976 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
977 assign_p_sizes_qlim(device, p, q);
941 put_ldev(device); 978 put_ldev(device);
942 } else { 979 } else {
943 d_size = 0; 980 d_size = 0;
944 u_size = 0; 981 u_size = 0;
945 q_order_type = QUEUE_ORDERED_NONE; 982 q_order_type = QUEUE_ORDERED_NONE;
946 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ 983 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
984 assign_p_sizes_qlim(device, p, NULL);
947 } 985 }
948 986
949 sock = &peer_device->connection->data;
950 p = drbd_prepare_command(peer_device, sock);
951 if (!p)
952 return -EIO;
953
954 if (peer_device->connection->agreed_pro_version <= 94) 987 if (peer_device->connection->agreed_pro_version <= 94)
955 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); 988 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
956 else if (peer_device->connection->agreed_pro_version < 100) 989 else if (peer_device->connection->agreed_pro_version < 100)
@@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
962 p->max_bio_size = cpu_to_be32(max_bio_size); 995 p->max_bio_size = cpu_to_be32(max_bio_size);
963 p->queue_order_type = cpu_to_be16(q_order_type); 996 p->queue_order_type = cpu_to_be16(q_order_type);
964 p->dds_flags = cpu_to_be16(flags); 997 p->dds_flags = cpu_to_be16(flags);
965 return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0); 998
999 return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
966} 1000}
967 1001
968/** 1002/**
@@ -1377,6 +1411,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1377 cpu_to_be64(block_id)); 1411 cpu_to_be64(block_id));
1378} 1412}
1379 1413
1414int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
1415 struct drbd_peer_request *peer_req)
1416{
1417 struct drbd_socket *sock;
1418 struct p_block_desc *p;
1419
1420 sock = &peer_device->connection->data;
1421 p = drbd_prepare_command(peer_device, sock);
1422 if (!p)
1423 return -EIO;
1424 p->sector = cpu_to_be64(peer_req->i.sector);
1425 p->blksize = cpu_to_be32(peer_req->i.size);
1426 p->pad = 0;
1427 return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
1428}
1429
1380int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd, 1430int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
1381 sector_t sector, int size, u64 block_id) 1431 sector_t sector, int size, u64 block_id)
1382{ 1432{
@@ -1561,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
1561 ? 0 : MSG_MORE); 1611 ? 0 : MSG_MORE);
1562 if (err) 1612 if (err)
1563 return err; 1613 return err;
1614 /* REQ_OP_WRITE_SAME has only one segment */
1615 if (bio_op(bio) == REQ_OP_WRITE_SAME)
1616 break;
1564 } 1617 }
1565 return 0; 1618 return 0;
1566} 1619}
@@ -1579,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
1579 bio_iter_last(bvec, iter) ? 0 : MSG_MORE); 1632 bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
1580 if (err) 1633 if (err)
1581 return err; 1634 return err;
1635 /* REQ_OP_WRITE_SAME has only one segment */
1636 if (bio_op(bio) == REQ_OP_WRITE_SAME)
1637 break;
1582 } 1638 }
1583 return 0; 1639 return 0;
1584} 1640}
@@ -1610,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
1610 return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | 1666 return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
1611 (bio->bi_rw & REQ_FUA ? DP_FUA : 0) | 1667 (bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
1612 (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) | 1668 (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
1669 (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
1613 (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0); 1670 (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
1614 else 1671 else
1615 return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; 1672 return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
@@ -1623,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
1623 struct drbd_device *device = peer_device->device; 1680 struct drbd_device *device = peer_device->device;
1624 struct drbd_socket *sock; 1681 struct drbd_socket *sock;
1625 struct p_data *p; 1682 struct p_data *p;
1683 struct p_wsame *wsame = NULL;
1684 void *digest_out;
1626 unsigned int dp_flags = 0; 1685 unsigned int dp_flags = 0;
1627 int digest_size; 1686 int digest_size;
1628 int err; 1687 int err;
@@ -1658,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
1658 err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); 1717 err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
1659 goto out; 1718 goto out;
1660 } 1719 }
1720 if (dp_flags & DP_WSAME) {
1721 /* this will only work if DRBD_FF_WSAME is set AND the
1722 * handshake agreed that all nodes and backend devices are
1723 * WRITE_SAME capable and agree on logical_block_size */
1724 wsame = (struct p_wsame*)p;
1725 digest_out = wsame + 1;
1726 wsame->size = cpu_to_be32(req->i.size);
1727 } else
1728 digest_out = p + 1;
1661 1729
1662 /* our digest is still only over the payload. 1730 /* our digest is still only over the payload.
1663 * TRIM does not carry any payload. */ 1731 * TRIM does not carry any payload. */
1664 if (digest_size) 1732 if (digest_size)
1665 drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); 1733 drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
1666 err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); 1734 if (wsame) {
1735 err =
1736 __send_command(peer_device->connection, device->vnr, sock, P_WSAME,
1737 sizeof(*wsame) + digest_size, NULL,
1738 bio_iovec(req->master_bio).bv_len);
1739 } else
1740 err =
1741 __send_command(peer_device->connection, device->vnr, sock, P_DATA,
1742 sizeof(*p) + digest_size, NULL, req->i.size);
1667 if (!err) { 1743 if (!err) {
1668 /* For protocol A, we have to memcpy the payload into 1744 /* For protocol A, we have to memcpy the payload into
1669 * socket buffers, as we may complete right away 1745 * socket buffers, as we may complete right away
@@ -3507,7 +3583,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
3507 struct bm_io_work *work = &device->bm_io_work; 3583 struct bm_io_work *work = &device->bm_io_work;
3508 int rv = -EIO; 3584 int rv = -EIO;
3509 3585
3510 D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0); 3586 if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
3587 int cnt = atomic_read(&device->ap_bio_cnt);
3588 if (cnt)
3589 drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
3590 cnt, work->why);
3591 }
3511 3592
3512 if (get_ldev(device)) { 3593 if (get_ldev(device)) {
3513 drbd_bm_lock(device, work->why, work->flags); 3594 drbd_bm_lock(device, work->why, work->flags);
@@ -3587,18 +3668,20 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
3587int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), 3668int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
3588 char *why, enum bm_flag flags) 3669 char *why, enum bm_flag flags)
3589{ 3670{
3671 /* Only suspend io, if some operation is supposed to be locked out */
3672 const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST);
3590 int rv; 3673 int rv;
3591 3674
3592 D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); 3675 D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
3593 3676
3594 if ((flags & BM_LOCKED_SET_ALLOWED) == 0) 3677 if (do_suspend_io)
3595 drbd_suspend_io(device); 3678 drbd_suspend_io(device);
3596 3679
3597 drbd_bm_lock(device, why, flags); 3680 drbd_bm_lock(device, why, flags);
3598 rv = io_fn(device); 3681 rv = io_fn(device);
3599 drbd_bm_unlock(device); 3682 drbd_bm_unlock(device);
3600 3683
3601 if ((flags & BM_LOCKED_SET_ALLOWED) == 0) 3684 if (do_suspend_io)
3602 drbd_resume_io(device); 3685 drbd_resume_io(device);
3603 3686
3604 return rv; 3687 return rv;
@@ -3637,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd)
3637 * one PRO_VERSION */ 3720 * one PRO_VERSION */
3638 static const char *cmdnames[] = { 3721 static const char *cmdnames[] = {
3639 [P_DATA] = "Data", 3722 [P_DATA] = "Data",
3723 [P_WSAME] = "WriteSame",
3724 [P_TRIM] = "Trim",
3640 [P_DATA_REPLY] = "DataReply", 3725 [P_DATA_REPLY] = "DataReply",
3641 [P_RS_DATA_REPLY] = "RSDataReply", 3726 [P_RS_DATA_REPLY] = "RSDataReply",
3642 [P_BARRIER] = "Barrier", 3727 [P_BARRIER] = "Barrier",
@@ -3681,6 +3766,8 @@ const char *cmdname(enum drbd_packet cmd)
3681 [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", 3766 [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
3682 [P_RETRY_WRITE] = "retry_write", 3767 [P_RETRY_WRITE] = "retry_write",
3683 [P_PROTOCOL_UPDATE] = "protocol_update", 3768 [P_PROTOCOL_UPDATE] = "protocol_update",
3769 [P_RS_THIN_REQ] = "rs_thin_req",
3770 [P_RS_DEALLOCATED] = "rs_deallocated",
3684 3771
3685 /* enum drbd_packet, but not commands - obsoleted flags: 3772 /* enum drbd_packet, but not commands - obsoleted flags:
3686 * P_MAY_IGNORE 3773 * P_MAY_IGNORE
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0bac9c8246bc..f35db29cac76 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -343,7 +343,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
343 (char[20]) { }, /* address family */ 343 (char[20]) { }, /* address family */
344 (char[60]) { }, /* address */ 344 (char[60]) { }, /* address */
345 NULL }; 345 NULL };
346 char mb[12]; 346 char mb[14];
347 char *argv[] = {usermode_helper, cmd, mb, NULL }; 347 char *argv[] = {usermode_helper, cmd, mb, NULL };
348 struct drbd_connection *connection = first_peer_device(device)->connection; 348 struct drbd_connection *connection = first_peer_device(device)->connection;
349 struct sib_info sib; 349 struct sib_info sib;
@@ -352,7 +352,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
352 if (current == connection->worker.task) 352 if (current == connection->worker.task)
353 set_bit(CALLBACK_PENDING, &connection->flags); 353 set_bit(CALLBACK_PENDING, &connection->flags);
354 354
355 snprintf(mb, 12, "minor-%d", device_to_minor(device)); 355 snprintf(mb, 14, "minor-%d", device_to_minor(device));
356 setup_khelper_env(connection, envp); 356 setup_khelper_env(connection, envp);
357 357
358 /* The helper may take some time. 358 /* The helper may take some time.
@@ -387,7 +387,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
387 return ret; 387 return ret;
388} 388}
389 389
390static int conn_khelper(struct drbd_connection *connection, char *cmd) 390enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd)
391{ 391{
392 char *envp[] = { "HOME=/", 392 char *envp[] = { "HOME=/",
393 "TERM=linux", 393 "TERM=linux",
@@ -442,19 +442,17 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec
442 } 442 }
443 rcu_read_unlock(); 443 rcu_read_unlock();
444 444
445 if (fp == FP_NOT_AVAIL) {
446 /* IO Suspending works on the whole resource.
447 Do it only for one device. */
448 vnr = 0;
449 peer_device = idr_get_next(&connection->peer_devices, &vnr);
450 drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
451 }
452
453 return fp; 445 return fp;
454} 446}
455 447
448static bool resource_is_supended(struct drbd_resource *resource)
449{
450 return resource->susp || resource->susp_fen || resource->susp_nod;
451}
452
456bool conn_try_outdate_peer(struct drbd_connection *connection) 453bool conn_try_outdate_peer(struct drbd_connection *connection)
457{ 454{
455 struct drbd_resource * const resource = connection->resource;
458 unsigned int connect_cnt; 456 unsigned int connect_cnt;
459 union drbd_state mask = { }; 457 union drbd_state mask = { };
460 union drbd_state val = { }; 458 union drbd_state val = { };
@@ -462,21 +460,41 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
462 char *ex_to_string; 460 char *ex_to_string;
463 int r; 461 int r;
464 462
465 spin_lock_irq(&connection->resource->req_lock); 463 spin_lock_irq(&resource->req_lock);
466 if (connection->cstate >= C_WF_REPORT_PARAMS) { 464 if (connection->cstate >= C_WF_REPORT_PARAMS) {
467 drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); 465 drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
468 spin_unlock_irq(&connection->resource->req_lock); 466 spin_unlock_irq(&resource->req_lock);
469 return false; 467 return false;
470 } 468 }
471 469
472 connect_cnt = connection->connect_cnt; 470 connect_cnt = connection->connect_cnt;
473 spin_unlock_irq(&connection->resource->req_lock); 471 spin_unlock_irq(&resource->req_lock);
474 472
475 fp = highest_fencing_policy(connection); 473 fp = highest_fencing_policy(connection);
476 switch (fp) { 474 switch (fp) {
477 case FP_NOT_AVAIL: 475 case FP_NOT_AVAIL:
478 drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n"); 476 drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
479 goto out; 477 spin_lock_irq(&resource->req_lock);
478 if (connection->cstate < C_WF_REPORT_PARAMS) {
479 _conn_request_state(connection,
480 (union drbd_state) { { .susp_fen = 1 } },
481 (union drbd_state) { { .susp_fen = 0 } },
482 CS_VERBOSE | CS_HARD | CS_DC_SUSP);
483 /* We are no longer suspended due to the fencing policy.
484 * We may still be suspended due to the on-no-data-accessible policy.
485 * If that was OND_IO_ERROR, fail pending requests. */
486 if (!resource_is_supended(resource))
487 _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
488 }
489 /* Else: in case we raced with a connection handshake,
490 * let the handshake figure out if we maybe can RESEND,
491 * and do not resume/fail pending requests here.
492 * Worst case is we stay suspended for now, which may be
493 * resolved by either re-establishing the replication link, or
494 * the next link failure, or eventually the administrator. */
495 spin_unlock_irq(&resource->req_lock);
496 return false;
497
480 case FP_DONT_CARE: 498 case FP_DONT_CARE:
481 return true; 499 return true;
482 default: ; 500 default: ;
@@ -485,17 +503,17 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
485 r = conn_khelper(connection, "fence-peer"); 503 r = conn_khelper(connection, "fence-peer");
486 504
487 switch ((r>>8) & 0xff) { 505 switch ((r>>8) & 0xff) {
488 case 3: /* peer is inconsistent */ 506 case P_INCONSISTENT: /* peer is inconsistent */
489 ex_to_string = "peer is inconsistent or worse"; 507 ex_to_string = "peer is inconsistent or worse";
490 mask.pdsk = D_MASK; 508 mask.pdsk = D_MASK;
491 val.pdsk = D_INCONSISTENT; 509 val.pdsk = D_INCONSISTENT;
492 break; 510 break;
493 case 4: /* peer got outdated, or was already outdated */ 511 case P_OUTDATED: /* peer got outdated, or was already outdated */
494 ex_to_string = "peer was fenced"; 512 ex_to_string = "peer was fenced";
495 mask.pdsk = D_MASK; 513 mask.pdsk = D_MASK;
496 val.pdsk = D_OUTDATED; 514 val.pdsk = D_OUTDATED;
497 break; 515 break;
498 case 5: /* peer was down */ 516 case P_DOWN: /* peer was down */
499 if (conn_highest_disk(connection) == D_UP_TO_DATE) { 517 if (conn_highest_disk(connection) == D_UP_TO_DATE) {
500 /* we will(have) create(d) a new UUID anyways... */ 518 /* we will(have) create(d) a new UUID anyways... */
501 ex_to_string = "peer is unreachable, assumed to be dead"; 519 ex_to_string = "peer is unreachable, assumed to be dead";
@@ -505,7 +523,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
505 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; 523 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
506 } 524 }
507 break; 525 break;
508 case 6: /* Peer is primary, voluntarily outdate myself. 526 case P_PRIMARY: /* Peer is primary, voluntarily outdate myself.
509 * This is useful when an unconnected R_SECONDARY is asked to 527 * This is useful when an unconnected R_SECONDARY is asked to
510 * become R_PRIMARY, but finds the other peer being active. */ 528 * become R_PRIMARY, but finds the other peer being active. */
511 ex_to_string = "peer is active"; 529 ex_to_string = "peer is active";
@@ -513,7 +531,9 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
513 mask.disk = D_MASK; 531 mask.disk = D_MASK;
514 val.disk = D_OUTDATED; 532 val.disk = D_OUTDATED;
515 break; 533 break;
516 case 7: 534 case P_FENCING:
535 /* THINK: do we need to handle this
536 * like case 4, or more like case 5? */
517 if (fp != FP_STONITH) 537 if (fp != FP_STONITH)
518 drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n"); 538 drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
519 ex_to_string = "peer was stonithed"; 539 ex_to_string = "peer was stonithed";
@@ -529,13 +549,11 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
529 drbd_info(connection, "fence-peer helper returned %d (%s)\n", 549 drbd_info(connection, "fence-peer helper returned %d (%s)\n",
530 (r>>8) & 0xff, ex_to_string); 550 (r>>8) & 0xff, ex_to_string);
531 551
532 out:
533
534 /* Not using 552 /* Not using
535 conn_request_state(connection, mask, val, CS_VERBOSE); 553 conn_request_state(connection, mask, val, CS_VERBOSE);
536 here, because we might were able to re-establish the connection in the 554 here, because we might were able to re-establish the connection in the
537 meantime. */ 555 meantime. */
538 spin_lock_irq(&connection->resource->req_lock); 556 spin_lock_irq(&resource->req_lock);
539 if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) { 557 if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
540 if (connection->connect_cnt != connect_cnt) 558 if (connection->connect_cnt != connect_cnt)
541 /* In case the connection was established and droped 559 /* In case the connection was established and droped
@@ -544,7 +562,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
544 else 562 else
545 _conn_request_state(connection, mask, val, CS_VERBOSE); 563 _conn_request_state(connection, mask, val, CS_VERBOSE);
546 } 564 }
547 spin_unlock_irq(&connection->resource->req_lock); 565 spin_unlock_irq(&resource->req_lock);
548 566
549 return conn_highest_pdsk(connection) <= D_OUTDATED; 567 return conn_highest_pdsk(connection) <= D_OUTDATED;
550} 568}
@@ -1154,51 +1172,160 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
1154 return 0; 1172 return 0;
1155} 1173}
1156 1174
1175static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity)
1176{
1177 q->limits.discard_granularity = granularity;
1178}
1179
1180static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
1181{
1182 /* when we introduced REQ_WRITE_SAME support, we also bumped
1183 * our maximum supported batch bio size used for discards. */
1184 if (connection->agreed_features & DRBD_FF_WSAME)
1185 return DRBD_MAX_BBIO_SECTORS;
1186 /* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
1187 return AL_EXTENT_SIZE >> 9;
1188}
1189
1190static void decide_on_discard_support(struct drbd_device *device,
1191 struct request_queue *q,
1192 struct request_queue *b,
1193 bool discard_zeroes_if_aligned)
1194{
1195 /* q = drbd device queue (device->rq_queue)
1196 * b = backing device queue (device->ldev->backing_bdev->bd_disk->queue),
1197 * or NULL if diskless
1198 */
1199 struct drbd_connection *connection = first_peer_device(device)->connection;
1200 bool can_do = b ? blk_queue_discard(b) : true;
1201
1202 if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
1203 can_do = false;
1204 drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
1205 }
1206 if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
1207 can_do = false;
1208 drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
1209 }
1210 if (can_do) {
1211 /* We don't care for the granularity, really.
1212 * Stacking limits below should fix it for the local
1213 * device. Whether or not it is a suitable granularity
1214 * on the remote device is not our problem, really. If
1215 * you care, you need to use devices with similar
1216 * topology on all peers. */
1217 blk_queue_discard_granularity(q, 512);
1218 q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
1219 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1220 } else {
1221 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1222 blk_queue_discard_granularity(q, 0);
1223 q->limits.max_discard_sectors = 0;
1224 }
1225}
1226
1227static void fixup_discard_if_not_supported(struct request_queue *q)
1228{
1229 /* To avoid confusion, if this queue does not support discard, clear
1230 * max_discard_sectors, which is what lsblk -D reports to the user.
1231 * Older kernels got this wrong in "stack limits".
1232 * */
1233 if (!blk_queue_discard(q)) {
1234 blk_queue_max_discard_sectors(q, 0);
1235 blk_queue_discard_granularity(q, 0);
1236 }
1237}
1238
1239static void decide_on_write_same_support(struct drbd_device *device,
1240 struct request_queue *q,
1241 struct request_queue *b, struct o_qlim *o)
1242{
1243 struct drbd_peer_device *peer_device = first_peer_device(device);
1244 struct drbd_connection *connection = peer_device->connection;
1245 bool can_do = b ? b->limits.max_write_same_sectors : true;
1246
1247 if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
1248 can_do = false;
1249 drbd_info(peer_device, "peer does not support WRITE_SAME\n");
1250 }
1251
1252 if (o) {
1253 /* logical block size; queue_logical_block_size(NULL) is 512 */
1254 unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
1255 unsigned int me_lbs_b = queue_logical_block_size(b);
1256 unsigned int me_lbs = queue_logical_block_size(q);
1257
1258 if (me_lbs_b != me_lbs) {
1259 drbd_warn(device,
1260 "logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
1261 me_lbs, me_lbs_b);
1262 /* rather disable write same than trigger some BUG_ON later in the scsi layer. */
1263 can_do = false;
1264 }
1265 if (me_lbs_b != peer_lbs) {
1266 drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
1267 me_lbs, peer_lbs);
1268 if (can_do) {
1269 drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
1270 can_do = false;
1271 }
1272 me_lbs = max(me_lbs, me_lbs_b);
1273 /* We cannot change the logical block size of an in-use queue.
1274 * We can only hope that access happens to be properly aligned.
1275 * If not, the peer will likely produce an IO error, and detach. */
1276 if (peer_lbs > me_lbs) {
1277 if (device->state.role != R_PRIMARY) {
1278 blk_queue_logical_block_size(q, peer_lbs);
1279 drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
1280 } else {
1281 drbd_warn(peer_device,
1282 "current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
1283 me_lbs, peer_lbs);
1284 }
1285 }
1286 }
1287 if (can_do && !o->write_same_capable) {
1288 /* If we introduce an open-coded write-same loop on the receiving side,
1289 * the peer would present itself as "capable". */
1290 drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
1291 can_do = false;
1292 }
1293 }
1294
1295 blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
1296}
1297
1157static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, 1298static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
1158 unsigned int max_bio_size) 1299 unsigned int max_bio_size, struct o_qlim *o)
1159{ 1300{
1160 struct request_queue * const q = device->rq_queue; 1301 struct request_queue * const q = device->rq_queue;
1161 unsigned int max_hw_sectors = max_bio_size >> 9; 1302 unsigned int max_hw_sectors = max_bio_size >> 9;
1162 unsigned int max_segments = 0; 1303 unsigned int max_segments = 0;
1163 struct request_queue *b = NULL; 1304 struct request_queue *b = NULL;
1305 struct disk_conf *dc;
1306 bool discard_zeroes_if_aligned = true;
1164 1307
1165 if (bdev) { 1308 if (bdev) {
1166 b = bdev->backing_bdev->bd_disk->queue; 1309 b = bdev->backing_bdev->bd_disk->queue;
1167 1310
1168 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 1311 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1169 rcu_read_lock(); 1312 rcu_read_lock();
1170 max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; 1313 dc = rcu_dereference(device->ldev->disk_conf);
1314 max_segments = dc->max_bio_bvecs;
1315 discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned;
1171 rcu_read_unlock(); 1316 rcu_read_unlock();
1172 1317
1173 blk_set_stacking_limits(&q->limits); 1318 blk_set_stacking_limits(&q->limits);
1174 blk_queue_max_write_same_sectors(q, 0);
1175 } 1319 }
1176 1320
1177 blk_queue_logical_block_size(q, 512);
1178 blk_queue_max_hw_sectors(q, max_hw_sectors); 1321 blk_queue_max_hw_sectors(q, max_hw_sectors);
1179 /* This is the workaround for "bio would need to, but cannot, be split" */ 1322 /* This is the workaround for "bio would need to, but cannot, be split" */
1180 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); 1323 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
1181 blk_queue_segment_boundary(q, PAGE_SIZE-1); 1324 blk_queue_segment_boundary(q, PAGE_SIZE-1);
1325 decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
1326 decide_on_write_same_support(device, q, b, o);
1182 1327
1183 if (b) { 1328 if (b) {
1184 struct drbd_connection *connection = first_peer_device(device)->connection;
1185
1186 blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
1187
1188 if (blk_queue_discard(b) &&
1189 (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
1190 /* We don't care, stacking below should fix it for the local device.
1191 * Whether or not it is a suitable granularity on the remote device
1192 * is not our problem, really. If you care, you need to
1193 * use devices with similar topology on all peers. */
1194 q->limits.discard_granularity = 512;
1195 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1196 } else {
1197 blk_queue_max_discard_sectors(q, 0);
1198 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1199 q->limits.discard_granularity = 0;
1200 }
1201
1202 blk_queue_stack_limits(q, b); 1329 blk_queue_stack_limits(q, b);
1203 1330
1204 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 1331 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
@@ -1208,15 +1335,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
1208 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 1335 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1209 } 1336 }
1210 } 1337 }
1211 /* To avoid confusion, if this queue does not support discard, clear 1338 fixup_discard_if_not_supported(q);
1212 * max_discard_sectors, which is what lsblk -D reports to the user. */
1213 if (!blk_queue_discard(q)) {
1214 blk_queue_max_discard_sectors(q, 0);
1215 q->limits.discard_granularity = 0;
1216 }
1217} 1339}
1218 1340
1219void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) 1341void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
1220{ 1342{
1221 unsigned int now, new, local, peer; 1343 unsigned int now, new, local, peer;
1222 1344
@@ -1259,7 +1381,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backin
1259 if (new != now) 1381 if (new != now)
1260 drbd_info(device, "max BIO size = %u\n", new); 1382 drbd_info(device, "max BIO size = %u\n", new);
1261 1383
1262 drbd_setup_queue_param(device, bdev, new); 1384 drbd_setup_queue_param(device, bdev, new, o);
1263} 1385}
1264 1386
1265/* Starts the worker thread */ 1387/* Starts the worker thread */
@@ -1348,6 +1470,43 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
1348 a->disk_drain != b->disk_drain; 1470 a->disk_drain != b->disk_drain;
1349} 1471}
1350 1472
1473static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf,
1474 struct drbd_backing_dev *nbc)
1475{
1476 struct request_queue * const q = nbc->backing_bdev->bd_disk->queue;
1477
1478 if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1479 disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1480 if (disk_conf->al_extents > drbd_al_extents_max(nbc))
1481 disk_conf->al_extents = drbd_al_extents_max(nbc);
1482
1483 if (!blk_queue_discard(q)
1484 || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
1485 if (disk_conf->rs_discard_granularity) {
1486 disk_conf->rs_discard_granularity = 0; /* disable feature */
1487 drbd_info(device, "rs_discard_granularity feature disabled\n");
1488 }
1489 }
1490
1491 if (disk_conf->rs_discard_granularity) {
1492 int orig_value = disk_conf->rs_discard_granularity;
1493 int remainder;
1494
1495 if (q->limits.discard_granularity > disk_conf->rs_discard_granularity)
1496 disk_conf->rs_discard_granularity = q->limits.discard_granularity;
1497
1498 remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity;
1499 disk_conf->rs_discard_granularity += remainder;
1500
1501 if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9)
1502 disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9;
1503
1504 if (disk_conf->rs_discard_granularity != orig_value)
1505 drbd_info(device, "rs_discard_granularity changed to %d\n",
1506 disk_conf->rs_discard_granularity);
1507 }
1508}
1509
1351int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) 1510int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1352{ 1511{
1353 struct drbd_config_context adm_ctx; 1512 struct drbd_config_context adm_ctx;
@@ -1395,10 +1554,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1395 if (!expect(new_disk_conf->resync_rate >= 1)) 1554 if (!expect(new_disk_conf->resync_rate >= 1))
1396 new_disk_conf->resync_rate = 1; 1555 new_disk_conf->resync_rate = 1;
1397 1556
1398 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) 1557 sanitize_disk_conf(device, new_disk_conf, device->ldev);
1399 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1400 if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
1401 new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
1402 1558
1403 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) 1559 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1404 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; 1560 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
@@ -1457,6 +1613,9 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1457 if (write_ordering_changed(old_disk_conf, new_disk_conf)) 1613 if (write_ordering_changed(old_disk_conf, new_disk_conf))
1458 drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); 1614 drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
1459 1615
1616 if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
1617 drbd_reconsider_queue_parameters(device, device->ldev, NULL);
1618
1460 drbd_md_sync(device); 1619 drbd_md_sync(device);
1461 1620
1462 if (device->state.conn >= C_CONNECTED) { 1621 if (device->state.conn >= C_CONNECTED) {
@@ -1693,10 +1852,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1693 if (retcode != NO_ERROR) 1852 if (retcode != NO_ERROR)
1694 goto fail; 1853 goto fail;
1695 1854
1696 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) 1855 sanitize_disk_conf(device, new_disk_conf, nbc);
1697 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1698 if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1699 new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1700 1856
1701 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { 1857 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1702 drbd_err(device, "max capacity %llu smaller than disk size %llu\n", 1858 drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
@@ -1838,7 +1994,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1838 device->read_cnt = 0; 1994 device->read_cnt = 0;
1839 device->writ_cnt = 0; 1995 device->writ_cnt = 0;
1840 1996
1841 drbd_reconsider_max_bio_size(device, device->ldev); 1997 drbd_reconsider_queue_parameters(device, device->ldev, NULL);
1842 1998
1843 /* If I am currently not R_PRIMARY, 1999 /* If I am currently not R_PRIMARY,
1844 * but meta data primary indicator is set, 2000 * but meta data primary indicator is set,
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 6537b25db9c1..be2b93fd2c11 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -25,7 +25,7 @@
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27 27
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/file.h> 30#include <linux/file.h>
31#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
@@ -122,18 +122,18 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
122 122
123 x = res/50; 123 x = res/50;
124 y = 20-x; 124 y = 20-x;
125 seq_printf(seq, "\t["); 125 seq_puts(seq, "\t[");
126 for (i = 1; i < x; i++) 126 for (i = 1; i < x; i++)
127 seq_printf(seq, "="); 127 seq_putc(seq, '=');
128 seq_printf(seq, ">"); 128 seq_putc(seq, '>');
129 for (i = 0; i < y; i++) 129 for (i = 0; i < y; i++)
130 seq_printf(seq, "."); 130 seq_printf(seq, ".");
131 seq_printf(seq, "] "); 131 seq_puts(seq, "] ");
132 132
133 if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) 133 if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
134 seq_printf(seq, "verified:"); 134 seq_puts(seq, "verified:");
135 else 135 else
136 seq_printf(seq, "sync'ed:"); 136 seq_puts(seq, "sync'ed:");
137 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); 137 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
138 138
139 /* if more than a few GB, display in MB */ 139 /* if more than a few GB, display in MB */
@@ -146,7 +146,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
146 (unsigned long) Bit2KB(rs_left), 146 (unsigned long) Bit2KB(rs_left),
147 (unsigned long) Bit2KB(rs_total)); 147 (unsigned long) Bit2KB(rs_total));
148 148
149 seq_printf(seq, "\n\t"); 149 seq_puts(seq, "\n\t");
150 150
151 /* see drivers/md/md.c 151 /* see drivers/md/md.c
152 * We do not want to overflow, so the order of operands and 152 * We do not want to overflow, so the order of operands and
@@ -175,9 +175,9 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
175 rt / 3600, (rt % 3600) / 60, rt % 60); 175 rt / 3600, (rt % 3600) / 60, rt % 60);
176 176
177 dbdt = Bit2KB(db/dt); 177 dbdt = Bit2KB(db/dt);
178 seq_printf(seq, " speed: "); 178 seq_puts(seq, " speed: ");
179 seq_printf_with_thousands_grouping(seq, dbdt); 179 seq_printf_with_thousands_grouping(seq, dbdt);
180 seq_printf(seq, " ("); 180 seq_puts(seq, " (");
181 /* ------------------------- ~3s average ------------------------ */ 181 /* ------------------------- ~3s average ------------------------ */
182 if (proc_details >= 1) { 182 if (proc_details >= 1) {
183 /* this is what drbd_rs_should_slow_down() uses */ 183 /* this is what drbd_rs_should_slow_down() uses */
@@ -188,7 +188,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
188 db = device->rs_mark_left[i] - rs_left; 188 db = device->rs_mark_left[i] - rs_left;
189 dbdt = Bit2KB(db/dt); 189 dbdt = Bit2KB(db/dt);
190 seq_printf_with_thousands_grouping(seq, dbdt); 190 seq_printf_with_thousands_grouping(seq, dbdt);
191 seq_printf(seq, " -- "); 191 seq_puts(seq, " -- ");
192 } 192 }
193 193
194 /* --------------------- long term average ---------------------- */ 194 /* --------------------- long term average ---------------------- */
@@ -200,11 +200,11 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
200 db = rs_total - rs_left; 200 db = rs_total - rs_left;
201 dbdt = Bit2KB(db/dt); 201 dbdt = Bit2KB(db/dt);
202 seq_printf_with_thousands_grouping(seq, dbdt); 202 seq_printf_with_thousands_grouping(seq, dbdt);
203 seq_printf(seq, ")"); 203 seq_putc(seq, ')');
204 204
205 if (state.conn == C_SYNC_TARGET || 205 if (state.conn == C_SYNC_TARGET ||
206 state.conn == C_VERIFY_S) { 206 state.conn == C_VERIFY_S) {
207 seq_printf(seq, " want: "); 207 seq_puts(seq, " want: ");
208 seq_printf_with_thousands_grouping(seq, device->c_sync_rate); 208 seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
209 } 209 }
210 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); 210 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
@@ -231,7 +231,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
231 (unsigned long long)bm_bits * BM_SECT_PER_BIT); 231 (unsigned long long)bm_bits * BM_SECT_PER_BIT);
232 if (stop_sector != 0 && stop_sector != ULLONG_MAX) 232 if (stop_sector != 0 && stop_sector != ULLONG_MAX)
233 seq_printf(seq, " stop sector: %llu", stop_sector); 233 seq_printf(seq, " stop sector: %llu", stop_sector);
234 seq_printf(seq, "\n"); 234 seq_putc(seq, '\n');
235 } 235 }
236} 236}
237 237
@@ -276,7 +276,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
276 rcu_read_lock(); 276 rcu_read_lock();
277 idr_for_each_entry(&drbd_devices, device, i) { 277 idr_for_each_entry(&drbd_devices, device, i) {
278 if (prev_i != i - 1) 278 if (prev_i != i - 1)
279 seq_printf(seq, "\n"); 279 seq_putc(seq, '\n');
280 prev_i = i; 280 prev_i = i;
281 281
282 state = device->state; 282 state = device->state;
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 129f8c76c9b1..4d296800f706 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -60,6 +60,15 @@ enum drbd_packet {
60 * which is why I chose TRIM here, to disambiguate. */ 60 * which is why I chose TRIM here, to disambiguate. */
61 P_TRIM = 0x31, 61 P_TRIM = 0x31,
62 62
63 /* Only use these two if both support FF_THIN_RESYNC */
64 P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
65 P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */
66
67 /* REQ_WRITE_SAME.
68 * On a receiving side without REQ_WRITE_SAME,
69 * we may fall back to an opencoded loop instead. */
70 P_WSAME = 0x34,
71
63 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 72 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
64 P_MAX_OPT_CMD = 0x101, 73 P_MAX_OPT_CMD = 0x101,
65 74
@@ -106,8 +115,11 @@ struct p_header100 {
106 u32 pad; 115 u32 pad;
107} __packed; 116} __packed;
108 117
109/* these defines must not be changed without changing the protocol version */ 118/* These defines must not be changed without changing the protocol version.
110#define DP_HARDBARRIER 1 /* depricated */ 119 * New defines may only be introduced together with protocol version bump or
120 * new protocol feature flags.
121 */
122#define DP_HARDBARRIER 1 /* no longer used */
111#define DP_RW_SYNC 2 /* equals REQ_SYNC */ 123#define DP_RW_SYNC 2 /* equals REQ_SYNC */
112#define DP_MAY_SET_IN_SYNC 4 124#define DP_MAY_SET_IN_SYNC 4
113#define DP_UNPLUG 8 /* not used anymore */ 125#define DP_UNPLUG 8 /* not used anymore */
@@ -116,6 +128,7 @@ struct p_header100 {
116#define DP_DISCARD 64 /* equals REQ_DISCARD */ 128#define DP_DISCARD 64 /* equals REQ_DISCARD */
117#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ 129#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
118#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ 130#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
131#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */
119 132
120struct p_data { 133struct p_data {
121 u64 sector; /* 64 bits sector number */ 134 u64 sector; /* 64 bits sector number */
@@ -129,6 +142,11 @@ struct p_trim {
129 u32 size; /* == bio->bi_size */ 142 u32 size; /* == bio->bi_size */
130} __packed; 143} __packed;
131 144
145struct p_wsame {
146 struct p_data p_data;
147 u32 size; /* == bio->bi_size */
148} __packed;
149
132/* 150/*
133 * commands which share a struct: 151 * commands which share a struct:
134 * p_block_ack: 152 * p_block_ack:
@@ -160,7 +178,23 @@ struct p_block_req {
160 * ReportParams 178 * ReportParams
161 */ 179 */
162 180
163#define FF_TRIM 1 181/* supports TRIM/DISCARD on the "wire" protocol */
182#define DRBD_FF_TRIM 1
183
184/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks
185 * instead of fully allocate a supposedly thin volume on initial resync */
186#define DRBD_FF_THIN_RESYNC 2
187
188/* supports REQ_WRITE_SAME on the "wire" protocol.
189 * Note: this flag is overloaded,
190 * its presence also
191 * - indicates support for 128 MiB "batch bios",
192 * max discard size of 128 MiB
193 * instead of 4M before that.
194 * - indicates that we exchange additional settings in p_sizes
195 * drbd_send_sizes()/receive_sizes()
196 */
197#define DRBD_FF_WSAME 4
164 198
165struct p_connection_features { 199struct p_connection_features {
166 u32 protocol_min; 200 u32 protocol_min;
@@ -235,6 +269,40 @@ struct p_rs_uuid {
235 u64 uuid; 269 u64 uuid;
236} __packed; 270} __packed;
237 271
272/* optional queue_limits if (agreed_features & DRBD_FF_WSAME)
273 * see also struct queue_limits, as of late 2015 */
274struct o_qlim {
275 /* we don't need it yet, but we may as well communicate it now */
276 u32 physical_block_size;
277
278 /* so the original in struct queue_limits is unsigned short,
279 * but I'd have to put in padding anyways. */
280 u32 logical_block_size;
281
282 /* One incoming bio becomes one DRBD request,
283 * which may be translated to several bio on the receiving side.
284 * We don't need to communicate chunk/boundary/segment ... limits.
285 */
286
287 /* various IO hints may be useful with "diskless client" setups */
288 u32 alignment_offset;
289 u32 io_min;
290 u32 io_opt;
291
292 /* We may need to communicate integrity stuff at some point,
293 * but let's not get ahead of ourselves. */
294
295 /* Backend discard capabilities.
296 * Receiving side uses "blkdev_issue_discard()", no need to communicate
297 * more specifics. If the backend cannot do discards, the DRBD peer
298 * may fall back to blkdev_issue_zeroout().
299 */
300 u8 discard_enabled;
301 u8 discard_zeroes_data;
302 u8 write_same_capable;
303 u8 _pad;
304} __packed;
305
238struct p_sizes { 306struct p_sizes {
239 u64 d_size; /* size of disk */ 307 u64 d_size; /* size of disk */
240 u64 u_size; /* user requested size */ 308 u64 u_size; /* user requested size */
@@ -242,6 +310,9 @@ struct p_sizes {
242 u32 max_bio_size; /* Maximal size of a BIO */ 310 u32 max_bio_size; /* Maximal size of a BIO */
243 u16 queue_order_type; /* not yet implemented in DRBD*/ 311 u16 queue_order_type; /* not yet implemented in DRBD*/
244 u16 dds_flags; /* use enum dds_flags here. */ 312 u16 dds_flags; /* use enum dds_flags here. */
313
314 /* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */
315 struct o_qlim qlim[0];
245} __packed; 316} __packed;
246 317
247struct p_state { 318struct p_state {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 1ee002352ea2..df45713dfbe8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -25,7 +25,7 @@
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27 27
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <net/sock.h> 29#include <net/sock.h>
30 30
31#include <linux/drbd.h> 31#include <linux/drbd.h>
@@ -48,7 +48,7 @@
48#include "drbd_req.h" 48#include "drbd_req.h"
49#include "drbd_vli.h" 49#include "drbd_vli.h"
50 50
51#define PRO_FEATURES (FF_TRIM) 51#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
52 52
53struct packet_info { 53struct packet_info {
54 enum drbd_packet cmd; 54 enum drbd_packet cmd;
@@ -361,14 +361,17 @@ You must not have the req_lock:
361 drbd_wait_ee_list_empty() 361 drbd_wait_ee_list_empty()
362*/ 362*/
363 363
364/* normal: payload_size == request size (bi_size)
365 * w_same: payload_size == logical_block_size
366 * trim: payload_size == 0 */
364struct drbd_peer_request * 367struct drbd_peer_request *
365drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 368drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
366 unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) 369 unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
367{ 370{
368 struct drbd_device *device = peer_device->device; 371 struct drbd_device *device = peer_device->device;
369 struct drbd_peer_request *peer_req; 372 struct drbd_peer_request *peer_req;
370 struct page *page = NULL; 373 struct page *page = NULL;
371 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 374 unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
372 375
373 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) 376 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
374 return NULL; 377 return NULL;
@@ -380,7 +383,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
380 return NULL; 383 return NULL;
381 } 384 }
382 385
383 if (has_payload && data_size) { 386 if (nr_pages) {
384 page = drbd_alloc_pages(peer_device, nr_pages, 387 page = drbd_alloc_pages(peer_device, nr_pages,
385 gfpflags_allow_blocking(gfp_mask)); 388 gfpflags_allow_blocking(gfp_mask));
386 if (!page) 389 if (!page)
@@ -390,7 +393,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
390 memset(peer_req, 0, sizeof(*peer_req)); 393 memset(peer_req, 0, sizeof(*peer_req));
391 INIT_LIST_HEAD(&peer_req->w.list); 394 INIT_LIST_HEAD(&peer_req->w.list);
392 drbd_clear_interval(&peer_req->i); 395 drbd_clear_interval(&peer_req->i);
393 peer_req->i.size = data_size; 396 peer_req->i.size = request_size;
394 peer_req->i.sector = sector; 397 peer_req->i.sector = sector;
395 peer_req->submit_jif = jiffies; 398 peer_req->submit_jif = jiffies;
396 peer_req->peer_device = peer_device; 399 peer_req->peer_device = peer_device;
@@ -1204,13 +1207,84 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in
1204 return err; 1207 return err;
1205} 1208}
1206 1209
1207static void drbd_flush(struct drbd_connection *connection) 1210/* This is blkdev_issue_flush, but asynchronous.
1211 * We want to submit to all component volumes in parallel,
1212 * then wait for all completions.
1213 */
1214struct issue_flush_context {
1215 atomic_t pending;
1216 int error;
1217 struct completion done;
1218};
1219struct one_flush_context {
1220 struct drbd_device *device;
1221 struct issue_flush_context *ctx;
1222};
1223
1224void one_flush_endio(struct bio *bio)
1208{ 1225{
1209 int rv; 1226 struct one_flush_context *octx = bio->bi_private;
1210 struct drbd_peer_device *peer_device; 1227 struct drbd_device *device = octx->device;
1211 int vnr; 1228 struct issue_flush_context *ctx = octx->ctx;
1212 1229
1230 if (bio->bi_error) {
1231 ctx->error = bio->bi_error;
1232 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error);
1233 }
1234 kfree(octx);
1235 bio_put(bio);
1236
1237 clear_bit(FLUSH_PENDING, &device->flags);
1238 put_ldev(device);
1239 kref_put(&device->kref, drbd_destroy_device);
1240
1241 if (atomic_dec_and_test(&ctx->pending))
1242 complete(&ctx->done);
1243}
1244
1245static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1246{
1247 struct bio *bio = bio_alloc(GFP_NOIO, 0);
1248 struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
1249 if (!bio || !octx) {
1250 drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
1251 /* FIXME: what else can I do now? disconnecting or detaching
1252 * really does not help to improve the state of the world, either.
1253 */
1254 kfree(octx);
1255 if (bio)
1256 bio_put(bio);
1257
1258 ctx->error = -ENOMEM;
1259 put_ldev(device);
1260 kref_put(&device->kref, drbd_destroy_device);
1261 return;
1262 }
1263
1264 octx->device = device;
1265 octx->ctx = ctx;
1266 bio->bi_bdev = device->ldev->backing_bdev;
1267 bio->bi_private = octx;
1268 bio->bi_end_io = one_flush_endio;
1269 bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH);
1270
1271 device->flush_jif = jiffies;
1272 set_bit(FLUSH_PENDING, &device->flags);
1273 atomic_inc(&ctx->pending);
1274 submit_bio(bio);
1275}
1276
1277static void drbd_flush(struct drbd_connection *connection)
1278{
1213 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { 1279 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
1280 struct drbd_peer_device *peer_device;
1281 struct issue_flush_context ctx;
1282 int vnr;
1283
1284 atomic_set(&ctx.pending, 1);
1285 ctx.error = 0;
1286 init_completion(&ctx.done);
1287
1214 rcu_read_lock(); 1288 rcu_read_lock();
1215 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1289 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1216 struct drbd_device *device = peer_device->device; 1290 struct drbd_device *device = peer_device->device;
@@ -1220,31 +1294,24 @@ static void drbd_flush(struct drbd_connection *connection)
1220 kref_get(&device->kref); 1294 kref_get(&device->kref);
1221 rcu_read_unlock(); 1295 rcu_read_unlock();
1222 1296
1223 /* Right now, we have only this one synchronous code path 1297 submit_one_flush(device, &ctx);
1224 * for flushes between request epochs.
1225 * We may want to make those asynchronous,
1226 * or at least parallelize the flushes to the volume devices.
1227 */
1228 device->flush_jif = jiffies;
1229 set_bit(FLUSH_PENDING, &device->flags);
1230 rv = blkdev_issue_flush(device->ldev->backing_bdev,
1231 GFP_NOIO, NULL);
1232 clear_bit(FLUSH_PENDING, &device->flags);
1233 if (rv) {
1234 drbd_info(device, "local disk flush failed with status %d\n", rv);
1235 /* would rather check on EOPNOTSUPP, but that is not reliable.
1236 * don't try again for ANY return value != 0
1237 * if (rv == -EOPNOTSUPP) */
1238 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1239 }
1240 put_ldev(device);
1241 kref_put(&device->kref, drbd_destroy_device);
1242 1298
1243 rcu_read_lock(); 1299 rcu_read_lock();
1244 if (rv)
1245 break;
1246 } 1300 }
1247 rcu_read_unlock(); 1301 rcu_read_unlock();
1302
1303 /* Do we want to add a timeout,
1304 * if disk-timeout is set? */
1305 if (!atomic_dec_and_test(&ctx.pending))
1306 wait_for_completion(&ctx.done);
1307
1308 if (ctx.error) {
1309 /* would rather check on EOPNOTSUPP, but that is not reliable.
1310 * don't try again for ANY return value != 0
1311 * if (rv == -EOPNOTSUPP) */
1312 /* Any error is already reported by bio_endio callback. */
1313 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1314 }
1248 } 1315 }
1249} 1316}
1250 1317
@@ -1379,6 +1446,120 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
1379 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); 1446 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1380} 1447}
1381 1448
1449/*
1450 * We *may* ignore the discard-zeroes-data setting, if so configured.
1451 *
1452 * Assumption is that it "discard_zeroes_data=0" is only because the backend
1453 * may ignore partial unaligned discards.
1454 *
1455 * LVM/DM thin as of at least
1456 * LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
1457 * Library version: 1.02.93-RHEL7 (2015-01-28)
1458 * Driver version: 4.29.0
1459 * still behaves this way.
1460 *
1461 * For unaligned (wrt. alignment and granularity) or too small discards,
1462 * we zero-out the initial (and/or) trailing unaligned partial chunks,
1463 * but discard all the aligned full chunks.
1464 *
1465 * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
1466 */
1467int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
1468{
1469 struct block_device *bdev = device->ldev->backing_bdev;
1470 struct request_queue *q = bdev_get_queue(bdev);
1471 sector_t tmp, nr;
1472 unsigned int max_discard_sectors, granularity;
1473 int alignment;
1474 int err = 0;
1475
1476 if (!discard)
1477 goto zero_out;
1478
1479 /* Zero-sector (unknown) and one-sector granularities are the same. */
1480 granularity = max(q->limits.discard_granularity >> 9, 1U);
1481 alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
1482
1483 max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
1484 max_discard_sectors -= max_discard_sectors % granularity;
1485 if (unlikely(!max_discard_sectors))
1486 goto zero_out;
1487
1488 if (nr_sectors < granularity)
1489 goto zero_out;
1490
1491 tmp = start;
1492 if (sector_div(tmp, granularity) != alignment) {
1493 if (nr_sectors < 2*granularity)
1494 goto zero_out;
1495 /* start + gran - (start + gran - align) % gran */
1496 tmp = start + granularity - alignment;
1497 tmp = start + granularity - sector_div(tmp, granularity);
1498
1499 nr = tmp - start;
1500 err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
1501 nr_sectors -= nr;
1502 start = tmp;
1503 }
1504 while (nr_sectors >= granularity) {
1505 nr = min_t(sector_t, nr_sectors, max_discard_sectors);
1506 err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
1507 nr_sectors -= nr;
1508 start += nr;
1509 }
1510 zero_out:
1511 if (nr_sectors) {
1512 err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
1513 }
1514 return err != 0;
1515}
1516
1517static bool can_do_reliable_discards(struct drbd_device *device)
1518{
1519 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
1520 struct disk_conf *dc;
1521 bool can_do;
1522
1523 if (!blk_queue_discard(q))
1524 return false;
1525
1526 if (q->limits.discard_zeroes_data)
1527 return true;
1528
1529 rcu_read_lock();
1530 dc = rcu_dereference(device->ldev->disk_conf);
1531 can_do = dc->discard_zeroes_if_aligned;
1532 rcu_read_unlock();
1533 return can_do;
1534}
1535
1536static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
1537{
1538 /* If the backend cannot discard, or does not guarantee
1539 * read-back zeroes in discarded ranges, we fall back to
1540 * zero-out. Unless configuration specifically requested
1541 * otherwise. */
1542 if (!can_do_reliable_discards(device))
1543 peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
1544
1545 if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
1546 peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
1547 peer_req->flags |= EE_WAS_ERROR;
1548 drbd_endio_write_sec_final(peer_req);
1549}
1550
1551static void drbd_issue_peer_wsame(struct drbd_device *device,
1552 struct drbd_peer_request *peer_req)
1553{
1554 struct block_device *bdev = device->ldev->backing_bdev;
1555 sector_t s = peer_req->i.sector;
1556 sector_t nr = peer_req->i.size >> 9;
1557 if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
1558 peer_req->flags |= EE_WAS_ERROR;
1559 drbd_endio_write_sec_final(peer_req);
1560}
1561
1562
1382/** 1563/**
1383 * drbd_submit_peer_request() 1564 * drbd_submit_peer_request()
1384 * @device: DRBD device. 1565 * @device: DRBD device.
@@ -1410,7 +1591,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
1410 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 1591 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
1411 int err = -ENOMEM; 1592 int err = -ENOMEM;
1412 1593
1413 if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { 1594 /* TRIM/DISCARD: for now, always use the helper function
1595 * blkdev_issue_zeroout(..., discard=true).
1596 * It's synchronous, but it does the right thing wrt. bio splitting.
1597 * Correctness first, performance later. Next step is to code an
1598 * asynchronous variant of the same.
1599 */
1600 if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
1414 /* wait for all pending IO completions, before we start 1601 /* wait for all pending IO completions, before we start
1415 * zeroing things out. */ 1602 * zeroing things out. */
1416 conn_wait_active_ee_empty(peer_req->peer_device->connection); 1603 conn_wait_active_ee_empty(peer_req->peer_device->connection);
@@ -1418,22 +1605,22 @@ int drbd_submit_peer_request(struct drbd_device *device,
1418 * so we can find it to present it in debugfs */ 1605 * so we can find it to present it in debugfs */
1419 peer_req->submit_jif = jiffies; 1606 peer_req->submit_jif = jiffies;
1420 peer_req->flags |= EE_SUBMITTED; 1607 peer_req->flags |= EE_SUBMITTED;
1421 spin_lock_irq(&device->resource->req_lock); 1608
1422 list_add_tail(&peer_req->w.list, &device->active_ee); 1609 /* If this was a resync request from receive_rs_deallocated(),
1423 spin_unlock_irq(&device->resource->req_lock); 1610 * it is already on the sync_ee list */
1424 if (blkdev_issue_zeroout(device->ldev->backing_bdev, 1611 if (list_empty(&peer_req->w.list)) {
1425 sector, data_size >> 9, GFP_NOIO, false)) 1612 spin_lock_irq(&device->resource->req_lock);
1426 peer_req->flags |= EE_WAS_ERROR; 1613 list_add_tail(&peer_req->w.list, &device->active_ee);
1427 drbd_endio_write_sec_final(peer_req); 1614 spin_unlock_irq(&device->resource->req_lock);
1615 }
1616
1617 if (peer_req->flags & EE_IS_TRIM)
1618 drbd_issue_peer_discard(device, peer_req);
1619 else /* EE_WRITE_SAME */
1620 drbd_issue_peer_wsame(device, peer_req);
1428 return 0; 1621 return 0;
1429 } 1622 }
1430 1623
1431 /* Discards don't have any payload.
1432 * But the scsi layer still expects a bio_vec it can use internally,
1433 * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
1434 if (peer_req->flags & EE_IS_TRIM)
1435 nr_pages = 1;
1436
1437 /* In most cases, we will only need one bio. But in case the lower 1624 /* In most cases, we will only need one bio. But in case the lower
1438 * level restrictions happen to be different at this offset on this 1625 * level restrictions happen to be different at this offset on this
1439 * side than those of the sending peer, we may need to submit the 1626 * side than those of the sending peer, we may need to submit the
@@ -1459,11 +1646,6 @@ next_bio:
1459 bios = bio; 1646 bios = bio;
1460 ++n_bios; 1647 ++n_bios;
1461 1648
1462 if (op == REQ_OP_DISCARD) {
1463 bio->bi_iter.bi_size = data_size;
1464 goto submit;
1465 }
1466
1467 page_chain_for_each(page) { 1649 page_chain_for_each(page) {
1468 unsigned len = min_t(unsigned, data_size, PAGE_SIZE); 1650 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
1469 if (!bio_add_page(bio, page, len, 0)) { 1651 if (!bio_add_page(bio, page, len, 0)) {
@@ -1485,7 +1667,6 @@ next_bio:
1485 --nr_pages; 1667 --nr_pages;
1486 } 1668 }
1487 D_ASSERT(device, data_size == 0); 1669 D_ASSERT(device, data_size == 0);
1488submit:
1489 D_ASSERT(device, page == NULL); 1670 D_ASSERT(device, page == NULL);
1490 1671
1491 atomic_set(&peer_req->pending_bios, n_bios); 1672 atomic_set(&peer_req->pending_bios, n_bios);
@@ -1609,8 +1790,26 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
1609 return 0; 1790 return 0;
1610} 1791}
1611 1792
1793/* quick wrapper in case payload size != request_size (write same) */
1794static void drbd_csum_ee_size(struct crypto_ahash *h,
1795 struct drbd_peer_request *r, void *d,
1796 unsigned int payload_size)
1797{
1798 unsigned int tmp = r->i.size;
1799 r->i.size = payload_size;
1800 drbd_csum_ee(h, r, d);
1801 r->i.size = tmp;
1802}
1803
1612/* used from receive_RSDataReply (recv_resync_read) 1804/* used from receive_RSDataReply (recv_resync_read)
1613 * and from receive_Data */ 1805 * and from receive_Data.
1806 * data_size: actual payload ("data in")
1807 * for normal writes that is bi_size.
1808 * for discards, that is zero.
1809 * for write same, it is logical_block_size.
1810 * both trim and write same have the bi_size ("data len to be affected")
1811 * as extra argument in the packet header.
1812 */
1614static struct drbd_peer_request * 1813static struct drbd_peer_request *
1615read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 1814read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1616 struct packet_info *pi) __must_hold(local) 1815 struct packet_info *pi) __must_hold(local)
@@ -1625,6 +1824,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1625 void *dig_vv = peer_device->connection->int_dig_vv; 1824 void *dig_vv = peer_device->connection->int_dig_vv;
1626 unsigned long *data; 1825 unsigned long *data;
1627 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; 1826 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1827 struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
1628 1828
1629 digest_size = 0; 1829 digest_size = 0;
1630 if (!trim && peer_device->connection->peer_integrity_tfm) { 1830 if (!trim && peer_device->connection->peer_integrity_tfm) {
@@ -1639,38 +1839,60 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1639 data_size -= digest_size; 1839 data_size -= digest_size;
1640 } 1840 }
1641 1841
1842 /* assume request_size == data_size, but special case trim and wsame. */
1843 ds = data_size;
1642 if (trim) { 1844 if (trim) {
1643 D_ASSERT(peer_device, data_size == 0); 1845 if (!expect(data_size == 0))
1644 data_size = be32_to_cpu(trim->size); 1846 return NULL;
1847 ds = be32_to_cpu(trim->size);
1848 } else if (wsame) {
1849 if (data_size != queue_logical_block_size(device->rq_queue)) {
1850 drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
1851 data_size, queue_logical_block_size(device->rq_queue));
1852 return NULL;
1853 }
1854 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
1855 drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
1856 data_size, bdev_logical_block_size(device->ldev->backing_bdev));
1857 return NULL;
1858 }
1859 ds = be32_to_cpu(wsame->size);
1645 } 1860 }
1646 1861
1647 if (!expect(IS_ALIGNED(data_size, 512))) 1862 if (!expect(IS_ALIGNED(ds, 512)))
1648 return NULL; 1863 return NULL;
1649 /* prepare for larger trim requests. */ 1864 if (trim || wsame) {
1650 if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) 1865 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
1866 return NULL;
1867 } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
1651 return NULL; 1868 return NULL;
1652 1869
1653 /* even though we trust out peer, 1870 /* even though we trust out peer,
1654 * we sometimes have to double check. */ 1871 * we sometimes have to double check. */
1655 if (sector + (data_size>>9) > capacity) { 1872 if (sector + (ds>>9) > capacity) {
1656 drbd_err(device, "request from peer beyond end of local disk: " 1873 drbd_err(device, "request from peer beyond end of local disk: "
1657 "capacity: %llus < sector: %llus + size: %u\n", 1874 "capacity: %llus < sector: %llus + size: %u\n",
1658 (unsigned long long)capacity, 1875 (unsigned long long)capacity,
1659 (unsigned long long)sector, data_size); 1876 (unsigned long long)sector, ds);
1660 return NULL; 1877 return NULL;
1661 } 1878 }
1662 1879
1663 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1880 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1664 * "criss-cross" setup, that might cause write-out on some other DRBD, 1881 * "criss-cross" setup, that might cause write-out on some other DRBD,
1665 * which in turn might block on the other node at this very place. */ 1882 * which in turn might block on the other node at this very place. */
1666 peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); 1883 peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
1667 if (!peer_req) 1884 if (!peer_req)
1668 return NULL; 1885 return NULL;
1669 1886
1670 peer_req->flags |= EE_WRITE; 1887 peer_req->flags |= EE_WRITE;
1671 if (trim) 1888 if (trim) {
1889 peer_req->flags |= EE_IS_TRIM;
1672 return peer_req; 1890 return peer_req;
1891 }
1892 if (wsame)
1893 peer_req->flags |= EE_WRITE_SAME;
1673 1894
1895 /* receive payload size bytes into page chain */
1674 ds = data_size; 1896 ds = data_size;
1675 page = peer_req->pages; 1897 page = peer_req->pages;
1676 page_chain_for_each(page) { 1898 page_chain_for_each(page) {
@@ -1690,7 +1912,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1690 } 1912 }
1691 1913
1692 if (digest_size) { 1914 if (digest_size) {
1693 drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv); 1915 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
1694 if (memcmp(dig_in, dig_vv, digest_size)) { 1916 if (memcmp(dig_in, dig_vv, digest_size)) {
1695 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", 1917 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1696 (unsigned long long)sector, data_size); 1918 (unsigned long long)sector, data_size);
@@ -2067,13 +2289,13 @@ static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
2067static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) 2289static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
2068{ 2290{
2069 struct drbd_peer_request *rs_req; 2291 struct drbd_peer_request *rs_req;
2070 bool rv = 0; 2292 bool rv = false;
2071 2293
2072 spin_lock_irq(&device->resource->req_lock); 2294 spin_lock_irq(&device->resource->req_lock);
2073 list_for_each_entry(rs_req, &device->sync_ee, w.list) { 2295 list_for_each_entry(rs_req, &device->sync_ee, w.list) {
2074 if (overlaps(peer_req->i.sector, peer_req->i.size, 2296 if (overlaps(peer_req->i.sector, peer_req->i.size,
2075 rs_req->i.sector, rs_req->i.size)) { 2297 rs_req->i.sector, rs_req->i.size)) {
2076 rv = 1; 2298 rv = true;
2077 break; 2299 break;
2078 } 2300 }
2079 } 2301 }
@@ -2354,10 +2576,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2354 op = wire_flags_to_bio_op(dp_flags); 2576 op = wire_flags_to_bio_op(dp_flags);
2355 op_flags = wire_flags_to_bio_flags(dp_flags); 2577 op_flags = wire_flags_to_bio_flags(dp_flags);
2356 if (pi->cmd == P_TRIM) { 2578 if (pi->cmd == P_TRIM) {
2357 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
2358 peer_req->flags |= EE_IS_TRIM;
2359 if (!blk_queue_discard(q))
2360 peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
2361 D_ASSERT(peer_device, peer_req->i.size > 0); 2579 D_ASSERT(peer_device, peer_req->i.size > 0);
2362 D_ASSERT(peer_device, op == REQ_OP_DISCARD); 2580 D_ASSERT(peer_device, op == REQ_OP_DISCARD);
2363 D_ASSERT(peer_device, peer_req->pages == NULL); 2581 D_ASSERT(peer_device, peer_req->pages == NULL);
@@ -2424,11 +2642,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2424 update_peer_seq(peer_device, peer_seq); 2642 update_peer_seq(peer_device, peer_seq);
2425 spin_lock_irq(&device->resource->req_lock); 2643 spin_lock_irq(&device->resource->req_lock);
2426 } 2644 }
2427 /* if we use the zeroout fallback code, we process synchronously 2645 /* TRIM and WRITE_SAME are processed synchronously,
2428 * and we wait for all pending requests, respectively wait for 2646 * we wait for all pending requests, respectively wait for
2429 * active_ee to become empty in drbd_submit_peer_request(); 2647 * active_ee to become empty in drbd_submit_peer_request();
2430 * better not add ourselves here. */ 2648 * better not add ourselves here. */
2431 if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) 2649 if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
2432 list_add_tail(&peer_req->w.list, &device->active_ee); 2650 list_add_tail(&peer_req->w.list, &device->active_ee);
2433 spin_unlock_irq(&device->resource->req_lock); 2651 spin_unlock_irq(&device->resource->req_lock);
2434 2652
@@ -2460,7 +2678,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2460 } 2678 }
2461 2679
2462out_interrupted: 2680out_interrupted:
2463 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); 2681 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
2464 put_ldev(device); 2682 put_ldev(device);
2465 drbd_free_peer_req(device, peer_req); 2683 drbd_free_peer_req(device, peer_req);
2466 return err; 2684 return err;
@@ -2585,6 +2803,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2585 case P_DATA_REQUEST: 2803 case P_DATA_REQUEST:
2586 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); 2804 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2587 break; 2805 break;
2806 case P_RS_THIN_REQ:
2588 case P_RS_DATA_REQUEST: 2807 case P_RS_DATA_REQUEST:
2589 case P_CSUM_RS_REQUEST: 2808 case P_CSUM_RS_REQUEST:
2590 case P_OV_REQUEST: 2809 case P_OV_REQUEST:
@@ -2610,7 +2829,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2610 * "criss-cross" setup, that might cause write-out on some other DRBD, 2829 * "criss-cross" setup, that might cause write-out on some other DRBD,
2611 * which in turn might block on the other node at this very place. */ 2830 * which in turn might block on the other node at this very place. */
2612 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, 2831 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2613 true /* has real payload */, GFP_NOIO); 2832 size, GFP_NOIO);
2614 if (!peer_req) { 2833 if (!peer_req) {
2615 put_ldev(device); 2834 put_ldev(device);
2616 return -ENOMEM; 2835 return -ENOMEM;
@@ -2624,6 +2843,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2624 peer_req->flags |= EE_APPLICATION; 2843 peer_req->flags |= EE_APPLICATION;
2625 goto submit; 2844 goto submit;
2626 2845
2846 case P_RS_THIN_REQ:
2847 /* If at some point in the future we have a smart way to
2848 find out if this data block is completely deallocated,
2849 then we would do something smarter here than reading
2850 the block... */
2851 peer_req->flags |= EE_RS_THIN_REQ;
2627 case P_RS_DATA_REQUEST: 2852 case P_RS_DATA_REQUEST:
2628 peer_req->w.cb = w_e_end_rsdata_req; 2853 peer_req->w.cb = w_e_end_rsdata_req;
2629 fault_type = DRBD_FAULT_RS_RD; 2854 fault_type = DRBD_FAULT_RS_RD;
@@ -2969,7 +3194,8 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
2969-1091 requires proto 91 3194-1091 requires proto 91
2970-1096 requires proto 96 3195-1096 requires proto 96
2971 */ 3196 */
2972static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) 3197
3198static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
2973{ 3199{
2974 struct drbd_peer_device *const peer_device = first_peer_device(device); 3200 struct drbd_peer_device *const peer_device = first_peer_device(device);
2975 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 3201 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
@@ -3049,8 +3275,39 @@ static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __m
3049 * next bit (weight 2) is set when peer was primary */ 3275 * next bit (weight 2) is set when peer was primary */
3050 *rule_nr = 40; 3276 *rule_nr = 40;
3051 3277
3278 /* Neither has the "crashed primary" flag set,
3279 * only a replication link hickup. */
3280 if (rct == 0)
3281 return 0;
3282
3283 /* Current UUID equal and no bitmap uuid; does not necessarily
3284 * mean this was a "simultaneous hard crash", maybe IO was
3285 * frozen, so no UUID-bump happened.
3286 * This is a protocol change, overload DRBD_FF_WSAME as flag
3287 * for "new-enough" peer DRBD version. */
3288 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3289 *rule_nr = 41;
3290 if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3291 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3292 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3293 }
3294 if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3295 /* At least one has the "crashed primary" bit set,
3296 * both are primary now, but neither has rotated its UUIDs?
3297 * "Can not happen." */
3298 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3299 return -100;
3300 }
3301 if (device->state.role == R_PRIMARY)
3302 return 1;
3303 return -1;
3304 }
3305
3306 /* Both are secondary.
3307 * Really looks like recovery from simultaneous hard crash.
3308 * Check which had been primary before, and arbitrate. */
3052 switch (rct) { 3309 switch (rct) {
3053 case 0: /* !self_pri && !peer_pri */ return 0; 3310 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
3054 case 1: /* self_pri && !peer_pri */ return 1; 3311 case 1: /* self_pri && !peer_pri */ return 1;
3055 case 2: /* !self_pri && peer_pri */ return -1; 3312 case 2: /* !self_pri && peer_pri */ return -1;
3056 case 3: /* self_pri && peer_pri */ 3313 case 3: /* self_pri && peer_pri */
@@ -3177,7 +3434,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3177 drbd_uuid_dump(device, "peer", device->p_uuid, 3434 drbd_uuid_dump(device, "peer", device->p_uuid,
3178 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3435 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3179 3436
3180 hg = drbd_uuid_compare(device, &rule_nr); 3437 hg = drbd_uuid_compare(device, peer_role, &rule_nr);
3181 spin_unlock_irq(&device->ldev->md.uuid_lock); 3438 spin_unlock_irq(&device->ldev->md.uuid_lock);
3182 3439
3183 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); 3440 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
@@ -3186,6 +3443,15 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3186 drbd_alert(device, "Unrelated data, aborting!\n"); 3443 drbd_alert(device, "Unrelated data, aborting!\n");
3187 return C_MASK; 3444 return C_MASK;
3188 } 3445 }
3446 if (hg < -0x10000) {
3447 int proto, fflags;
3448 hg = -hg;
3449 proto = hg & 0xff;
3450 fflags = (hg >> 8) & 0xff;
3451 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3452 proto, fflags);
3453 return C_MASK;
3454 }
3189 if (hg < -1000) { 3455 if (hg < -1000) {
3190 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); 3456 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3191 return C_MASK; 3457 return C_MASK;
@@ -3415,7 +3681,8 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
3415 */ 3681 */
3416 3682
3417 peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC); 3683 peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3418 if (!peer_integrity_tfm) { 3684 if (IS_ERR(peer_integrity_tfm)) {
3685 peer_integrity_tfm = NULL;
3419 drbd_err(connection, "peer data-integrity-alg %s not supported\n", 3686 drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3420 integrity_alg); 3687 integrity_alg);
3421 goto disconnect; 3688 goto disconnect;
@@ -3766,6 +4033,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3766 struct drbd_peer_device *peer_device; 4033 struct drbd_peer_device *peer_device;
3767 struct drbd_device *device; 4034 struct drbd_device *device;
3768 struct p_sizes *p = pi->data; 4035 struct p_sizes *p = pi->data;
4036 struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
3769 enum determine_dev_size dd = DS_UNCHANGED; 4037 enum determine_dev_size dd = DS_UNCHANGED;
3770 sector_t p_size, p_usize, p_csize, my_usize; 4038 sector_t p_size, p_usize, p_csize, my_usize;
3771 int ldsc = 0; /* local disk size changed */ 4039 int ldsc = 0; /* local disk size changed */
@@ -3785,6 +4053,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3785 device->p_size = p_size; 4053 device->p_size = p_size;
3786 4054
3787 if (get_ldev(device)) { 4055 if (get_ldev(device)) {
4056 sector_t new_size, cur_size;
3788 rcu_read_lock(); 4057 rcu_read_lock();
3789 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; 4058 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
3790 rcu_read_unlock(); 4059 rcu_read_unlock();
@@ -3801,11 +4070,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3801 4070
3802 /* Never shrink a device with usable data during connect. 4071 /* Never shrink a device with usable data during connect.
3803 But allow online shrinking if we are connected. */ 4072 But allow online shrinking if we are connected. */
3804 if (drbd_new_dev_size(device, device->ldev, p_usize, 0) < 4073 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
3805 drbd_get_capacity(device->this_bdev) && 4074 cur_size = drbd_get_capacity(device->this_bdev);
4075 if (new_size < cur_size &&
3806 device->state.disk >= D_OUTDATED && 4076 device->state.disk >= D_OUTDATED &&
3807 device->state.conn < C_CONNECTED) { 4077 device->state.conn < C_CONNECTED) {
3808 drbd_err(device, "The peer's disk size is too small!\n"); 4078 drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
4079 (unsigned long long)new_size, (unsigned long long)cur_size);
3809 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4080 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3810 put_ldev(device); 4081 put_ldev(device);
3811 return -EIO; 4082 return -EIO;
@@ -3839,14 +4110,14 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3839 } 4110 }
3840 4111
3841 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); 4112 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3842 /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). 4113 /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
3843 In case we cleared the QUEUE_FLAG_DISCARD from our queue in 4114 In case we cleared the QUEUE_FLAG_DISCARD from our queue in
3844 drbd_reconsider_max_bio_size(), we can be sure that after 4115 drbd_reconsider_queue_parameters(), we can be sure that after
3845 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ 4116 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
3846 4117
3847 ddsf = be16_to_cpu(p->dds_flags); 4118 ddsf = be16_to_cpu(p->dds_flags);
3848 if (get_ldev(device)) { 4119 if (get_ldev(device)) {
3849 drbd_reconsider_max_bio_size(device, device->ldev); 4120 drbd_reconsider_queue_parameters(device, device->ldev, o);
3850 dd = drbd_determine_dev_size(device, ddsf, NULL); 4121 dd = drbd_determine_dev_size(device, ddsf, NULL);
3851 put_ldev(device); 4122 put_ldev(device);
3852 if (dd == DS_ERROR) 4123 if (dd == DS_ERROR)
@@ -3866,7 +4137,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3866 * However, if he sends a zero current size, 4137 * However, if he sends a zero current size,
3867 * take his (user-capped or) backing disk size anyways. 4138 * take his (user-capped or) backing disk size anyways.
3868 */ 4139 */
3869 drbd_reconsider_max_bio_size(device, NULL); 4140 drbd_reconsider_queue_parameters(device, NULL, o);
3870 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); 4141 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
3871 } 4142 }
3872 4143
@@ -4599,9 +4870,75 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet
4599 return 0; 4870 return 0;
4600} 4871}
4601 4872
4873static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
4874{
4875 struct drbd_peer_device *peer_device;
4876 struct p_block_desc *p = pi->data;
4877 struct drbd_device *device;
4878 sector_t sector;
4879 int size, err = 0;
4880
4881 peer_device = conn_peer_device(connection, pi->vnr);
4882 if (!peer_device)
4883 return -EIO;
4884 device = peer_device->device;
4885
4886 sector = be64_to_cpu(p->sector);
4887 size = be32_to_cpu(p->blksize);
4888
4889 dec_rs_pending(device);
4890
4891 if (get_ldev(device)) {
4892 struct drbd_peer_request *peer_req;
4893 const int op = REQ_OP_DISCARD;
4894
4895 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
4896 size, 0, GFP_NOIO);
4897 if (!peer_req) {
4898 put_ldev(device);
4899 return -ENOMEM;
4900 }
4901
4902 peer_req->w.cb = e_end_resync_block;
4903 peer_req->submit_jif = jiffies;
4904 peer_req->flags |= EE_IS_TRIM;
4905
4906 spin_lock_irq(&device->resource->req_lock);
4907 list_add_tail(&peer_req->w.list, &device->sync_ee);
4908 spin_unlock_irq(&device->resource->req_lock);
4909
4910 atomic_add(pi->size >> 9, &device->rs_sect_ev);
4911 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
4912
4913 if (err) {
4914 spin_lock_irq(&device->resource->req_lock);
4915 list_del(&peer_req->w.list);
4916 spin_unlock_irq(&device->resource->req_lock);
4917
4918 drbd_free_peer_req(device, peer_req);
4919 put_ldev(device);
4920 err = 0;
4921 goto fail;
4922 }
4923
4924 inc_unacked(device);
4925
4926 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
4927 as well as drbd_rs_complete_io() */
4928 } else {
4929 fail:
4930 drbd_rs_complete_io(device, sector);
4931 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
4932 }
4933
4934 atomic_add(size >> 9, &device->rs_sect_in);
4935
4936 return err;
4937}
4938
4602struct data_cmd { 4939struct data_cmd {
4603 int expect_payload; 4940 int expect_payload;
4604 size_t pkt_size; 4941 unsigned int pkt_size;
4605 int (*fn)(struct drbd_connection *, struct packet_info *); 4942 int (*fn)(struct drbd_connection *, struct packet_info *);
4606}; 4943};
4607 4944
@@ -4626,11 +4963,14 @@ static struct data_cmd drbd_cmd_handler[] = {
4626 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 4963 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4627 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 4964 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4628 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 4965 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4966 [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4629 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, 4967 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
4630 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, 4968 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4631 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, 4969 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
4632 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, 4970 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
4633 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, 4971 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
4972 [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
4973 [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data },
4634}; 4974};
4635 4975
4636static void drbdd(struct drbd_connection *connection) 4976static void drbdd(struct drbd_connection *connection)
@@ -4640,7 +4980,7 @@ static void drbdd(struct drbd_connection *connection)
4640 int err; 4980 int err;
4641 4981
4642 while (get_t_state(&connection->receiver) == RUNNING) { 4982 while (get_t_state(&connection->receiver) == RUNNING) {
4643 struct data_cmd *cmd; 4983 struct data_cmd const *cmd;
4644 4984
4645 drbd_thread_current_set_cpu(&connection->receiver); 4985 drbd_thread_current_set_cpu(&connection->receiver);
4646 update_receiver_timing_details(connection, drbd_recv_header); 4986 update_receiver_timing_details(connection, drbd_recv_header);
@@ -4655,11 +4995,18 @@ static void drbdd(struct drbd_connection *connection)
4655 } 4995 }
4656 4996
4657 shs = cmd->pkt_size; 4997 shs = cmd->pkt_size;
4998 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
4999 shs += sizeof(struct o_qlim);
4658 if (pi.size > shs && !cmd->expect_payload) { 5000 if (pi.size > shs && !cmd->expect_payload) {
4659 drbd_err(connection, "No payload expected %s l:%d\n", 5001 drbd_err(connection, "No payload expected %s l:%d\n",
4660 cmdname(pi.cmd), pi.size); 5002 cmdname(pi.cmd), pi.size);
4661 goto err_out; 5003 goto err_out;
4662 } 5004 }
5005 if (pi.size < shs) {
5006 drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
5007 cmdname(pi.cmd), (int)shs, pi.size);
5008 goto err_out;
5009 }
4663 5010
4664 if (shs) { 5011 if (shs) {
4665 update_receiver_timing_details(connection, drbd_recv_all_warn); 5012 update_receiver_timing_details(connection, drbd_recv_all_warn);
@@ -4795,9 +5142,11 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
4795 5142
4796 drbd_md_sync(device); 5143 drbd_md_sync(device);
4797 5144
4798 /* serialize with bitmap writeout triggered by the state change, 5145 if (get_ldev(device)) {
4799 * if any. */ 5146 drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
4800 wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); 5147 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5148 put_ldev(device);
5149 }
4801 5150
4802 /* tcp_close and release of sendpage pages can be deferred. I don't 5151 /* tcp_close and release of sendpage pages can be deferred. I don't
4803 * want to use SO_LINGER, because apparently it can be deferred for 5152 * want to use SO_LINGER, because apparently it can be deferred for
@@ -4904,8 +5253,12 @@ static int drbd_do_features(struct drbd_connection *connection)
4904 drbd_info(connection, "Handshake successful: " 5253 drbd_info(connection, "Handshake successful: "
4905 "Agreed network protocol version %d\n", connection->agreed_pro_version); 5254 "Agreed network protocol version %d\n", connection->agreed_pro_version);
4906 5255
4907 drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", 5256 drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
4908 connection->agreed_features & FF_TRIM ? " " : " not "); 5257 connection->agreed_features,
5258 connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5259 connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
5260 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
5261 connection->agreed_features ? "" : " none");
4909 5262
4910 return 1; 5263 return 1;
4911 5264
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index eef6e9575b4e..66b8e4bb74d8 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
47 &device->vdisk->part0, req->start_jif); 47 &device->vdisk->part0, req->start_jif);
48} 48}
49 49
50static struct drbd_request *drbd_req_new(struct drbd_device *device, 50static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src)
51 struct bio *bio_src)
52{ 51{
53 struct drbd_request *req; 52 struct drbd_request *req;
54 53
@@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
58 memset(req, 0, sizeof(*req)); 57 memset(req, 0, sizeof(*req));
59 58
60 drbd_req_make_private_bio(req, bio_src); 59 drbd_req_make_private_bio(req, bio_src);
61 req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; 60 req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
62 req->device = device; 61 | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
63 req->master_bio = bio_src; 62 | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
64 req->epoch = 0; 63 req->device = device;
64 req->master_bio = bio_src;
65 req->epoch = 0;
65 66
66 drbd_clear_interval(&req->i); 67 drbd_clear_interval(&req->i);
67 req->i.sector = bio_src->bi_iter.bi_sector; 68 req->i.sector = bio_src->bi_iter.bi_sector;
@@ -218,7 +219,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
218{ 219{
219 const unsigned s = req->rq_state; 220 const unsigned s = req->rq_state;
220 struct drbd_device *device = req->device; 221 struct drbd_device *device = req->device;
221 int rw;
222 int error, ok; 222 int error, ok;
223 223
224 /* we must not complete the master bio, while it is 224 /* we must not complete the master bio, while it is
@@ -242,8 +242,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
242 return; 242 return;
243 } 243 }
244 244
245 rw = bio_rw(req->master_bio);
246
247 /* 245 /*
248 * figure out whether to report success or failure. 246 * figure out whether to report success or failure.
249 * 247 *
@@ -267,7 +265,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
267 * epoch number. If they match, increase the current_tle_nr, 265 * epoch number. If they match, increase the current_tle_nr,
268 * and reset the transfer log epoch write_cnt. 266 * and reset the transfer log epoch write_cnt.
269 */ 267 */
270 if (rw == WRITE && 268 if (op_is_write(bio_op(req->master_bio)) &&
271 req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr)) 269 req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
272 start_new_tl_epoch(first_peer_device(device)->connection); 270 start_new_tl_epoch(first_peer_device(device)->connection);
273 271
@@ -284,11 +282,14 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
284 * because no path was available, in which case 282 * because no path was available, in which case
285 * it was not even added to the transfer_log. 283 * it was not even added to the transfer_log.
286 * 284 *
287 * READA may fail, and will not be retried. 285 * read-ahead may fail, and will not be retried.
288 * 286 *
289 * WRITE should have used all available paths already. 287 * WRITE should have used all available paths already.
290 */ 288 */
291 if (!ok && rw == READ && !list_empty(&req->tl_requests)) 289 if (!ok &&
290 bio_op(req->master_bio) == REQ_OP_READ &&
291 !(req->master_bio->bi_rw & REQ_RAHEAD) &&
292 !list_empty(&req->tl_requests))
292 req->rq_state |= RQ_POSTPONED; 293 req->rq_state |= RQ_POSTPONED;
293 294
294 if (!(req->rq_state & RQ_POSTPONED)) { 295 if (!(req->rq_state & RQ_POSTPONED)) {
@@ -644,7 +645,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
644 __drbd_chk_io_error(device, DRBD_READ_ERROR); 645 __drbd_chk_io_error(device, DRBD_READ_ERROR);
645 /* fall through. */ 646 /* fall through. */
646 case READ_AHEAD_COMPLETED_WITH_ERROR: 647 case READ_AHEAD_COMPLETED_WITH_ERROR:
647 /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ 648 /* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */
648 mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); 649 mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
649 break; 650 break;
650 651
@@ -656,7 +657,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
656 break; 657 break;
657 658
658 case QUEUE_FOR_NET_READ: 659 case QUEUE_FOR_NET_READ:
659 /* READ or READA, and 660 /* READ, and
660 * no local disk, 661 * no local disk,
661 * or target area marked as invalid, 662 * or target area marked as invalid,
662 * or just got an io-error. */ 663 * or just got an io-error. */
@@ -977,16 +978,20 @@ static void complete_conflicting_writes(struct drbd_request *req)
977 sector_t sector = req->i.sector; 978 sector_t sector = req->i.sector;
978 int size = req->i.size; 979 int size = req->i.size;
979 980
980 i = drbd_find_overlap(&device->write_requests, sector, size);
981 if (!i)
982 return;
983
984 for (;;) { 981 for (;;) {
985 prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); 982 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
986 i = drbd_find_overlap(&device->write_requests, sector, size); 983 /* Ignore, if already completed to upper layers. */
987 if (!i) 984 if (i->completed)
985 continue;
986 /* Handle the first found overlap. After the schedule
987 * we have to restart the tree walk. */
988 break;
989 }
990 if (!i) /* if any */
988 break; 991 break;
992
989 /* Indicate to wake up device->misc_wait on progress. */ 993 /* Indicate to wake up device->misc_wait on progress. */
994 prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
990 i->waiting = true; 995 i->waiting = true;
991 spin_unlock_irq(&device->resource->req_lock); 996 spin_unlock_irq(&device->resource->req_lock);
992 schedule(); 997 schedule();
@@ -995,7 +1000,7 @@ static void complete_conflicting_writes(struct drbd_request *req)
995 finish_wait(&device->misc_wait, &wait); 1000 finish_wait(&device->misc_wait, &wait);
996} 1001}
997 1002
998/* called within req_lock and rcu_read_lock() */ 1003/* called within req_lock */
999static void maybe_pull_ahead(struct drbd_device *device) 1004static void maybe_pull_ahead(struct drbd_device *device)
1000{ 1005{
1001 struct drbd_connection *connection = first_peer_device(device)->connection; 1006 struct drbd_connection *connection = first_peer_device(device)->connection;
@@ -1152,12 +1157,29 @@ static int drbd_process_write_request(struct drbd_request *req)
1152 return remote; 1157 return remote;
1153} 1158}
1154 1159
1160static void drbd_process_discard_req(struct drbd_request *req)
1161{
1162 int err = drbd_issue_discard_or_zero_out(req->device,
1163 req->i.sector, req->i.size >> 9, true);
1164
1165 if (err)
1166 req->private_bio->bi_error = -EIO;
1167 bio_endio(req->private_bio);
1168}
1169
1155static void 1170static void
1156drbd_submit_req_private_bio(struct drbd_request *req) 1171drbd_submit_req_private_bio(struct drbd_request *req)
1157{ 1172{
1158 struct drbd_device *device = req->device; 1173 struct drbd_device *device = req->device;
1159 struct bio *bio = req->private_bio; 1174 struct bio *bio = req->private_bio;
1160 const int rw = bio_rw(bio); 1175 unsigned int type;
1176
1177 if (bio_op(bio) != REQ_OP_READ)
1178 type = DRBD_FAULT_DT_WR;
1179 else if (bio->bi_rw & REQ_RAHEAD)
1180 type = DRBD_FAULT_DT_RA;
1181 else
1182 type = DRBD_FAULT_DT_RD;
1161 1183
1162 bio->bi_bdev = device->ldev->backing_bdev; 1184 bio->bi_bdev = device->ldev->backing_bdev;
1163 1185
@@ -1167,11 +1189,10 @@ drbd_submit_req_private_bio(struct drbd_request *req)
1167 * stable storage, and this is a WRITE, we may not even submit 1189 * stable storage, and this is a WRITE, we may not even submit
1168 * this bio. */ 1190 * this bio. */
1169 if (get_ldev(device)) { 1191 if (get_ldev(device)) {
1170 if (drbd_insert_fault(device, 1192 if (drbd_insert_fault(device, type))
1171 rw == WRITE ? DRBD_FAULT_DT_WR
1172 : rw == READ ? DRBD_FAULT_DT_RD
1173 : DRBD_FAULT_DT_RA))
1174 bio_io_error(bio); 1193 bio_io_error(bio);
1194 else if (bio_op(bio) == REQ_OP_DISCARD)
1195 drbd_process_discard_req(req);
1175 else 1196 else
1176 generic_make_request(bio); 1197 generic_make_request(bio);
1177 put_ldev(device); 1198 put_ldev(device);
@@ -1223,24 +1244,45 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
1223 /* Update disk stats */ 1244 /* Update disk stats */
1224 _drbd_start_io_acct(device, req); 1245 _drbd_start_io_acct(device, req);
1225 1246
1247 /* process discards always from our submitter thread */
1248 if (bio_op(bio) & REQ_OP_DISCARD)
1249 goto queue_for_submitter_thread;
1250
1226 if (rw == WRITE && req->private_bio && req->i.size 1251 if (rw == WRITE && req->private_bio && req->i.size
1227 && !test_bit(AL_SUSPENDED, &device->flags)) { 1252 && !test_bit(AL_SUSPENDED, &device->flags)) {
1228 if (!drbd_al_begin_io_fastpath(device, &req->i)) { 1253 if (!drbd_al_begin_io_fastpath(device, &req->i))
1229 atomic_inc(&device->ap_actlog_cnt); 1254 goto queue_for_submitter_thread;
1230 drbd_queue_write(device, req);
1231 return NULL;
1232 }
1233 req->rq_state |= RQ_IN_ACT_LOG; 1255 req->rq_state |= RQ_IN_ACT_LOG;
1234 req->in_actlog_jif = jiffies; 1256 req->in_actlog_jif = jiffies;
1235 } 1257 }
1236
1237 return req; 1258 return req;
1259
1260 queue_for_submitter_thread:
1261 atomic_inc(&device->ap_actlog_cnt);
1262 drbd_queue_write(device, req);
1263 return NULL;
1264}
1265
1266/* Require at least one path to current data.
1267 * We don't want to allow writes on C_STANDALONE D_INCONSISTENT:
1268 * We would not allow to read what was written,
1269 * we would not have bumped the data generation uuids,
1270 * we would cause data divergence for all the wrong reasons.
1271 *
1272 * If we don't see at least one D_UP_TO_DATE, we will fail this request,
1273 * which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO,
1274 * and queues for retry later.
1275 */
1276static bool may_do_writes(struct drbd_device *device)
1277{
1278 const union drbd_dev_state s = device->state;
1279 return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE;
1238} 1280}
1239 1281
1240static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) 1282static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
1241{ 1283{
1242 struct drbd_resource *resource = device->resource; 1284 struct drbd_resource *resource = device->resource;
1243 const int rw = bio_rw(req->master_bio); 1285 const int rw = bio_data_dir(req->master_bio);
1244 struct bio_and_error m = { NULL, }; 1286 struct bio_and_error m = { NULL, };
1245 bool no_remote = false; 1287 bool no_remote = false;
1246 bool submit_private_bio = false; 1288 bool submit_private_bio = false;
@@ -1270,7 +1312,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
1270 goto out; 1312 goto out;
1271 } 1313 }
1272 1314
1273 /* We fail READ/READA early, if we can not serve it. 1315 /* We fail READ early, if we can not serve it.
1274 * We must do this before req is registered on any lists. 1316 * We must do this before req is registered on any lists.
1275 * Otherwise, drbd_req_complete() will queue failed READ for retry. */ 1317 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
1276 if (rw != WRITE) { 1318 if (rw != WRITE) {
@@ -1291,6 +1333,12 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
1291 } 1333 }
1292 1334
1293 if (rw == WRITE) { 1335 if (rw == WRITE) {
1336 if (req->private_bio && !may_do_writes(device)) {
1337 bio_put(req->private_bio);
1338 req->private_bio = NULL;
1339 put_ldev(device);
1340 goto nodata;
1341 }
1294 if (!drbd_process_write_request(req)) 1342 if (!drbd_process_write_request(req))
1295 no_remote = true; 1343 no_remote = true;
1296 } else { 1344 } else {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index bb2ef78165e5..eb49e7f2da91 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -206,6 +206,8 @@ enum drbd_req_state_bits {
206 206
207 /* Set when this is a write, clear for a read */ 207 /* Set when this is a write, clear for a read */
208 __RQ_WRITE, 208 __RQ_WRITE,
209 __RQ_WSAME,
210 __RQ_UNMAP,
209 211
210 /* Should call drbd_al_complete_io() for this request... */ 212 /* Should call drbd_al_complete_io() for this request... */
211 __RQ_IN_ACT_LOG, 213 __RQ_IN_ACT_LOG,
@@ -241,10 +243,11 @@ enum drbd_req_state_bits {
241#define RQ_NET_OK (1UL << __RQ_NET_OK) 243#define RQ_NET_OK (1UL << __RQ_NET_OK)
242#define RQ_NET_SIS (1UL << __RQ_NET_SIS) 244#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
243 245
244/* 0x1f8 */
245#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) 246#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
246 247
247#define RQ_WRITE (1UL << __RQ_WRITE) 248#define RQ_WRITE (1UL << __RQ_WRITE)
249#define RQ_WSAME (1UL << __RQ_WSAME)
250#define RQ_UNMAP (1UL << __RQ_UNMAP)
248#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) 251#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
249#define RQ_POSTPONED (1UL << __RQ_POSTPONED) 252#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
250#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) 253#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 5a7ef7873b67..eea0c4aec978 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -814,7 +814,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
814 } 814 }
815 815
816 if (rv <= 0) 816 if (rv <= 0)
817 /* already found a reason to abort */; 817 goto out; /* already found a reason to abort */
818 else if (ns.role == R_SECONDARY && device->open_cnt) 818 else if (ns.role == R_SECONDARY && device->open_cnt)
819 rv = SS_DEVICE_IN_USE; 819 rv = SS_DEVICE_IN_USE;
820 820
@@ -862,6 +862,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
862 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) 862 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
863 rv = SS_CONNECTED_OUTDATES; 863 rv = SS_CONNECTED_OUTDATES;
864 864
865out:
865 rcu_read_unlock(); 866 rcu_read_unlock();
866 867
867 return rv; 868 return rv;
@@ -906,6 +907,15 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c
906 (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) 907 (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
907 rv = SS_IN_TRANSIENT_STATE; 908 rv = SS_IN_TRANSIENT_STATE;
908 909
910 /* Do not promote during resync handshake triggered by "force primary".
911 * This is a hack. It should really be rejected by the peer during the
912 * cluster wide state change request. */
913 if (os.role != R_PRIMARY && ns.role == R_PRIMARY
914 && ns.pdsk == D_UP_TO_DATE
915 && ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS
916 && (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn))
917 rv = SS_IN_TRANSIENT_STATE;
918
909 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) 919 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
910 rv = SS_NEED_CONNECTION; 920 rv = SS_NEED_CONNECTION;
911 921
@@ -1628,6 +1638,26 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
1628#undef REMEMBER_STATE_CHANGE 1638#undef REMEMBER_STATE_CHANGE
1629} 1639}
1630 1640
1641/* takes old and new peer disk state */
1642static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns)
1643{
1644 if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED)
1645 && (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED))
1646 return true;
1647
1648 /* Scenario, starting with normal operation
1649 * Connected Primary/Secondary UpToDate/UpToDate
1650 * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen)
1651 * ...
1652 * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!)
1653 */
1654 if (os == D_UNKNOWN
1655 && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED))
1656 return true;
1657
1658 return false;
1659}
1660
1631/** 1661/**
1632 * after_state_ch() - Perform after state change actions that may sleep 1662 * after_state_ch() - Perform after state change actions that may sleep
1633 * @device: DRBD device. 1663 * @device: DRBD device.
@@ -1675,7 +1705,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1675 what = RESEND; 1705 what = RESEND;
1676 1706
1677 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && 1707 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1678 conn_lowest_disk(connection) > D_NEGOTIATING) 1708 conn_lowest_disk(connection) == D_UP_TO_DATE)
1679 what = RESTART_FROZEN_DISK_IO; 1709 what = RESTART_FROZEN_DISK_IO;
1680 1710
1681 if (resource->susp_nod && what != NOTHING) { 1711 if (resource->susp_nod && what != NOTHING) {
@@ -1699,6 +1729,13 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1699 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) 1729 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1700 clear_bit(NEW_CUR_UUID, &peer_device->device->flags); 1730 clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
1701 rcu_read_unlock(); 1731 rcu_read_unlock();
1732
1733 /* We should actively create a new uuid, _before_
1734 * we resume/resent, if the peer is diskless
1735 * (recovery from a multiple error scenario).
1736 * Currently, this happens with a slight delay
1737 * below when checking lost_contact_to_peer_data() ...
1738 */
1702 _tl_restart(connection, RESEND); 1739 _tl_restart(connection, RESEND);
1703 _conn_request_state(connection, 1740 _conn_request_state(connection,
1704 (union drbd_state) { { .susp_fen = 1 } }, 1741 (union drbd_state) { { .susp_fen = 1 } },
@@ -1742,12 +1779,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1742 BM_LOCKED_TEST_ALLOWED); 1779 BM_LOCKED_TEST_ALLOWED);
1743 1780
1744 /* Lost contact to peer's copy of the data */ 1781 /* Lost contact to peer's copy of the data */
1745 if ((os.pdsk >= D_INCONSISTENT && 1782 if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) {
1746 os.pdsk != D_UNKNOWN &&
1747 os.pdsk != D_OUTDATED)
1748 && (ns.pdsk < D_INCONSISTENT ||
1749 ns.pdsk == D_UNKNOWN ||
1750 ns.pdsk == D_OUTDATED)) {
1751 if (get_ldev(device)) { 1783 if (get_ldev(device)) {
1752 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1784 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1753 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1785 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
@@ -1934,12 +1966,17 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1934 1966
1935 /* This triggers bitmap writeout of potentially still unwritten pages 1967 /* This triggers bitmap writeout of potentially still unwritten pages
1936 * if the resync finished cleanly, or aborted because of peer disk 1968 * if the resync finished cleanly, or aborted because of peer disk
1937 * failure, or because of connection loss. 1969 * failure, or on transition from resync back to AHEAD/BEHIND.
1970 *
1971 * Connection loss is handled in drbd_disconnected() by the receiver.
1972 *
1938 * For resync aborted because of local disk failure, we cannot do 1973 * For resync aborted because of local disk failure, we cannot do
1939 * any bitmap writeout anymore. 1974 * any bitmap writeout anymore.
1975 *
1940 * No harm done if some bits change during this phase. 1976 * No harm done if some bits change during this phase.
1941 */ 1977 */
1942 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) { 1978 if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) &&
1979 (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) {
1943 drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, 1980 drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
1944 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); 1981 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
1945 put_ldev(device); 1982 put_ldev(device);
@@ -2160,9 +2197,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
2160 ns.disk = os.disk; 2197 ns.disk = os.disk;
2161 2198
2162 rv = _drbd_set_state(device, ns, flags, NULL); 2199 rv = _drbd_set_state(device, ns, flags, NULL);
2163 if (rv < SS_SUCCESS) 2200 BUG_ON(rv < SS_SUCCESS);
2164 BUG();
2165
2166 ns.i = device->state.i; 2201 ns.i = device->state.i;
2167 ns_max.role = max_role(ns.role, ns_max.role); 2202 ns_max.role = max_role(ns.role, ns_max.role);
2168 ns_max.peer = max_role(ns.peer, ns_max.peer); 2203 ns_max.peer = max_role(ns.peer, ns_max.peer);
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index bd989536f888..6c9d5d4a8a75 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -140,7 +140,7 @@ extern void drbd_resume_al(struct drbd_device *device);
140extern bool conn_all_vols_unconf(struct drbd_connection *connection); 140extern bool conn_all_vols_unconf(struct drbd_connection *connection);
141 141
142/** 142/**
143 * drbd_request_state() - Reqest a state change 143 * drbd_request_state() - Request a state change
144 * @device: DRBD device. 144 * @device: DRBD device.
145 * @mask: mask of state bits to change. 145 * @mask: mask of state bits to change.
146 * @val: value of new state bits. 146 * @val: value of new state bits.
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 80b0f63c7075..0eeab14776e9 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -26,7 +26,7 @@
26#include <linux/drbd.h> 26#include <linux/drbd.h>
27#include "drbd_strings.h" 27#include "drbd_strings.h"
28 28
29static const char *drbd_conn_s_names[] = { 29static const char * const drbd_conn_s_names[] = {
30 [C_STANDALONE] = "StandAlone", 30 [C_STANDALONE] = "StandAlone",
31 [C_DISCONNECTING] = "Disconnecting", 31 [C_DISCONNECTING] = "Disconnecting",
32 [C_UNCONNECTED] = "Unconnected", 32 [C_UNCONNECTED] = "Unconnected",
@@ -53,13 +53,13 @@ static const char *drbd_conn_s_names[] = {
53 [C_BEHIND] = "Behind", 53 [C_BEHIND] = "Behind",
54}; 54};
55 55
56static const char *drbd_role_s_names[] = { 56static const char * const drbd_role_s_names[] = {
57 [R_PRIMARY] = "Primary", 57 [R_PRIMARY] = "Primary",
58 [R_SECONDARY] = "Secondary", 58 [R_SECONDARY] = "Secondary",
59 [R_UNKNOWN] = "Unknown" 59 [R_UNKNOWN] = "Unknown"
60}; 60};
61 61
62static const char *drbd_disk_s_names[] = { 62static const char * const drbd_disk_s_names[] = {
63 [D_DISKLESS] = "Diskless", 63 [D_DISKLESS] = "Diskless",
64 [D_ATTACHING] = "Attaching", 64 [D_ATTACHING] = "Attaching",
65 [D_FAILED] = "Failed", 65 [D_FAILED] = "Failed",
@@ -71,7 +71,7 @@ static const char *drbd_disk_s_names[] = {
71 [D_UP_TO_DATE] = "UpToDate", 71 [D_UP_TO_DATE] = "UpToDate",
72}; 72};
73 73
74static const char *drbd_state_sw_errors[] = { 74static const char * const drbd_state_sw_errors[] = {
75 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", 75 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
76 [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", 76 [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
77 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", 77 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 51fab978eb61..35dbb3dca47e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -173,8 +173,8 @@ void drbd_peer_request_endio(struct bio *bio)
173{ 173{
174 struct drbd_peer_request *peer_req = bio->bi_private; 174 struct drbd_peer_request *peer_req = bio->bi_private;
175 struct drbd_device *device = peer_req->peer_device->device; 175 struct drbd_device *device = peer_req->peer_device->device;
176 int is_write = bio_data_dir(bio) == WRITE; 176 bool is_write = bio_data_dir(bio) == WRITE;
177 int is_discard = !!(bio_op(bio) == REQ_OP_DISCARD); 177 bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
178 178
179 if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) 179 if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
180 drbd_warn(device, "%s: error=%d s=%llus\n", 180 drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -248,18 +248,26 @@ void drbd_request_endio(struct bio *bio)
248 248
249 /* to avoid recursion in __req_mod */ 249 /* to avoid recursion in __req_mod */
250 if (unlikely(bio->bi_error)) { 250 if (unlikely(bio->bi_error)) {
251 if (bio_op(bio) == REQ_OP_DISCARD) 251 switch (bio_op(bio)) {
252 what = (bio->bi_error == -EOPNOTSUPP) 252 case REQ_OP_DISCARD:
253 ? DISCARD_COMPLETED_NOTSUPP 253 if (bio->bi_error == -EOPNOTSUPP)
254 : DISCARD_COMPLETED_WITH_ERROR; 254 what = DISCARD_COMPLETED_NOTSUPP;
255 else 255 else
256 what = (bio_data_dir(bio) == WRITE) 256 what = DISCARD_COMPLETED_WITH_ERROR;
257 ? WRITE_COMPLETED_WITH_ERROR 257 break;
258 : (bio_rw(bio) == READ) 258 case REQ_OP_READ:
259 ? READ_COMPLETED_WITH_ERROR 259 if (bio->bi_rw & REQ_RAHEAD)
260 : READ_AHEAD_COMPLETED_WITH_ERROR; 260 what = READ_AHEAD_COMPLETED_WITH_ERROR;
261 } else 261 else
262 what = READ_COMPLETED_WITH_ERROR;
263 break;
264 default:
265 what = WRITE_COMPLETED_WITH_ERROR;
266 break;
267 }
268 } else {
262 what = COMPLETED_OK; 269 what = COMPLETED_OK;
270 }
263 271
264 bio_put(req->private_bio); 272 bio_put(req->private_bio);
265 req->private_bio = ERR_PTR(bio->bi_error); 273 req->private_bio = ERR_PTR(bio->bi_error);
@@ -320,6 +328,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
320 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); 328 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
321 ahash_request_set_crypt(req, &sg, NULL, sg.length); 329 ahash_request_set_crypt(req, &sg, NULL, sg.length);
322 crypto_ahash_update(req); 330 crypto_ahash_update(req);
331 /* REQ_OP_WRITE_SAME has only one segment,
332 * checksum the payload only once. */
333 if (bio_op(bio) == REQ_OP_WRITE_SAME)
334 break;
323 } 335 }
324 ahash_request_set_crypt(req, NULL, digest, 0); 336 ahash_request_set_crypt(req, NULL, digest, 0);
325 crypto_ahash_final(req); 337 crypto_ahash_final(req);
@@ -387,7 +399,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
387 /* GFP_TRY, because if there is no memory available right now, this may 399 /* GFP_TRY, because if there is no memory available right now, this may
388 * be rescheduled for later. It is "only" background resync, after all. */ 400 * be rescheduled for later. It is "only" background resync, after all. */
389 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, 401 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
390 size, true /* has real payload */, GFP_TRY); 402 size, size, GFP_TRY);
391 if (!peer_req) 403 if (!peer_req)
392 goto defer; 404 goto defer;
393 405
@@ -583,6 +595,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
583 int number, rollback_i, size; 595 int number, rollback_i, size;
584 int align, requeue = 0; 596 int align, requeue = 0;
585 int i = 0; 597 int i = 0;
598 int discard_granularity = 0;
586 599
587 if (unlikely(cancel)) 600 if (unlikely(cancel))
588 return 0; 601 return 0;
@@ -602,6 +615,12 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
602 return 0; 615 return 0;
603 } 616 }
604 617
618 if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
619 rcu_read_lock();
620 discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
621 rcu_read_unlock();
622 }
623
605 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; 624 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
606 number = drbd_rs_number_requests(device); 625 number = drbd_rs_number_requests(device);
607 if (number <= 0) 626 if (number <= 0)
@@ -666,6 +685,9 @@ next_sector:
666 if (sector & ((1<<(align+3))-1)) 685 if (sector & ((1<<(align+3))-1))
667 break; 686 break;
668 687
688 if (discard_granularity && size == discard_granularity)
689 break;
690
669 /* do not cross extent boundaries */ 691 /* do not cross extent boundaries */
670 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) 692 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
671 break; 693 break;
@@ -712,7 +734,8 @@ next_sector:
712 int err; 734 int err;
713 735
714 inc_rs_pending(device); 736 inc_rs_pending(device);
715 err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, 737 err = drbd_send_drequest(peer_device,
738 size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST,
716 sector, size, ID_SYNCER); 739 sector, size, ID_SYNCER);
717 if (err) { 740 if (err) {
718 drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); 741 drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -829,6 +852,7 @@ static void ping_peer(struct drbd_device *device)
829 852
830int drbd_resync_finished(struct drbd_device *device) 853int drbd_resync_finished(struct drbd_device *device)
831{ 854{
855 struct drbd_connection *connection = first_peer_device(device)->connection;
832 unsigned long db, dt, dbdt; 856 unsigned long db, dt, dbdt;
833 unsigned long n_oos; 857 unsigned long n_oos;
834 union drbd_state os, ns; 858 union drbd_state os, ns;
@@ -850,8 +874,7 @@ int drbd_resync_finished(struct drbd_device *device)
850 if (dw) { 874 if (dw) {
851 dw->w.cb = w_resync_finished; 875 dw->w.cb = w_resync_finished;
852 dw->device = device; 876 dw->device = device;
853 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 877 drbd_queue_work(&connection->sender_work, &dw->w);
854 &dw->w);
855 return 1; 878 return 1;
856 } 879 }
857 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n"); 880 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
@@ -964,6 +987,30 @@ int drbd_resync_finished(struct drbd_device *device)
964 _drbd_set_state(device, ns, CS_VERBOSE, NULL); 987 _drbd_set_state(device, ns, CS_VERBOSE, NULL);
965out_unlock: 988out_unlock:
966 spin_unlock_irq(&device->resource->req_lock); 989 spin_unlock_irq(&device->resource->req_lock);
990
991 /* If we have been sync source, and have an effective fencing-policy,
992 * once *all* volumes are back in sync, call "unfence". */
993 if (os.conn == C_SYNC_SOURCE) {
994 enum drbd_disk_state disk_state = D_MASK;
995 enum drbd_disk_state pdsk_state = D_MASK;
996 enum drbd_fencing_p fp = FP_DONT_CARE;
997
998 rcu_read_lock();
999 fp = rcu_dereference(device->ldev->disk_conf)->fencing;
1000 if (fp != FP_DONT_CARE) {
1001 struct drbd_peer_device *peer_device;
1002 int vnr;
1003 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1004 struct drbd_device *device = peer_device->device;
1005 disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
1006 pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk);
1007 }
1008 }
1009 rcu_read_unlock();
1010 if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE)
1011 conn_khelper(connection, "unfence-peer");
1012 }
1013
967 put_ldev(device); 1014 put_ldev(device);
968out: 1015out:
969 device->rs_total = 0; 1016 device->rs_total = 0;
@@ -1000,7 +1047,6 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_
1000 1047
1001/** 1048/**
1002 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST 1049 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
1003 * @device: DRBD device.
1004 * @w: work object. 1050 * @w: work object.
1005 * @cancel: The connection will be closed anyways 1051 * @cancel: The connection will be closed anyways
1006 */ 1052 */
@@ -1036,6 +1082,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
1036 return err; 1082 return err;
1037} 1083}
1038 1084
1085static bool all_zero(struct drbd_peer_request *peer_req)
1086{
1087 struct page *page = peer_req->pages;
1088 unsigned int len = peer_req->i.size;
1089
1090 page_chain_for_each(page) {
1091 unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
1092 unsigned int i, words = l / sizeof(long);
1093 unsigned long *d;
1094
1095 d = kmap_atomic(page);
1096 for (i = 0; i < words; i++) {
1097 if (d[i]) {
1098 kunmap_atomic(d);
1099 return false;
1100 }
1101 }
1102 kunmap_atomic(d);
1103 len -= l;
1104 }
1105
1106 return true;
1107}
1108
1039/** 1109/**
1040 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST 1110 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
1041 * @w: work object. 1111 * @w: work object.
@@ -1064,7 +1134,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
1064 } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { 1134 } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1065 if (likely(device->state.pdsk >= D_INCONSISTENT)) { 1135 if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1066 inc_rs_pending(device); 1136 inc_rs_pending(device);
1067 err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); 1137 if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
1138 err = drbd_send_rs_deallocated(peer_device, peer_req);
1139 else
1140 err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1068 } else { 1141 } else {
1069 if (__ratelimit(&drbd_ratelimit_state)) 1142 if (__ratelimit(&drbd_ratelimit_state))
1070 drbd_err(device, "Not sending RSDataReply, " 1143 drbd_err(device, "Not sending RSDataReply, "
@@ -1634,7 +1707,7 @@ static bool use_checksum_based_resync(struct drbd_connection *connection, struct
1634 rcu_read_unlock(); 1707 rcu_read_unlock();
1635 return connection->agreed_pro_version >= 89 && /* supported? */ 1708 return connection->agreed_pro_version >= 89 && /* supported? */
1636 connection->csums_tfm && /* configured? */ 1709 connection->csums_tfm && /* configured? */
1637 (csums_after_crash_only == 0 /* use for each resync? */ 1710 (csums_after_crash_only == false /* use for each resync? */
1638 || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ 1711 || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
1639} 1712}
1640 1713
@@ -1769,7 +1842,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1769 device->bm_resync_fo = 0; 1842 device->bm_resync_fo = 0;
1770 device->use_csums = use_checksum_based_resync(connection, device); 1843 device->use_csums = use_checksum_based_resync(connection, device);
1771 } else { 1844 } else {
1772 device->use_csums = 0; 1845 device->use_csums = false;
1773 } 1846 }
1774 1847
1775 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid 1848 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index f9bfecd733a8..c557057fe8ae 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4350,8 +4350,7 @@ static int __init do_floppy_init(void)
4350 /* to be cleaned up... */ 4350 /* to be cleaned up... */
4351 disks[drive]->private_data = (void *)(long)drive; 4351 disks[drive]->private_data = (void *)(long)drive;
4352 disks[drive]->flags |= GENHD_FL_REMOVABLE; 4352 disks[drive]->flags |= GENHD_FL_REMOVABLE;
4353 disks[drive]->driverfs_dev = &floppy_device[drive].dev; 4353 device_add_disk(&floppy_device[drive].dev, disks[drive]);
4354 add_disk(disks[drive]);
4355 } 4354 }
4356 4355
4357 return 0; 4356 return 0;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 364d491d4bdd..075377eee0c0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1765,6 +1765,7 @@ static int loop_add(struct loop_device **l, int i)
1765 */ 1765 */
1766 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); 1766 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
1767 1767
1768 err = -ENOMEM;
1768 disk = lo->lo_disk = alloc_disk(1 << part_shift); 1769 disk = lo->lo_disk = alloc_disk(1 << part_shift);
1769 if (!disk) 1770 if (!disk)
1770 goto out_free_queue; 1771 goto out_free_queue;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 145ce2aa2e78..e937fcf71769 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -687,15 +687,13 @@ static unsigned int mg_issue_req(struct request *req,
687 unsigned int sect_num, 687 unsigned int sect_num,
688 unsigned int sect_cnt) 688 unsigned int sect_cnt)
689{ 689{
690 switch (rq_data_dir(req)) { 690 if (rq_data_dir(req) == READ) {
691 case READ:
692 if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) 691 if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
693 != MG_ERR_NONE) { 692 != MG_ERR_NONE) {
694 mg_bad_rw_intr(host); 693 mg_bad_rw_intr(host);
695 return host->error; 694 return host->error;
696 } 695 }
697 break; 696 } else {
698 case WRITE:
699 /* TODO : handler */ 697 /* TODO : handler */
700 outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); 698 outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
701 if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr) 699 if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
@@ -714,7 +712,6 @@ static unsigned int mg_issue_req(struct request *req,
714 mod_timer(&host->timer, jiffies + 3 * HZ); 712 mod_timer(&host->timer, jiffies + 3 * HZ);
715 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + 713 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
716 MG_REG_COMMAND); 714 MG_REG_COMMAND);
717 break;
718 } 715 }
719 return MG_ERR_NONE; 716 return MG_ERR_NONE;
720} 717}
@@ -1018,7 +1015,7 @@ probe_err_7:
1018probe_err_6: 1015probe_err_6:
1019 blk_cleanup_queue(host->breq); 1016 blk_cleanup_queue(host->breq);
1020probe_err_5: 1017probe_err_5:
1021 unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME); 1018 unregister_blkdev(host->major, MG_DISK_NAME);
1022probe_err_4: 1019probe_err_4:
1023 if (!prv_data->use_polling) 1020 if (!prv_data->use_polling)
1024 free_irq(host->irq, host); 1021 free_irq(host->irq, host);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 8e3e708cb9ee..2aca98e8e427 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3956,7 +3956,6 @@ static int mtip_block_initialize(struct driver_data *dd)
3956 if (rv) 3956 if (rv)
3957 goto disk_index_error; 3957 goto disk_index_error;
3958 3958
3959 dd->disk->driverfs_dev = &dd->pdev->dev;
3960 dd->disk->major = dd->major; 3959 dd->disk->major = dd->major;
3961 dd->disk->first_minor = index * MTIP_MAX_MINORS; 3960 dd->disk->first_minor = index * MTIP_MAX_MINORS;
3962 dd->disk->minors = MTIP_MAX_MINORS; 3961 dd->disk->minors = MTIP_MAX_MINORS;
@@ -4008,7 +4007,7 @@ skip_create_disk:
4008 4007
4009 /* 4008 /*
4010 * if rebuild pending, start the service thread, and delay the block 4009 * if rebuild pending, start the service thread, and delay the block
4011 * queue creation and add_disk() 4010 * queue creation and device_add_disk()
4012 */ 4011 */
4013 if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) 4012 if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
4014 goto start_service_thread; 4013 goto start_service_thread;
@@ -4042,7 +4041,7 @@ skip_create_disk:
4042 set_capacity(dd->disk, capacity); 4041 set_capacity(dd->disk, capacity);
4043 4042
4044 /* Enable the block device and add it to /dev */ 4043 /* Enable the block device and add it to /dev */
4045 add_disk(dd->disk); 4044 device_add_disk(&dd->pdev->dev, dd->disk);
4046 4045
4047 dd->bdev = bdget_disk(dd->disk, 0); 4046 dd->bdev = bdget_disk(dd->disk, 0);
4048 /* 4047 /*
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cab97593ba54..75a7f88d6717 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -448,7 +448,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
448 struct request *rq; 448 struct request *rq;
449 struct bio *bio = rqd->bio; 449 struct bio *bio = rqd->bio;
450 450
451 rq = blk_mq_alloc_request(q, bio_rw(bio), 0); 451 rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
452 if (IS_ERR(rq)) 452 if (IS_ERR(rq))
453 return -ENOMEM; 453 return -ENOMEM;
454 454
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index acb44529c05e..76f33c84ce3d 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -487,7 +487,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
487 gendisk->fops = &ps3disk_fops; 487 gendisk->fops = &ps3disk_fops;
488 gendisk->queue = queue; 488 gendisk->queue = queue;
489 gendisk->private_data = dev; 489 gendisk->private_data = dev;
490 gendisk->driverfs_dev = &dev->sbd.core;
491 snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME, 490 snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,
492 devidx+'a'); 491 devidx+'a');
493 priv->blocking_factor = dev->blk_size >> 9; 492 priv->blocking_factor = dev->blk_size >> 9;
@@ -499,7 +498,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
499 gendisk->disk_name, priv->model, priv->raw_capacity >> 11, 498 gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
500 get_capacity(gendisk) >> 11); 499 get_capacity(gendisk) >> 11);
501 500
502 add_disk(gendisk); 501 device_add_disk(&dev->sbd.core, gendisk);
503 return 0; 502 return 0;
504 503
505fail_cleanup_queue: 504fail_cleanup_queue:
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 56847fcda086..456b4fe21559 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -773,14 +773,13 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
773 gendisk->fops = &ps3vram_fops; 773 gendisk->fops = &ps3vram_fops;
774 gendisk->queue = queue; 774 gendisk->queue = queue;
775 gendisk->private_data = dev; 775 gendisk->private_data = dev;
776 gendisk->driverfs_dev = &dev->core;
777 strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); 776 strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name));
778 set_capacity(gendisk, priv->size >> 9); 777 set_capacity(gendisk, priv->size >> 9);
779 778
780 dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n", 779 dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n",
781 gendisk->disk_name, get_capacity(gendisk) >> 11); 780 gendisk->disk_name, get_capacity(gendisk) >> 11);
782 781
783 add_disk(gendisk); 782 device_add_disk(&dev->core, gendisk);
784 return 0; 783 return 0;
785 784
786fail_cleanup_queue: 785fail_cleanup_queue:
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index e1b8b7061d2f..f81d70b39d10 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -230,8 +230,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card)
230 set_capacity(card->gendisk, card->size8 >> 9); 230 set_capacity(card->gendisk, card->size8 >> 9);
231 else 231 else
232 set_capacity(card->gendisk, 0); 232 set_capacity(card->gendisk, 0);
233 add_disk(card->gendisk); 233 device_add_disk(CARD_TO_DEV(card), card->gendisk);
234
235 card->bdev_attached = 1; 234 card->bdev_attached = 1;
236 } 235 }
237 236
@@ -308,7 +307,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
308 307
309 snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), 308 snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name),
310 "rsxx%d", card->disk_id); 309 "rsxx%d", card->disk_id);
311 card->gendisk->driverfs_dev = &card->dev->dev;
312 card->gendisk->major = card->major; 310 card->gendisk->major = card->major;
313 card->gendisk->first_minor = 0; 311 card->gendisk->first_minor = 0;
314 card->gendisk->fops = &rsxx_fops; 312 card->gendisk->fops = &rsxx_fops;
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 5c07a23e2ada..3822eae102db 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -4690,10 +4690,10 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4690 return -EIO; 4690 return -EIO;
4691} 4691}
4692 4692
4693static int skd_bdev_attach(struct skd_device *skdev) 4693static int skd_bdev_attach(struct device *parent, struct skd_device *skdev)
4694{ 4694{
4695 pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); 4695 pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
4696 add_disk(skdev->disk); 4696 device_add_disk(parent, skdev->disk);
4697 return 0; 4697 return 0;
4698} 4698}
4699 4699
@@ -4812,8 +4812,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
4812 4812
4813 pci_set_drvdata(pdev, skdev); 4813 pci_set_drvdata(pdev, skdev);
4814 4814
4815 skdev->disk->driverfs_dev = &pdev->dev;
4816
4817 for (i = 0; i < SKD_MAX_BARS; i++) { 4815 for (i = 0; i < SKD_MAX_BARS; i++) {
4818 skdev->mem_phys[i] = pci_resource_start(pdev, i); 4816 skdev->mem_phys[i] = pci_resource_start(pdev, i);
4819 skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); 4817 skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
@@ -4851,7 +4849,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
4851 (SKD_START_WAIT_SECONDS * HZ)); 4849 (SKD_START_WAIT_SECONDS * HZ));
4852 if (skdev->gendisk_on > 0) { 4850 if (skdev->gendisk_on > 0) {
4853 /* device came on-line after reset */ 4851 /* device came on-line after reset */
4854 skd_bdev_attach(skdev); 4852 skd_bdev_attach(&pdev->dev, skdev);
4855 rc = 0; 4853 rc = 0;
4856 } else { 4854 } else {
4857 /* we timed out, something is wrong with the device, 4855 /* we timed out, something is wrong with the device,
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 4b911ed96ea3..cab157331c4e 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -804,7 +804,6 @@ static int probe_disk(struct vdc_port *port)
804 g->fops = &vdc_fops; 804 g->fops = &vdc_fops;
805 g->queue = q; 805 g->queue = q;
806 g->private_data = port; 806 g->private_data = port;
807 g->driverfs_dev = &port->vio.vdev->dev;
808 807
809 set_capacity(g, port->vdisk_size); 808 set_capacity(g, port->vdisk_size);
810 809
@@ -835,7 +834,7 @@ static int probe_disk(struct vdc_port *port)
835 port->vdisk_size, (port->vdisk_size >> (20 - 9)), 834 port->vdisk_size, (port->vdisk_size >> (20 - 9)),
836 port->vio.ver.major, port->vio.ver.minor); 835 port->vio.ver.major, port->vio.ver.minor);
837 836
838 add_disk(g); 837 device_add_disk(&port->vio.vdev->dev, g);
839 838
840 return 0; 839 return 0;
841} 840}
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 4b3ba74e9d22..d0a3e6d4515f 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -344,7 +344,6 @@ static int add_bio(struct cardinfo *card)
344 int offset; 344 int offset;
345 struct bio *bio; 345 struct bio *bio;
346 struct bio_vec vec; 346 struct bio_vec vec;
347 int rw;
348 347
349 bio = card->currentbio; 348 bio = card->currentbio;
350 if (!bio && card->bio) { 349 if (!bio && card->bio) {
@@ -359,7 +358,6 @@ static int add_bio(struct cardinfo *card)
359 if (!bio) 358 if (!bio)
360 return 0; 359 return 0;
361 360
362 rw = bio_rw(bio);
363 if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE) 361 if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE)
364 return 0; 362 return 0;
365 363
@@ -369,7 +367,7 @@ static int add_bio(struct cardinfo *card)
369 vec.bv_page, 367 vec.bv_page,
370 vec.bv_offset, 368 vec.bv_offset,
371 vec.bv_len, 369 vec.bv_len,
372 (rw == READ) ? 370 bio_op(bio) == REQ_OP_READ ?
373 PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); 371 PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
374 372
375 p = &card->mm_pages[card->Ready]; 373 p = &card->mm_pages[card->Ready];
@@ -398,7 +396,7 @@ static int add_bio(struct cardinfo *card)
398 DMASCR_CHAIN_EN | 396 DMASCR_CHAIN_EN |
399 DMASCR_SEM_EN | 397 DMASCR_SEM_EN |
400 pci_cmds); 398 pci_cmds);
401 if (rw == WRITE) 399 if (bio_op(bio) == REQ_OP_WRITE)
402 desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ); 400 desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ);
403 desc->sem_control_bits = desc->control_bits; 401 desc->sem_control_bits = desc->control_bits;
404 402
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 18e4069dd24b..1523e05c46fc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -236,25 +236,22 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
236static int virtblk_get_id(struct gendisk *disk, char *id_str) 236static int virtblk_get_id(struct gendisk *disk, char *id_str)
237{ 237{
238 struct virtio_blk *vblk = disk->private_data; 238 struct virtio_blk *vblk = disk->private_data;
239 struct request_queue *q = vblk->disk->queue;
239 struct request *req; 240 struct request *req;
240 struct bio *bio;
241 int err; 241 int err;
242 242
243 bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES, 243 req = blk_get_request(q, READ, GFP_KERNEL);
244 GFP_KERNEL); 244 if (IS_ERR(req))
245 if (IS_ERR(bio))
246 return PTR_ERR(bio);
247
248 req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
249 if (IS_ERR(req)) {
250 bio_put(bio);
251 return PTR_ERR(req); 245 return PTR_ERR(req);
252 }
253
254 req->cmd_type = REQ_TYPE_DRV_PRIV; 246 req->cmd_type = REQ_TYPE_DRV_PRIV;
247
248 err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
249 if (err)
250 goto out;
251
255 err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); 252 err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
253out:
256 blk_put_request(req); 254 blk_put_request(req);
257
258 return err; 255 return err;
259} 256}
260 257
@@ -656,7 +653,6 @@ static int virtblk_probe(struct virtio_device *vdev)
656 vblk->disk->first_minor = index_to_minor(index); 653 vblk->disk->first_minor = index_to_minor(index);
657 vblk->disk->private_data = vblk; 654 vblk->disk->private_data = vblk;
658 vblk->disk->fops = &virtblk_fops; 655 vblk->disk->fops = &virtblk_fops;
659 vblk->disk->driverfs_dev = &vdev->dev;
660 vblk->disk->flags |= GENHD_FL_EXT_DEVT; 656 vblk->disk->flags |= GENHD_FL_EXT_DEVT;
661 vblk->index = index; 657 vblk->index = index;
662 658
@@ -733,7 +729,7 @@ static int virtblk_probe(struct virtio_device *vdev)
733 729
734 virtio_device_ready(vdev); 730 virtio_device_ready(vdev);
735 731
736 add_disk(vblk->disk); 732 device_add_disk(&vdev->dev, vblk->disk);
737 err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); 733 err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
738 if (err) 734 if (err)
739 goto out_del_disk; 735 goto out_del_disk;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 3355f1cdd4e5..2994cfa44c8a 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
480 if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) 480 if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
481 vbd->flush_support = true; 481 vbd->flush_support = true;
482 482
483 if (q && blk_queue_secdiscard(q)) 483 if (q && blk_queue_secure_erase(q))
484 vbd->discard_secure = true; 484 vbd->discard_secure = true;
485 485
486 pr_debug("Successful creation of handle=%04x (dom=%u)\n", 486 pr_debug("Successful creation of handle=%04x (dom=%u)\n",
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index da05d3f9bad2..0b6682a33e3b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -548,7 +548,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf
548 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 548 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
549 ring_req->u.discard.id = id; 549 ring_req->u.discard.id = id;
550 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); 550 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
551 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 551 if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
552 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 552 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
553 else 553 else
554 ring_req->u.discard.flag = 0; 554 ring_req->u.discard.flag = 0;
@@ -844,7 +844,7 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r
844 return 1; 844 return 1;
845 845
846 if (unlikely(req_op(req) == REQ_OP_DISCARD || 846 if (unlikely(req_op(req) == REQ_OP_DISCARD ||
847 req->cmd_flags & REQ_SECURE)) 847 req_op(req) == REQ_OP_SECURE_ERASE))
848 return blkif_queue_discard_req(req, rinfo); 848 return blkif_queue_discard_req(req, rinfo);
849 else 849 else
850 return blkif_queue_rw_req(req, rinfo); 850 return blkif_queue_rw_req(req, rinfo);
@@ -952,7 +952,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
952 rq->limits.discard_granularity = info->discard_granularity; 952 rq->limits.discard_granularity = info->discard_granularity;
953 rq->limits.discard_alignment = info->discard_alignment; 953 rq->limits.discard_alignment = info->discard_alignment;
954 if (info->feature_secdiscard) 954 if (info->feature_secdiscard)
955 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); 955 queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq);
956 } 956 }
957 957
958 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 958 /* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -1134,7 +1134,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
1134 gd->first_minor = minor; 1134 gd->first_minor = minor;
1135 gd->fops = &xlvbd_block_fops; 1135 gd->fops = &xlvbd_block_fops;
1136 gd->private_data = info; 1136 gd->private_data = info;
1137 gd->driverfs_dev = &(info->xbdev->dev);
1138 set_capacity(gd, capacity); 1137 set_capacity(gd, capacity);
1139 1138
1140 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, 1139 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
@@ -1592,7 +1591,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1592 info->feature_discard = 0; 1591 info->feature_discard = 0;
1593 info->feature_secdiscard = 0; 1592 info->feature_secdiscard = 0;
1594 queue_flag_clear(QUEUE_FLAG_DISCARD, rq); 1593 queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
1595 queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); 1594 queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
1596 } 1595 }
1597 blk_mq_complete_request(req, error); 1596 blk_mq_complete_request(req, error);
1598 break; 1597 break;
@@ -2106,11 +2105,14 @@ static int blkfront_resume(struct xenbus_device *dev)
2106 */ 2105 */
2107 if (req_op(shadow[i].request) == REQ_OP_FLUSH || 2106 if (req_op(shadow[i].request) == REQ_OP_FLUSH ||
2108 req_op(shadow[i].request) == REQ_OP_DISCARD || 2107 req_op(shadow[i].request) == REQ_OP_DISCARD ||
2109 shadow[j].request->cmd_flags & (REQ_FUA | REQ_SECURE)) { 2108 req_op(shadow[i].request) == REQ_OP_SECURE_ERASE ||
2110 2109 shadow[j].request->cmd_flags & REQ_FUA) {
2111 /* 2110 /*
2112 * Flush operations don't contain bios, so 2111 * Flush operations don't contain bios, so
2113 * we need to requeue the whole request 2112 * we need to requeue the whole request
2113 *
2114 * XXX: but this doesn't make any sense for a
2115 * write with the FUA flag set..
2114 */ 2116 */
2115 list_add(&shadow[j].request->queuelist, &info->requests); 2117 list_add(&shadow[j].request->queuelist, &info->requests);
2116 continue; 2118 continue;
@@ -2445,7 +2447,7 @@ static void blkfront_connect(struct blkfront_info *info)
2445 for (i = 0; i < info->nr_rings; i++) 2447 for (i = 0; i < info->nr_rings; i++)
2446 kick_pending_request_queues(&info->rinfo[i]); 2448 kick_pending_request_queues(&info->rinfo[i]);
2447 2449
2448 add_disk(info->gd); 2450 device_add_disk(&info->xbdev->dev, info->gd);
2449 2451
2450 info->is_ready = 1; 2452 info->is_ready = 1;
2451} 2453}
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 1b257ea9776a..5d475b3a0b2e 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2032,7 +2032,7 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi,
2032 2032
2033 init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ); 2033 init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ);
2034 cgc.cmd[0] = GPCMD_READ_SUBCHANNEL; 2034 cgc.cmd[0] = GPCMD_READ_SUBCHANNEL;
2035 cgc.cmd[1] = 2; /* MSF addressing */ 2035 cgc.cmd[1] = subchnl->cdsc_format;/* MSF or LBA addressing */
2036 cgc.cmd[2] = 0x40; /* request subQ data */ 2036 cgc.cmd[2] = 0x40; /* request subQ data */
2037 cgc.cmd[3] = mcn ? 2 : 1; 2037 cgc.cmd[3] = mcn ? 2 : 1;
2038 cgc.cmd[8] = 16; 2038 cgc.cmd[8] = 16;
@@ -2041,17 +2041,27 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi,
2041 return ret; 2041 return ret;
2042 2042
2043 subchnl->cdsc_audiostatus = cgc.buffer[1]; 2043 subchnl->cdsc_audiostatus = cgc.buffer[1];
2044 subchnl->cdsc_format = CDROM_MSF;
2045 subchnl->cdsc_ctrl = cgc.buffer[5] & 0xf; 2044 subchnl->cdsc_ctrl = cgc.buffer[5] & 0xf;
2046 subchnl->cdsc_trk = cgc.buffer[6]; 2045 subchnl->cdsc_trk = cgc.buffer[6];
2047 subchnl->cdsc_ind = cgc.buffer[7]; 2046 subchnl->cdsc_ind = cgc.buffer[7];
2048 2047
2049 subchnl->cdsc_reladdr.msf.minute = cgc.buffer[13]; 2048 if (subchnl->cdsc_format == CDROM_LBA) {
2050 subchnl->cdsc_reladdr.msf.second = cgc.buffer[14]; 2049 subchnl->cdsc_absaddr.lba = ((cgc.buffer[8] << 24) |
2051 subchnl->cdsc_reladdr.msf.frame = cgc.buffer[15]; 2050 (cgc.buffer[9] << 16) |
2052 subchnl->cdsc_absaddr.msf.minute = cgc.buffer[9]; 2051 (cgc.buffer[10] << 8) |
2053 subchnl->cdsc_absaddr.msf.second = cgc.buffer[10]; 2052 (cgc.buffer[11]));
2054 subchnl->cdsc_absaddr.msf.frame = cgc.buffer[11]; 2053 subchnl->cdsc_reladdr.lba = ((cgc.buffer[12] << 24) |
2054 (cgc.buffer[13] << 16) |
2055 (cgc.buffer[14] << 8) |
2056 (cgc.buffer[15]));
2057 } else {
2058 subchnl->cdsc_reladdr.msf.minute = cgc.buffer[13];
2059 subchnl->cdsc_reladdr.msf.second = cgc.buffer[14];
2060 subchnl->cdsc_reladdr.msf.frame = cgc.buffer[15];
2061 subchnl->cdsc_absaddr.msf.minute = cgc.buffer[9];
2062 subchnl->cdsc_absaddr.msf.second = cgc.buffer[10];
2063 subchnl->cdsc_absaddr.msf.frame = cgc.buffer[11];
2064 }
2055 2065
2056 return 0; 2066 return 0;
2057} 2067}
@@ -3022,7 +3032,7 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
3022 if (!((requested == CDROM_MSF) || 3032 if (!((requested == CDROM_MSF) ||
3023 (requested == CDROM_LBA))) 3033 (requested == CDROM_LBA)))
3024 return -EINVAL; 3034 return -EINVAL;
3025 q.cdsc_format = CDROM_MSF; 3035
3026 ret = cdrom_read_subchannel(cdi, &q, 0); 3036 ret = cdrom_read_subchannel(cdi, &q, 0);
3027 if (ret) 3037 if (ret)
3028 return ret; 3038 return ret;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index ef907fd5ba98..bf9a2ad296ed 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1770,7 +1770,6 @@ static int ide_cd_probe(ide_drive_t *drive)
1770 drive->driver_data = info; 1770 drive->driver_data = info;
1771 1771
1772 g->minors = 1; 1772 g->minors = 1;
1773 g->driverfs_dev = &drive->gendev;
1774 g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE; 1773 g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE;
1775 if (ide_cdrom_setup(drive)) { 1774 if (ide_cdrom_setup(drive)) {
1776 put_device(&info->dev); 1775 put_device(&info->dev);
@@ -1780,7 +1779,7 @@ static int ide_cd_probe(ide_drive_t *drive)
1780 ide_cd_read_toc(drive, &sense); 1779 ide_cd_read_toc(drive, &sense);
1781 g->fops = &idecd_ops; 1780 g->fops = &idecd_ops;
1782 g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 1781 g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
1783 add_disk(g); 1782 device_add_disk(&drive->gendev, g);
1784 return 0; 1783 return 0;
1785 1784
1786out_free_disk: 1785out_free_disk:
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 838996a0039e..e823394ed543 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -412,12 +412,11 @@ static int ide_gd_probe(ide_drive_t *drive)
412 set_capacity(g, ide_gd_capacity(drive)); 412 set_capacity(g, ide_gd_capacity(drive));
413 413
414 g->minors = IDE_DISK_MINORS; 414 g->minors = IDE_DISK_MINORS;
415 g->driverfs_dev = &drive->gendev;
416 g->flags |= GENHD_FL_EXT_DEVT; 415 g->flags |= GENHD_FL_EXT_DEVT;
417 if (drive->dev_flags & IDE_DFLAG_REMOVABLE) 416 if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
418 g->flags = GENHD_FL_REMOVABLE; 417 g->flags = GENHD_FL_REMOVABLE;
419 g->fops = &ide_gd_ops; 418 g->fops = &ide_gd_ops;
420 add_disk(g); 419 device_add_disk(&drive->gendev, g);
421 return 0; 420 return 0;
422 421
423out_free_disk: 422out_free_disk:
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 85a339030e4b..61c68a1f054a 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -27,11 +27,13 @@ config NVM_DEBUG
27 It is required to create/remove targets without IOCTLs. 27 It is required to create/remove targets without IOCTLs.
28 28
29config NVM_GENNVM 29config NVM_GENNVM
30 tristate "Generic NVM manager for Open-Channel SSDs" 30 tristate "General Non-Volatile Memory Manager for Open-Channel SSDs"
31 ---help--- 31 ---help---
32 NVM media manager for Open-Channel SSDs that offload management 32 Non-volatile memory media manager for Open-Channel SSDs that implements
33 functionality to device, while keeping data placement and garbage 33 physical media metadata management and block provisioning API.
34 collection decisions on the host. 34
35 This is the standard media manager for using Open-Channel SSDs, and
36 required for targets to be instantiated.
35 37
36config NVM_RRPC 38config NVM_RRPC
37 tristate "Round-robin Hybrid Open-Channel SSD target" 39 tristate "Round-robin Hybrid Open-Channel SSD target"
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 160c1a6838e1..9ebd2cfbd849 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -18,8 +18,6 @@
18 * 18 *
19 */ 19 */
20 20
21#include <linux/blkdev.h>
22#include <linux/blk-mq.h>
23#include <linux/list.h> 21#include <linux/list.h>
24#include <linux/types.h> 22#include <linux/types.h>
25#include <linux/sem.h> 23#include <linux/sem.h>
@@ -28,46 +26,42 @@
28#include <linux/miscdevice.h> 26#include <linux/miscdevice.h>
29#include <linux/lightnvm.h> 27#include <linux/lightnvm.h>
30#include <linux/sched/sysctl.h> 28#include <linux/sched/sysctl.h>
31#include <uapi/linux/lightnvm.h>
32 29
33static LIST_HEAD(nvm_tgt_types); 30static LIST_HEAD(nvm_tgt_types);
31static DECLARE_RWSEM(nvm_tgtt_lock);
34static LIST_HEAD(nvm_mgrs); 32static LIST_HEAD(nvm_mgrs);
35static LIST_HEAD(nvm_devices); 33static LIST_HEAD(nvm_devices);
36static LIST_HEAD(nvm_targets);
37static DECLARE_RWSEM(nvm_lock); 34static DECLARE_RWSEM(nvm_lock);
38 35
39static struct nvm_target *nvm_find_target(const char *name) 36struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
40{ 37{
41 struct nvm_target *tgt; 38 struct nvm_tgt_type *tmp, *tt = NULL;
42 39
43 list_for_each_entry(tgt, &nvm_targets, list) 40 if (lock)
44 if (!strcmp(name, tgt->disk->disk_name)) 41 down_write(&nvm_tgtt_lock);
45 return tgt;
46 42
47 return NULL; 43 list_for_each_entry(tmp, &nvm_tgt_types, list)
48} 44 if (!strcmp(name, tmp->name)) {
49 45 tt = tmp;
50static struct nvm_tgt_type *nvm_find_target_type(const char *name) 46 break;
51{ 47 }
52 struct nvm_tgt_type *tt;
53
54 list_for_each_entry(tt, &nvm_tgt_types, list)
55 if (!strcmp(name, tt->name))
56 return tt;
57 48
58 return NULL; 49 if (lock)
50 up_write(&nvm_tgtt_lock);
51 return tt;
59} 52}
53EXPORT_SYMBOL(nvm_find_target_type);
60 54
61int nvm_register_tgt_type(struct nvm_tgt_type *tt) 55int nvm_register_tgt_type(struct nvm_tgt_type *tt)
62{ 56{
63 int ret = 0; 57 int ret = 0;
64 58
65 down_write(&nvm_lock); 59 down_write(&nvm_tgtt_lock);
66 if (nvm_find_target_type(tt->name)) 60 if (nvm_find_target_type(tt->name, 0))
67 ret = -EEXIST; 61 ret = -EEXIST;
68 else 62 else
69 list_add(&tt->list, &nvm_tgt_types); 63 list_add(&tt->list, &nvm_tgt_types);
70 up_write(&nvm_lock); 64 up_write(&nvm_tgtt_lock);
71 65
72 return ret; 66 return ret;
73} 67}
@@ -110,7 +104,7 @@ static struct nvmm_type *nvm_find_mgr_type(const char *name)
110 return NULL; 104 return NULL;
111} 105}
112 106
113struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) 107static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev)
114{ 108{
115 struct nvmm_type *mt; 109 struct nvmm_type *mt;
116 int ret; 110 int ret;
@@ -182,20 +176,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
182 return NULL; 176 return NULL;
183} 177}
184 178
185struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun,
186 unsigned long flags)
187{
188 return dev->mt->get_blk_unlocked(dev, lun, flags);
189}
190EXPORT_SYMBOL(nvm_get_blk_unlocked);
191
192/* Assumes that all valid pages have already been moved on release to bm */
193void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk)
194{
195 return dev->mt->put_blk_unlocked(dev, blk);
196}
197EXPORT_SYMBOL(nvm_put_blk_unlocked);
198
199struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun, 179struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun,
200 unsigned long flags) 180 unsigned long flags)
201{ 181{
@@ -210,6 +190,12 @@ void nvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
210} 190}
211EXPORT_SYMBOL(nvm_put_blk); 191EXPORT_SYMBOL(nvm_put_blk);
212 192
193void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
194{
195 return dev->mt->mark_blk(dev, ppa, type);
196}
197EXPORT_SYMBOL(nvm_mark_blk);
198
213int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) 199int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
214{ 200{
215 return dev->mt->submit_io(dev, rqd); 201 return dev->mt->submit_io(dev, rqd);
@@ -251,9 +237,10 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
251EXPORT_SYMBOL(nvm_generic_to_addr_mode); 237EXPORT_SYMBOL(nvm_generic_to_addr_mode);
252 238
253int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, 239int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
254 struct ppa_addr *ppas, int nr_ppas, int vblk) 240 const struct ppa_addr *ppas, int nr_ppas, int vblk)
255{ 241{
256 int i, plane_cnt, pl_idx; 242 int i, plane_cnt, pl_idx;
243 struct ppa_addr ppa;
257 244
258 if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { 245 if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
259 rqd->nr_ppas = nr_ppas; 246 rqd->nr_ppas = nr_ppas;
@@ -278,8 +265,9 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
278 265
279 for (i = 0; i < nr_ppas; i++) { 266 for (i = 0; i < nr_ppas; i++) {
280 for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { 267 for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
281 ppas[i].g.pl = pl_idx; 268 ppa = ppas[i];
282 rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i]; 269 ppa.g.pl = pl_idx;
270 rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
283 } 271 }
284 } 272 }
285 } 273 }
@@ -337,7 +325,7 @@ static void nvm_end_io_sync(struct nvm_rq *rqd)
337 complete(waiting); 325 complete(waiting);
338} 326}
339 327
340int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, 328static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
341 int flags, void *buf, int len) 329 int flags, void *buf, int len)
342{ 330{
343 DECLARE_COMPLETION_ONSTACK(wait); 331 DECLARE_COMPLETION_ONSTACK(wait);
@@ -367,7 +355,9 @@ int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
367 /* Prevent hang_check timer from firing at us during very long I/O */ 355 /* Prevent hang_check timer from firing at us during very long I/O */
368 hang_check = sysctl_hung_task_timeout_secs; 356 hang_check = sysctl_hung_task_timeout_secs;
369 if (hang_check) 357 if (hang_check)
370 while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); 358 while (!wait_for_completion_io_timeout(&wait,
359 hang_check * (HZ/2)))
360 ;
371 else 361 else
372 wait_for_completion_io(&wait); 362 wait_for_completion_io(&wait);
373 363
@@ -510,7 +500,8 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
510 /* The lower page table encoding consists of a list of bytes, where each 500 /* The lower page table encoding consists of a list of bytes, where each
511 * has a lower and an upper half. The first half byte maintains the 501 * has a lower and an upper half. The first half byte maintains the
512 * increment value and every value after is an offset added to the 502 * increment value and every value after is an offset added to the
513 * previous incrementation value */ 503 * previous incrementation value
504 */
514 dev->lptbl[0] = mlc->pairs[0] & 0xF; 505 dev->lptbl[0] = mlc->pairs[0] & 0xF;
515 for (i = 1; i < dev->lps_per_blk; i++) { 506 for (i = 1; i < dev->lps_per_blk; i++) {
516 p = mlc->pairs[i >> 1]; 507 p = mlc->pairs[i >> 1];
@@ -596,42 +587,11 @@ err_fmtype:
596 return ret; 587 return ret;
597} 588}
598 589
599static void nvm_remove_target(struct nvm_target *t)
600{
601 struct nvm_tgt_type *tt = t->type;
602 struct gendisk *tdisk = t->disk;
603 struct request_queue *q = tdisk->queue;
604
605 lockdep_assert_held(&nvm_lock);
606
607 del_gendisk(tdisk);
608 blk_cleanup_queue(q);
609
610 if (tt->exit)
611 tt->exit(tdisk->private_data);
612
613 put_disk(tdisk);
614
615 list_del(&t->list);
616 kfree(t);
617}
618
619static void nvm_free_mgr(struct nvm_dev *dev) 590static void nvm_free_mgr(struct nvm_dev *dev)
620{ 591{
621 struct nvm_target *tgt, *tmp;
622
623 if (!dev->mt) 592 if (!dev->mt)
624 return; 593 return;
625 594
626 down_write(&nvm_lock);
627 list_for_each_entry_safe(tgt, tmp, &nvm_targets, list) {
628 if (tgt->dev != dev)
629 continue;
630
631 nvm_remove_target(tgt);
632 }
633 up_write(&nvm_lock);
634
635 dev->mt->unregister_mgr(dev); 595 dev->mt->unregister_mgr(dev);
636 dev->mt = NULL; 596 dev->mt = NULL;
637} 597}
@@ -778,91 +738,6 @@ void nvm_unregister(char *disk_name)
778} 738}
779EXPORT_SYMBOL(nvm_unregister); 739EXPORT_SYMBOL(nvm_unregister);
780 740
781static const struct block_device_operations nvm_fops = {
782 .owner = THIS_MODULE,
783};
784
785static int nvm_create_target(struct nvm_dev *dev,
786 struct nvm_ioctl_create *create)
787{
788 struct nvm_ioctl_create_simple *s = &create->conf.s;
789 struct request_queue *tqueue;
790 struct gendisk *tdisk;
791 struct nvm_tgt_type *tt;
792 struct nvm_target *t;
793 void *targetdata;
794
795 if (!dev->mt) {
796 pr_info("nvm: device has no media manager registered.\n");
797 return -ENODEV;
798 }
799
800 down_write(&nvm_lock);
801 tt = nvm_find_target_type(create->tgttype);
802 if (!tt) {
803 pr_err("nvm: target type %s not found\n", create->tgttype);
804 up_write(&nvm_lock);
805 return -EINVAL;
806 }
807
808 t = nvm_find_target(create->tgtname);
809 if (t) {
810 pr_err("nvm: target name already exists.\n");
811 up_write(&nvm_lock);
812 return -EINVAL;
813 }
814 up_write(&nvm_lock);
815
816 t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
817 if (!t)
818 return -ENOMEM;
819
820 tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
821 if (!tqueue)
822 goto err_t;
823 blk_queue_make_request(tqueue, tt->make_rq);
824
825 tdisk = alloc_disk(0);
826 if (!tdisk)
827 goto err_queue;
828
829 sprintf(tdisk->disk_name, "%s", create->tgtname);
830 tdisk->flags = GENHD_FL_EXT_DEVT;
831 tdisk->major = 0;
832 tdisk->first_minor = 0;
833 tdisk->fops = &nvm_fops;
834 tdisk->queue = tqueue;
835
836 targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end);
837 if (IS_ERR(targetdata))
838 goto err_init;
839
840 tdisk->private_data = targetdata;
841 tqueue->queuedata = targetdata;
842
843 blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
844
845 set_capacity(tdisk, tt->capacity(targetdata));
846 add_disk(tdisk);
847
848 t->type = tt;
849 t->disk = tdisk;
850 t->dev = dev;
851
852 down_write(&nvm_lock);
853 list_add_tail(&t->list, &nvm_targets);
854 up_write(&nvm_lock);
855
856 return 0;
857err_init:
858 put_disk(tdisk);
859err_queue:
860 blk_cleanup_queue(tqueue);
861err_t:
862 kfree(t);
863 return -ENOMEM;
864}
865
866static int __nvm_configure_create(struct nvm_ioctl_create *create) 741static int __nvm_configure_create(struct nvm_ioctl_create *create)
867{ 742{
868 struct nvm_dev *dev; 743 struct nvm_dev *dev;
@@ -871,11 +746,17 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
871 down_write(&nvm_lock); 746 down_write(&nvm_lock);
872 dev = nvm_find_nvm_dev(create->dev); 747 dev = nvm_find_nvm_dev(create->dev);
873 up_write(&nvm_lock); 748 up_write(&nvm_lock);
749
874 if (!dev) { 750 if (!dev) {
875 pr_err("nvm: device not found\n"); 751 pr_err("nvm: device not found\n");
876 return -EINVAL; 752 return -EINVAL;
877 } 753 }
878 754
755 if (!dev->mt) {
756 pr_info("nvm: device has no media manager registered.\n");
757 return -ENODEV;
758 }
759
879 if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) { 760 if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
880 pr_err("nvm: config type not valid\n"); 761 pr_err("nvm: config type not valid\n");
881 return -EINVAL; 762 return -EINVAL;
@@ -888,25 +769,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
888 return -EINVAL; 769 return -EINVAL;
889 } 770 }
890 771
891 return nvm_create_target(dev, create); 772 return dev->mt->create_tgt(dev, create);
892}
893
894static int __nvm_configure_remove(struct nvm_ioctl_remove *remove)
895{
896 struct nvm_target *t;
897
898 down_write(&nvm_lock);
899 t = nvm_find_target(remove->tgtname);
900 if (!t) {
901 pr_err("nvm: target \"%s\" doesn't exist.\n", remove->tgtname);
902 up_write(&nvm_lock);
903 return -EINVAL;
904 }
905
906 nvm_remove_target(t);
907 up_write(&nvm_lock);
908
909 return 0;
910} 773}
911 774
912#ifdef CONFIG_NVM_DEBUG 775#ifdef CONFIG_NVM_DEBUG
@@ -941,8 +804,9 @@ static int nvm_configure_show(const char *val)
941static int nvm_configure_remove(const char *val) 804static int nvm_configure_remove(const char *val)
942{ 805{
943 struct nvm_ioctl_remove remove; 806 struct nvm_ioctl_remove remove;
807 struct nvm_dev *dev;
944 char opcode; 808 char opcode;
945 int ret; 809 int ret = 0;
946 810
947 ret = sscanf(val, "%c %256s", &opcode, remove.tgtname); 811 ret = sscanf(val, "%c %256s", &opcode, remove.tgtname);
948 if (ret != 2) { 812 if (ret != 2) {
@@ -952,7 +816,13 @@ static int nvm_configure_remove(const char *val)
952 816
953 remove.flags = 0; 817 remove.flags = 0;
954 818
955 return __nvm_configure_remove(&remove); 819 list_for_each_entry(dev, &nvm_devices, devices) {
820 ret = dev->mt->remove_tgt(dev, &remove);
821 if (!ret)
822 break;
823 }
824
825 return ret;
956} 826}
957 827
958static int nvm_configure_create(const char *val) 828static int nvm_configure_create(const char *val)
@@ -1149,6 +1019,8 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
1149static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) 1019static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
1150{ 1020{
1151 struct nvm_ioctl_remove remove; 1021 struct nvm_ioctl_remove remove;
1022 struct nvm_dev *dev;
1023 int ret = 0;
1152 1024
1153 if (!capable(CAP_SYS_ADMIN)) 1025 if (!capable(CAP_SYS_ADMIN))
1154 return -EPERM; 1026 return -EPERM;
@@ -1163,7 +1035,13 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
1163 return -EINVAL; 1035 return -EINVAL;
1164 } 1036 }
1165 1037
1166 return __nvm_configure_remove(&remove); 1038 list_for_each_entry(dev, &nvm_devices, devices) {
1039 ret = dev->mt->remove_tgt(dev, &remove);
1040 if (!ret)
1041 break;
1042 }
1043
1044 return ret;
1167} 1045}
1168 1046
1169static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) 1047static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info)
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index ec9fb6876e38..b74174c6d021 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -15,22 +15,160 @@
15 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, 15 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
16 * USA. 16 * USA.
17 * 17 *
18 * Implementation of a generic nvm manager for Open-Channel SSDs. 18 * Implementation of a general nvm manager for Open-Channel SSDs.
19 */ 19 */
20 20
21#include "gennvm.h" 21#include "gennvm.h"
22 22
23static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) 23static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name)
24{ 24{
25 struct gen_nvm *gn = dev->mp; 25 struct nvm_target *tgt;
26 struct gennvm_area *area, *prev, *next; 26
27 list_for_each_entry(tgt, &gn->targets, list)
28 if (!strcmp(name, tgt->disk->disk_name))
29 return tgt;
30
31 return NULL;
32}
33
34static const struct block_device_operations gen_fops = {
35 .owner = THIS_MODULE,
36};
37
38static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
39{
40 struct gen_dev *gn = dev->mp;
41 struct nvm_ioctl_create_simple *s = &create->conf.s;
42 struct request_queue *tqueue;
43 struct gendisk *tdisk;
44 struct nvm_tgt_type *tt;
45 struct nvm_target *t;
46 void *targetdata;
47
48 tt = nvm_find_target_type(create->tgttype, 1);
49 if (!tt) {
50 pr_err("nvm: target type %s not found\n", create->tgttype);
51 return -EINVAL;
52 }
53
54 mutex_lock(&gn->lock);
55 t = gen_find_target(gn, create->tgtname);
56 if (t) {
57 pr_err("nvm: target name already exists.\n");
58 mutex_unlock(&gn->lock);
59 return -EINVAL;
60 }
61 mutex_unlock(&gn->lock);
62
63 t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
64 if (!t)
65 return -ENOMEM;
66
67 tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
68 if (!tqueue)
69 goto err_t;
70 blk_queue_make_request(tqueue, tt->make_rq);
71
72 tdisk = alloc_disk(0);
73 if (!tdisk)
74 goto err_queue;
75
76 sprintf(tdisk->disk_name, "%s", create->tgtname);
77 tdisk->flags = GENHD_FL_EXT_DEVT;
78 tdisk->major = 0;
79 tdisk->first_minor = 0;
80 tdisk->fops = &gen_fops;
81 tdisk->queue = tqueue;
82
83 targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end);
84 if (IS_ERR(targetdata))
85 goto err_init;
86
87 tdisk->private_data = targetdata;
88 tqueue->queuedata = targetdata;
89
90 blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
91
92 set_capacity(tdisk, tt->capacity(targetdata));
93 add_disk(tdisk);
94
95 t->type = tt;
96 t->disk = tdisk;
97 t->dev = dev;
98
99 mutex_lock(&gn->lock);
100 list_add_tail(&t->list, &gn->targets);
101 mutex_unlock(&gn->lock);
102
103 return 0;
104err_init:
105 put_disk(tdisk);
106err_queue:
107 blk_cleanup_queue(tqueue);
108err_t:
109 kfree(t);
110 return -ENOMEM;
111}
112
113static void __gen_remove_target(struct nvm_target *t)
114{
115 struct nvm_tgt_type *tt = t->type;
116 struct gendisk *tdisk = t->disk;
117 struct request_queue *q = tdisk->queue;
118
119 del_gendisk(tdisk);
120 blk_cleanup_queue(q);
121
122 if (tt->exit)
123 tt->exit(tdisk->private_data);
124
125 put_disk(tdisk);
126
127 list_del(&t->list);
128 kfree(t);
129}
130
131/**
132 * gen_remove_tgt - Removes a target from the media manager
133 * @dev: device
134 * @remove: ioctl structure with target name to remove.
135 *
136 * Returns:
137 * 0: on success
138 * 1: on not found
139 * <0: on error
140 */
141static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
142{
143 struct gen_dev *gn = dev->mp;
144 struct nvm_target *t;
145
146 if (!gn)
147 return 1;
148
149 mutex_lock(&gn->lock);
150 t = gen_find_target(gn, remove->tgtname);
151 if (!t) {
152 mutex_unlock(&gn->lock);
153 return 1;
154 }
155 __gen_remove_target(t);
156 mutex_unlock(&gn->lock);
157
158 return 0;
159}
160
161static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
162{
163 struct gen_dev *gn = dev->mp;
164 struct gen_area *area, *prev, *next;
27 sector_t begin = 0; 165 sector_t begin = 0;
28 sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9; 166 sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9;
29 167
30 if (len > max_sectors) 168 if (len > max_sectors)
31 return -EINVAL; 169 return -EINVAL;
32 170
33 area = kmalloc(sizeof(struct gennvm_area), GFP_KERNEL); 171 area = kmalloc(sizeof(struct gen_area), GFP_KERNEL);
34 if (!area) 172 if (!area)
35 return -ENOMEM; 173 return -ENOMEM;
36 174
@@ -64,10 +202,10 @@ static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
64 return 0; 202 return 0;
65} 203}
66 204
67static void gennvm_put_area(struct nvm_dev *dev, sector_t begin) 205static void gen_put_area(struct nvm_dev *dev, sector_t begin)
68{ 206{
69 struct gen_nvm *gn = dev->mp; 207 struct gen_dev *gn = dev->mp;
70 struct gennvm_area *area; 208 struct gen_area *area;
71 209
72 spin_lock(&dev->lock); 210 spin_lock(&dev->lock);
73 list_for_each_entry(area, &gn->area_list, list) { 211 list_for_each_entry(area, &gn->area_list, list) {
@@ -82,27 +220,27 @@ static void gennvm_put_area(struct nvm_dev *dev, sector_t begin)
82 spin_unlock(&dev->lock); 220 spin_unlock(&dev->lock);
83} 221}
84 222
85static void gennvm_blocks_free(struct nvm_dev *dev) 223static void gen_blocks_free(struct nvm_dev *dev)
86{ 224{
87 struct gen_nvm *gn = dev->mp; 225 struct gen_dev *gn = dev->mp;
88 struct gen_lun *lun; 226 struct gen_lun *lun;
89 int i; 227 int i;
90 228
91 gennvm_for_each_lun(gn, lun, i) { 229 gen_for_each_lun(gn, lun, i) {
92 if (!lun->vlun.blocks) 230 if (!lun->vlun.blocks)
93 break; 231 break;
94 vfree(lun->vlun.blocks); 232 vfree(lun->vlun.blocks);
95 } 233 }
96} 234}
97 235
98static void gennvm_luns_free(struct nvm_dev *dev) 236static void gen_luns_free(struct nvm_dev *dev)
99{ 237{
100 struct gen_nvm *gn = dev->mp; 238 struct gen_dev *gn = dev->mp;
101 239
102 kfree(gn->luns); 240 kfree(gn->luns);
103} 241}
104 242
105static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) 243static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
106{ 244{
107 struct gen_lun *lun; 245 struct gen_lun *lun;
108 int i; 246 int i;
@@ -111,7 +249,7 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
111 if (!gn->luns) 249 if (!gn->luns)
112 return -ENOMEM; 250 return -ENOMEM;
113 251
114 gennvm_for_each_lun(gn, lun, i) { 252 gen_for_each_lun(gn, lun, i) {
115 spin_lock_init(&lun->vlun.lock); 253 spin_lock_init(&lun->vlun.lock);
116 INIT_LIST_HEAD(&lun->free_list); 254 INIT_LIST_HEAD(&lun->free_list);
117 INIT_LIST_HEAD(&lun->used_list); 255 INIT_LIST_HEAD(&lun->used_list);
@@ -122,14 +260,11 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
122 lun->vlun.lun_id = i % dev->luns_per_chnl; 260 lun->vlun.lun_id = i % dev->luns_per_chnl;
123 lun->vlun.chnl_id = i / dev->luns_per_chnl; 261 lun->vlun.chnl_id = i / dev->luns_per_chnl;
124 lun->vlun.nr_free_blocks = dev->blks_per_lun; 262 lun->vlun.nr_free_blocks = dev->blks_per_lun;
125 lun->vlun.nr_open_blocks = 0;
126 lun->vlun.nr_closed_blocks = 0;
127 lun->vlun.nr_bad_blocks = 0;
128 } 263 }
129 return 0; 264 return 0;
130} 265}
131 266
132static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa, 267static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
133 u8 *blks, int nr_blks) 268 u8 *blks, int nr_blks)
134{ 269{
135 struct nvm_dev *dev = gn->dev; 270 struct nvm_dev *dev = gn->dev;
@@ -149,17 +284,16 @@ static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa,
149 284
150 blk = &lun->vlun.blocks[i]; 285 blk = &lun->vlun.blocks[i];
151 list_move_tail(&blk->list, &lun->bb_list); 286 list_move_tail(&blk->list, &lun->bb_list);
152 lun->vlun.nr_bad_blocks++;
153 lun->vlun.nr_free_blocks--; 287 lun->vlun.nr_free_blocks--;
154 } 288 }
155 289
156 return 0; 290 return 0;
157} 291}
158 292
159static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) 293static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
160{ 294{
161 struct nvm_dev *dev = private; 295 struct nvm_dev *dev = private;
162 struct gen_nvm *gn = dev->mp; 296 struct gen_dev *gn = dev->mp;
163 u64 elba = slba + nlb; 297 u64 elba = slba + nlb;
164 struct gen_lun *lun; 298 struct gen_lun *lun;
165 struct nvm_block *blk; 299 struct nvm_block *blk;
@@ -167,7 +301,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
167 int lun_id; 301 int lun_id;
168 302
169 if (unlikely(elba > dev->total_secs)) { 303 if (unlikely(elba > dev->total_secs)) {
170 pr_err("gennvm: L2P data from device is out of bounds!\n"); 304 pr_err("gen: L2P data from device is out of bounds!\n");
171 return -EINVAL; 305 return -EINVAL;
172 } 306 }
173 307
@@ -175,7 +309,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
175 u64 pba = le64_to_cpu(entries[i]); 309 u64 pba = le64_to_cpu(entries[i]);
176 310
177 if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) { 311 if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) {
178 pr_err("gennvm: L2P data entry is out of bounds!\n"); 312 pr_err("gen: L2P data entry is out of bounds!\n");
179 return -EINVAL; 313 return -EINVAL;
180 } 314 }
181 315
@@ -200,16 +334,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
200 * block state. The block is assumed to be open. 334 * block state. The block is assumed to be open.
201 */ 335 */
202 list_move_tail(&blk->list, &lun->used_list); 336 list_move_tail(&blk->list, &lun->used_list);
203 blk->state = NVM_BLK_ST_OPEN; 337 blk->state = NVM_BLK_ST_TGT;
204 lun->vlun.nr_free_blocks--; 338 lun->vlun.nr_free_blocks--;
205 lun->vlun.nr_open_blocks++;
206 } 339 }
207 } 340 }
208 341
209 return 0; 342 return 0;
210} 343}
211 344
212static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) 345static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
213{ 346{
214 struct gen_lun *lun; 347 struct gen_lun *lun;
215 struct nvm_block *block; 348 struct nvm_block *block;
@@ -222,7 +355,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
222 if (!blks) 355 if (!blks)
223 return -ENOMEM; 356 return -ENOMEM;
224 357
225 gennvm_for_each_lun(gn, lun, lun_iter) { 358 gen_for_each_lun(gn, lun, lun_iter) {
226 lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) * 359 lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) *
227 dev->blks_per_lun); 360 dev->blks_per_lun);
228 if (!lun->vlun.blocks) { 361 if (!lun->vlun.blocks) {
@@ -256,20 +389,20 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
256 389
257 ret = nvm_get_bb_tbl(dev, ppa, blks); 390 ret = nvm_get_bb_tbl(dev, ppa, blks);
258 if (ret) 391 if (ret)
259 pr_err("gennvm: could not get BB table\n"); 392 pr_err("gen: could not get BB table\n");
260 393
261 ret = gennvm_block_bb(gn, ppa, blks, nr_blks); 394 ret = gen_block_bb(gn, ppa, blks, nr_blks);
262 if (ret) 395 if (ret)
263 pr_err("gennvm: BB table map failed\n"); 396 pr_err("gen: BB table map failed\n");
264 } 397 }
265 } 398 }
266 399
267 if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) { 400 if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) {
268 ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, 401 ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs,
269 gennvm_block_map, dev); 402 gen_block_map, dev);
270 if (ret) { 403 if (ret) {
271 pr_err("gennvm: could not read L2P table.\n"); 404 pr_err("gen: could not read L2P table.\n");
272 pr_warn("gennvm: default block initialization"); 405 pr_warn("gen: default block initialization");
273 } 406 }
274 } 407 }
275 408
@@ -277,67 +410,79 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
277 return 0; 410 return 0;
278} 411}
279 412
280static void gennvm_free(struct nvm_dev *dev) 413static void gen_free(struct nvm_dev *dev)
281{ 414{
282 gennvm_blocks_free(dev); 415 gen_blocks_free(dev);
283 gennvm_luns_free(dev); 416 gen_luns_free(dev);
284 kfree(dev->mp); 417 kfree(dev->mp);
285 dev->mp = NULL; 418 dev->mp = NULL;
286} 419}
287 420
288static int gennvm_register(struct nvm_dev *dev) 421static int gen_register(struct nvm_dev *dev)
289{ 422{
290 struct gen_nvm *gn; 423 struct gen_dev *gn;
291 int ret; 424 int ret;
292 425
293 if (!try_module_get(THIS_MODULE)) 426 if (!try_module_get(THIS_MODULE))
294 return -ENODEV; 427 return -ENODEV;
295 428
296 gn = kzalloc(sizeof(struct gen_nvm), GFP_KERNEL); 429 gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL);
297 if (!gn) 430 if (!gn)
298 return -ENOMEM; 431 return -ENOMEM;
299 432
300 gn->dev = dev; 433 gn->dev = dev;
301 gn->nr_luns = dev->nr_luns; 434 gn->nr_luns = dev->nr_luns;
302 INIT_LIST_HEAD(&gn->area_list); 435 INIT_LIST_HEAD(&gn->area_list);
436 mutex_init(&gn->lock);
437 INIT_LIST_HEAD(&gn->targets);
303 dev->mp = gn; 438 dev->mp = gn;
304 439
305 ret = gennvm_luns_init(dev, gn); 440 ret = gen_luns_init(dev, gn);
306 if (ret) { 441 if (ret) {
307 pr_err("gennvm: could not initialize luns\n"); 442 pr_err("gen: could not initialize luns\n");
308 goto err; 443 goto err;
309 } 444 }
310 445
311 ret = gennvm_blocks_init(dev, gn); 446 ret = gen_blocks_init(dev, gn);
312 if (ret) { 447 if (ret) {
313 pr_err("gennvm: could not initialize blocks\n"); 448 pr_err("gen: could not initialize blocks\n");
314 goto err; 449 goto err;
315 } 450 }
316 451
317 return 1; 452 return 1;
318err: 453err:
319 gennvm_free(dev); 454 gen_free(dev);
320 module_put(THIS_MODULE); 455 module_put(THIS_MODULE);
321 return ret; 456 return ret;
322} 457}
323 458
324static void gennvm_unregister(struct nvm_dev *dev) 459static void gen_unregister(struct nvm_dev *dev)
325{ 460{
326 gennvm_free(dev); 461 struct gen_dev *gn = dev->mp;
462 struct nvm_target *t, *tmp;
463
464 mutex_lock(&gn->lock);
465 list_for_each_entry_safe(t, tmp, &gn->targets, list) {
466 if (t->dev != dev)
467 continue;
468 __gen_remove_target(t);
469 }
470 mutex_unlock(&gn->lock);
471
472 gen_free(dev);
327 module_put(THIS_MODULE); 473 module_put(THIS_MODULE);
328} 474}
329 475
330static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev, 476static struct nvm_block *gen_get_blk(struct nvm_dev *dev,
331 struct nvm_lun *vlun, unsigned long flags) 477 struct nvm_lun *vlun, unsigned long flags)
332{ 478{
333 struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); 479 struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
334 struct nvm_block *blk = NULL; 480 struct nvm_block *blk = NULL;
335 int is_gc = flags & NVM_IOTYPE_GC; 481 int is_gc = flags & NVM_IOTYPE_GC;
336 482
337 assert_spin_locked(&vlun->lock); 483 spin_lock(&vlun->lock);
338
339 if (list_empty(&lun->free_list)) { 484 if (list_empty(&lun->free_list)) {
340 pr_err_ratelimited("gennvm: lun %u have no free pages available", 485 pr_err_ratelimited("gen: lun %u have no free pages available",
341 lun->vlun.id); 486 lun->vlun.id);
342 goto out; 487 goto out;
343 } 488 }
@@ -346,88 +491,58 @@ static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev,
346 goto out; 491 goto out;
347 492
348 blk = list_first_entry(&lun->free_list, struct nvm_block, list); 493 blk = list_first_entry(&lun->free_list, struct nvm_block, list);
349 list_move_tail(&blk->list, &lun->used_list);
350 blk->state = NVM_BLK_ST_OPEN;
351 494
495 list_move_tail(&blk->list, &lun->used_list);
496 blk->state = NVM_BLK_ST_TGT;
352 lun->vlun.nr_free_blocks--; 497 lun->vlun.nr_free_blocks--;
353 lun->vlun.nr_open_blocks++;
354
355out: 498out:
356 return blk;
357}
358
359static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
360 struct nvm_lun *vlun, unsigned long flags)
361{
362 struct nvm_block *blk;
363
364 spin_lock(&vlun->lock);
365 blk = gennvm_get_blk_unlocked(dev, vlun, flags);
366 spin_unlock(&vlun->lock); 499 spin_unlock(&vlun->lock);
367 return blk; 500 return blk;
368} 501}
369 502
370static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) 503static void gen_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
371{ 504{
372 struct nvm_lun *vlun = blk->lun; 505 struct nvm_lun *vlun = blk->lun;
373 struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); 506 struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
374 507
375 assert_spin_locked(&vlun->lock); 508 spin_lock(&vlun->lock);
376 509 if (blk->state & NVM_BLK_ST_TGT) {
377 if (blk->state & NVM_BLK_ST_OPEN) {
378 list_move_tail(&blk->list, &lun->free_list);
379 lun->vlun.nr_open_blocks--;
380 lun->vlun.nr_free_blocks++;
381 blk->state = NVM_BLK_ST_FREE;
382 } else if (blk->state & NVM_BLK_ST_CLOSED) {
383 list_move_tail(&blk->list, &lun->free_list); 510 list_move_tail(&blk->list, &lun->free_list);
384 lun->vlun.nr_closed_blocks--;
385 lun->vlun.nr_free_blocks++; 511 lun->vlun.nr_free_blocks++;
386 blk->state = NVM_BLK_ST_FREE; 512 blk->state = NVM_BLK_ST_FREE;
387 } else if (blk->state & NVM_BLK_ST_BAD) { 513 } else if (blk->state & NVM_BLK_ST_BAD) {
388 list_move_tail(&blk->list, &lun->bb_list); 514 list_move_tail(&blk->list, &lun->bb_list);
389 lun->vlun.nr_bad_blocks++;
390 blk->state = NVM_BLK_ST_BAD; 515 blk->state = NVM_BLK_ST_BAD;
391 } else { 516 } else {
392 WARN_ON_ONCE(1); 517 WARN_ON_ONCE(1);
393 pr_err("gennvm: erroneous block type (%lu -> %u)\n", 518 pr_err("gen: erroneous block type (%lu -> %u)\n",
394 blk->id, blk->state); 519 blk->id, blk->state);
395 list_move_tail(&blk->list, &lun->bb_list); 520 list_move_tail(&blk->list, &lun->bb_list);
396 lun->vlun.nr_bad_blocks++;
397 blk->state = NVM_BLK_ST_BAD;
398 } 521 }
399}
400
401static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
402{
403 struct nvm_lun *vlun = blk->lun;
404
405 spin_lock(&vlun->lock);
406 gennvm_put_blk_unlocked(dev, blk);
407 spin_unlock(&vlun->lock); 522 spin_unlock(&vlun->lock);
408} 523}
409 524
410static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) 525static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
411{ 526{
412 struct gen_nvm *gn = dev->mp; 527 struct gen_dev *gn = dev->mp;
413 struct gen_lun *lun; 528 struct gen_lun *lun;
414 struct nvm_block *blk; 529 struct nvm_block *blk;
415 530
416 pr_debug("gennvm: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n", 531 pr_debug("gen: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n",
417 ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type); 532 ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type);
418 533
419 if (unlikely(ppa.g.ch > dev->nr_chnls || 534 if (unlikely(ppa.g.ch > dev->nr_chnls ||
420 ppa.g.lun > dev->luns_per_chnl || 535 ppa.g.lun > dev->luns_per_chnl ||
421 ppa.g.blk > dev->blks_per_lun)) { 536 ppa.g.blk > dev->blks_per_lun)) {
422 WARN_ON_ONCE(1); 537 WARN_ON_ONCE(1);
423 pr_err("gennvm: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u", 538 pr_err("gen: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u",
424 ppa.g.ch, dev->nr_chnls, 539 ppa.g.ch, dev->nr_chnls,
425 ppa.g.lun, dev->luns_per_chnl, 540 ppa.g.lun, dev->luns_per_chnl,
426 ppa.g.blk, dev->blks_per_lun); 541 ppa.g.blk, dev->blks_per_lun);
427 return; 542 return;
428 } 543 }
429 544
430 lun = &gn->luns[ppa.g.lun * ppa.g.ch]; 545 lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
431 blk = &lun->vlun.blocks[ppa.g.blk]; 546 blk = &lun->vlun.blocks[ppa.g.blk];
432 547
433 /* will be moved to bb list on put_blk from target */ 548 /* will be moved to bb list on put_blk from target */
@@ -435,9 +550,9 @@ static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
435} 550}
436 551
437/* 552/*
438 * mark block bad in gennvm. It is expected that the target recovers separately 553 * mark block bad in gen. It is expected that the target recovers separately
439 */ 554 */
440static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) 555static void gen_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
441{ 556{
442 int bit = -1; 557 int bit = -1;
443 int max_secs = dev->ops->max_phys_sect; 558 int max_secs = dev->ops->max_phys_sect;
@@ -447,25 +562,25 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
447 562
448 /* look up blocks and mark them as bad */ 563 /* look up blocks and mark them as bad */
449 if (rqd->nr_ppas == 1) { 564 if (rqd->nr_ppas == 1) {
450 gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD); 565 gen_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD);
451 return; 566 return;
452 } 567 }
453 568
454 while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs) 569 while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs)
455 gennvm_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD); 570 gen_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD);
456} 571}
457 572
458static void gennvm_end_io(struct nvm_rq *rqd) 573static void gen_end_io(struct nvm_rq *rqd)
459{ 574{
460 struct nvm_tgt_instance *ins = rqd->ins; 575 struct nvm_tgt_instance *ins = rqd->ins;
461 576
462 if (rqd->error == NVM_RSP_ERR_FAILWRITE) 577 if (rqd->error == NVM_RSP_ERR_FAILWRITE)
463 gennvm_mark_blk_bad(rqd->dev, rqd); 578 gen_mark_blk_bad(rqd->dev, rqd);
464 579
465 ins->tt->end_io(rqd); 580 ins->tt->end_io(rqd);
466} 581}
467 582
468static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) 583static int gen_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
469{ 584{
470 if (!dev->ops->submit_io) 585 if (!dev->ops->submit_io)
471 return -ENODEV; 586 return -ENODEV;
@@ -474,11 +589,11 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
474 nvm_generic_to_addr_mode(dev, rqd); 589 nvm_generic_to_addr_mode(dev, rqd);
475 590
476 rqd->dev = dev; 591 rqd->dev = dev;
477 rqd->end_io = gennvm_end_io; 592 rqd->end_io = gen_end_io;
478 return dev->ops->submit_io(dev, rqd); 593 return dev->ops->submit_io(dev, rqd);
479} 594}
480 595
481static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, 596static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
482 unsigned long flags) 597 unsigned long flags)
483{ 598{
484 struct ppa_addr addr = block_to_ppa(dev, blk); 599 struct ppa_addr addr = block_to_ppa(dev, blk);
@@ -486,19 +601,19 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
486 return nvm_erase_ppa(dev, &addr, 1); 601 return nvm_erase_ppa(dev, &addr, 1);
487} 602}
488 603
489static int gennvm_reserve_lun(struct nvm_dev *dev, int lunid) 604static int gen_reserve_lun(struct nvm_dev *dev, int lunid)
490{ 605{
491 return test_and_set_bit(lunid, dev->lun_map); 606 return test_and_set_bit(lunid, dev->lun_map);
492} 607}
493 608
494static void gennvm_release_lun(struct nvm_dev *dev, int lunid) 609static void gen_release_lun(struct nvm_dev *dev, int lunid)
495{ 610{
496 WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); 611 WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
497} 612}
498 613
499static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) 614static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid)
500{ 615{
501 struct gen_nvm *gn = dev->mp; 616 struct gen_dev *gn = dev->mp;
502 617
503 if (unlikely(lunid >= dev->nr_luns)) 618 if (unlikely(lunid >= dev->nr_luns))
504 return NULL; 619 return NULL;
@@ -506,66 +621,62 @@ static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
506 return &gn->luns[lunid].vlun; 621 return &gn->luns[lunid].vlun;
507} 622}
508 623
509static void gennvm_lun_info_print(struct nvm_dev *dev) 624static void gen_lun_info_print(struct nvm_dev *dev)
510{ 625{
511 struct gen_nvm *gn = dev->mp; 626 struct gen_dev *gn = dev->mp;
512 struct gen_lun *lun; 627 struct gen_lun *lun;
513 unsigned int i; 628 unsigned int i;
514 629
515 630
516 gennvm_for_each_lun(gn, lun, i) { 631 gen_for_each_lun(gn, lun, i) {
517 spin_lock(&lun->vlun.lock); 632 spin_lock(&lun->vlun.lock);
518 633
519 pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n", 634 pr_info("%s: lun%8u\t%u\n", dev->name, i,
520 dev->name, i, 635 lun->vlun.nr_free_blocks);
521 lun->vlun.nr_free_blocks,
522 lun->vlun.nr_open_blocks,
523 lun->vlun.nr_closed_blocks,
524 lun->vlun.nr_bad_blocks);
525 636
526 spin_unlock(&lun->vlun.lock); 637 spin_unlock(&lun->vlun.lock);
527 } 638 }
528} 639}
529 640
530static struct nvmm_type gennvm = { 641static struct nvmm_type gen = {
531 .name = "gennvm", 642 .name = "gennvm",
532 .version = {0, 1, 0}, 643 .version = {0, 1, 0},
533 644
534 .register_mgr = gennvm_register, 645 .register_mgr = gen_register,
535 .unregister_mgr = gennvm_unregister, 646 .unregister_mgr = gen_unregister,
536 647
537 .get_blk_unlocked = gennvm_get_blk_unlocked, 648 .create_tgt = gen_create_tgt,
538 .put_blk_unlocked = gennvm_put_blk_unlocked, 649 .remove_tgt = gen_remove_tgt,
539 650
540 .get_blk = gennvm_get_blk, 651 .get_blk = gen_get_blk,
541 .put_blk = gennvm_put_blk, 652 .put_blk = gen_put_blk,
542 653
543 .submit_io = gennvm_submit_io, 654 .submit_io = gen_submit_io,
544 .erase_blk = gennvm_erase_blk, 655 .erase_blk = gen_erase_blk,
545 656
546 .mark_blk = gennvm_mark_blk, 657 .mark_blk = gen_mark_blk,
547 658
548 .get_lun = gennvm_get_lun, 659 .get_lun = gen_get_lun,
549 .reserve_lun = gennvm_reserve_lun, 660 .reserve_lun = gen_reserve_lun,
550 .release_lun = gennvm_release_lun, 661 .release_lun = gen_release_lun,
551 .lun_info_print = gennvm_lun_info_print, 662 .lun_info_print = gen_lun_info_print,
552 663
553 .get_area = gennvm_get_area, 664 .get_area = gen_get_area,
554 .put_area = gennvm_put_area, 665 .put_area = gen_put_area,
555 666
556}; 667};
557 668
558static int __init gennvm_module_init(void) 669static int __init gen_module_init(void)
559{ 670{
560 return nvm_register_mgr(&gennvm); 671 return nvm_register_mgr(&gen);
561} 672}
562 673
563static void gennvm_module_exit(void) 674static void gen_module_exit(void)
564{ 675{
565 nvm_unregister_mgr(&gennvm); 676 nvm_unregister_mgr(&gen);
566} 677}
567 678
568module_init(gennvm_module_init); 679module_init(gen_module_init);
569module_exit(gennvm_module_exit); 680module_exit(gen_module_exit);
570MODULE_LICENSE("GPL v2"); 681MODULE_LICENSE("GPL v2");
571MODULE_DESCRIPTION("Generic media manager for Open-Channel SSDs"); 682MODULE_DESCRIPTION("General media manager for Open-Channel SSDs");
diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h
index 04d7c23cfc61..8ecfa817d21d 100644
--- a/drivers/lightnvm/gennvm.h
+++ b/drivers/lightnvm/gennvm.h
@@ -34,20 +34,24 @@ struct gen_lun {
34 */ 34 */
35}; 35};
36 36
37struct gen_nvm { 37struct gen_dev {
38 struct nvm_dev *dev; 38 struct nvm_dev *dev;
39 39
40 int nr_luns; 40 int nr_luns;
41 struct gen_lun *luns; 41 struct gen_lun *luns;
42 struct list_head area_list; 42 struct list_head area_list;
43
44 struct mutex lock;
45 struct list_head targets;
43}; 46};
44 47
45struct gennvm_area { 48struct gen_area {
46 struct list_head list; 49 struct list_head list;
47 sector_t begin; 50 sector_t begin;
48 sector_t end; /* end is excluded */ 51 sector_t end; /* end is excluded */
49}; 52};
50#define gennvm_for_each_lun(bm, lun, i) \ 53
54#define gen_for_each_lun(bm, lun, i) \
51 for ((i) = 0, lun = &(bm)->luns[0]; \ 55 for ((i) = 0, lun = &(bm)->luns[0]; \
52 (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)]) 56 (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)])
53 57
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index de86d72dcdf0..37fcaadbf80c 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -48,7 +48,7 @@ static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
48} 48}
49 49
50static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba, 50static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba,
51 unsigned len) 51 unsigned int len)
52{ 52{
53 sector_t i; 53 sector_t i;
54 54
@@ -96,10 +96,13 @@ static void rrpc_discard(struct rrpc *rrpc, struct bio *bio)
96 sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE; 96 sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
97 struct nvm_rq *rqd; 97 struct nvm_rq *rqd;
98 98
99 do { 99 while (1) {
100 rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len); 100 rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len);
101 if (rqd)
102 break;
103
101 schedule(); 104 schedule();
102 } while (!rqd); 105 }
103 106
104 if (IS_ERR(rqd)) { 107 if (IS_ERR(rqd)) {
105 pr_err("rrpc: unable to acquire inflight IO\n"); 108 pr_err("rrpc: unable to acquire inflight IO\n");
@@ -172,39 +175,32 @@ static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr)
172} 175}
173 176
174/* requires lun->lock taken */ 177/* requires lun->lock taken */
175static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk) 178static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk,
179 struct rrpc_block **cur_rblk)
176{ 180{
177 struct rrpc *rrpc = rlun->rrpc; 181 struct rrpc *rrpc = rlun->rrpc;
178 182
179 BUG_ON(!rblk); 183 if (*cur_rblk) {
180 184 spin_lock(&(*cur_rblk)->lock);
181 if (rlun->cur) { 185 WARN_ON(!block_is_full(rrpc, *cur_rblk));
182 spin_lock(&rlun->cur->lock); 186 spin_unlock(&(*cur_rblk)->lock);
183 WARN_ON(!block_is_full(rrpc, rlun->cur));
184 spin_unlock(&rlun->cur->lock);
185 } 187 }
186 rlun->cur = rblk; 188 *cur_rblk = new_rblk;
187} 189}
188 190
189static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, 191static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
190 unsigned long flags) 192 unsigned long flags)
191{ 193{
192 struct nvm_lun *lun = rlun->parent;
193 struct nvm_block *blk; 194 struct nvm_block *blk;
194 struct rrpc_block *rblk; 195 struct rrpc_block *rblk;
195 196
196 spin_lock(&lun->lock); 197 blk = nvm_get_blk(rrpc->dev, rlun->parent, flags);
197 blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags);
198 if (!blk) { 198 if (!blk) {
199 pr_err("nvm: rrpc: cannot get new block from media manager\n"); 199 pr_err("nvm: rrpc: cannot get new block from media manager\n");
200 spin_unlock(&lun->lock);
201 return NULL; 200 return NULL;
202 } 201 }
203 202
204 rblk = rrpc_get_rblk(rlun, blk->id); 203 rblk = rrpc_get_rblk(rlun, blk->id);
205 list_add_tail(&rblk->list, &rlun->open_list);
206 spin_unlock(&lun->lock);
207
208 blk->priv = rblk; 204 blk->priv = rblk;
209 bitmap_zero(rblk->invalid_pages, rrpc->dev->sec_per_blk); 205 bitmap_zero(rblk->invalid_pages, rrpc->dev->sec_per_blk);
210 rblk->next_page = 0; 206 rblk->next_page = 0;
@@ -216,13 +212,7 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
216 212
217static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) 213static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
218{ 214{
219 struct rrpc_lun *rlun = rblk->rlun; 215 nvm_put_blk(rrpc->dev, rblk->parent);
220 struct nvm_lun *lun = rlun->parent;
221
222 spin_lock(&lun->lock);
223 nvm_put_blk_unlocked(rrpc->dev, rblk->parent);
224 list_del(&rblk->list);
225 spin_unlock(&lun->lock);
226} 216}
227 217
228static void rrpc_put_blks(struct rrpc *rrpc) 218static void rrpc_put_blks(struct rrpc *rrpc)
@@ -508,21 +498,11 @@ static void rrpc_gc_queue(struct work_struct *work)
508 struct rrpc *rrpc = gcb->rrpc; 498 struct rrpc *rrpc = gcb->rrpc;
509 struct rrpc_block *rblk = gcb->rblk; 499 struct rrpc_block *rblk = gcb->rblk;
510 struct rrpc_lun *rlun = rblk->rlun; 500 struct rrpc_lun *rlun = rblk->rlun;
511 struct nvm_lun *lun = rblk->parent->lun;
512 struct nvm_block *blk = rblk->parent;
513 501
514 spin_lock(&rlun->lock); 502 spin_lock(&rlun->lock);
515 list_add_tail(&rblk->prio, &rlun->prio_list); 503 list_add_tail(&rblk->prio, &rlun->prio_list);
516 spin_unlock(&rlun->lock); 504 spin_unlock(&rlun->lock);
517 505
518 spin_lock(&lun->lock);
519 lun->nr_open_blocks--;
520 lun->nr_closed_blocks++;
521 blk->state &= ~NVM_BLK_ST_OPEN;
522 blk->state |= NVM_BLK_ST_CLOSED;
523 list_move_tail(&rblk->list, &rlun->closed_list);
524 spin_unlock(&lun->lock);
525
526 mempool_free(gcb, rrpc->gcb_pool); 506 mempool_free(gcb, rrpc->gcb_pool);
527 pr_debug("nvm: block '%lu' is full, allow GC (sched)\n", 507 pr_debug("nvm: block '%lu' is full, allow GC (sched)\n",
528 rblk->parent->id); 508 rblk->parent->id);
@@ -596,21 +576,20 @@ out:
596 return addr; 576 return addr;
597} 577}
598 578
599/* Simple round-robin Logical to physical address translation. 579/* Map logical address to a physical page. The mapping implements a round robin
600 * 580 * approach and allocates a page from the next lun available.
601 * Retrieve the mapping using the active append point. Then update the ap for
602 * the next write to the disk.
603 * 581 *
604 * Returns rrpc_addr with the physical address and block. Remember to return to 582 * Returns rrpc_addr with the physical address and block. Returns NULL if no
605 * rrpc->addr_cache when request is finished. 583 * blocks in the next rlun are available.
606 */ 584 */
607static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr, 585static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
608 int is_gc) 586 int is_gc)
609{ 587{
610 struct rrpc_lun *rlun; 588 struct rrpc_lun *rlun;
611 struct rrpc_block *rblk; 589 struct rrpc_block *rblk, **cur_rblk;
612 struct nvm_lun *lun; 590 struct nvm_lun *lun;
613 u64 paddr; 591 u64 paddr;
592 int gc_force = 0;
614 593
615 rlun = rrpc_get_lun_rr(rrpc, is_gc); 594 rlun = rrpc_get_lun_rr(rrpc, is_gc);
616 lun = rlun->parent; 595 lun = rlun->parent;
@@ -618,41 +597,65 @@ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
618 if (!is_gc && lun->nr_free_blocks < rrpc->nr_luns * 4) 597 if (!is_gc && lun->nr_free_blocks < rrpc->nr_luns * 4)
619 return NULL; 598 return NULL;
620 599
621 spin_lock(&rlun->lock); 600 /*
601 * page allocation steps:
602 * 1. Try to allocate new page from current rblk
603 * 2a. If succeed, proceed to map it in and return
604 * 2b. If fail, first try to allocate a new block from media manger,
605 * and then retry step 1. Retry until the normal block pool is
606 * exhausted.
607 * 3. If exhausted, and garbage collector is requesting the block,
608 * go to the reserved block and retry step 1.
609 * In the case that this fails as well, or it is not GC
610 * requesting, report not able to retrieve a block and let the
611 * caller handle further processing.
612 */
622 613
614 spin_lock(&rlun->lock);
615 cur_rblk = &rlun->cur;
623 rblk = rlun->cur; 616 rblk = rlun->cur;
624retry: 617retry:
625 paddr = rrpc_alloc_addr(rrpc, rblk); 618 paddr = rrpc_alloc_addr(rrpc, rblk);
626 619
627 if (paddr == ADDR_EMPTY) { 620 if (paddr != ADDR_EMPTY)
628 rblk = rrpc_get_blk(rrpc, rlun, 0); 621 goto done;
629 if (rblk) {
630 rrpc_set_lun_cur(rlun, rblk);
631 goto retry;
632 }
633 622
634 if (is_gc) { 623 if (!list_empty(&rlun->wblk_list)) {
635 /* retry from emergency gc block */ 624new_blk:
636 paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur); 625 rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block,
637 if (paddr == ADDR_EMPTY) { 626 prio);
638 rblk = rrpc_get_blk(rrpc, rlun, 1); 627 rrpc_set_lun_cur(rlun, rblk, cur_rblk);
639 if (!rblk) { 628 list_del(&rblk->prio);
640 pr_err("rrpc: no more blocks"); 629 goto retry;
641 goto err; 630 }
642 } 631 spin_unlock(&rlun->lock);
643 632
644 rlun->gc_cur = rblk; 633 rblk = rrpc_get_blk(rrpc, rlun, gc_force);
645 paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur); 634 if (rblk) {
646 } 635 spin_lock(&rlun->lock);
647 rblk = rlun->gc_cur; 636 list_add_tail(&rblk->prio, &rlun->wblk_list);
648 } 637 /*
638 * another thread might already have added a new block,
639 * Therefore, make sure that one is used, instead of the
640 * one just added.
641 */
642 goto new_blk;
649 } 643 }
650 644
645 if (unlikely(is_gc) && !gc_force) {
646 /* retry from emergency gc block */
647 cur_rblk = &rlun->gc_cur;
648 rblk = rlun->gc_cur;
649 gc_force = 1;
650 spin_lock(&rlun->lock);
651 goto retry;
652 }
653
654 pr_err("rrpc: failed to allocate new block\n");
655 return NULL;
656done:
651 spin_unlock(&rlun->lock); 657 spin_unlock(&rlun->lock);
652 return rrpc_update_map(rrpc, laddr, rblk, paddr); 658 return rrpc_update_map(rrpc, laddr, rblk, paddr);
653err:
654 spin_unlock(&rlun->lock);
655 return NULL;
656} 659}
657 660
658static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk) 661static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
@@ -850,14 +853,14 @@ static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
850 return NVM_IO_ERR; 853 return NVM_IO_ERR;
851 } 854 }
852 855
853 if (bio_rw(bio) == WRITE) 856 if (bio_op(bio) == REQ_OP_WRITE)
854 return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags, 857 return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags,
855 npages); 858 npages);
856 859
857 return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages); 860 return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages);
858 } 861 }
859 862
860 if (bio_rw(bio) == WRITE) 863 if (bio_op(bio) == REQ_OP_WRITE)
861 return rrpc_write_rq(rrpc, bio, rqd, flags); 864 return rrpc_write_rq(rrpc, bio, rqd, flags);
862 865
863 return rrpc_read_rq(rrpc, bio, rqd, flags); 866 return rrpc_read_rq(rrpc, bio, rqd, flags);
@@ -1196,8 +1199,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
1196 1199
1197 rlun->rrpc = rrpc; 1200 rlun->rrpc = rrpc;
1198 INIT_LIST_HEAD(&rlun->prio_list); 1201 INIT_LIST_HEAD(&rlun->prio_list);
1199 INIT_LIST_HEAD(&rlun->open_list); 1202 INIT_LIST_HEAD(&rlun->wblk_list);
1200 INIT_LIST_HEAD(&rlun->closed_list);
1201 1203
1202 INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); 1204 INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
1203 spin_lock_init(&rlun->lock); 1205 spin_lock_init(&rlun->lock);
@@ -1338,14 +1340,13 @@ static int rrpc_luns_configure(struct rrpc *rrpc)
1338 rblk = rrpc_get_blk(rrpc, rlun, 0); 1340 rblk = rrpc_get_blk(rrpc, rlun, 0);
1339 if (!rblk) 1341 if (!rblk)
1340 goto err; 1342 goto err;
1341 1343 rrpc_set_lun_cur(rlun, rblk, &rlun->cur);
1342 rrpc_set_lun_cur(rlun, rblk);
1343 1344
1344 /* Emergency gc block */ 1345 /* Emergency gc block */
1345 rblk = rrpc_get_blk(rrpc, rlun, 1); 1346 rblk = rrpc_get_blk(rrpc, rlun, 1);
1346 if (!rblk) 1347 if (!rblk)
1347 goto err; 1348 goto err;
1348 rlun->gc_cur = rblk; 1349 rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur);
1349 } 1350 }
1350 1351
1351 return 0; 1352 return 0;
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index 87e84b5fc1cc..5e87d52cb983 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -56,7 +56,6 @@ struct rrpc_block {
56 struct nvm_block *parent; 56 struct nvm_block *parent;
57 struct rrpc_lun *rlun; 57 struct rrpc_lun *rlun;
58 struct list_head prio; 58 struct list_head prio;
59 struct list_head list;
60 59
61#define MAX_INVALID_PAGES_STORAGE 8 60#define MAX_INVALID_PAGES_STORAGE 8
62 /* Bitmap for invalid page intries */ 61 /* Bitmap for invalid page intries */
@@ -77,13 +76,7 @@ struct rrpc_lun {
77 struct rrpc_block *blocks; /* Reference to block allocation */ 76 struct rrpc_block *blocks; /* Reference to block allocation */
78 77
79 struct list_head prio_list; /* Blocks that may be GC'ed */ 78 struct list_head prio_list; /* Blocks that may be GC'ed */
80 struct list_head open_list; /* In-use open blocks. These are blocks 79 struct list_head wblk_list; /* Queued blocks to be written to */
81 * that can be both written to and read
82 * from
83 */
84 struct list_head closed_list; /* In-use closed blocks. These are
85 * blocks that can _only_ be read from
86 */
87 80
88 struct work_struct ws_gc; 81 struct work_struct ws_gc;
89 82
@@ -188,7 +181,7 @@ static inline int request_intersects(struct rrpc_inflight_rq *r,
188} 181}
189 182
190static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, 183static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
191 unsigned pages, struct rrpc_inflight_rq *r) 184 unsigned int pages, struct rrpc_inflight_rq *r)
192{ 185{
193 sector_t laddr_end = laddr + pages - 1; 186 sector_t laddr_end = laddr + pages - 1;
194 struct rrpc_inflight_rq *rtmp; 187 struct rrpc_inflight_rq *rtmp;
@@ -213,7 +206,7 @@ static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
213} 206}
214 207
215static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, 208static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
216 unsigned pages, 209 unsigned int pages,
217 struct rrpc_inflight_rq *r) 210 struct rrpc_inflight_rq *r)
218{ 211{
219 BUG_ON((laddr + pages) > rrpc->nr_sects); 212 BUG_ON((laddr + pages) > rrpc->nr_sects);
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index 994697ac786e..a75bd28aaca3 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -39,7 +39,8 @@ static inline int scan_ppa_idx(int row, int blkid)
39 return (row * MAX_BLKS_PR_SYSBLK) + blkid; 39 return (row * MAX_BLKS_PR_SYSBLK) + blkid;
40} 40}
41 41
42void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) 42static void nvm_sysblk_to_cpu(struct nvm_sb_info *info,
43 struct nvm_system_block *sb)
43{ 44{
44 info->seqnr = be32_to_cpu(sb->seqnr); 45 info->seqnr = be32_to_cpu(sb->seqnr);
45 info->erase_cnt = be32_to_cpu(sb->erase_cnt); 46 info->erase_cnt = be32_to_cpu(sb->erase_cnt);
@@ -48,7 +49,8 @@ void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb)
48 info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); 49 info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa);
49} 50}
50 51
51void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info) 52static void nvm_cpu_to_sysblk(struct nvm_system_block *sb,
53 struct nvm_sb_info *info)
52{ 54{
53 sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); 55 sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC);
54 sb->seqnr = cpu_to_be32(info->seqnr); 56 sb->seqnr = cpu_to_be32(info->seqnr);
@@ -86,7 +88,7 @@ static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
86 return nr_rows; 88 return nr_rows;
87} 89}
88 90
89void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, 91static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
90 struct ppa_addr *sysblk_ppas) 92 struct ppa_addr *sysblk_ppas)
91{ 93{
92 memset(s, 0, sizeof(struct sysblk_scan)); 94 memset(s, 0, sizeof(struct sysblk_scan));
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 9eaf1d6e8302..864e673aec39 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -112,7 +112,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
112EXPORT_SYMBOL(closure_wait); 112EXPORT_SYMBOL(closure_wait);
113 113
114/** 114/**
115 * closure_sync - sleep until a closure a closure has nothing left to wait on 115 * closure_sync - sleep until a closure has nothing left to wait on
116 * 116 *
117 * Sleeps until the refcount hits 1 - the thread that's running the closure owns 117 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
118 * the last refcount. 118 * the last refcount.
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 782cc2c8a185..9b2fe2d3e3a9 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -31,7 +31,8 @@
31 * passing it, as you might expect, the function to run when nothing is pending 31 * passing it, as you might expect, the function to run when nothing is pending
32 * and the workqueue to run that function out of. 32 * and the workqueue to run that function out of.
33 * 33 *
34 * continue_at() also, critically, is a macro that returns the calling function. 34 * continue_at() also, critically, requires a 'return' immediately following the
35 * location where this macro is referenced, to return to the calling function.
35 * There's good reason for this. 36 * There's good reason for this.
36 * 37 *
37 * To use safely closures asynchronously, they must always have a refcount while 38 * To use safely closures asynchronously, they must always have a refcount while
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fd885cc2afad..e97b0acf7b8d 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -25,7 +25,6 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
25 struct bio *bio = &b->bio; 25 struct bio *bio = &b->bio;
26 26
27 bio_init(bio); 27 bio_init(bio);
28 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
29 bio->bi_max_vecs = bucket_pages(c); 28 bio->bi_max_vecs = bucket_pages(c);
30 bio->bi_io_vec = bio->bi_inline_vecs; 29 bio->bi_io_vec = bio->bi_inline_vecs;
31 30
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c944daf75dd0..88ef6d14cce3 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -134,7 +134,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
134 case BCACHE_SB_VERSION_CDEV: 134 case BCACHE_SB_VERSION_CDEV:
135 case BCACHE_SB_VERSION_CDEV_WITH_UUID: 135 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
136 sb->nbuckets = le64_to_cpu(s->nbuckets); 136 sb->nbuckets = le64_to_cpu(s->nbuckets);
137 sb->block_size = le16_to_cpu(s->block_size);
138 sb->bucket_size = le16_to_cpu(s->bucket_size); 137 sb->bucket_size = le16_to_cpu(s->bucket_size);
139 138
140 sb->nr_in_set = le16_to_cpu(s->nr_in_set); 139 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
@@ -1520,7 +1519,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1520 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || 1519 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1521 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1520 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1522 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1521 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1523 !(c->moving_gc_wq = create_workqueue("bcache_gc")) || 1522 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1523 WQ_MEM_RECLAIM, 0)) ||
1524 bch_journal_alloc(c) || 1524 bch_journal_alloc(c) ||
1525 bch_btree_cache_alloc(c) || 1525 bch_btree_cache_alloc(c) ||
1526 bch_open_buckets_alloc(c) || 1526 bch_open_buckets_alloc(c) ||
@@ -1805,7 +1805,7 @@ void bch_cache_release(struct kobject *kobj)
1805 module_put(THIS_MODULE); 1805 module_put(THIS_MODULE);
1806} 1806}
1807 1807
1808static int cache_alloc(struct cache_sb *sb, struct cache *ca) 1808static int cache_alloc(struct cache *ca)
1809{ 1809{
1810 size_t free; 1810 size_t free;
1811 struct bucket *b; 1811 struct bucket *b;
@@ -1860,7 +1860,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
1860 if (blk_queue_discard(bdev_get_queue(ca->bdev))) 1860 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1861 ca->discard = CACHE_DISCARD(&ca->sb); 1861 ca->discard = CACHE_DISCARD(&ca->sb);
1862 1862
1863 ret = cache_alloc(sb, ca); 1863 ret = cache_alloc(ca);
1864 if (ret != 0) 1864 if (ret != 0)
1865 goto err; 1865 goto err;
1866 1866
@@ -2099,7 +2099,7 @@ static int __init bcache_init(void)
2099 return bcache_major; 2099 return bcache_major;
2100 } 2100 }
2101 2101
2102 if (!(bcache_wq = create_workqueue("bcache")) || 2102 if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
2103 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || 2103 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2104 sysfs_create_files(bcache_kobj, files) || 2104 sysfs_create_files(bcache_kobj, files) ||
2105 bch_request_init() || 2105 bch_request_init() ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9f5f460c0e92..dac55b254a09 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -528,7 +528,7 @@ static void read_callback(unsigned long error, void *context)
528 DMWARN_LIMIT("Read failure on mirror device %s. " 528 DMWARN_LIMIT("Read failure on mirror device %s. "
529 "Trying alternative device.", 529 "Trying alternative device.",
530 m->dev->name); 530 m->dev->name);
531 queue_bio(m->ms, bio, bio_rw(bio)); 531 queue_bio(m->ms, bio, bio_data_dir(bio));
532 return; 532 return;
533 } 533 }
534 534
@@ -1193,7 +1193,7 @@ static void mirror_dtr(struct dm_target *ti)
1193 */ 1193 */
1194static int mirror_map(struct dm_target *ti, struct bio *bio) 1194static int mirror_map(struct dm_target *ti, struct bio *bio)
1195{ 1195{
1196 int r, rw = bio_rw(bio); 1196 int r, rw = bio_data_dir(bio);
1197 struct mirror *m; 1197 struct mirror *m;
1198 struct mirror_set *ms = ti->private; 1198 struct mirror_set *ms = ti->private;
1199 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1199 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
@@ -1217,7 +1217,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
1217 * If region is not in-sync queue the bio. 1217 * If region is not in-sync queue the bio.
1218 */ 1218 */
1219 if (!r || (r == -EWOULDBLOCK)) { 1219 if (!r || (r == -EWOULDBLOCK)) {
1220 if (rw == READA) 1220 if (bio->bi_rw & REQ_RAHEAD)
1221 return -EWOULDBLOCK; 1221 return -EWOULDBLOCK;
1222 1222
1223 queue_bio(ms, bio, rw); 1223 queue_bio(ms, bio, rw);
@@ -1242,7 +1242,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
1242 1242
1243static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) 1243static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
1244{ 1244{
1245 int rw = bio_rw(bio); 1245 int rw = bio_data_dir(bio);
1246 struct mirror_set *ms = (struct mirror_set *) ti->private; 1246 struct mirror_set *ms = (struct mirror_set *) ti->private;
1247 struct mirror *m = NULL; 1247 struct mirror *m = NULL;
1248 struct dm_bio_details *bd = NULL; 1248 struct dm_bio_details *bd = NULL;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 69ab1ff5f5c9..cc2f14b42ba4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1696,7 +1696,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1696 * to copy an exception */ 1696 * to copy an exception */
1697 down_write(&s->lock); 1697 down_write(&s->lock);
1698 1698
1699 if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_rw(bio) == WRITE)) { 1699 if (!s->valid || (unlikely(s->snapshot_overflowed) &&
1700 bio_data_dir(bio) == WRITE)) {
1700 r = -EIO; 1701 r = -EIO;
1701 goto out_unlock; 1702 goto out_unlock;
1702 } 1703 }
@@ -1713,7 +1714,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1713 * flags so we should only get this if we are 1714 * flags so we should only get this if we are
1714 * writeable. 1715 * writeable.
1715 */ 1716 */
1716 if (bio_rw(bio) == WRITE) { 1717 if (bio_data_dir(bio) == WRITE) {
1717 pe = __lookup_pending_exception(s, chunk); 1718 pe = __lookup_pending_exception(s, chunk);
1718 if (!pe) { 1719 if (!pe) {
1719 up_write(&s->lock); 1720 up_write(&s->lock);
@@ -1819,7 +1820,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
1819 e = dm_lookup_exception(&s->complete, chunk); 1820 e = dm_lookup_exception(&s->complete, chunk);
1820 if (e) { 1821 if (e) {
1821 /* Queue writes overlapping with chunks being merged */ 1822 /* Queue writes overlapping with chunks being merged */
1822 if (bio_rw(bio) == WRITE && 1823 if (bio_data_dir(bio) == WRITE &&
1823 chunk >= s->first_merging_chunk && 1824 chunk >= s->first_merging_chunk &&
1824 chunk < (s->first_merging_chunk + 1825 chunk < (s->first_merging_chunk +
1825 s->num_merging_chunks)) { 1826 s->num_merging_chunks)) {
@@ -1831,7 +1832,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
1831 1832
1832 remap_exception(s, e, bio, chunk); 1833 remap_exception(s, e, bio, chunk);
1833 1834
1834 if (bio_rw(bio) == WRITE) 1835 if (bio_data_dir(bio) == WRITE)
1835 track_chunk(s, bio, chunk); 1836 track_chunk(s, bio, chunk);
1836 goto out_unlock; 1837 goto out_unlock;
1837 } 1838 }
@@ -1839,7 +1840,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
1839redirect_to_origin: 1840redirect_to_origin:
1840 bio->bi_bdev = s->origin->bdev; 1841 bio->bi_bdev = s->origin->bdev;
1841 1842
1842 if (bio_rw(bio) == WRITE) { 1843 if (bio_data_dir(bio) == WRITE) {
1843 up_write(&s->lock); 1844 up_write(&s->lock);
1844 return do_origin(s->origin, bio); 1845 return do_origin(s->origin, bio);
1845 } 1846 }
@@ -2288,7 +2289,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
2288 if (unlikely(bio->bi_rw & REQ_PREFLUSH)) 2289 if (unlikely(bio->bi_rw & REQ_PREFLUSH))
2289 return DM_MAPIO_REMAPPED; 2290 return DM_MAPIO_REMAPPED;
2290 2291
2291 if (bio_rw(bio) != WRITE) 2292 if (bio_data_dir(bio) != WRITE)
2292 return DM_MAPIO_REMAPPED; 2293 return DM_MAPIO_REMAPPED;
2293 2294
2294 available_sectors = o->split_boundary - 2295 available_sectors = o->split_boundary -
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 766bc93006e6..618b8752dcf1 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -35,16 +35,19 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
35 */ 35 */
36static int zero_map(struct dm_target *ti, struct bio *bio) 36static int zero_map(struct dm_target *ti, struct bio *bio)
37{ 37{
38 switch(bio_rw(bio)) { 38 switch (bio_op(bio)) {
39 case READ: 39 case REQ_OP_READ:
40 if (bio->bi_rw & REQ_RAHEAD) {
41 /* readahead of null bytes only wastes buffer cache */
42 return -EIO;
43 }
40 zero_fill_bio(bio); 44 zero_fill_bio(bio);
41 break; 45 break;
42 case READA: 46 case REQ_OP_WRITE:
43 /* readahead of null bytes only wastes buffer cache */
44 return -EIO;
45 case WRITE:
46 /* writes get silently dropped */ 47 /* writes get silently dropped */
47 break; 48 break;
49 default:
50 return -EIO;
48 } 51 }
49 52
50 bio_endio(bio); 53 bio_endio(bio);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index aba7ed9abb3a..812fd5984eea 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1833,7 +1833,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1833 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1833 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1834 dm_put_live_table(md, srcu_idx); 1834 dm_put_live_table(md, srcu_idx);
1835 1835
1836 if (bio_rw(bio) != READA) 1836 if (!(bio->bi_rw & REQ_RAHEAD))
1837 queue_io(md, bio); 1837 queue_io(md, bio);
1838 else 1838 else
1839 bio_io_error(bio); 1839 bio_io_error(bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 10e53cd6a995..4e6da4497553 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1058,7 +1058,6 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
1058 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1058 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1059 const unsigned long do_flush_fua = (bio->bi_rw & 1059 const unsigned long do_flush_fua = (bio->bi_rw &
1060 (REQ_PREFLUSH | REQ_FUA)); 1060 (REQ_PREFLUSH | REQ_FUA));
1061 const unsigned long do_sec = (bio->bi_rw & REQ_SECURE);
1062 struct md_rdev *blocked_rdev; 1061 struct md_rdev *blocked_rdev;
1063 struct blk_plug_cb *cb; 1062 struct blk_plug_cb *cb;
1064 struct raid1_plug_cb *plug = NULL; 1063 struct raid1_plug_cb *plug = NULL;
@@ -1106,7 +1105,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
1106 bitmap = mddev->bitmap; 1105 bitmap = mddev->bitmap;
1107 1106
1108 /* 1107 /*
1109 * make_request() can abort the operation when READA is being 1108 * make_request() can abort the operation when read-ahead is being
1110 * used and no empty request is available. 1109 * used and no empty request is available.
1111 * 1110 *
1112 */ 1111 */
@@ -1376,7 +1375,7 @@ read_again:
1376 conf->mirrors[i].rdev->data_offset); 1375 conf->mirrors[i].rdev->data_offset);
1377 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1376 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1378 mbio->bi_end_io = raid1_end_write_request; 1377 mbio->bi_end_io = raid1_end_write_request;
1379 bio_set_op_attrs(mbio, op, do_flush_fua | do_sync | do_sec); 1378 bio_set_op_attrs(mbio, op, do_flush_fua | do_sync);
1380 mbio->bi_private = r1_bio; 1379 mbio->bi_private = r1_bio;
1381 1380
1382 atomic_inc(&r1_bio->remaining); 1381 atomic_inc(&r1_bio->remaining);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 245640b50153..26ae74fd0d01 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1062,7 +1062,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
1062 const int rw = bio_data_dir(bio); 1062 const int rw = bio_data_dir(bio);
1063 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1063 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1064 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1064 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1065 const unsigned long do_sec = (bio->bi_rw & REQ_SECURE);
1066 unsigned long flags; 1065 unsigned long flags;
1067 struct md_rdev *blocked_rdev; 1066 struct md_rdev *blocked_rdev;
1068 struct blk_plug_cb *cb; 1067 struct blk_plug_cb *cb;
@@ -1362,7 +1361,7 @@ retry_write:
1362 rdev)); 1361 rdev));
1363 mbio->bi_bdev = rdev->bdev; 1362 mbio->bi_bdev = rdev->bdev;
1364 mbio->bi_end_io = raid10_end_write_request; 1363 mbio->bi_end_io = raid10_end_write_request;
1365 bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec); 1364 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1366 mbio->bi_private = r10_bio; 1365 mbio->bi_private = r10_bio;
1367 1366
1368 atomic_inc(&r10_bio->remaining); 1367 atomic_inc(&r10_bio->remaining);
@@ -1404,7 +1403,7 @@ retry_write:
1404 r10_bio, rdev)); 1403 r10_bio, rdev));
1405 mbio->bi_bdev = rdev->bdev; 1404 mbio->bi_bdev = rdev->bdev;
1406 mbio->bi_end_io = raid10_end_write_request; 1405 mbio->bi_end_io = raid10_end_write_request;
1407 bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec); 1406 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1408 mbio->bi_private = r10_bio; 1407 mbio->bi_private = r10_bio;
1409 1408
1410 atomic_inc(&r10_bio->remaining); 1409 atomic_inc(&r10_bio->remaining);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7aacf5b55e15..6953d78297b0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5233,7 +5233,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5233 (unsigned long long)logical_sector); 5233 (unsigned long long)logical_sector);
5234 5234
5235 sh = raid5_get_active_stripe(conf, new_sector, previous, 5235 sh = raid5_get_active_stripe(conf, new_sector, previous,
5236 (bi->bi_rw&RWA_MASK), 0); 5236 (bi->bi_rw & REQ_RAHEAD), 0);
5237 if (sh) { 5237 if (sh) {
5238 if (unlikely(previous)) { 5238 if (unlikely(previous)) {
5239 /* expansion might have moved on while waiting for a 5239 /* expansion might have moved on while waiting for a
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 3cd68152ddf8..40bb8ae5853c 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2002,8 +2002,7 @@ static int msb_bd_getgeo(struct block_device *bdev,
2002 2002
2003static int msb_prepare_req(struct request_queue *q, struct request *req) 2003static int msb_prepare_req(struct request_queue *q, struct request *req)
2004{ 2004{
2005 if (req->cmd_type != REQ_TYPE_FS && 2005 if (req->cmd_type != REQ_TYPE_FS) {
2006 req->cmd_type != REQ_TYPE_BLOCK_PC) {
2007 blk_dump_rq_flags(req, "MS unsupported request"); 2006 blk_dump_rq_flags(req, "MS unsupported request");
2008 return BLKPREP_KILL; 2007 return BLKPREP_KILL;
2009 } 2008 }
@@ -2146,7 +2145,6 @@ static int msb_init_disk(struct memstick_dev *card)
2146 msb->disk->fops = &msb_bdops; 2145 msb->disk->fops = &msb_bdops;
2147 msb->disk->private_data = msb; 2146 msb->disk->private_data = msb;
2148 msb->disk->queue = msb->queue; 2147 msb->disk->queue = msb->queue;
2149 msb->disk->driverfs_dev = &card->dev;
2150 msb->disk->flags |= GENHD_FL_EXT_DEVT; 2148 msb->disk->flags |= GENHD_FL_EXT_DEVT;
2151 2149
2152 capacity = msb->pages_in_block * msb->logical_block_count; 2150 capacity = msb->pages_in_block * msb->logical_block_count;
@@ -2163,7 +2161,7 @@ static int msb_init_disk(struct memstick_dev *card)
2163 set_disk_ro(msb->disk, 1); 2161 set_disk_ro(msb->disk, 1);
2164 2162
2165 msb_start(card); 2163 msb_start(card);
2166 add_disk(msb->disk); 2164 device_add_disk(&card->dev, msb->disk);
2167 dbg("Disk added"); 2165 dbg("Disk added");
2168 return 0; 2166 return 0;
2169 2167
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 0fb27d338811..c1472275fe57 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -829,8 +829,7 @@ static void mspro_block_start(struct memstick_dev *card)
829 829
830static int mspro_block_prepare_req(struct request_queue *q, struct request *req) 830static int mspro_block_prepare_req(struct request_queue *q, struct request *req)
831{ 831{
832 if (req->cmd_type != REQ_TYPE_FS && 832 if (req->cmd_type != REQ_TYPE_FS) {
833 req->cmd_type != REQ_TYPE_BLOCK_PC) {
834 blk_dump_rq_flags(req, "MSPro unsupported request"); 833 blk_dump_rq_flags(req, "MSPro unsupported request");
835 return BLKPREP_KILL; 834 return BLKPREP_KILL;
836 } 835 }
@@ -1243,7 +1242,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
1243 msb->usage_count = 1; 1242 msb->usage_count = 1;
1244 msb->disk->private_data = msb; 1243 msb->disk->private_data = msb;
1245 msb->disk->queue = msb->queue; 1244 msb->disk->queue = msb->queue;
1246 msb->disk->driverfs_dev = &card->dev;
1247 1245
1248 sprintf(msb->disk->disk_name, "mspblk%d", disk_id); 1246 sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
1249 1247
@@ -1255,7 +1253,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
1255 set_capacity(msb->disk, capacity); 1253 set_capacity(msb->disk, capacity);
1256 dev_dbg(&card->dev, "capacity set %ld\n", capacity); 1254 dev_dbg(&card->dev, "capacity set %ld\n", capacity);
1257 1255
1258 add_disk(msb->disk); 1256 device_add_disk(&card->dev, msb->disk);
1259 msb->active = 1; 1257 msb->active = 1;
1260 return 0; 1258 return 0;
1261 1259
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 11ee4145983b..10b553765ee7 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -93,6 +93,7 @@ static DEFINE_SPINLOCK(mmc_blk_lock);
93 */ 93 */
94struct mmc_blk_data { 94struct mmc_blk_data {
95 spinlock_t lock; 95 spinlock_t lock;
96 struct device *parent;
96 struct gendisk *disk; 97 struct gendisk *disk;
97 struct mmc_queue queue; 98 struct mmc_queue queue;
98 struct list_head part; 99 struct list_head part;
@@ -2169,10 +2170,12 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
2169 /* complete ongoing async transfer before issuing discard */ 2170 /* complete ongoing async transfer before issuing discard */
2170 if (card->host->areq) 2171 if (card->host->areq)
2171 mmc_blk_issue_rw_rq(mq, NULL); 2172 mmc_blk_issue_rw_rq(mq, NULL);
2172 if (req->cmd_flags & REQ_SECURE) 2173 ret = mmc_blk_issue_discard_rq(mq, req);
2173 ret = mmc_blk_issue_secdiscard_rq(mq, req); 2174 } else if (req && req_op(req) == REQ_OP_SECURE_ERASE) {
2174 else 2175 /* complete ongoing async transfer before issuing secure erase*/
2175 ret = mmc_blk_issue_discard_rq(mq, req); 2176 if (card->host->areq)
2177 mmc_blk_issue_rw_rq(mq, NULL);
2178 ret = mmc_blk_issue_secdiscard_rq(mq, req);
2176 } else if (req && req_op(req) == REQ_OP_FLUSH) { 2179 } else if (req && req_op(req) == REQ_OP_FLUSH) {
2177 /* complete ongoing async transfer before issuing flush */ 2180 /* complete ongoing async transfer before issuing flush */
2178 if (card->host->areq) 2181 if (card->host->areq)
@@ -2270,7 +2273,7 @@ again:
2270 md->disk->fops = &mmc_bdops; 2273 md->disk->fops = &mmc_bdops;
2271 md->disk->private_data = md; 2274 md->disk->private_data = md;
2272 md->disk->queue = md->queue.queue; 2275 md->disk->queue = md->queue.queue;
2273 md->disk->driverfs_dev = parent; 2276 md->parent = parent;
2274 set_disk_ro(md->disk, md->read_only || default_ro); 2277 set_disk_ro(md->disk, md->read_only || default_ro);
2275 md->disk->flags = GENHD_FL_EXT_DEVT; 2278 md->disk->flags = GENHD_FL_EXT_DEVT;
2276 if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT)) 2279 if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT))
@@ -2458,7 +2461,7 @@ static int mmc_add_disk(struct mmc_blk_data *md)
2458 int ret; 2461 int ret;
2459 struct mmc_card *card = md->queue.card; 2462 struct mmc_card *card = md->queue.card;
2460 2463
2461 add_disk(md->disk); 2464 device_add_disk(md->parent, md->disk);
2462 md->force_ro.show = force_ro_show; 2465 md->force_ro.show = force_ro_show;
2463 md->force_ro.store = force_ro_store; 2466 md->force_ro.store = force_ro_store;
2464 sysfs_attr_init(&md->force_ro.attr); 2467 sysfs_attr_init(&md->force_ro.attr);
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index c2d5f6f35145..bf14642a576a 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -171,7 +171,7 @@ static void mmc_queue_setup_discard(struct request_queue *q,
171 if (card->pref_erase > max_discard) 171 if (card->pref_erase > max_discard)
172 q->limits.discard_granularity = 0; 172 q->limits.discard_granularity = 0;
173 if (mmc_can_secure_erase_trim(card)) 173 if (mmc_can_secure_erase_trim(card))
174 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); 174 queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
175} 175}
176 176
177/** 177/**
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 78b3eb45faf6..8d58acf33021 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -431,12 +431,10 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
431 goto error4; 431 goto error4;
432 INIT_WORK(&new->work, mtd_blktrans_work); 432 INIT_WORK(&new->work, mtd_blktrans_work);
433 433
434 gd->driverfs_dev = &new->mtd->dev;
435
436 if (new->readonly) 434 if (new->readonly)
437 set_disk_ro(gd, 1); 435 set_disk_ro(gd, 1);
438 436
439 add_disk(gd); 437 device_add_disk(&new->mtd->dev, gd);
440 438
441 if (new->disk_attributes) { 439 if (new->disk_attributes) {
442 ret = sysfs_create_group(&disk_to_dev(gd)->kobj, 440 ret = sysfs_create_group(&disk_to_dev(gd)->kobj,
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 495e06d9f7e7..7e262ef06ede 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -287,14 +287,13 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
287 return -ENOMEM; 287 return -ENOMEM;
288 } 288 }
289 289
290 disk->driverfs_dev = dev;
291 disk->first_minor = 0; 290 disk->first_minor = 0;
292 disk->fops = &nd_blk_fops; 291 disk->fops = &nd_blk_fops;
293 disk->queue = q; 292 disk->queue = q;
294 disk->flags = GENHD_FL_EXT_DEVT; 293 disk->flags = GENHD_FL_EXT_DEVT;
295 nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name); 294 nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
296 set_capacity(disk, 0); 295 set_capacity(disk, 0);
297 add_disk(disk); 296 device_add_disk(dev, disk);
298 297
299 if (nsblk_meta_size(nsblk)) { 298 if (nsblk_meta_size(nsblk)) {
300 int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk)); 299 int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 68a7c3c1eed9..9dce03f420eb 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1243,7 +1243,6 @@ static int btt_blk_init(struct btt *btt)
1243 } 1243 }
1244 1244
1245 nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); 1245 nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
1246 btt->btt_disk->driverfs_dev = &btt->nd_btt->dev;
1247 btt->btt_disk->first_minor = 0; 1246 btt->btt_disk->first_minor = 0;
1248 btt->btt_disk->fops = &btt_fops; 1247 btt->btt_disk->fops = &btt_fops;
1249 btt->btt_disk->private_data = btt; 1248 btt->btt_disk->private_data = btt;
@@ -1258,7 +1257,7 @@ static int btt_blk_init(struct btt *btt)
1258 btt->btt_queue->queuedata = btt; 1257 btt->btt_queue->queuedata = btt;
1259 1258
1260 set_capacity(btt->btt_disk, 0); 1259 set_capacity(btt->btt_disk, 0);
1261 add_disk(btt->btt_disk); 1260 device_add_disk(&btt->nd_btt->dev, btt->btt_disk);
1262 if (btt_meta_size(btt)) { 1261 if (btt_meta_size(btt)) {
1263 int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); 1262 int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
1264 1263
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index f085f8bceae8..5e4e5c772ea5 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -312,7 +312,7 @@ EXPORT_SYMBOL(__nd_driver_register);
312 312
313int nvdimm_revalidate_disk(struct gendisk *disk) 313int nvdimm_revalidate_disk(struct gendisk *disk)
314{ 314{
315 struct device *dev = disk->driverfs_dev; 315 struct device *dev = disk_to_dev(disk)->parent;
316 struct nd_region *nd_region = to_nd_region(dev->parent); 316 struct nd_region *nd_region = to_nd_region(dev->parent);
317 const char *pol = nd_region->ro ? "only" : "write"; 317 const char *pol = nd_region->ro ? "only" : "write";
318 318
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 53b701b2f73e..36cb39047d5b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -298,14 +298,13 @@ static int pmem_attach_disk(struct device *dev,
298 disk->queue = q; 298 disk->queue = q;
299 disk->flags = GENHD_FL_EXT_DEVT; 299 disk->flags = GENHD_FL_EXT_DEVT;
300 nvdimm_namespace_disk_name(ndns, disk->disk_name); 300 nvdimm_namespace_disk_name(ndns, disk->disk_name);
301 disk->driverfs_dev = dev;
302 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 301 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
303 / 512); 302 / 512);
304 if (devm_init_badblocks(dev, &pmem->bb)) 303 if (devm_init_badblocks(dev, &pmem->bb))
305 return -ENOMEM; 304 return -ENOMEM;
306 nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res); 305 nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
307 disk->bb = &pmem->bb; 306 disk->bb = &pmem->bb;
308 add_disk(disk); 307 device_add_disk(dev, disk);
309 revalidate_disk(disk); 308 revalidate_disk(disk);
310 309
311 return 0; 310 return 0;
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index a39d9431eaec..b7c78a5b1f7a 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1 +1,2 @@
1source "drivers/nvme/host/Kconfig" 1source "drivers/nvme/host/Kconfig"
2source "drivers/nvme/target/Kconfig"
diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
index 9421e829d2a9..0096a7fd1431 100644
--- a/drivers/nvme/Makefile
+++ b/drivers/nvme/Makefile
@@ -1,2 +1,3 @@
1 1
2obj-y += host/ 2obj-y += host/
3obj-y += target/
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index d296fc3ae06e..db39d53cdfb9 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -24,3 +24,22 @@ config BLK_DEV_NVME_SCSI
24 to say N here, unless you run a distro that abuses the SCSI 24 to say N here, unless you run a distro that abuses the SCSI
25 emulation to provide stable device names for mount by id, like 25 emulation to provide stable device names for mount by id, like
26 some OpenSuSE and SLES versions. 26 some OpenSuSE and SLES versions.
27
28config NVME_FABRICS
29 tristate
30
31config NVME_RDMA
32 tristate "NVM Express over Fabrics RDMA host driver"
33 depends on INFINIBAND
34 depends on BLK_DEV_NVME
35 select NVME_FABRICS
36 select SG_POOL
37 help
38 This provides support for the NVMe over Fabrics protocol using
39 the RDMA (Infiniband, RoCE, iWarp) transport. This allows you
40 to use remote block devices exported using the NVMe protocol set.
41
42 To configure a NVMe over Fabrics controller use the nvme-cli tool
43 from https://github.com/linux-nvme/nvme-cli.
44
45 If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 9a3ca892b4a7..47abcec23514 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,8 +1,14 @@
1obj-$(CONFIG_NVME_CORE) += nvme-core.o 1obj-$(CONFIG_NVME_CORE) += nvme-core.o
2obj-$(CONFIG_BLK_DEV_NVME) += nvme.o 2obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
3obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
4obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
3 5
4nvme-core-y := core.o 6nvme-core-y := core.o
5nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o 7nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o
6nvme-core-$(CONFIG_NVM) += lightnvm.o 8nvme-core-$(CONFIG_NVM) += lightnvm.o
7 9
8nvme-y += pci.o 10nvme-y += pci.o
11
12nvme-fabrics-y += fabrics.o
13
14nvme-rdma-y += rdma.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1c5a032d490d..7ff2e820bbf4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -30,6 +30,7 @@
30#include <asm/unaligned.h> 30#include <asm/unaligned.h>
31 31
32#include "nvme.h" 32#include "nvme.h"
33#include "fabrics.h"
33 34
34#define NVME_MINORS (1U << MINORBITS) 35#define NVME_MINORS (1U << MINORBITS)
35 36
@@ -47,8 +48,10 @@ unsigned char shutdown_timeout = 5;
47module_param(shutdown_timeout, byte, 0644); 48module_param(shutdown_timeout, byte, 0644);
48MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 49MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
49 50
50static int nvme_major; 51unsigned int nvme_max_retries = 5;
51module_param(nvme_major, int, 0); 52module_param_named(max_retries, nvme_max_retries, uint, 0644);
53MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
54EXPORT_SYMBOL_GPL(nvme_max_retries);
52 55
53static int nvme_char_major; 56static int nvme_char_major;
54module_param(nvme_char_major, int, 0); 57module_param(nvme_char_major, int, 0);
@@ -58,6 +61,23 @@ static DEFINE_SPINLOCK(dev_list_lock);
58 61
59static struct class *nvme_class; 62static struct class *nvme_class;
60 63
64void nvme_cancel_request(struct request *req, void *data, bool reserved)
65{
66 int status;
67
68 if (!blk_mq_request_started(req))
69 return;
70
71 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
72 "Cancelling I/O %d", req->tag);
73
74 status = NVME_SC_ABORT_REQ;
75 if (blk_queue_dying(req->q))
76 status |= NVME_SC_DNR;
77 blk_mq_complete_request(req, status);
78}
79EXPORT_SYMBOL_GPL(nvme_cancel_request);
80
61bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 81bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
62 enum nvme_ctrl_state new_state) 82 enum nvme_ctrl_state new_state)
63{ 83{
@@ -68,7 +88,9 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
68 switch (new_state) { 88 switch (new_state) {
69 case NVME_CTRL_LIVE: 89 case NVME_CTRL_LIVE:
70 switch (old_state) { 90 switch (old_state) {
91 case NVME_CTRL_NEW:
71 case NVME_CTRL_RESETTING: 92 case NVME_CTRL_RESETTING:
93 case NVME_CTRL_RECONNECTING:
72 changed = true; 94 changed = true;
73 /* FALLTHRU */ 95 /* FALLTHRU */
74 default: 96 default:
@@ -79,6 +101,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
79 switch (old_state) { 101 switch (old_state) {
80 case NVME_CTRL_NEW: 102 case NVME_CTRL_NEW:
81 case NVME_CTRL_LIVE: 103 case NVME_CTRL_LIVE:
104 case NVME_CTRL_RECONNECTING:
105 changed = true;
106 /* FALLTHRU */
107 default:
108 break;
109 }
110 break;
111 case NVME_CTRL_RECONNECTING:
112 switch (old_state) {
113 case NVME_CTRL_LIVE:
82 changed = true; 114 changed = true;
83 /* FALLTHRU */ 115 /* FALLTHRU */
84 default: 116 default:
@@ -89,6 +121,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
89 switch (old_state) { 121 switch (old_state) {
90 case NVME_CTRL_LIVE: 122 case NVME_CTRL_LIVE:
91 case NVME_CTRL_RESETTING: 123 case NVME_CTRL_RESETTING:
124 case NVME_CTRL_RECONNECTING:
92 changed = true; 125 changed = true;
93 /* FALLTHRU */ 126 /* FALLTHRU */
94 default: 127 default:
@@ -174,21 +207,21 @@ void nvme_requeue_req(struct request *req)
174EXPORT_SYMBOL_GPL(nvme_requeue_req); 207EXPORT_SYMBOL_GPL(nvme_requeue_req);
175 208
176struct request *nvme_alloc_request(struct request_queue *q, 209struct request *nvme_alloc_request(struct request_queue *q,
177 struct nvme_command *cmd, unsigned int flags) 210 struct nvme_command *cmd, unsigned int flags, int qid)
178{ 211{
179 bool write = cmd->common.opcode & 1;
180 struct request *req; 212 struct request *req;
181 213
182 req = blk_mq_alloc_request(q, write, flags); 214 if (qid == NVME_QID_ANY) {
215 req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags);
216 } else {
217 req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags,
218 qid ? qid - 1 : 0);
219 }
183 if (IS_ERR(req)) 220 if (IS_ERR(req))
184 return req; 221 return req;
185 222
186 req->cmd_type = REQ_TYPE_DRV_PRIV; 223 req->cmd_type = REQ_TYPE_DRV_PRIV;
187 req->cmd_flags |= REQ_FAILFAST_DRIVER; 224 req->cmd_flags |= REQ_FAILFAST_DRIVER;
188 req->__data_len = 0;
189 req->__sector = (sector_t) -1;
190 req->bio = req->biotail = NULL;
191
192 req->cmd = (unsigned char *)cmd; 225 req->cmd = (unsigned char *)cmd;
193 req->cmd_len = sizeof(struct nvme_command); 226 req->cmd_len = sizeof(struct nvme_command);
194 227
@@ -307,12 +340,12 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
307 */ 340 */
308int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 341int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
309 struct nvme_completion *cqe, void *buffer, unsigned bufflen, 342 struct nvme_completion *cqe, void *buffer, unsigned bufflen,
310 unsigned timeout) 343 unsigned timeout, int qid, int at_head, int flags)
311{ 344{
312 struct request *req; 345 struct request *req;
313 int ret; 346 int ret;
314 347
315 req = nvme_alloc_request(q, cmd, 0); 348 req = nvme_alloc_request(q, cmd, flags, qid);
316 if (IS_ERR(req)) 349 if (IS_ERR(req))
317 return PTR_ERR(req); 350 return PTR_ERR(req);
318 351
@@ -325,17 +358,19 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
325 goto out; 358 goto out;
326 } 359 }
327 360
328 blk_execute_rq(req->q, NULL, req, 0); 361 blk_execute_rq(req->q, NULL, req, at_head);
329 ret = req->errors; 362 ret = req->errors;
330 out: 363 out:
331 blk_mq_free_request(req); 364 blk_mq_free_request(req);
332 return ret; 365 return ret;
333} 366}
367EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
334 368
335int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 369int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
336 void *buffer, unsigned bufflen) 370 void *buffer, unsigned bufflen)
337{ 371{
338 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0); 372 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
373 NVME_QID_ANY, 0, 0);
339} 374}
340EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 375EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
341 376
@@ -344,7 +379,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
344 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 379 void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
345 u32 *result, unsigned timeout) 380 u32 *result, unsigned timeout)
346{ 381{
347 bool write = cmd->common.opcode & 1; 382 bool write = nvme_is_write(cmd);
348 struct nvme_completion cqe; 383 struct nvme_completion cqe;
349 struct nvme_ns *ns = q->queuedata; 384 struct nvme_ns *ns = q->queuedata;
350 struct gendisk *disk = ns ? ns->disk : NULL; 385 struct gendisk *disk = ns ? ns->disk : NULL;
@@ -353,7 +388,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
353 void *meta = NULL; 388 void *meta = NULL;
354 int ret; 389 int ret;
355 390
356 req = nvme_alloc_request(q, cmd, 0); 391 req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
357 if (IS_ERR(req)) 392 if (IS_ERR(req))
358 return PTR_ERR(req); 393 return PTR_ERR(req);
359 394
@@ -439,6 +474,74 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
439 result, timeout); 474 result, timeout);
440} 475}
441 476
477static void nvme_keep_alive_end_io(struct request *rq, int error)
478{
479 struct nvme_ctrl *ctrl = rq->end_io_data;
480
481 blk_mq_free_request(rq);
482
483 if (error) {
484 dev_err(ctrl->device,
485 "failed nvme_keep_alive_end_io error=%d\n", error);
486 return;
487 }
488
489 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
490}
491
492static int nvme_keep_alive(struct nvme_ctrl *ctrl)
493{
494 struct nvme_command c;
495 struct request *rq;
496
497 memset(&c, 0, sizeof(c));
498 c.common.opcode = nvme_admin_keep_alive;
499
500 rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
501 NVME_QID_ANY);
502 if (IS_ERR(rq))
503 return PTR_ERR(rq);
504
505 rq->timeout = ctrl->kato * HZ;
506 rq->end_io_data = ctrl;
507
508 blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
509
510 return 0;
511}
512
513static void nvme_keep_alive_work(struct work_struct *work)
514{
515 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
516 struct nvme_ctrl, ka_work);
517
518 if (nvme_keep_alive(ctrl)) {
519 /* allocation failure, reset the controller */
520 dev_err(ctrl->device, "keep-alive failed\n");
521 ctrl->ops->reset_ctrl(ctrl);
522 return;
523 }
524}
525
526void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
527{
528 if (unlikely(ctrl->kato == 0))
529 return;
530
531 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
532 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
533}
534EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
535
536void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
537{
538 if (unlikely(ctrl->kato == 0))
539 return;
540
541 cancel_delayed_work_sync(&ctrl->ka_work);
542}
543EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
544
442int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 545int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
443{ 546{
444 struct nvme_command c = { }; 547 struct nvme_command c = { };
@@ -500,10 +603,11 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
500 memset(&c, 0, sizeof(c)); 603 memset(&c, 0, sizeof(c));
501 c.features.opcode = nvme_admin_get_features; 604 c.features.opcode = nvme_admin_get_features;
502 c.features.nsid = cpu_to_le32(nsid); 605 c.features.nsid = cpu_to_le32(nsid);
503 c.features.prp1 = cpu_to_le64(dma_addr); 606 c.features.dptr.prp1 = cpu_to_le64(dma_addr);
504 c.features.fid = cpu_to_le32(fid); 607 c.features.fid = cpu_to_le32(fid);
505 608
506 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); 609 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
610 NVME_QID_ANY, 0, 0);
507 if (ret >= 0) 611 if (ret >= 0)
508 *result = le32_to_cpu(cqe.result); 612 *result = le32_to_cpu(cqe.result);
509 return ret; 613 return ret;
@@ -518,11 +622,12 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
518 622
519 memset(&c, 0, sizeof(c)); 623 memset(&c, 0, sizeof(c));
520 c.features.opcode = nvme_admin_set_features; 624 c.features.opcode = nvme_admin_set_features;
521 c.features.prp1 = cpu_to_le64(dma_addr); 625 c.features.dptr.prp1 = cpu_to_le64(dma_addr);
522 c.features.fid = cpu_to_le32(fid); 626 c.features.fid = cpu_to_le32(fid);
523 c.features.dword11 = cpu_to_le32(dword11); 627 c.features.dword11 = cpu_to_le32(dword11);
524 628
525 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); 629 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
630 NVME_QID_ANY, 0, 0);
526 if (ret >= 0) 631 if (ret >= 0)
527 *result = le32_to_cpu(cqe.result); 632 *result = le32_to_cpu(cqe.result);
528 return ret; 633 return ret;
@@ -558,11 +663,22 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
558 663
559 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, 664 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
560 &result); 665 &result);
561 if (status) 666 if (status < 0)
562 return status; 667 return status;
563 668
564 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 669 /*
565 *count = min(*count, nr_io_queues); 670 * Degraded controllers might return an error when setting the queue
671 * count. We still want to be able to bring them online and offer
672 * access to the admin queue, as that might be only way to fix them up.
673 */
674 if (status > 0) {
675 dev_err(ctrl->dev, "Could not set queue count (%d)\n", status);
676 *count = 0;
677 } else {
678 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
679 *count = min(*count, nr_io_queues);
680 }
681
566 return 0; 682 return 0;
567} 683}
568EXPORT_SYMBOL_GPL(nvme_set_queue_count); 684EXPORT_SYMBOL_GPL(nvme_set_queue_count);
@@ -726,6 +842,7 @@ static void nvme_init_integrity(struct nvme_ns *ns)
726{ 842{
727 struct blk_integrity integrity; 843 struct blk_integrity integrity;
728 844
845 memset(&integrity, 0, sizeof(integrity));
729 switch (ns->pi_type) { 846 switch (ns->pi_type) {
730 case NVME_NS_DPS_PI_TYPE3: 847 case NVME_NS_DPS_PI_TYPE3:
731 integrity.profile = &t10_pi_type3_crc; 848 integrity.profile = &t10_pi_type3_crc;
@@ -764,7 +881,7 @@ static void nvme_config_discard(struct nvme_ns *ns)
764 881
765 ns->queue->limits.discard_alignment = logical_block_size; 882 ns->queue->limits.discard_alignment = logical_block_size;
766 ns->queue->limits.discard_granularity = logical_block_size; 883 ns->queue->limits.discard_granularity = logical_block_size;
767 blk_queue_max_discard_sectors(ns->queue, 0xffffffff); 884 blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
768 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 885 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
769} 886}
770 887
@@ -991,6 +1108,15 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
991 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1108 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
992 if (ret) 1109 if (ret)
993 return ret; 1110 return ret;
1111
1112 /* Checking for ctrl->tagset is a trick to avoid sleeping on module
1113 * load, since we only need the quirk on reset_controller. Notice
1114 * that the HGST device needs this delay only in firmware activation
1115 * procedure; unfortunately we have no (easy) way to verify this.
1116 */
1117 if ((ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) && ctrl->tagset)
1118 msleep(NVME_QUIRK_DELAY_AMOUNT);
1119
994 return nvme_wait_ready(ctrl, cap, false); 1120 return nvme_wait_ready(ctrl, cap, false);
995} 1121}
996EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 1122EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
@@ -1088,6 +1214,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
1088 struct nvme_id_ctrl *id; 1214 struct nvme_id_ctrl *id;
1089 u64 cap; 1215 u64 cap;
1090 int ret, page_shift; 1216 int ret, page_shift;
1217 u32 max_hw_sectors;
1091 1218
1092 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 1219 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
1093 if (ret) { 1220 if (ret) {
@@ -1120,9 +1247,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
1120 memcpy(ctrl->model, id->mn, sizeof(id->mn)); 1247 memcpy(ctrl->model, id->mn, sizeof(id->mn));
1121 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); 1248 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
1122 if (id->mdts) 1249 if (id->mdts)
1123 ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9); 1250 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
1124 else 1251 else
1125 ctrl->max_hw_sectors = UINT_MAX; 1252 max_hw_sectors = UINT_MAX;
1253 ctrl->max_hw_sectors =
1254 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
1126 1255
1127 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { 1256 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
1128 unsigned int max_hw_sectors; 1257 unsigned int max_hw_sectors;
@@ -1138,9 +1267,33 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
1138 } 1267 }
1139 1268
1140 nvme_set_queue_limits(ctrl, ctrl->admin_q); 1269 nvme_set_queue_limits(ctrl, ctrl->admin_q);
1270 ctrl->sgls = le32_to_cpu(id->sgls);
1271 ctrl->kas = le16_to_cpu(id->kas);
1272
1273 if (ctrl->ops->is_fabrics) {
1274 ctrl->icdoff = le16_to_cpu(id->icdoff);
1275 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
1276 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
1277 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
1278
1279 /*
1280 * In fabrics we need to verify the cntlid matches the
1281 * admin connect
1282 */
1283 if (ctrl->cntlid != le16_to_cpu(id->cntlid))
1284 ret = -EINVAL;
1285
1286 if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
1287 dev_err(ctrl->dev,
1288 "keep-alive support is mandatory for fabrics\n");
1289 ret = -EINVAL;
1290 }
1291 } else {
1292 ctrl->cntlid = le16_to_cpu(id->cntlid);
1293 }
1141 1294
1142 kfree(id); 1295 kfree(id);
1143 return 0; 1296 return ret;
1144} 1297}
1145EXPORT_SYMBOL_GPL(nvme_init_identify); 1298EXPORT_SYMBOL_GPL(nvme_init_identify);
1146 1299
@@ -1322,7 +1475,7 @@ static struct attribute *nvme_ns_attrs[] = {
1322 NULL, 1475 NULL,
1323}; 1476};
1324 1477
1325static umode_t nvme_attrs_are_visible(struct kobject *kobj, 1478static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
1326 struct attribute *a, int n) 1479 struct attribute *a, int n)
1327{ 1480{
1328 struct device *dev = container_of(kobj, struct device, kobj); 1481 struct device *dev = container_of(kobj, struct device, kobj);
@@ -1341,7 +1494,7 @@ static umode_t nvme_attrs_are_visible(struct kobject *kobj,
1341 1494
1342static const struct attribute_group nvme_ns_attr_group = { 1495static const struct attribute_group nvme_ns_attr_group = {
1343 .attrs = nvme_ns_attrs, 1496 .attrs = nvme_ns_attrs,
1344 .is_visible = nvme_attrs_are_visible, 1497 .is_visible = nvme_ns_attrs_are_visible,
1345}; 1498};
1346 1499
1347#define nvme_show_str_function(field) \ 1500#define nvme_show_str_function(field) \
@@ -1367,6 +1520,49 @@ nvme_show_str_function(serial);
1367nvme_show_str_function(firmware_rev); 1520nvme_show_str_function(firmware_rev);
1368nvme_show_int_function(cntlid); 1521nvme_show_int_function(cntlid);
1369 1522
1523static ssize_t nvme_sysfs_delete(struct device *dev,
1524 struct device_attribute *attr, const char *buf,
1525 size_t count)
1526{
1527 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1528
1529 if (device_remove_file_self(dev, attr))
1530 ctrl->ops->delete_ctrl(ctrl);
1531 return count;
1532}
1533static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
1534
1535static ssize_t nvme_sysfs_show_transport(struct device *dev,
1536 struct device_attribute *attr,
1537 char *buf)
1538{
1539 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1540
1541 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
1542}
1543static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
1544
1545static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
1546 struct device_attribute *attr,
1547 char *buf)
1548{
1549 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1550
1551 return snprintf(buf, PAGE_SIZE, "%s\n",
1552 ctrl->ops->get_subsysnqn(ctrl));
1553}
1554static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
1555
1556static ssize_t nvme_sysfs_show_address(struct device *dev,
1557 struct device_attribute *attr,
1558 char *buf)
1559{
1560 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1561
1562 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
1563}
1564static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
1565
1370static struct attribute *nvme_dev_attrs[] = { 1566static struct attribute *nvme_dev_attrs[] = {
1371 &dev_attr_reset_controller.attr, 1567 &dev_attr_reset_controller.attr,
1372 &dev_attr_rescan_controller.attr, 1568 &dev_attr_rescan_controller.attr,
@@ -1374,11 +1570,38 @@ static struct attribute *nvme_dev_attrs[] = {
1374 &dev_attr_serial.attr, 1570 &dev_attr_serial.attr,
1375 &dev_attr_firmware_rev.attr, 1571 &dev_attr_firmware_rev.attr,
1376 &dev_attr_cntlid.attr, 1572 &dev_attr_cntlid.attr,
1573 &dev_attr_delete_controller.attr,
1574 &dev_attr_transport.attr,
1575 &dev_attr_subsysnqn.attr,
1576 &dev_attr_address.attr,
1377 NULL 1577 NULL
1378}; 1578};
1379 1579
1580#define CHECK_ATTR(ctrl, a, name) \
1581 if ((a) == &dev_attr_##name.attr && \
1582 !(ctrl)->ops->get_##name) \
1583 return 0
1584
1585static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
1586 struct attribute *a, int n)
1587{
1588 struct device *dev = container_of(kobj, struct device, kobj);
1589 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1590
1591 if (a == &dev_attr_delete_controller.attr) {
1592 if (!ctrl->ops->delete_ctrl)
1593 return 0;
1594 }
1595
1596 CHECK_ATTR(ctrl, a, subsysnqn);
1597 CHECK_ATTR(ctrl, a, address);
1598
1599 return a->mode;
1600}
1601
1380static struct attribute_group nvme_dev_attrs_group = { 1602static struct attribute_group nvme_dev_attrs_group = {
1381 .attrs = nvme_dev_attrs, 1603 .attrs = nvme_dev_attrs,
1604 .is_visible = nvme_dev_attrs_are_visible,
1382}; 1605};
1383 1606
1384static const struct attribute_group *nvme_dev_attr_groups[] = { 1607static const struct attribute_group *nvme_dev_attr_groups[] = {
@@ -1446,12 +1669,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1446 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1669 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1447 nvme_set_queue_limits(ctrl, ns->queue); 1670 nvme_set_queue_limits(ctrl, ns->queue);
1448 1671
1449 disk->major = nvme_major;
1450 disk->first_minor = 0;
1451 disk->fops = &nvme_fops; 1672 disk->fops = &nvme_fops;
1452 disk->private_data = ns; 1673 disk->private_data = ns;
1453 disk->queue = ns->queue; 1674 disk->queue = ns->queue;
1454 disk->driverfs_dev = ctrl->device;
1455 disk->flags = GENHD_FL_EXT_DEVT; 1675 disk->flags = GENHD_FL_EXT_DEVT;
1456 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); 1676 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
1457 1677
@@ -1466,7 +1686,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1466 if (ns->type == NVME_NS_LIGHTNVM) 1686 if (ns->type == NVME_NS_LIGHTNVM)
1467 return; 1687 return;
1468 1688
1469 add_disk(ns->disk); 1689 device_add_disk(ctrl->device, ns->disk);
1470 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, 1690 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
1471 &nvme_ns_attr_group)) 1691 &nvme_ns_attr_group))
1472 pr_warn("%s: failed to create sysfs group for identification\n", 1692 pr_warn("%s: failed to create sysfs group for identification\n",
@@ -1517,6 +1737,17 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1517 nvme_alloc_ns(ctrl, nsid); 1737 nvme_alloc_ns(ctrl, nsid);
1518} 1738}
1519 1739
1740static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
1741 unsigned nsid)
1742{
1743 struct nvme_ns *ns, *next;
1744
1745 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
1746 if (ns->ns_id > nsid)
1747 nvme_ns_remove(ns);
1748 }
1749}
1750
1520static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) 1751static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
1521{ 1752{
1522 struct nvme_ns *ns; 1753 struct nvme_ns *ns;
@@ -1531,7 +1762,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
1531 for (i = 0; i < num_lists; i++) { 1762 for (i = 0; i < num_lists; i++) {
1532 ret = nvme_identify_ns_list(ctrl, prev, ns_list); 1763 ret = nvme_identify_ns_list(ctrl, prev, ns_list);
1533 if (ret) 1764 if (ret)
1534 goto out; 1765 goto free;
1535 1766
1536 for (j = 0; j < min(nn, 1024U); j++) { 1767 for (j = 0; j < min(nn, 1024U); j++) {
1537 nsid = le32_to_cpu(ns_list[j]); 1768 nsid = le32_to_cpu(ns_list[j]);
@@ -1551,22 +1782,20 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
1551 nn -= j; 1782 nn -= j;
1552 } 1783 }
1553 out: 1784 out:
1785 nvme_remove_invalid_namespaces(ctrl, prev);
1786 free:
1554 kfree(ns_list); 1787 kfree(ns_list);
1555 return ret; 1788 return ret;
1556} 1789}
1557 1790
1558static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) 1791static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
1559{ 1792{
1560 struct nvme_ns *ns, *next;
1561 unsigned i; 1793 unsigned i;
1562 1794
1563 for (i = 1; i <= nn; i++) 1795 for (i = 1; i <= nn; i++)
1564 nvme_validate_ns(ctrl, i); 1796 nvme_validate_ns(ctrl, i);
1565 1797
1566 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 1798 nvme_remove_invalid_namespaces(ctrl, nn);
1567 if (ns->ns_id > nn)
1568 nvme_ns_remove(ns);
1569 }
1570} 1799}
1571 1800
1572static void nvme_scan_work(struct work_struct *work) 1801static void nvme_scan_work(struct work_struct *work)
@@ -1852,16 +2081,10 @@ int __init nvme_core_init(void)
1852{ 2081{
1853 int result; 2082 int result;
1854 2083
1855 result = register_blkdev(nvme_major, "nvme");
1856 if (result < 0)
1857 return result;
1858 else if (result > 0)
1859 nvme_major = result;
1860
1861 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 2084 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
1862 &nvme_dev_fops); 2085 &nvme_dev_fops);
1863 if (result < 0) 2086 if (result < 0)
1864 goto unregister_blkdev; 2087 return result;
1865 else if (result > 0) 2088 else if (result > 0)
1866 nvme_char_major = result; 2089 nvme_char_major = result;
1867 2090
@@ -1875,8 +2098,6 @@ int __init nvme_core_init(void)
1875 2098
1876 unregister_chrdev: 2099 unregister_chrdev:
1877 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 2100 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1878 unregister_blkdev:
1879 unregister_blkdev(nvme_major, "nvme");
1880 return result; 2101 return result;
1881} 2102}
1882 2103
@@ -1884,7 +2105,6 @@ void nvme_core_exit(void)
1884{ 2105{
1885 class_destroy(nvme_class); 2106 class_destroy(nvme_class);
1886 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 2107 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1887 unregister_blkdev(nvme_major, "nvme");
1888} 2108}
1889 2109
1890MODULE_LICENSE("GPL"); 2110MODULE_LICENSE("GPL");
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
new file mode 100644
index 000000000000..dc996761042f
--- /dev/null
+++ b/drivers/nvme/host/fabrics.c
@@ -0,0 +1,952 @@
1/*
2 * NVMe over Fabrics common host code.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/init.h>
16#include <linux/miscdevice.h>
17#include <linux/module.h>
18#include <linux/mutex.h>
19#include <linux/parser.h>
20#include <linux/seq_file.h>
21#include "nvme.h"
22#include "fabrics.h"
23
24static LIST_HEAD(nvmf_transports);
25static DEFINE_MUTEX(nvmf_transports_mutex);
26
27static LIST_HEAD(nvmf_hosts);
28static DEFINE_MUTEX(nvmf_hosts_mutex);
29
30static struct nvmf_host *nvmf_default_host;
31
32static struct nvmf_host *__nvmf_host_find(const char *hostnqn)
33{
34 struct nvmf_host *host;
35
36 list_for_each_entry(host, &nvmf_hosts, list) {
37 if (!strcmp(host->nqn, hostnqn))
38 return host;
39 }
40
41 return NULL;
42}
43
44static struct nvmf_host *nvmf_host_add(const char *hostnqn)
45{
46 struct nvmf_host *host;
47
48 mutex_lock(&nvmf_hosts_mutex);
49 host = __nvmf_host_find(hostnqn);
50 if (host)
51 goto out_unlock;
52
53 host = kmalloc(sizeof(*host), GFP_KERNEL);
54 if (!host)
55 goto out_unlock;
56
57 kref_init(&host->ref);
58 memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
59 uuid_le_gen(&host->id);
60
61 list_add_tail(&host->list, &nvmf_hosts);
62out_unlock:
63 mutex_unlock(&nvmf_hosts_mutex);
64 return host;
65}
66
67static struct nvmf_host *nvmf_host_default(void)
68{
69 struct nvmf_host *host;
70
71 host = kmalloc(sizeof(*host), GFP_KERNEL);
72 if (!host)
73 return NULL;
74
75 kref_init(&host->ref);
76 uuid_le_gen(&host->id);
77 snprintf(host->nqn, NVMF_NQN_SIZE,
78 "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUl", &host->id);
79
80 mutex_lock(&nvmf_hosts_mutex);
81 list_add_tail(&host->list, &nvmf_hosts);
82 mutex_unlock(&nvmf_hosts_mutex);
83
84 return host;
85}
86
87static void nvmf_host_destroy(struct kref *ref)
88{
89 struct nvmf_host *host = container_of(ref, struct nvmf_host, ref);
90
91 mutex_lock(&nvmf_hosts_mutex);
92 list_del(&host->list);
93 mutex_unlock(&nvmf_hosts_mutex);
94
95 kfree(host);
96}
97
98static void nvmf_host_put(struct nvmf_host *host)
99{
100 if (host)
101 kref_put(&host->ref, nvmf_host_destroy);
102}
103
104/**
105 * nvmf_get_address() - Get address/port
106 * @ctrl: Host NVMe controller instance which we got the address
107 * @buf: OUTPUT parameter that will contain the address/port
108 * @size: buffer size
109 */
110int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
111{
112 return snprintf(buf, size, "traddr=%s,trsvcid=%s\n",
113 ctrl->opts->traddr, ctrl->opts->trsvcid);
114}
115EXPORT_SYMBOL_GPL(nvmf_get_address);
116
117/**
118 * nvmf_get_subsysnqn() - Get subsystem NQN
119 * @ctrl: Host NVMe controller instance which we got the NQN
120 */
121const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl)
122{
123 return ctrl->opts->subsysnqn;
124}
125EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn);
126
127/**
128 * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function.
129 * @ctrl: Host NVMe controller instance maintaining the admin
130 * queue used to submit the property read command to
131 * the allocated NVMe controller resource on the target system.
132 * @off: Starting offset value of the targeted property
133 * register (see the fabrics section of the NVMe standard).
134 * @val: OUTPUT parameter that will contain the value of
135 * the property after a successful read.
136 *
137 * Used by the host system to retrieve a 32-bit capsule property value
138 * from an NVMe controller on the target system.
139 *
140 * ("Capsule property" is an "PCIe register concept" applied to the
141 * NVMe fabrics space.)
142 *
143 * Return:
144 * 0: successful read
145 * > 0: NVMe error status code
146 * < 0: Linux errno error code
147 */
148int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
149{
150 struct nvme_command cmd;
151 struct nvme_completion cqe;
152 int ret;
153
154 memset(&cmd, 0, sizeof(cmd));
155 cmd.prop_get.opcode = nvme_fabrics_command;
156 cmd.prop_get.fctype = nvme_fabrics_type_property_get;
157 cmd.prop_get.offset = cpu_to_le32(off);
158
159 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0,
160 NVME_QID_ANY, 0, 0);
161
162 if (ret >= 0)
163 *val = le64_to_cpu(cqe.result64);
164 if (unlikely(ret != 0))
165 dev_err(ctrl->device,
166 "Property Get error: %d, offset %#x\n",
167 ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
168
169 return ret;
170}
171EXPORT_SYMBOL_GPL(nvmf_reg_read32);
172
173/**
174 * nvmf_reg_read64() - NVMe Fabrics "Property Get" API function.
175 * @ctrl: Host NVMe controller instance maintaining the admin
176 * queue used to submit the property read command to
177 * the allocated controller resource on the target system.
178 * @off: Starting offset value of the targeted property
179 * register (see the fabrics section of the NVMe standard).
180 * @val: OUTPUT parameter that will contain the value of
181 * the property after a successful read.
182 *
183 * Used by the host system to retrieve a 64-bit capsule property value
184 * from an NVMe controller on the target system.
185 *
186 * ("Capsule property" is an "PCIe register concept" applied to the
187 * NVMe fabrics space.)
188 *
189 * Return:
190 * 0: successful read
191 * > 0: NVMe error status code
192 * < 0: Linux errno error code
193 */
194int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
195{
196 struct nvme_command cmd;
197 struct nvme_completion cqe;
198 int ret;
199
200 memset(&cmd, 0, sizeof(cmd));
201 cmd.prop_get.opcode = nvme_fabrics_command;
202 cmd.prop_get.fctype = nvme_fabrics_type_property_get;
203 cmd.prop_get.attrib = 1;
204 cmd.prop_get.offset = cpu_to_le32(off);
205
206 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0,
207 NVME_QID_ANY, 0, 0);
208
209 if (ret >= 0)
210 *val = le64_to_cpu(cqe.result64);
211 if (unlikely(ret != 0))
212 dev_err(ctrl->device,
213 "Property Get error: %d, offset %#x\n",
214 ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
215 return ret;
216}
217EXPORT_SYMBOL_GPL(nvmf_reg_read64);
218
219/**
220 * nvmf_reg_write32() - NVMe Fabrics "Property Write" API function.
221 * @ctrl: Host NVMe controller instance maintaining the admin
222 * queue used to submit the property read command to
223 * the allocated NVMe controller resource on the target system.
224 * @off: Starting offset value of the targeted property
225 * register (see the fabrics section of the NVMe standard).
226 * @val: Input parameter that contains the value to be
227 * written to the property.
228 *
229 * Used by the NVMe host system to write a 32-bit capsule property value
230 * to an NVMe controller on the target system.
231 *
232 * ("Capsule property" is an "PCIe register concept" applied to the
233 * NVMe fabrics space.)
234 *
235 * Return:
236 * 0: successful write
237 * > 0: NVMe error status code
238 * < 0: Linux errno error code
239 */
240int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
241{
242 struct nvme_command cmd;
243 int ret;
244
245 memset(&cmd, 0, sizeof(cmd));
246 cmd.prop_set.opcode = nvme_fabrics_command;
247 cmd.prop_set.fctype = nvme_fabrics_type_property_set;
248 cmd.prop_set.attrib = 0;
249 cmd.prop_set.offset = cpu_to_le32(off);
250 cmd.prop_set.value = cpu_to_le64(val);
251
252 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
253 NVME_QID_ANY, 0, 0);
254 if (unlikely(ret))
255 dev_err(ctrl->device,
256 "Property Set error: %d, offset %#x\n",
257 ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
258 return ret;
259}
260EXPORT_SYMBOL_GPL(nvmf_reg_write32);
261
262/**
263 * nvmf_log_connect_error() - Error-parsing-diagnostic print
264 * out function for connect() errors.
265 *
266 * @ctrl: the specific /dev/nvmeX device that had the error.
267 *
268 * @errval: Error code to be decoded in a more human-friendly
269 * printout.
270 *
271 * @offset: For use with the NVMe error code NVME_SC_CONNECT_INVALID_PARAM.
272 *
273 * @cmd: This is the SQE portion of a submission capsule.
274 *
275 * @data: This is the "Data" portion of a submission capsule.
276 */
277static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
278 int errval, int offset, struct nvme_command *cmd,
279 struct nvmf_connect_data *data)
280{
281 int err_sctype = errval & (~NVME_SC_DNR);
282
283 switch (err_sctype) {
284
285 case (NVME_SC_CONNECT_INVALID_PARAM):
286 if (offset >> 16) {
287 char *inv_data = "Connect Invalid Data Parameter";
288
289 switch (offset & 0xffff) {
290 case (offsetof(struct nvmf_connect_data, cntlid)):
291 dev_err(ctrl->device,
292 "%s, cntlid: %d\n",
293 inv_data, data->cntlid);
294 break;
295 case (offsetof(struct nvmf_connect_data, hostnqn)):
296 dev_err(ctrl->device,
297 "%s, hostnqn \"%s\"\n",
298 inv_data, data->hostnqn);
299 break;
300 case (offsetof(struct nvmf_connect_data, subsysnqn)):
301 dev_err(ctrl->device,
302 "%s, subsysnqn \"%s\"\n",
303 inv_data, data->subsysnqn);
304 break;
305 default:
306 dev_err(ctrl->device,
307 "%s, starting byte offset: %d\n",
308 inv_data, offset & 0xffff);
309 break;
310 }
311 } else {
312 char *inv_sqe = "Connect Invalid SQE Parameter";
313
314 switch (offset) {
315 case (offsetof(struct nvmf_connect_command, qid)):
316 dev_err(ctrl->device,
317 "%s, qid %d\n",
318 inv_sqe, cmd->connect.qid);
319 break;
320 default:
321 dev_err(ctrl->device,
322 "%s, starting byte offset: %d\n",
323 inv_sqe, offset);
324 }
325 }
326 break;
327 default:
328 dev_err(ctrl->device,
329 "Connect command failed, error wo/DNR bit: %d\n",
330 err_sctype);
331 break;
332 } /* switch (err_sctype) */
333}
334
335/**
336 * nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect"
337 * API function.
338 * @ctrl: Host nvme controller instance used to request
339 * a new NVMe controller allocation on the target
340 * system and establish an NVMe Admin connection to
341 * that controller.
342 *
343 * This function enables an NVMe host device to request a new allocation of
344 * an NVMe controller resource on a target system as well establish a
345 * fabrics-protocol connection of the NVMe Admin queue between the
346 * host system device and the allocated NVMe controller on the
347 * target system via a NVMe Fabrics "Connect" command.
348 *
349 * Return:
350 * 0: success
351 * > 0: NVMe error status code
352 * < 0: Linux errno error code
353 *
354 */
355int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
356{
357 struct nvme_command cmd;
358 struct nvme_completion cqe;
359 struct nvmf_connect_data *data;
360 int ret;
361
362 memset(&cmd, 0, sizeof(cmd));
363 cmd.connect.opcode = nvme_fabrics_command;
364 cmd.connect.fctype = nvme_fabrics_type_connect;
365 cmd.connect.qid = 0;
366 cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
367 /*
368 * Set keep-alive timeout in seconds granularity (ms * 1000)
369 * and add a grace period for controller kato enforcement
370 */
371 cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
372 cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
373
374 data = kzalloc(sizeof(*data), GFP_KERNEL);
375 if (!data)
376 return -ENOMEM;
377
378 memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le));
379 data->cntlid = cpu_to_le16(0xffff);
380 strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
381 strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
382
383 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe,
384 data, sizeof(*data), 0, NVME_QID_ANY, 1,
385 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
386 if (ret) {
387 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result),
388 &cmd, data);
389 goto out_free_data;
390 }
391
392 ctrl->cntlid = le16_to_cpu(cqe.result16);
393
394out_free_data:
395 kfree(data);
396 return ret;
397}
398EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
399
400/**
401 * nvmf_connect_io_queue() - NVMe Fabrics I/O Queue "Connect"
402 * API function.
403 * @ctrl: Host nvme controller instance used to establish an
404 * NVMe I/O queue connection to the already allocated NVMe
405 * controller on the target system.
406 * @qid: NVMe I/O queue number for the new I/O connection between
407 * host and target (note qid == 0 is illegal as this is
408 * the Admin queue, per NVMe standard).
409 *
410 * This function issues a fabrics-protocol connection
411 * of a NVMe I/O queue (via NVMe Fabrics "Connect" command)
412 * between the host system device and the allocated NVMe controller
413 * on the target system.
414 *
415 * Return:
416 * 0: success
417 * > 0: NVMe error status code
418 * < 0: Linux errno error code
419 */
420int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
421{
422 struct nvme_command cmd;
423 struct nvmf_connect_data *data;
424 struct nvme_completion cqe;
425 int ret;
426
427 memset(&cmd, 0, sizeof(cmd));
428 cmd.connect.opcode = nvme_fabrics_command;
429 cmd.connect.fctype = nvme_fabrics_type_connect;
430 cmd.connect.qid = cpu_to_le16(qid);
431 cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
432
433 data = kzalloc(sizeof(*data), GFP_KERNEL);
434 if (!data)
435 return -ENOMEM;
436
437 memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le));
438 data->cntlid = cpu_to_le16(ctrl->cntlid);
439 strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
440 strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
441
442 ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &cqe,
443 data, sizeof(*data), 0, qid, 1,
444 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
445 if (ret) {
446 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result),
447 &cmd, data);
448 }
449 kfree(data);
450 return ret;
451}
452EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
453
454/**
455 * nvmf_register_transport() - NVMe Fabrics Library registration function.
456 * @ops: Transport ops instance to be registered to the
457 * common fabrics library.
458 *
459 * API function that registers the type of specific transport fabric
460 * being implemented to the common NVMe fabrics library. Part of
461 * the overall init sequence of starting up a fabrics driver.
462 */
463void nvmf_register_transport(struct nvmf_transport_ops *ops)
464{
465 mutex_lock(&nvmf_transports_mutex);
466 list_add_tail(&ops->entry, &nvmf_transports);
467 mutex_unlock(&nvmf_transports_mutex);
468}
469EXPORT_SYMBOL_GPL(nvmf_register_transport);
470
471/**
472 * nvmf_unregister_transport() - NVMe Fabrics Library unregistration function.
473 * @ops: Transport ops instance to be unregistered from the
474 * common fabrics library.
475 *
476 * Fabrics API function that unregisters the type of specific transport
477 * fabric being implemented from the common NVMe fabrics library.
478 * Part of the overall exit sequence of unloading the implemented driver.
479 */
480void nvmf_unregister_transport(struct nvmf_transport_ops *ops)
481{
482 mutex_lock(&nvmf_transports_mutex);
483 list_del(&ops->entry);
484 mutex_unlock(&nvmf_transports_mutex);
485}
486EXPORT_SYMBOL_GPL(nvmf_unregister_transport);
487
488static struct nvmf_transport_ops *nvmf_lookup_transport(
489 struct nvmf_ctrl_options *opts)
490{
491 struct nvmf_transport_ops *ops;
492
493 lockdep_assert_held(&nvmf_transports_mutex);
494
495 list_for_each_entry(ops, &nvmf_transports, entry) {
496 if (strcmp(ops->name, opts->transport) == 0)
497 return ops;
498 }
499
500 return NULL;
501}
502
503static const match_table_t opt_tokens = {
504 { NVMF_OPT_TRANSPORT, "transport=%s" },
505 { NVMF_OPT_TRADDR, "traddr=%s" },
506 { NVMF_OPT_TRSVCID, "trsvcid=%s" },
507 { NVMF_OPT_NQN, "nqn=%s" },
508 { NVMF_OPT_QUEUE_SIZE, "queue_size=%d" },
509 { NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" },
510 { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" },
511 { NVMF_OPT_KATO, "keep_alive_tmo=%d" },
512 { NVMF_OPT_HOSTNQN, "hostnqn=%s" },
513 { NVMF_OPT_ERR, NULL }
514};
515
516static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
517 const char *buf)
518{
519 substring_t args[MAX_OPT_ARGS];
520 char *options, *o, *p;
521 int token, ret = 0;
522 size_t nqnlen = 0;
523
524 /* Set defaults */
525 opts->queue_size = NVMF_DEF_QUEUE_SIZE;
526 opts->nr_io_queues = num_online_cpus();
527 opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
528
529 options = o = kstrdup(buf, GFP_KERNEL);
530 if (!options)
531 return -ENOMEM;
532
533 while ((p = strsep(&o, ",\n")) != NULL) {
534 if (!*p)
535 continue;
536
537 token = match_token(p, opt_tokens, args);
538 opts->mask |= token;
539 switch (token) {
540 case NVMF_OPT_TRANSPORT:
541 p = match_strdup(args);
542 if (!p) {
543 ret = -ENOMEM;
544 goto out;
545 }
546 opts->transport = p;
547 break;
548 case NVMF_OPT_NQN:
549 p = match_strdup(args);
550 if (!p) {
551 ret = -ENOMEM;
552 goto out;
553 }
554 opts->subsysnqn = p;
555 nqnlen = strlen(opts->subsysnqn);
556 if (nqnlen >= NVMF_NQN_SIZE) {
557 pr_err("%s needs to be < %d bytes\n",
558 opts->subsysnqn, NVMF_NQN_SIZE);
559 ret = -EINVAL;
560 goto out;
561 }
562 opts->discovery_nqn =
563 !(strcmp(opts->subsysnqn,
564 NVME_DISC_SUBSYS_NAME));
565 if (opts->discovery_nqn)
566 opts->nr_io_queues = 0;
567 break;
568 case NVMF_OPT_TRADDR:
569 p = match_strdup(args);
570 if (!p) {
571 ret = -ENOMEM;
572 goto out;
573 }
574 opts->traddr = p;
575 break;
576 case NVMF_OPT_TRSVCID:
577 p = match_strdup(args);
578 if (!p) {
579 ret = -ENOMEM;
580 goto out;
581 }
582 opts->trsvcid = p;
583 break;
584 case NVMF_OPT_QUEUE_SIZE:
585 if (match_int(args, &token)) {
586 ret = -EINVAL;
587 goto out;
588 }
589 if (token < NVMF_MIN_QUEUE_SIZE ||
590 token > NVMF_MAX_QUEUE_SIZE) {
591 pr_err("Invalid queue_size %d\n", token);
592 ret = -EINVAL;
593 goto out;
594 }
595 opts->queue_size = token;
596 break;
597 case NVMF_OPT_NR_IO_QUEUES:
598 if (match_int(args, &token)) {
599 ret = -EINVAL;
600 goto out;
601 }
602 if (token <= 0) {
603 pr_err("Invalid number of IOQs %d\n", token);
604 ret = -EINVAL;
605 goto out;
606 }
607 opts->nr_io_queues = min_t(unsigned int,
608 num_online_cpus(), token);
609 break;
610 case NVMF_OPT_KATO:
611 if (match_int(args, &token)) {
612 ret = -EINVAL;
613 goto out;
614 }
615
616 if (opts->discovery_nqn) {
617 pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n");
618 ret = -EINVAL;
619 goto out;
620 }
621
622 if (token < 0) {
623 pr_err("Invalid keep_alive_tmo %d\n", token);
624 ret = -EINVAL;
625 goto out;
626 } else if (token == 0) {
627 /* Allowed for debug */
628 pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n");
629 }
630 opts->kato = token;
631 break;
632 case NVMF_OPT_HOSTNQN:
633 if (opts->host) {
634 pr_err("hostnqn already user-assigned: %s\n",
635 opts->host->nqn);
636 ret = -EADDRINUSE;
637 goto out;
638 }
639 p = match_strdup(args);
640 if (!p) {
641 ret = -ENOMEM;
642 goto out;
643 }
644 nqnlen = strlen(p);
645 if (nqnlen >= NVMF_NQN_SIZE) {
646 pr_err("%s needs to be < %d bytes\n",
647 p, NVMF_NQN_SIZE);
648 ret = -EINVAL;
649 goto out;
650 }
651 opts->host = nvmf_host_add(p);
652 if (!opts->host) {
653 ret = -ENOMEM;
654 goto out;
655 }
656 break;
657 case NVMF_OPT_RECONNECT_DELAY:
658 if (match_int(args, &token)) {
659 ret = -EINVAL;
660 goto out;
661 }
662 if (token <= 0) {
663 pr_err("Invalid reconnect_delay %d\n", token);
664 ret = -EINVAL;
665 goto out;
666 }
667 opts->reconnect_delay = token;
668 break;
669 default:
670 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
671 p);
672 ret = -EINVAL;
673 goto out;
674 }
675 }
676
677 if (!opts->host) {
678 kref_get(&nvmf_default_host->ref);
679 opts->host = nvmf_default_host;
680 }
681
682out:
683 if (!opts->discovery_nqn && !opts->kato)
684 opts->kato = NVME_DEFAULT_KATO;
685 kfree(options);
686 return ret;
687}
688
689static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts,
690 unsigned int required_opts)
691{
692 if ((opts->mask & required_opts) != required_opts) {
693 int i;
694
695 for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) {
696 if ((opt_tokens[i].token & required_opts) &&
697 !(opt_tokens[i].token & opts->mask)) {
698 pr_warn("missing parameter '%s'\n",
699 opt_tokens[i].pattern);
700 }
701 }
702
703 return -EINVAL;
704 }
705
706 return 0;
707}
708
709static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
710 unsigned int allowed_opts)
711{
712 if (opts->mask & ~allowed_opts) {
713 int i;
714
715 for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) {
716 if (opt_tokens[i].token & ~allowed_opts) {
717 pr_warn("invalid parameter '%s'\n",
718 opt_tokens[i].pattern);
719 }
720 }
721
722 return -EINVAL;
723 }
724
725 return 0;
726}
727
728void nvmf_free_options(struct nvmf_ctrl_options *opts)
729{
730 nvmf_host_put(opts->host);
731 kfree(opts->transport);
732 kfree(opts->traddr);
733 kfree(opts->trsvcid);
734 kfree(opts->subsysnqn);
735 kfree(opts);
736}
737EXPORT_SYMBOL_GPL(nvmf_free_options);
738
739#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
740#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
741 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN)
742
743static struct nvme_ctrl *
744nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
745{
746 struct nvmf_ctrl_options *opts;
747 struct nvmf_transport_ops *ops;
748 struct nvme_ctrl *ctrl;
749 int ret;
750
751 opts = kzalloc(sizeof(*opts), GFP_KERNEL);
752 if (!opts)
753 return ERR_PTR(-ENOMEM);
754
755 ret = nvmf_parse_options(opts, buf);
756 if (ret)
757 goto out_free_opts;
758
759 /*
760 * Check the generic options first as we need a valid transport for
761 * the lookup below. Then clear the generic flags so that transport
762 * drivers don't have to care about them.
763 */
764 ret = nvmf_check_required_opts(opts, NVMF_REQUIRED_OPTS);
765 if (ret)
766 goto out_free_opts;
767 opts->mask &= ~NVMF_REQUIRED_OPTS;
768
769 mutex_lock(&nvmf_transports_mutex);
770 ops = nvmf_lookup_transport(opts);
771 if (!ops) {
772 pr_info("no handler found for transport %s.\n",
773 opts->transport);
774 ret = -EINVAL;
775 goto out_unlock;
776 }
777
778 ret = nvmf_check_required_opts(opts, ops->required_opts);
779 if (ret)
780 goto out_unlock;
781 ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
782 ops->allowed_opts | ops->required_opts);
783 if (ret)
784 goto out_unlock;
785
786 ctrl = ops->create_ctrl(dev, opts);
787 if (IS_ERR(ctrl)) {
788 ret = PTR_ERR(ctrl);
789 goto out_unlock;
790 }
791
792 mutex_unlock(&nvmf_transports_mutex);
793 return ctrl;
794
795out_unlock:
796 mutex_unlock(&nvmf_transports_mutex);
797out_free_opts:
798 nvmf_host_put(opts->host);
799 kfree(opts);
800 return ERR_PTR(ret);
801}
802
803static struct class *nvmf_class;
804static struct device *nvmf_device;
805static DEFINE_MUTEX(nvmf_dev_mutex);
806
807static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf,
808 size_t count, loff_t *pos)
809{
810 struct seq_file *seq_file = file->private_data;
811 struct nvme_ctrl *ctrl;
812 const char *buf;
813 int ret = 0;
814
815 if (count > PAGE_SIZE)
816 return -ENOMEM;
817
818 buf = memdup_user_nul(ubuf, count);
819 if (IS_ERR(buf))
820 return PTR_ERR(buf);
821
822 mutex_lock(&nvmf_dev_mutex);
823 if (seq_file->private) {
824 ret = -EINVAL;
825 goto out_unlock;
826 }
827
828 ctrl = nvmf_create_ctrl(nvmf_device, buf, count);
829 if (IS_ERR(ctrl)) {
830 ret = PTR_ERR(ctrl);
831 goto out_unlock;
832 }
833
834 seq_file->private = ctrl;
835
836out_unlock:
837 mutex_unlock(&nvmf_dev_mutex);
838 kfree(buf);
839 return ret ? ret : count;
840}
841
842static int nvmf_dev_show(struct seq_file *seq_file, void *private)
843{
844 struct nvme_ctrl *ctrl;
845 int ret = 0;
846
847 mutex_lock(&nvmf_dev_mutex);
848 ctrl = seq_file->private;
849 if (!ctrl) {
850 ret = -EINVAL;
851 goto out_unlock;
852 }
853
854 seq_printf(seq_file, "instance=%d,cntlid=%d\n",
855 ctrl->instance, ctrl->cntlid);
856
857out_unlock:
858 mutex_unlock(&nvmf_dev_mutex);
859 return ret;
860}
861
862static int nvmf_dev_open(struct inode *inode, struct file *file)
863{
864 /*
865 * The miscdevice code initializes file->private_data, but doesn't
866 * make use of it later.
867 */
868 file->private_data = NULL;
869 return single_open(file, nvmf_dev_show, NULL);
870}
871
872static int nvmf_dev_release(struct inode *inode, struct file *file)
873{
874 struct seq_file *seq_file = file->private_data;
875 struct nvme_ctrl *ctrl = seq_file->private;
876
877 if (ctrl)
878 nvme_put_ctrl(ctrl);
879 return single_release(inode, file);
880}
881
882static const struct file_operations nvmf_dev_fops = {
883 .owner = THIS_MODULE,
884 .write = nvmf_dev_write,
885 .read = seq_read,
886 .open = nvmf_dev_open,
887 .release = nvmf_dev_release,
888};
889
890static struct miscdevice nvmf_misc = {
891 .minor = MISC_DYNAMIC_MINOR,
892 .name = "nvme-fabrics",
893 .fops = &nvmf_dev_fops,
894};
895
896static int __init nvmf_init(void)
897{
898 int ret;
899
900 nvmf_default_host = nvmf_host_default();
901 if (!nvmf_default_host)
902 return -ENOMEM;
903
904 nvmf_class = class_create(THIS_MODULE, "nvme-fabrics");
905 if (IS_ERR(nvmf_class)) {
906 pr_err("couldn't register class nvme-fabrics\n");
907 ret = PTR_ERR(nvmf_class);
908 goto out_free_host;
909 }
910
911 nvmf_device =
912 device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
913 if (IS_ERR(nvmf_device)) {
914 pr_err("couldn't create nvme-fabris device!\n");
915 ret = PTR_ERR(nvmf_device);
916 goto out_destroy_class;
917 }
918
919 ret = misc_register(&nvmf_misc);
920 if (ret) {
921 pr_err("couldn't register misc device: %d\n", ret);
922 goto out_destroy_device;
923 }
924
925 return 0;
926
927out_destroy_device:
928 device_destroy(nvmf_class, MKDEV(0, 0));
929out_destroy_class:
930 class_destroy(nvmf_class);
931out_free_host:
932 nvmf_host_put(nvmf_default_host);
933 return ret;
934}
935
936static void __exit nvmf_exit(void)
937{
938 misc_deregister(&nvmf_misc);
939 device_destroy(nvmf_class, MKDEV(0, 0));
940 class_destroy(nvmf_class);
941 nvmf_host_put(nvmf_default_host);
942
943 BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
944 BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
945 BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
946 BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
947}
948
949MODULE_LICENSE("GPL v2");
950
951module_init(nvmf_init);
952module_exit(nvmf_exit);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
new file mode 100644
index 000000000000..89df52c8be97
--- /dev/null
+++ b/drivers/nvme/host/fabrics.h
@@ -0,0 +1,132 @@
1/*
2 * NVMe over Fabrics common host code.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#ifndef _NVME_FABRICS_H
15#define _NVME_FABRICS_H 1
16
17#include <linux/in.h>
18#include <linux/inet.h>
19
20#define NVMF_MIN_QUEUE_SIZE 16
21#define NVMF_MAX_QUEUE_SIZE 1024
22#define NVMF_DEF_QUEUE_SIZE 128
23#define NVMF_DEF_RECONNECT_DELAY 10
24
25/*
26 * Define a host as seen by the target. We allocate one at boot, but also
27 * allow the override it when creating controllers. This is both to provide
28 * persistence of the Host NQN over multiple boots, and to allow using
29 * multiple ones, for example in a container scenario. Because we must not
30 * use different Host NQNs with the same Host ID we generate a Host ID and
31 * use this structure to keep track of the relation between the two.
32 */
33struct nvmf_host {
34 struct kref ref;
35 struct list_head list;
36 char nqn[NVMF_NQN_SIZE];
37 uuid_le id;
38};
39
40/**
41 * enum nvmf_parsing_opts - used to define the sysfs parsing options used.
42 */
43enum {
44 NVMF_OPT_ERR = 0,
45 NVMF_OPT_TRANSPORT = 1 << 0,
46 NVMF_OPT_NQN = 1 << 1,
47 NVMF_OPT_TRADDR = 1 << 2,
48 NVMF_OPT_TRSVCID = 1 << 3,
49 NVMF_OPT_QUEUE_SIZE = 1 << 4,
50 NVMF_OPT_NR_IO_QUEUES = 1 << 5,
51 NVMF_OPT_TL_RETRY_COUNT = 1 << 6,
52 NVMF_OPT_KATO = 1 << 7,
53 NVMF_OPT_HOSTNQN = 1 << 8,
54 NVMF_OPT_RECONNECT_DELAY = 1 << 9,
55};
56
57/**
58 * struct nvmf_ctrl_options - Used to hold the options specified
59 * with the parsing opts enum.
60 * @mask: Used by the fabrics library to parse through sysfs options
61 * on adding a NVMe controller.
62 * @transport: Holds the fabric transport "technology name" (for a lack of
63 * better description) that will be used by an NVMe controller
64 * being added.
65 * @subsysnqn: Hold the fully qualified NQN subystem name (format defined
66 * in the NVMe specification, "NVMe Qualified Names").
67 * @traddr: network address that will be used by the host to communicate
68 * to the added NVMe controller.
69 * @trsvcid: network port used for host-controller communication.
70 * @queue_size: Number of IO queue elements.
71 * @nr_io_queues: Number of controller IO queues that will be established.
72 * @reconnect_delay: Time between two consecutive reconnect attempts.
73 * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
74 * @kato: Keep-alive timeout.
75 * @host: Virtual NVMe host, contains the NQN and Host ID.
76 */
77struct nvmf_ctrl_options {
78 unsigned mask;
79 char *transport;
80 char *subsysnqn;
81 char *traddr;
82 char *trsvcid;
83 size_t queue_size;
84 unsigned int nr_io_queues;
85 unsigned int reconnect_delay;
86 bool discovery_nqn;
87 unsigned int kato;
88 struct nvmf_host *host;
89};
90
91/*
92 * struct nvmf_transport_ops - used to register a specific
93 * fabric implementation of NVMe fabrics.
94 * @entry: Used by the fabrics library to add the new
95 * registration entry to its linked-list internal tree.
96 * @name: Name of the NVMe fabric driver implementation.
97 * @required_opts: sysfs command-line options that must be specified
98 * when adding a new NVMe controller.
99 * @allowed_opts: sysfs command-line options that can be specified
100 * when adding a new NVMe controller.
101 * @create_ctrl(): function pointer that points to a non-NVMe
102 * implementation-specific fabric technology
103 * that would go into starting up that fabric
104 * for the purpose of conneciton to an NVMe controller
105 * using that fabric technology.
106 *
107 * Notes:
108 * 1. At minimum, 'required_opts' and 'allowed_opts' should
109 * be set to the same enum parsing options defined earlier.
110 * 2. create_ctrl() must be defined (even if it does nothing)
111 */
112struct nvmf_transport_ops {
113 struct list_head entry;
114 const char *name;
115 int required_opts;
116 int allowed_opts;
117 struct nvme_ctrl *(*create_ctrl)(struct device *dev,
118 struct nvmf_ctrl_options *opts);
119};
120
121int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
122int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
123int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
124int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
125int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
126void nvmf_register_transport(struct nvmf_transport_ops *ops);
127void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
128void nvmf_free_options(struct nvmf_ctrl_options *opts);
129const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
130int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
131
132#endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index a0af0558354c..63f483daf930 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -156,7 +156,7 @@ struct nvme_nvm_completion {
156 156
157#define NVME_NVM_LP_MLC_PAIRS 886 157#define NVME_NVM_LP_MLC_PAIRS 886
158struct nvme_nvm_lp_mlc { 158struct nvme_nvm_lp_mlc {
159 __u16 num_pairs; 159 __le16 num_pairs;
160 __u8 pairs[NVME_NVM_LP_MLC_PAIRS]; 160 __u8 pairs[NVME_NVM_LP_MLC_PAIRS];
161}; 161};
162 162
@@ -500,7 +500,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
500 struct bio *bio = rqd->bio; 500 struct bio *bio = rqd->bio;
501 struct nvme_nvm_command *cmd; 501 struct nvme_nvm_command *cmd;
502 502
503 rq = blk_mq_alloc_request(q, bio_rw(bio), 0); 503 rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
504 if (IS_ERR(rq)) 504 if (IS_ERR(rq))
505 return -ENOMEM; 505 return -ENOMEM;
506 506
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4d196d2d57da..ab18b78102bf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -38,6 +38,11 @@ extern unsigned char admin_timeout;
38extern unsigned char shutdown_timeout; 38extern unsigned char shutdown_timeout;
39#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) 39#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
40 40
41#define NVME_DEFAULT_KATO 5
42#define NVME_KATO_GRACE 10
43
44extern unsigned int nvme_max_retries;
45
41enum { 46enum {
42 NVME_NS_LBA = 0, 47 NVME_NS_LBA = 0,
43 NVME_NS_LIGHTNVM = 1, 48 NVME_NS_LIGHTNVM = 1,
@@ -65,12 +70,26 @@ enum nvme_quirks {
65 * logical blocks. 70 * logical blocks.
66 */ 71 */
67 NVME_QUIRK_DISCARD_ZEROES = (1 << 2), 72 NVME_QUIRK_DISCARD_ZEROES = (1 << 2),
73
74 /*
75 * The controller needs a delay before starts checking the device
76 * readiness, which is done by reading the NVME_CSTS_RDY bit.
77 */
78 NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3),
68}; 79};
69 80
81/* The below value is the specific amount of delay needed before checking
82 * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
83 * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
84 * found empirically.
85 */
86#define NVME_QUIRK_DELAY_AMOUNT 2000
87
70enum nvme_ctrl_state { 88enum nvme_ctrl_state {
71 NVME_CTRL_NEW, 89 NVME_CTRL_NEW,
72 NVME_CTRL_LIVE, 90 NVME_CTRL_LIVE,
73 NVME_CTRL_RESETTING, 91 NVME_CTRL_RESETTING,
92 NVME_CTRL_RECONNECTING,
74 NVME_CTRL_DELETING, 93 NVME_CTRL_DELETING,
75 NVME_CTRL_DEAD, 94 NVME_CTRL_DEAD,
76}; 95};
@@ -80,6 +99,7 @@ struct nvme_ctrl {
80 spinlock_t lock; 99 spinlock_t lock;
81 const struct nvme_ctrl_ops *ops; 100 const struct nvme_ctrl_ops *ops;
82 struct request_queue *admin_q; 101 struct request_queue *admin_q;
102 struct request_queue *connect_q;
83 struct device *dev; 103 struct device *dev;
84 struct kref kref; 104 struct kref kref;
85 int instance; 105 int instance;
@@ -107,10 +127,22 @@ struct nvme_ctrl {
107 u8 event_limit; 127 u8 event_limit;
108 u8 vwc; 128 u8 vwc;
109 u32 vs; 129 u32 vs;
130 u32 sgls;
131 u16 kas;
132 unsigned int kato;
110 bool subsystem; 133 bool subsystem;
111 unsigned long quirks; 134 unsigned long quirks;
112 struct work_struct scan_work; 135 struct work_struct scan_work;
113 struct work_struct async_event_work; 136 struct work_struct async_event_work;
137 struct delayed_work ka_work;
138
139 /* Fabrics only */
140 u16 sqsize;
141 u32 ioccsz;
142 u32 iorcsz;
143 u16 icdoff;
144 u16 maxcmd;
145 struct nvmf_ctrl_options *opts;
114}; 146};
115 147
116/* 148/*
@@ -144,7 +176,9 @@ struct nvme_ns {
144}; 176};
145 177
146struct nvme_ctrl_ops { 178struct nvme_ctrl_ops {
179 const char *name;
147 struct module *module; 180 struct module *module;
181 bool is_fabrics;
148 int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); 182 int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
149 int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); 183 int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
150 int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); 184 int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@@ -152,6 +186,9 @@ struct nvme_ctrl_ops {
152 void (*free_ctrl)(struct nvme_ctrl *ctrl); 186 void (*free_ctrl)(struct nvme_ctrl *ctrl);
153 void (*post_scan)(struct nvme_ctrl *ctrl); 187 void (*post_scan)(struct nvme_ctrl *ctrl);
154 void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); 188 void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx);
189 int (*delete_ctrl)(struct nvme_ctrl *ctrl);
190 const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl);
191 int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
155}; 192};
156 193
157static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) 194static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
@@ -204,9 +241,11 @@ static inline int nvme_error_status(u16 status)
204static inline bool nvme_req_needs_retry(struct request *req, u16 status) 241static inline bool nvme_req_needs_retry(struct request *req, u16 status)
205{ 242{
206 return !(status & NVME_SC_DNR || blk_noretry_request(req)) && 243 return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
207 (jiffies - req->start_time) < req->timeout; 244 (jiffies - req->start_time) < req->timeout &&
245 req->retries < nvme_max_retries;
208} 246}
209 247
248void nvme_cancel_request(struct request *req, void *data, bool reserved);
210bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 249bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
211 enum nvme_ctrl_state new_state); 250 enum nvme_ctrl_state new_state);
212int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); 251int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
@@ -230,8 +269,9 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl);
230void nvme_start_queues(struct nvme_ctrl *ctrl); 269void nvme_start_queues(struct nvme_ctrl *ctrl);
231void nvme_kill_queues(struct nvme_ctrl *ctrl); 270void nvme_kill_queues(struct nvme_ctrl *ctrl);
232 271
272#define NVME_QID_ANY -1
233struct request *nvme_alloc_request(struct request_queue *q, 273struct request *nvme_alloc_request(struct request_queue *q,
234 struct nvme_command *cmd, unsigned int flags); 274 struct nvme_command *cmd, unsigned int flags, int qid);
235void nvme_requeue_req(struct request *req); 275void nvme_requeue_req(struct request *req);
236int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 276int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
237 struct nvme_command *cmd); 277 struct nvme_command *cmd);
@@ -239,7 +279,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
239 void *buf, unsigned bufflen); 279 void *buf, unsigned bufflen);
240int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 280int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
241 struct nvme_completion *cqe, void *buffer, unsigned bufflen, 281 struct nvme_completion *cqe, void *buffer, unsigned bufflen,
242 unsigned timeout); 282 unsigned timeout, int qid, int at_head, int flags);
243int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, 283int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
244 void __user *ubuffer, unsigned bufflen, u32 *result, 284 void __user *ubuffer, unsigned bufflen, u32 *result,
245 unsigned timeout); 285 unsigned timeout);
@@ -256,6 +296,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
256int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, 296int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
257 dma_addr_t dma_addr, u32 *result); 297 dma_addr_t dma_addr, u32 *result);
258int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); 298int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
299void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
300void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
259 301
260struct sg_io_hdr; 302struct sg_io_hdr;
261 303
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index befac5b19490..4cb9b156cab7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -310,6 +310,11 @@ static int nvme_init_iod(struct request *rq, unsigned size,
310 iod->npages = -1; 310 iod->npages = -1;
311 iod->nents = 0; 311 iod->nents = 0;
312 iod->length = size; 312 iod->length = size;
313
314 if (!(rq->cmd_flags & REQ_DONTPREP)) {
315 rq->retries = 0;
316 rq->cmd_flags |= REQ_DONTPREP;
317 }
313 return 0; 318 return 0;
314} 319}
315 320
@@ -520,8 +525,8 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req,
520 goto out_unmap; 525 goto out_unmap;
521 } 526 }
522 527
523 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 528 cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
524 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 529 cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
525 if (blk_integrity_rq(req)) 530 if (blk_integrity_rq(req))
526 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); 531 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
527 return BLK_MQ_RQ_QUEUE_OK; 532 return BLK_MQ_RQ_QUEUE_OK;
@@ -623,6 +628,7 @@ static void nvme_complete_rq(struct request *req)
623 628
624 if (unlikely(req->errors)) { 629 if (unlikely(req->errors)) {
625 if (nvme_req_needs_retry(req, req->errors)) { 630 if (nvme_req_needs_retry(req, req->errors)) {
631 req->retries++;
626 nvme_requeue_req(req); 632 nvme_requeue_req(req);
627 return; 633 return;
628 } 634 }
@@ -901,7 +907,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
901 req->tag, nvmeq->qid); 907 req->tag, nvmeq->qid);
902 908
903 abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, 909 abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
904 BLK_MQ_REQ_NOWAIT); 910 BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
905 if (IS_ERR(abort_req)) { 911 if (IS_ERR(abort_req)) {
906 atomic_inc(&dev->ctrl.abort_limit); 912 atomic_inc(&dev->ctrl.abort_limit);
907 return BLK_EH_RESET_TIMER; 913 return BLK_EH_RESET_TIMER;
@@ -919,22 +925,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
919 return BLK_EH_RESET_TIMER; 925 return BLK_EH_RESET_TIMER;
920} 926}
921 927
922static void nvme_cancel_io(struct request *req, void *data, bool reserved)
923{
924 int status;
925
926 if (!blk_mq_request_started(req))
927 return;
928
929 dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device,
930 "Cancelling I/O %d", req->tag);
931
932 status = NVME_SC_ABORT_REQ;
933 if (blk_queue_dying(req->q))
934 status |= NVME_SC_DNR;
935 blk_mq_complete_request(req, status);
936}
937
938static void nvme_free_queue(struct nvme_queue *nvmeq) 928static void nvme_free_queue(struct nvme_queue *nvmeq)
939{ 929{
940 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 930 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
@@ -1399,16 +1389,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1399 if (result < 0) 1389 if (result < 0)
1400 return result; 1390 return result;
1401 1391
1402 /* 1392 if (nr_io_queues == 0)
1403 * Degraded controllers might return an error when setting the queue
1404 * count. We still want to be able to bring them online and offer
1405 * access to the admin queue, as that might be only way to fix them up.
1406 */
1407 if (result > 0) {
1408 dev_err(dev->ctrl.device,
1409 "Could not set queue count (%d)\n", result);
1410 return 0; 1393 return 0;
1411 }
1412 1394
1413 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 1395 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
1414 result = nvme_cmb_qdepth(dev, nr_io_queues, 1396 result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -1536,7 +1518,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
1536 cmd.delete_queue.opcode = opcode; 1518 cmd.delete_queue.opcode = opcode;
1537 cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); 1519 cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
1538 1520
1539 req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); 1521 req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
1540 if (IS_ERR(req)) 1522 if (IS_ERR(req))
1541 return PTR_ERR(req); 1523 return PTR_ERR(req);
1542 1524
@@ -1727,8 +1709,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
1727 } 1709 }
1728 nvme_pci_disable(dev); 1710 nvme_pci_disable(dev);
1729 1711
1730 blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev); 1712 blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
1731 blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev); 1713 blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
1732 mutex_unlock(&dev->shutdown_lock); 1714 mutex_unlock(&dev->shutdown_lock);
1733} 1715}
1734 1716
@@ -1902,6 +1884,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
1902} 1884}
1903 1885
1904static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { 1886static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
1887 .name = "pcie",
1905 .module = THIS_MODULE, 1888 .module = THIS_MODULE,
1906 .reg_read32 = nvme_pci_reg_read32, 1889 .reg_read32 = nvme_pci_reg_read32,
1907 .reg_write32 = nvme_pci_reg_write32, 1890 .reg_write32 = nvme_pci_reg_write32,
@@ -1940,7 +1923,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
1940 1923
1941 node = dev_to_node(&pdev->dev); 1924 node = dev_to_node(&pdev->dev);
1942 if (node == NUMA_NO_NODE) 1925 if (node == NUMA_NO_NODE)
1943 set_dev_node(&pdev->dev, 0); 1926 set_dev_node(&pdev->dev, first_memory_node);
1944 1927
1945 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 1928 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
1946 if (!dev) 1929 if (!dev)
@@ -2037,6 +2020,24 @@ static void nvme_remove(struct pci_dev *pdev)
2037 nvme_put_ctrl(&dev->ctrl); 2020 nvme_put_ctrl(&dev->ctrl);
2038} 2021}
2039 2022
2023static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs)
2024{
2025 int ret = 0;
2026
2027 if (numvfs == 0) {
2028 if (pci_vfs_assigned(pdev)) {
2029 dev_warn(&pdev->dev,
2030 "Cannot disable SR-IOV VFs while assigned\n");
2031 return -EPERM;
2032 }
2033 pci_disable_sriov(pdev);
2034 return 0;
2035 }
2036
2037 ret = pci_enable_sriov(pdev, numvfs);
2038 return ret ? ret : numvfs;
2039}
2040
2040#ifdef CONFIG_PM_SLEEP 2041#ifdef CONFIG_PM_SLEEP
2041static int nvme_suspend(struct device *dev) 2042static int nvme_suspend(struct device *dev)
2042{ 2043{
@@ -2122,6 +2123,8 @@ static const struct pci_device_id nvme_id_table[] = {
2122 NVME_QUIRK_DISCARD_ZEROES, }, 2123 NVME_QUIRK_DISCARD_ZEROES, },
2123 { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ 2124 { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
2124 .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, 2125 .driver_data = NVME_QUIRK_IDENTIFY_CNS, },
2126 { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
2127 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2125 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2128 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
2126 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 2129 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
2127 { 0, } 2130 { 0, }
@@ -2137,6 +2140,7 @@ static struct pci_driver nvme_driver = {
2137 .driver = { 2140 .driver = {
2138 .pm = &nvme_dev_pm_ops, 2141 .pm = &nvme_dev_pm_ops,
2139 }, 2142 },
2143 .sriov_configure = nvme_pci_sriov_configure,
2140 .err_handler = &nvme_err_handler, 2144 .err_handler = &nvme_err_handler,
2141}; 2145};
2142 2146
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
new file mode 100644
index 000000000000..3e3ce2b0424e
--- /dev/null
+++ b/drivers/nvme/host/rdma.c
@@ -0,0 +1,2018 @@
1/*
2 * NVMe over Fabrics RDMA host code.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/delay.h>
16#include <linux/module.h>
17#include <linux/init.h>
18#include <linux/slab.h>
19#include <linux/err.h>
20#include <linux/string.h>
21#include <linux/jiffies.h>
22#include <linux/atomic.h>
23#include <linux/blk-mq.h>
24#include <linux/types.h>
25#include <linux/list.h>
26#include <linux/mutex.h>
27#include <linux/scatterlist.h>
28#include <linux/nvme.h>
29#include <linux/t10-pi.h>
30#include <asm/unaligned.h>
31
32#include <rdma/ib_verbs.h>
33#include <rdma/rdma_cm.h>
34#include <rdma/ib_cm.h>
35#include <linux/nvme-rdma.h>
36
37#include "nvme.h"
38#include "fabrics.h"
39
40
41#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */
42
43#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */
44
45#define NVME_RDMA_MAX_SEGMENTS 256
46
47#define NVME_RDMA_MAX_INLINE_SEGMENTS 1
48
49#define NVME_RDMA_MAX_PAGES_PER_MR 512
50
51#define NVME_RDMA_DEF_RECONNECT_DELAY 20
52
53/*
54 * We handle AEN commands ourselves and don't even let the
55 * block layer know about them.
56 */
57#define NVME_RDMA_NR_AEN_COMMANDS 1
58#define NVME_RDMA_AQ_BLKMQ_DEPTH \
59 (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
60
61struct nvme_rdma_device {
62 struct ib_device *dev;
63 struct ib_pd *pd;
64 struct ib_mr *mr;
65 struct kref ref;
66 struct list_head entry;
67};
68
69struct nvme_rdma_qe {
70 struct ib_cqe cqe;
71 void *data;
72 u64 dma;
73};
74
75struct nvme_rdma_queue;
76struct nvme_rdma_request {
77 struct ib_mr *mr;
78 struct nvme_rdma_qe sqe;
79 struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
80 u32 num_sge;
81 int nents;
82 bool inline_data;
83 bool need_inval;
84 struct ib_reg_wr reg_wr;
85 struct ib_cqe reg_cqe;
86 struct nvme_rdma_queue *queue;
87 struct sg_table sg_table;
88 struct scatterlist first_sgl[];
89};
90
91enum nvme_rdma_queue_flags {
92 NVME_RDMA_Q_CONNECTED = (1 << 0),
93};
94
95struct nvme_rdma_queue {
96 struct nvme_rdma_qe *rsp_ring;
97 u8 sig_count;
98 int queue_size;
99 size_t cmnd_capsule_len;
100 struct nvme_rdma_ctrl *ctrl;
101 struct nvme_rdma_device *device;
102 struct ib_cq *ib_cq;
103 struct ib_qp *qp;
104
105 unsigned long flags;
106 struct rdma_cm_id *cm_id;
107 int cm_error;
108 struct completion cm_done;
109};
110
111struct nvme_rdma_ctrl {
112 /* read and written in the hot path */
113 spinlock_t lock;
114
115 /* read only in the hot path */
116 struct nvme_rdma_queue *queues;
117 u32 queue_count;
118
119 /* other member variables */
120 struct blk_mq_tag_set tag_set;
121 struct work_struct delete_work;
122 struct work_struct reset_work;
123 struct work_struct err_work;
124
125 struct nvme_rdma_qe async_event_sqe;
126
127 int reconnect_delay;
128 struct delayed_work reconnect_work;
129
130 struct list_head list;
131
132 struct blk_mq_tag_set admin_tag_set;
133 struct nvme_rdma_device *device;
134
135 u64 cap;
136 u32 max_fr_pages;
137
138 union {
139 struct sockaddr addr;
140 struct sockaddr_in addr_in;
141 };
142
143 struct nvme_ctrl ctrl;
144};
145
146static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
147{
148 return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
149}
150
151static LIST_HEAD(device_list);
152static DEFINE_MUTEX(device_list_mutex);
153
154static LIST_HEAD(nvme_rdma_ctrl_list);
155static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
156
157static struct workqueue_struct *nvme_rdma_wq;
158
159/*
160 * Disabling this option makes small I/O goes faster, but is fundamentally
161 * unsafe. With it turned off we will have to register a global rkey that
162 * allows read and write access to all physical memory.
163 */
164static bool register_always = true;
165module_param(register_always, bool, 0444);
166MODULE_PARM_DESC(register_always,
167 "Use memory registration even for contiguous memory regions");
168
169static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
170 struct rdma_cm_event *event);
171static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
172static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
173
174/* XXX: really should move to a generic header sooner or later.. */
175static inline void put_unaligned_le24(u32 val, u8 *p)
176{
177 *p++ = val;
178 *p++ = val >> 8;
179 *p++ = val >> 16;
180}
181
182static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
183{
184 return queue - queue->ctrl->queues;
185}
186
187static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
188{
189 return queue->cmnd_capsule_len - sizeof(struct nvme_command);
190}
191
192static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
193 size_t capsule_size, enum dma_data_direction dir)
194{
195 ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
196 kfree(qe->data);
197}
198
199static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
200 size_t capsule_size, enum dma_data_direction dir)
201{
202 qe->data = kzalloc(capsule_size, GFP_KERNEL);
203 if (!qe->data)
204 return -ENOMEM;
205
206 qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
207 if (ib_dma_mapping_error(ibdev, qe->dma)) {
208 kfree(qe->data);
209 return -ENOMEM;
210 }
211
212 return 0;
213}
214
215static void nvme_rdma_free_ring(struct ib_device *ibdev,
216 struct nvme_rdma_qe *ring, size_t ib_queue_size,
217 size_t capsule_size, enum dma_data_direction dir)
218{
219 int i;
220
221 for (i = 0; i < ib_queue_size; i++)
222 nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
223 kfree(ring);
224}
225
226static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
227 size_t ib_queue_size, size_t capsule_size,
228 enum dma_data_direction dir)
229{
230 struct nvme_rdma_qe *ring;
231 int i;
232
233 ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
234 if (!ring)
235 return NULL;
236
237 for (i = 0; i < ib_queue_size; i++) {
238 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
239 goto out_free_ring;
240 }
241
242 return ring;
243
244out_free_ring:
245 nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
246 return NULL;
247}
248
249static void nvme_rdma_qp_event(struct ib_event *event, void *context)
250{
251 pr_debug("QP event %d\n", event->event);
252}
253
254static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
255{
256 wait_for_completion_interruptible_timeout(&queue->cm_done,
257 msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
258 return queue->cm_error;
259}
260
261static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
262{
263 struct nvme_rdma_device *dev = queue->device;
264 struct ib_qp_init_attr init_attr;
265 int ret;
266
267 memset(&init_attr, 0, sizeof(init_attr));
268 init_attr.event_handler = nvme_rdma_qp_event;
269 /* +1 for drain */
270 init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
271 /* +1 for drain */
272 init_attr.cap.max_recv_wr = queue->queue_size + 1;
273 init_attr.cap.max_recv_sge = 1;
274 init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
275 init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
276 init_attr.qp_type = IB_QPT_RC;
277 init_attr.send_cq = queue->ib_cq;
278 init_attr.recv_cq = queue->ib_cq;
279
280 ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
281
282 queue->qp = queue->cm_id->qp;
283 return ret;
284}
285
286static int nvme_rdma_reinit_request(void *data, struct request *rq)
287{
288 struct nvme_rdma_ctrl *ctrl = data;
289 struct nvme_rdma_device *dev = ctrl->device;
290 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
291 int ret = 0;
292
293 if (!req->need_inval)
294 goto out;
295
296 ib_dereg_mr(req->mr);
297
298 req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
299 ctrl->max_fr_pages);
300 if (IS_ERR(req->mr)) {
301 ret = PTR_ERR(req->mr);
302 req->mr = NULL;
303 }
304
305 req->need_inval = false;
306
307out:
308 return ret;
309}
310
311static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl,
312 struct request *rq, unsigned int queue_idx)
313{
314 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
315 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
316 struct nvme_rdma_device *dev = queue->device;
317
318 if (req->mr)
319 ib_dereg_mr(req->mr);
320
321 nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
322 DMA_TO_DEVICE);
323}
324
325static void nvme_rdma_exit_request(void *data, struct request *rq,
326 unsigned int hctx_idx, unsigned int rq_idx)
327{
328 return __nvme_rdma_exit_request(data, rq, hctx_idx + 1);
329}
330
331static void nvme_rdma_exit_admin_request(void *data, struct request *rq,
332 unsigned int hctx_idx, unsigned int rq_idx)
333{
334 return __nvme_rdma_exit_request(data, rq, 0);
335}
336
337static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl,
338 struct request *rq, unsigned int queue_idx)
339{
340 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
341 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
342 struct nvme_rdma_device *dev = queue->device;
343 struct ib_device *ibdev = dev->dev;
344 int ret;
345
346 BUG_ON(queue_idx >= ctrl->queue_count);
347
348 ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
349 DMA_TO_DEVICE);
350 if (ret)
351 return ret;
352
353 req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
354 ctrl->max_fr_pages);
355 if (IS_ERR(req->mr)) {
356 ret = PTR_ERR(req->mr);
357 goto out_free_qe;
358 }
359
360 req->queue = queue;
361
362 return 0;
363
364out_free_qe:
365 nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
366 DMA_TO_DEVICE);
367 return -ENOMEM;
368}
369
370static int nvme_rdma_init_request(void *data, struct request *rq,
371 unsigned int hctx_idx, unsigned int rq_idx,
372 unsigned int numa_node)
373{
374 return __nvme_rdma_init_request(data, rq, hctx_idx + 1);
375}
376
377static int nvme_rdma_init_admin_request(void *data, struct request *rq,
378 unsigned int hctx_idx, unsigned int rq_idx,
379 unsigned int numa_node)
380{
381 return __nvme_rdma_init_request(data, rq, 0);
382}
383
384static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
385 unsigned int hctx_idx)
386{
387 struct nvme_rdma_ctrl *ctrl = data;
388 struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
389
390 BUG_ON(hctx_idx >= ctrl->queue_count);
391
392 hctx->driver_data = queue;
393 return 0;
394}
395
396static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
397 unsigned int hctx_idx)
398{
399 struct nvme_rdma_ctrl *ctrl = data;
400 struct nvme_rdma_queue *queue = &ctrl->queues[0];
401
402 BUG_ON(hctx_idx != 0);
403
404 hctx->driver_data = queue;
405 return 0;
406}
407
408static void nvme_rdma_free_dev(struct kref *ref)
409{
410 struct nvme_rdma_device *ndev =
411 container_of(ref, struct nvme_rdma_device, ref);
412
413 mutex_lock(&device_list_mutex);
414 list_del(&ndev->entry);
415 mutex_unlock(&device_list_mutex);
416
417 if (!register_always)
418 ib_dereg_mr(ndev->mr);
419 ib_dealloc_pd(ndev->pd);
420
421 kfree(ndev);
422}
423
424static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
425{
426 kref_put(&dev->ref, nvme_rdma_free_dev);
427}
428
429static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
430{
431 return kref_get_unless_zero(&dev->ref);
432}
433
434static struct nvme_rdma_device *
435nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
436{
437 struct nvme_rdma_device *ndev;
438
439 mutex_lock(&device_list_mutex);
440 list_for_each_entry(ndev, &device_list, entry) {
441 if (ndev->dev->node_guid == cm_id->device->node_guid &&
442 nvme_rdma_dev_get(ndev))
443 goto out_unlock;
444 }
445
446 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
447 if (!ndev)
448 goto out_err;
449
450 ndev->dev = cm_id->device;
451 kref_init(&ndev->ref);
452
453 ndev->pd = ib_alloc_pd(ndev->dev);
454 if (IS_ERR(ndev->pd))
455 goto out_free_dev;
456
457 if (!register_always) {
458 ndev->mr = ib_get_dma_mr(ndev->pd,
459 IB_ACCESS_LOCAL_WRITE |
460 IB_ACCESS_REMOTE_READ |
461 IB_ACCESS_REMOTE_WRITE);
462 if (IS_ERR(ndev->mr))
463 goto out_free_pd;
464 }
465
466 if (!(ndev->dev->attrs.device_cap_flags &
467 IB_DEVICE_MEM_MGT_EXTENSIONS)) {
468 dev_err(&ndev->dev->dev,
469 "Memory registrations not supported.\n");
470 goto out_free_mr;
471 }
472
473 list_add(&ndev->entry, &device_list);
474out_unlock:
475 mutex_unlock(&device_list_mutex);
476 return ndev;
477
478out_free_mr:
479 if (!register_always)
480 ib_dereg_mr(ndev->mr);
481out_free_pd:
482 ib_dealloc_pd(ndev->pd);
483out_free_dev:
484 kfree(ndev);
485out_err:
486 mutex_unlock(&device_list_mutex);
487 return NULL;
488}
489
490static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
491{
492 struct nvme_rdma_device *dev = queue->device;
493 struct ib_device *ibdev = dev->dev;
494
495 rdma_destroy_qp(queue->cm_id);
496 ib_free_cq(queue->ib_cq);
497
498 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
499 sizeof(struct nvme_completion), DMA_FROM_DEVICE);
500
501 nvme_rdma_dev_put(dev);
502}
503
504static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue,
505 struct nvme_rdma_device *dev)
506{
507 struct ib_device *ibdev = dev->dev;
508 const int send_wr_factor = 3; /* MR, SEND, INV */
509 const int cq_factor = send_wr_factor + 1; /* + RECV */
510 int comp_vector, idx = nvme_rdma_queue_idx(queue);
511
512 int ret;
513
514 queue->device = dev;
515
516 /*
517 * The admin queue is barely used once the controller is live, so don't
518 * bother to spread it out.
519 */
520 if (idx == 0)
521 comp_vector = 0;
522 else
523 comp_vector = idx % ibdev->num_comp_vectors;
524
525
526 /* +1 for ib_stop_cq */
527 queue->ib_cq = ib_alloc_cq(dev->dev, queue,
528 cq_factor * queue->queue_size + 1, comp_vector,
529 IB_POLL_SOFTIRQ);
530 if (IS_ERR(queue->ib_cq)) {
531 ret = PTR_ERR(queue->ib_cq);
532 goto out;
533 }
534
535 ret = nvme_rdma_create_qp(queue, send_wr_factor);
536 if (ret)
537 goto out_destroy_ib_cq;
538
539 queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
540 sizeof(struct nvme_completion), DMA_FROM_DEVICE);
541 if (!queue->rsp_ring) {
542 ret = -ENOMEM;
543 goto out_destroy_qp;
544 }
545
546 return 0;
547
548out_destroy_qp:
549 ib_destroy_qp(queue->qp);
550out_destroy_ib_cq:
551 ib_free_cq(queue->ib_cq);
552out:
553 return ret;
554}
555
556static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
557 int idx, size_t queue_size)
558{
559 struct nvme_rdma_queue *queue;
560 int ret;
561
562 queue = &ctrl->queues[idx];
563 queue->ctrl = ctrl;
564 init_completion(&queue->cm_done);
565
566 if (idx > 0)
567 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
568 else
569 queue->cmnd_capsule_len = sizeof(struct nvme_command);
570
571 queue->queue_size = queue_size;
572
573 queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
574 RDMA_PS_TCP, IB_QPT_RC);
575 if (IS_ERR(queue->cm_id)) {
576 dev_info(ctrl->ctrl.device,
577 "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
578 return PTR_ERR(queue->cm_id);
579 }
580
581 queue->cm_error = -ETIMEDOUT;
582 ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr,
583 NVME_RDMA_CONNECT_TIMEOUT_MS);
584 if (ret) {
585 dev_info(ctrl->ctrl.device,
586 "rdma_resolve_addr failed (%d).\n", ret);
587 goto out_destroy_cm_id;
588 }
589
590 ret = nvme_rdma_wait_for_cm(queue);
591 if (ret) {
592 dev_info(ctrl->ctrl.device,
593 "rdma_resolve_addr wait failed (%d).\n", ret);
594 goto out_destroy_cm_id;
595 }
596
597 set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags);
598
599 return 0;
600
601out_destroy_cm_id:
602 rdma_destroy_id(queue->cm_id);
603 return ret;
604}
605
606static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
607{
608 rdma_disconnect(queue->cm_id);
609 ib_drain_qp(queue->qp);
610}
611
612static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
613{
614 nvme_rdma_destroy_queue_ib(queue);
615 rdma_destroy_id(queue->cm_id);
616}
617
618static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue)
619{
620 if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
621 return;
622 nvme_rdma_stop_queue(queue);
623 nvme_rdma_free_queue(queue);
624}
625
626static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
627{
628 int i;
629
630 for (i = 1; i < ctrl->queue_count; i++)
631 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
632}
633
634static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
635{
636 int i, ret = 0;
637
638 for (i = 1; i < ctrl->queue_count; i++) {
639 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
640 if (ret)
641 break;
642 }
643
644 return ret;
645}
646
647static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
648{
649 int i, ret;
650
651 for (i = 1; i < ctrl->queue_count; i++) {
652 ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.sqsize);
653 if (ret) {
654 dev_info(ctrl->ctrl.device,
655 "failed to initialize i/o queue: %d\n", ret);
656 goto out_free_queues;
657 }
658 }
659
660 return 0;
661
662out_free_queues:
663 for (; i >= 1; i--)
664 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
665
666 return ret;
667}
668
669static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
670{
671 nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
672 sizeof(struct nvme_command), DMA_TO_DEVICE);
673 nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
674 blk_cleanup_queue(ctrl->ctrl.admin_q);
675 blk_mq_free_tag_set(&ctrl->admin_tag_set);
676 nvme_rdma_dev_put(ctrl->device);
677}
678
679static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
680{
681 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
682
683 if (list_empty(&ctrl->list))
684 goto free_ctrl;
685
686 mutex_lock(&nvme_rdma_ctrl_mutex);
687 list_del(&ctrl->list);
688 mutex_unlock(&nvme_rdma_ctrl_mutex);
689
690 if (ctrl->ctrl.tagset) {
691 blk_cleanup_queue(ctrl->ctrl.connect_q);
692 blk_mq_free_tag_set(&ctrl->tag_set);
693 nvme_rdma_dev_put(ctrl->device);
694 }
695 kfree(ctrl->queues);
696 nvmf_free_options(nctrl->opts);
697free_ctrl:
698 kfree(ctrl);
699}
700
701static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
702{
703 struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
704 struct nvme_rdma_ctrl, reconnect_work);
705 bool changed;
706 int ret;
707
708 if (ctrl->queue_count > 1) {
709 nvme_rdma_free_io_queues(ctrl);
710
711 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
712 if (ret)
713 goto requeue;
714 }
715
716 nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
717
718 ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
719 if (ret)
720 goto requeue;
721
722 ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
723 if (ret)
724 goto requeue;
725
726 blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
727
728 ret = nvmf_connect_admin_queue(&ctrl->ctrl);
729 if (ret)
730 goto stop_admin_q;
731
732 ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
733 if (ret)
734 goto stop_admin_q;
735
736 nvme_start_keep_alive(&ctrl->ctrl);
737
738 if (ctrl->queue_count > 1) {
739 ret = nvme_rdma_init_io_queues(ctrl);
740 if (ret)
741 goto stop_admin_q;
742
743 ret = nvme_rdma_connect_io_queues(ctrl);
744 if (ret)
745 goto stop_admin_q;
746 }
747
748 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
749 WARN_ON_ONCE(!changed);
750
751 if (ctrl->queue_count > 1)
752 nvme_start_queues(&ctrl->ctrl);
753
754 dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
755
756 return;
757
758stop_admin_q:
759 blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
760requeue:
761 /* Make sure we are not resetting/deleting */
762 if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
763 dev_info(ctrl->ctrl.device,
764 "Failed reconnect attempt, requeueing...\n");
765 queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
766 ctrl->reconnect_delay * HZ);
767 }
768}
769
770static void nvme_rdma_error_recovery_work(struct work_struct *work)
771{
772 struct nvme_rdma_ctrl *ctrl = container_of(work,
773 struct nvme_rdma_ctrl, err_work);
774
775 nvme_stop_keep_alive(&ctrl->ctrl);
776 if (ctrl->queue_count > 1)
777 nvme_stop_queues(&ctrl->ctrl);
778 blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
779
780 /* We must take care of fastfail/requeue all our inflight requests */
781 if (ctrl->queue_count > 1)
782 blk_mq_tagset_busy_iter(&ctrl->tag_set,
783 nvme_cancel_request, &ctrl->ctrl);
784 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
785 nvme_cancel_request, &ctrl->ctrl);
786
787 dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
788 ctrl->reconnect_delay);
789
790 queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
791 ctrl->reconnect_delay * HZ);
792}
793
794static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
795{
796 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
797 return;
798
799 queue_work(nvme_rdma_wq, &ctrl->err_work);
800}
801
802static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
803 const char *op)
804{
805 struct nvme_rdma_queue *queue = cq->cq_context;
806 struct nvme_rdma_ctrl *ctrl = queue->ctrl;
807
808 if (ctrl->ctrl.state == NVME_CTRL_LIVE)
809 dev_info(ctrl->ctrl.device,
810 "%s for CQE 0x%p failed with status %s (%d)\n",
811 op, wc->wr_cqe,
812 ib_wc_status_msg(wc->status), wc->status);
813 nvme_rdma_error_recovery(ctrl);
814}
815
816static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
817{
818 if (unlikely(wc->status != IB_WC_SUCCESS))
819 nvme_rdma_wr_error(cq, wc, "MEMREG");
820}
821
822static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
823{
824 if (unlikely(wc->status != IB_WC_SUCCESS))
825 nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
826}
827
828static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
829 struct nvme_rdma_request *req)
830{
831 struct ib_send_wr *bad_wr;
832 struct ib_send_wr wr = {
833 .opcode = IB_WR_LOCAL_INV,
834 .next = NULL,
835 .num_sge = 0,
836 .send_flags = 0,
837 .ex.invalidate_rkey = req->mr->rkey,
838 };
839
840 req->reg_cqe.done = nvme_rdma_inv_rkey_done;
841 wr.wr_cqe = &req->reg_cqe;
842
843 return ib_post_send(queue->qp, &wr, &bad_wr);
844}
845
846static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
847 struct request *rq)
848{
849 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
850 struct nvme_rdma_ctrl *ctrl = queue->ctrl;
851 struct nvme_rdma_device *dev = queue->device;
852 struct ib_device *ibdev = dev->dev;
853 int res;
854
855 if (!blk_rq_bytes(rq))
856 return;
857
858 if (req->need_inval) {
859 res = nvme_rdma_inv_rkey(queue, req);
860 if (res < 0) {
861 dev_err(ctrl->ctrl.device,
862 "Queueing INV WR for rkey %#x failed (%d)\n",
863 req->mr->rkey, res);
864 nvme_rdma_error_recovery(queue->ctrl);
865 }
866 }
867
868 ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
869 req->nents, rq_data_dir(rq) ==
870 WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
871
872 nvme_cleanup_cmd(rq);
873 sg_free_table_chained(&req->sg_table, true);
874}
875
876static int nvme_rdma_set_sg_null(struct nvme_command *c)
877{
878 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
879
880 sg->addr = 0;
881 put_unaligned_le24(0, sg->length);
882 put_unaligned_le32(0, sg->key);
883 sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
884 return 0;
885}
886
887static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
888 struct nvme_rdma_request *req, struct nvme_command *c)
889{
890 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
891
892 req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
893 req->sge[1].length = sg_dma_len(req->sg_table.sgl);
894 req->sge[1].lkey = queue->device->pd->local_dma_lkey;
895
896 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
897 sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
898 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
899
900 req->inline_data = true;
901 req->num_sge++;
902 return 0;
903}
904
905static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
906 struct nvme_rdma_request *req, struct nvme_command *c)
907{
908 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
909
910 sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
911 put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
912 put_unaligned_le32(queue->device->mr->rkey, sg->key);
913 sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
914 return 0;
915}
916
917static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
918 struct nvme_rdma_request *req, struct nvme_command *c,
919 int count)
920{
921 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
922 int nr;
923
924 nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
925 if (nr < count) {
926 if (nr < 0)
927 return nr;
928 return -EINVAL;
929 }
930
931 ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
932
933 req->reg_cqe.done = nvme_rdma_memreg_done;
934 memset(&req->reg_wr, 0, sizeof(req->reg_wr));
935 req->reg_wr.wr.opcode = IB_WR_REG_MR;
936 req->reg_wr.wr.wr_cqe = &req->reg_cqe;
937 req->reg_wr.wr.num_sge = 0;
938 req->reg_wr.mr = req->mr;
939 req->reg_wr.key = req->mr->rkey;
940 req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
941 IB_ACCESS_REMOTE_READ |
942 IB_ACCESS_REMOTE_WRITE;
943
944 req->need_inval = true;
945
946 sg->addr = cpu_to_le64(req->mr->iova);
947 put_unaligned_le24(req->mr->length, sg->length);
948 put_unaligned_le32(req->mr->rkey, sg->key);
949 sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
950 NVME_SGL_FMT_INVALIDATE;
951
952 return 0;
953}
954
955static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
956 struct request *rq, unsigned int map_len,
957 struct nvme_command *c)
958{
959 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
960 struct nvme_rdma_device *dev = queue->device;
961 struct ib_device *ibdev = dev->dev;
962 int nents, count;
963 int ret;
964
965 req->num_sge = 1;
966 req->inline_data = false;
967 req->need_inval = false;
968
969 c->common.flags |= NVME_CMD_SGL_METABUF;
970
971 if (!blk_rq_bytes(rq))
972 return nvme_rdma_set_sg_null(c);
973
974 req->sg_table.sgl = req->first_sgl;
975 ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments,
976 req->sg_table.sgl);
977 if (ret)
978 return -ENOMEM;
979
980 nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
981 BUG_ON(nents > rq->nr_phys_segments);
982 req->nents = nents;
983
984 count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents,
985 rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
986 if (unlikely(count <= 0)) {
987 sg_free_table_chained(&req->sg_table, true);
988 return -EIO;
989 }
990
991 if (count == 1) {
992 if (rq_data_dir(rq) == WRITE &&
993 map_len <= nvme_rdma_inline_data_size(queue) &&
994 nvme_rdma_queue_idx(queue))
995 return nvme_rdma_map_sg_inline(queue, req, c);
996
997 if (!register_always)
998 return nvme_rdma_map_sg_single(queue, req, c);
999 }
1000
1001 return nvme_rdma_map_sg_fr(queue, req, c, count);
1002}
1003
1004static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1005{
1006 if (unlikely(wc->status != IB_WC_SUCCESS))
1007 nvme_rdma_wr_error(cq, wc, "SEND");
1008}
1009
1010static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1011 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1012 struct ib_send_wr *first, bool flush)
1013{
1014 struct ib_send_wr wr, *bad_wr;
1015 int ret;
1016
1017 sge->addr = qe->dma;
1018 sge->length = sizeof(struct nvme_command),
1019 sge->lkey = queue->device->pd->local_dma_lkey;
1020
1021 qe->cqe.done = nvme_rdma_send_done;
1022
1023 wr.next = NULL;
1024 wr.wr_cqe = &qe->cqe;
1025 wr.sg_list = sge;
1026 wr.num_sge = num_sge;
1027 wr.opcode = IB_WR_SEND;
1028 wr.send_flags = 0;
1029
1030 /*
1031 * Unsignalled send completions are another giant desaster in the
1032 * IB Verbs spec: If we don't regularly post signalled sends
1033 * the send queue will fill up and only a QP reset will rescue us.
1034 * Would have been way to obvious to handle this in hardware or
1035 * at least the RDMA stack..
1036 *
1037 * This messy and racy code sniplet is copy and pasted from the iSER
1038 * initiator, and the magic '32' comes from there as well.
1039 *
1040 * Always signal the flushes. The magic request used for the flush
1041 * sequencer is not allocated in our driver's tagset and it's
1042 * triggered to be freed by blk_cleanup_queue(). So we need to
1043 * always mark it as signaled to ensure that the "wr_cqe", which is
1044 * embeded in request's payload, is not freed when __ib_process_cq()
1045 * calls wr_cqe->done().
1046 */
1047 if ((++queue->sig_count % 32) == 0 || flush)
1048 wr.send_flags |= IB_SEND_SIGNALED;
1049
1050 if (first)
1051 first->next = &wr;
1052 else
1053 first = &wr;
1054
1055 ret = ib_post_send(queue->qp, first, &bad_wr);
1056 if (ret) {
1057 dev_err(queue->ctrl->ctrl.device,
1058 "%s failed with error code %d\n", __func__, ret);
1059 }
1060 return ret;
1061}
1062
1063static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1064 struct nvme_rdma_qe *qe)
1065{
1066 struct ib_recv_wr wr, *bad_wr;
1067 struct ib_sge list;
1068 int ret;
1069
1070 list.addr = qe->dma;
1071 list.length = sizeof(struct nvme_completion);
1072 list.lkey = queue->device->pd->local_dma_lkey;
1073
1074 qe->cqe.done = nvme_rdma_recv_done;
1075
1076 wr.next = NULL;
1077 wr.wr_cqe = &qe->cqe;
1078 wr.sg_list = &list;
1079 wr.num_sge = 1;
1080
1081 ret = ib_post_recv(queue->qp, &wr, &bad_wr);
1082 if (ret) {
1083 dev_err(queue->ctrl->ctrl.device,
1084 "%s failed with error code %d\n", __func__, ret);
1085 }
1086 return ret;
1087}
1088
1089static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1090{
1091 u32 queue_idx = nvme_rdma_queue_idx(queue);
1092
1093 if (queue_idx == 0)
1094 return queue->ctrl->admin_tag_set.tags[queue_idx];
1095 return queue->ctrl->tag_set.tags[queue_idx - 1];
1096}
1097
1098static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
1099{
1100 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1101 struct nvme_rdma_queue *queue = &ctrl->queues[0];
1102 struct ib_device *dev = queue->device->dev;
1103 struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1104 struct nvme_command *cmd = sqe->data;
1105 struct ib_sge sge;
1106 int ret;
1107
1108 if (WARN_ON_ONCE(aer_idx != 0))
1109 return;
1110
1111 ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1112
1113 memset(cmd, 0, sizeof(*cmd));
1114 cmd->common.opcode = nvme_admin_async_event;
1115 cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
1116 cmd->common.flags |= NVME_CMD_SGL_METABUF;
1117 nvme_rdma_set_sg_null(cmd);
1118
1119 ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1120 DMA_TO_DEVICE);
1121
1122 ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
1123 WARN_ON_ONCE(ret);
1124}
1125
1126static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1127 struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1128{
1129 u16 status = le16_to_cpu(cqe->status);
1130 struct request *rq;
1131 struct nvme_rdma_request *req;
1132 int ret = 0;
1133
1134 status >>= 1;
1135
1136 rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1137 if (!rq) {
1138 dev_err(queue->ctrl->ctrl.device,
1139 "tag 0x%x on QP %#x not found\n",
1140 cqe->command_id, queue->qp->qp_num);
1141 nvme_rdma_error_recovery(queue->ctrl);
1142 return ret;
1143 }
1144 req = blk_mq_rq_to_pdu(rq);
1145
1146 if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
1147 memcpy(rq->special, cqe, sizeof(*cqe));
1148
1149 if (rq->tag == tag)
1150 ret = 1;
1151
1152 if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
1153 wc->ex.invalidate_rkey == req->mr->rkey)
1154 req->need_inval = false;
1155
1156 blk_mq_complete_request(rq, status);
1157
1158 return ret;
1159}
1160
1161static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1162{
1163 struct nvme_rdma_qe *qe =
1164 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1165 struct nvme_rdma_queue *queue = cq->cq_context;
1166 struct ib_device *ibdev = queue->device->dev;
1167 struct nvme_completion *cqe = qe->data;
1168 const size_t len = sizeof(struct nvme_completion);
1169 int ret = 0;
1170
1171 if (unlikely(wc->status != IB_WC_SUCCESS)) {
1172 nvme_rdma_wr_error(cq, wc, "RECV");
1173 return 0;
1174 }
1175
1176 ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1177 /*
1178 * AEN requests are special as they don't time out and can
1179 * survive any kind of queue freeze and often don't respond to
1180 * aborts. We don't even bother to allocate a struct request
1181 * for them but rather special case them here.
1182 */
1183 if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1184 cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
1185 nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
1186 else
1187 ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1188 ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1189
1190 nvme_rdma_post_recv(queue, qe);
1191 return ret;
1192}
1193
1194static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1195{
1196 __nvme_rdma_recv_done(cq, wc, -1);
1197}
1198
1199static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1200{
1201 int ret, i;
1202
1203 for (i = 0; i < queue->queue_size; i++) {
1204 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1205 if (ret)
1206 goto out_destroy_queue_ib;
1207 }
1208
1209 return 0;
1210
1211out_destroy_queue_ib:
1212 nvme_rdma_destroy_queue_ib(queue);
1213 return ret;
1214}
1215
1216static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1217 struct rdma_cm_event *ev)
1218{
1219 if (ev->param.conn.private_data_len) {
1220 struct nvme_rdma_cm_rej *rej =
1221 (struct nvme_rdma_cm_rej *)ev->param.conn.private_data;
1222
1223 dev_err(queue->ctrl->ctrl.device,
1224 "Connect rejected, status %d.", le16_to_cpu(rej->sts));
1225 /* XXX: Think of something clever to do here... */
1226 } else {
1227 dev_err(queue->ctrl->ctrl.device,
1228 "Connect rejected, no private data.\n");
1229 }
1230
1231 return -ECONNRESET;
1232}
1233
1234static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1235{
1236 struct nvme_rdma_device *dev;
1237 int ret;
1238
1239 dev = nvme_rdma_find_get_device(queue->cm_id);
1240 if (!dev) {
1241 dev_err(queue->cm_id->device->dma_device,
1242 "no client data found!\n");
1243 return -ECONNREFUSED;
1244 }
1245
1246 ret = nvme_rdma_create_queue_ib(queue, dev);
1247 if (ret) {
1248 nvme_rdma_dev_put(dev);
1249 goto out;
1250 }
1251
1252 ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1253 if (ret) {
1254 dev_err(queue->ctrl->ctrl.device,
1255 "rdma_resolve_route failed (%d).\n",
1256 queue->cm_error);
1257 goto out_destroy_queue;
1258 }
1259
1260 return 0;
1261
1262out_destroy_queue:
1263 nvme_rdma_destroy_queue_ib(queue);
1264out:
1265 return ret;
1266}
1267
1268static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1269{
1270 struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1271 struct rdma_conn_param param = { };
1272 struct nvme_rdma_cm_req priv;
1273 int ret;
1274
1275 param.qp_num = queue->qp->qp_num;
1276 param.flow_control = 1;
1277
1278 param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1279 /* maximum retry count */
1280 param.retry_count = 7;
1281 param.rnr_retry_count = 7;
1282 param.private_data = &priv;
1283 param.private_data_len = sizeof(priv);
1284
1285 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1286 priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1287 priv.hrqsize = cpu_to_le16(queue->queue_size);
1288 priv.hsqsize = cpu_to_le16(queue->queue_size);
1289
1290 ret = rdma_connect(queue->cm_id, &param);
1291 if (ret) {
1292 dev_err(ctrl->ctrl.device,
1293 "rdma_connect failed (%d).\n", ret);
1294 goto out_destroy_queue_ib;
1295 }
1296
1297 return 0;
1298
1299out_destroy_queue_ib:
1300 nvme_rdma_destroy_queue_ib(queue);
1301 return ret;
1302}
1303
1304/**
1305 * nvme_rdma_device_unplug() - Handle RDMA device unplug
1306 * @queue: Queue that owns the cm_id that caught the event
1307 *
1308 * DEVICE_REMOVAL event notifies us that the RDMA device is about
1309 * to unplug so we should take care of destroying our RDMA resources.
1310 * This event will be generated for each allocated cm_id.
1311 *
1312 * In our case, the RDMA resources are managed per controller and not
1313 * only per queue. So the way we handle this is we trigger an implicit
1314 * controller deletion upon the first DEVICE_REMOVAL event we see, and
1315 * hold the event inflight until the controller deletion is completed.
1316 *
1317 * One exception that we need to handle is the destruction of the cm_id
1318 * that caught the event. Since we hold the callout until the controller
1319 * deletion is completed, we'll deadlock if the controller deletion will
1320 * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
1321 * of destroying this queue before-hand, destroy the queue resources
1322 * after the controller deletion completed with the exception of destroying
1323 * the cm_id implicitely by returning a non-zero rc to the callout.
1324 */
1325static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
1326{
1327 struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1328 int ret, ctrl_deleted = 0;
1329
1330 /* First disable the queue so ctrl delete won't free it */
1331 if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
1332 goto out;
1333
1334 /* delete the controller */
1335 ret = __nvme_rdma_del_ctrl(ctrl);
1336 if (!ret) {
1337 dev_warn(ctrl->ctrl.device,
1338 "Got rdma device removal event, deleting ctrl\n");
1339 flush_work(&ctrl->delete_work);
1340
1341 /* Return non-zero so the cm_id will destroy implicitly */
1342 ctrl_deleted = 1;
1343
1344 /* Free this queue ourselves */
1345 rdma_disconnect(queue->cm_id);
1346 ib_drain_qp(queue->qp);
1347 nvme_rdma_destroy_queue_ib(queue);
1348 }
1349
1350out:
1351 return ctrl_deleted;
1352}
1353
1354static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1355 struct rdma_cm_event *ev)
1356{
1357 struct nvme_rdma_queue *queue = cm_id->context;
1358 int cm_error = 0;
1359
1360 dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1361 rdma_event_msg(ev->event), ev->event,
1362 ev->status, cm_id);
1363
1364 switch (ev->event) {
1365 case RDMA_CM_EVENT_ADDR_RESOLVED:
1366 cm_error = nvme_rdma_addr_resolved(queue);
1367 break;
1368 case RDMA_CM_EVENT_ROUTE_RESOLVED:
1369 cm_error = nvme_rdma_route_resolved(queue);
1370 break;
1371 case RDMA_CM_EVENT_ESTABLISHED:
1372 queue->cm_error = nvme_rdma_conn_established(queue);
1373 /* complete cm_done regardless of success/failure */
1374 complete(&queue->cm_done);
1375 return 0;
1376 case RDMA_CM_EVENT_REJECTED:
1377 cm_error = nvme_rdma_conn_rejected(queue, ev);
1378 break;
1379 case RDMA_CM_EVENT_ADDR_ERROR:
1380 case RDMA_CM_EVENT_ROUTE_ERROR:
1381 case RDMA_CM_EVENT_CONNECT_ERROR:
1382 case RDMA_CM_EVENT_UNREACHABLE:
1383 dev_dbg(queue->ctrl->ctrl.device,
1384 "CM error event %d\n", ev->event);
1385 cm_error = -ECONNRESET;
1386 break;
1387 case RDMA_CM_EVENT_DISCONNECTED:
1388 case RDMA_CM_EVENT_ADDR_CHANGE:
1389 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1390 dev_dbg(queue->ctrl->ctrl.device,
1391 "disconnect received - connection closed\n");
1392 nvme_rdma_error_recovery(queue->ctrl);
1393 break;
1394 case RDMA_CM_EVENT_DEVICE_REMOVAL:
1395 /* return 1 means impliciy CM ID destroy */
1396 return nvme_rdma_device_unplug(queue);
1397 default:
1398 dev_err(queue->ctrl->ctrl.device,
1399 "Unexpected RDMA CM event (%d)\n", ev->event);
1400 nvme_rdma_error_recovery(queue->ctrl);
1401 break;
1402 }
1403
1404 if (cm_error) {
1405 queue->cm_error = cm_error;
1406 complete(&queue->cm_done);
1407 }
1408
1409 return 0;
1410}
1411
1412static enum blk_eh_timer_return
1413nvme_rdma_timeout(struct request *rq, bool reserved)
1414{
1415 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1416
1417 /* queue error recovery */
1418 nvme_rdma_error_recovery(req->queue->ctrl);
1419
1420 /* fail with DNR on cmd timeout */
1421 rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
1422
1423 return BLK_EH_HANDLED;
1424}
1425
1426static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1427 const struct blk_mq_queue_data *bd)
1428{
1429 struct nvme_ns *ns = hctx->queue->queuedata;
1430 struct nvme_rdma_queue *queue = hctx->driver_data;
1431 struct request *rq = bd->rq;
1432 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1433 struct nvme_rdma_qe *sqe = &req->sqe;
1434 struct nvme_command *c = sqe->data;
1435 bool flush = false;
1436 struct ib_device *dev;
1437 unsigned int map_len;
1438 int ret;
1439
1440 WARN_ON_ONCE(rq->tag < 0);
1441
1442 dev = queue->device->dev;
1443 ib_dma_sync_single_for_cpu(dev, sqe->dma,
1444 sizeof(struct nvme_command), DMA_TO_DEVICE);
1445
1446 ret = nvme_setup_cmd(ns, rq, c);
1447 if (ret)
1448 return ret;
1449
1450 c->common.command_id = rq->tag;
1451 blk_mq_start_request(rq);
1452
1453 map_len = nvme_map_len(rq);
1454 ret = nvme_rdma_map_data(queue, rq, map_len, c);
1455 if (ret < 0) {
1456 dev_err(queue->ctrl->ctrl.device,
1457 "Failed to map data (%d)\n", ret);
1458 nvme_cleanup_cmd(rq);
1459 goto err;
1460 }
1461
1462 ib_dma_sync_single_for_device(dev, sqe->dma,
1463 sizeof(struct nvme_command), DMA_TO_DEVICE);
1464
1465 if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH)
1466 flush = true;
1467 ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1468 req->need_inval ? &req->reg_wr.wr : NULL, flush);
1469 if (ret) {
1470 nvme_rdma_unmap_data(queue, rq);
1471 goto err;
1472 }
1473
1474 return BLK_MQ_RQ_QUEUE_OK;
1475err:
1476 return (ret == -ENOMEM || ret == -EAGAIN) ?
1477 BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
1478}
1479
1480static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1481{
1482 struct nvme_rdma_queue *queue = hctx->driver_data;
1483 struct ib_cq *cq = queue->ib_cq;
1484 struct ib_wc wc;
1485 int found = 0;
1486
1487 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
1488 while (ib_poll_cq(cq, 1, &wc) > 0) {
1489 struct ib_cqe *cqe = wc.wr_cqe;
1490
1491 if (cqe) {
1492 if (cqe->done == nvme_rdma_recv_done)
1493 found |= __nvme_rdma_recv_done(cq, &wc, tag);
1494 else
1495 cqe->done(cq, &wc);
1496 }
1497 }
1498
1499 return found;
1500}
1501
1502static void nvme_rdma_complete_rq(struct request *rq)
1503{
1504 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1505 struct nvme_rdma_queue *queue = req->queue;
1506 int error = 0;
1507
1508 nvme_rdma_unmap_data(queue, rq);
1509
1510 if (unlikely(rq->errors)) {
1511 if (nvme_req_needs_retry(rq, rq->errors)) {
1512 nvme_requeue_req(rq);
1513 return;
1514 }
1515
1516 if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
1517 error = rq->errors;
1518 else
1519 error = nvme_error_status(rq->errors);
1520 }
1521
1522 blk_mq_end_request(rq, error);
1523}
1524
1525static struct blk_mq_ops nvme_rdma_mq_ops = {
1526 .queue_rq = nvme_rdma_queue_rq,
1527 .complete = nvme_rdma_complete_rq,
1528 .map_queue = blk_mq_map_queue,
1529 .init_request = nvme_rdma_init_request,
1530 .exit_request = nvme_rdma_exit_request,
1531 .reinit_request = nvme_rdma_reinit_request,
1532 .init_hctx = nvme_rdma_init_hctx,
1533 .poll = nvme_rdma_poll,
1534 .timeout = nvme_rdma_timeout,
1535};
1536
1537static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1538 .queue_rq = nvme_rdma_queue_rq,
1539 .complete = nvme_rdma_complete_rq,
1540 .map_queue = blk_mq_map_queue,
1541 .init_request = nvme_rdma_init_admin_request,
1542 .exit_request = nvme_rdma_exit_admin_request,
1543 .reinit_request = nvme_rdma_reinit_request,
1544 .init_hctx = nvme_rdma_init_admin_hctx,
1545 .timeout = nvme_rdma_timeout,
1546};
1547
1548static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
1549{
1550 int error;
1551
1552 error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
1553 if (error)
1554 return error;
1555
1556 ctrl->device = ctrl->queues[0].device;
1557
1558 /*
1559 * We need a reference on the device as long as the tag_set is alive,
1560 * as the MRs in the request structures need a valid ib_device.
1561 */
1562 error = -EINVAL;
1563 if (!nvme_rdma_dev_get(ctrl->device))
1564 goto out_free_queue;
1565
1566 ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
1567 ctrl->device->dev->attrs.max_fast_reg_page_list_len);
1568
1569 memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
1570 ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops;
1571 ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
1572 ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
1573 ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
1574 ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1575 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1576 ctrl->admin_tag_set.driver_data = ctrl;
1577 ctrl->admin_tag_set.nr_hw_queues = 1;
1578 ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
1579
1580 error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
1581 if (error)
1582 goto out_put_dev;
1583
1584 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
1585 if (IS_ERR(ctrl->ctrl.admin_q)) {
1586 error = PTR_ERR(ctrl->ctrl.admin_q);
1587 goto out_free_tagset;
1588 }
1589
1590 error = nvmf_connect_admin_queue(&ctrl->ctrl);
1591 if (error)
1592 goto out_cleanup_queue;
1593
1594 error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
1595 if (error) {
1596 dev_err(ctrl->ctrl.device,
1597 "prop_get NVME_REG_CAP failed\n");
1598 goto out_cleanup_queue;
1599 }
1600
1601 ctrl->ctrl.sqsize =
1602 min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
1603
1604 error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
1605 if (error)
1606 goto out_cleanup_queue;
1607
1608 ctrl->ctrl.max_hw_sectors =
1609 (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
1610
1611 error = nvme_init_identify(&ctrl->ctrl);
1612 if (error)
1613 goto out_cleanup_queue;
1614
1615 error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
1616 &ctrl->async_event_sqe, sizeof(struct nvme_command),
1617 DMA_TO_DEVICE);
1618 if (error)
1619 goto out_cleanup_queue;
1620
1621 nvme_start_keep_alive(&ctrl->ctrl);
1622
1623 return 0;
1624
1625out_cleanup_queue:
1626 blk_cleanup_queue(ctrl->ctrl.admin_q);
1627out_free_tagset:
1628 /* disconnect and drain the queue before freeing the tagset */
1629 nvme_rdma_stop_queue(&ctrl->queues[0]);
1630 blk_mq_free_tag_set(&ctrl->admin_tag_set);
1631out_put_dev:
1632 nvme_rdma_dev_put(ctrl->device);
1633out_free_queue:
1634 nvme_rdma_free_queue(&ctrl->queues[0]);
1635 return error;
1636}
1637
1638static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
1639{
1640 nvme_stop_keep_alive(&ctrl->ctrl);
1641 cancel_work_sync(&ctrl->err_work);
1642 cancel_delayed_work_sync(&ctrl->reconnect_work);
1643
1644 if (ctrl->queue_count > 1) {
1645 nvme_stop_queues(&ctrl->ctrl);
1646 blk_mq_tagset_busy_iter(&ctrl->tag_set,
1647 nvme_cancel_request, &ctrl->ctrl);
1648 nvme_rdma_free_io_queues(ctrl);
1649 }
1650
1651 if (ctrl->ctrl.state == NVME_CTRL_LIVE)
1652 nvme_shutdown_ctrl(&ctrl->ctrl);
1653
1654 blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
1655 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
1656 nvme_cancel_request, &ctrl->ctrl);
1657 nvme_rdma_destroy_admin_queue(ctrl);
1658}
1659
1660static void nvme_rdma_del_ctrl_work(struct work_struct *work)
1661{
1662 struct nvme_rdma_ctrl *ctrl = container_of(work,
1663 struct nvme_rdma_ctrl, delete_work);
1664
1665 nvme_remove_namespaces(&ctrl->ctrl);
1666 nvme_rdma_shutdown_ctrl(ctrl);
1667 nvme_uninit_ctrl(&ctrl->ctrl);
1668 nvme_put_ctrl(&ctrl->ctrl);
1669}
1670
1671static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
1672{
1673 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
1674 return -EBUSY;
1675
1676 if (!queue_work(nvme_rdma_wq, &ctrl->delete_work))
1677 return -EBUSY;
1678
1679 return 0;
1680}
1681
1682static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
1683{
1684 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1685 int ret;
1686
1687 ret = __nvme_rdma_del_ctrl(ctrl);
1688 if (ret)
1689 return ret;
1690
1691 flush_work(&ctrl->delete_work);
1692
1693 return 0;
1694}
1695
1696static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
1697{
1698 struct nvme_rdma_ctrl *ctrl = container_of(work,
1699 struct nvme_rdma_ctrl, delete_work);
1700
1701 nvme_remove_namespaces(&ctrl->ctrl);
1702 nvme_uninit_ctrl(&ctrl->ctrl);
1703 nvme_put_ctrl(&ctrl->ctrl);
1704}
1705
1706static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1707{
1708 struct nvme_rdma_ctrl *ctrl = container_of(work,
1709 struct nvme_rdma_ctrl, reset_work);
1710 int ret;
1711 bool changed;
1712
1713 nvme_rdma_shutdown_ctrl(ctrl);
1714
1715 ret = nvme_rdma_configure_admin_queue(ctrl);
1716 if (ret) {
1717 /* ctrl is already shutdown, just remove the ctrl */
1718 INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work);
1719 goto del_dead_ctrl;
1720 }
1721
1722 if (ctrl->queue_count > 1) {
1723 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
1724 if (ret)
1725 goto del_dead_ctrl;
1726
1727 ret = nvme_rdma_init_io_queues(ctrl);
1728 if (ret)
1729 goto del_dead_ctrl;
1730
1731 ret = nvme_rdma_connect_io_queues(ctrl);
1732 if (ret)
1733 goto del_dead_ctrl;
1734 }
1735
1736 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1737 WARN_ON_ONCE(!changed);
1738
1739 if (ctrl->queue_count > 1) {
1740 nvme_start_queues(&ctrl->ctrl);
1741 nvme_queue_scan(&ctrl->ctrl);
1742 }
1743
1744 return;
1745
1746del_dead_ctrl:
1747 /* Deleting this dead controller... */
1748 dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
1749 WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work));
1750}
1751
1752static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl)
1753{
1754 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1755
1756 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1757 return -EBUSY;
1758
1759 if (!queue_work(nvme_rdma_wq, &ctrl->reset_work))
1760 return -EBUSY;
1761
1762 flush_work(&ctrl->reset_work);
1763
1764 return 0;
1765}
1766
1767static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1768 .name = "rdma",
1769 .module = THIS_MODULE,
1770 .is_fabrics = true,
1771 .reg_read32 = nvmf_reg_read32,
1772 .reg_read64 = nvmf_reg_read64,
1773 .reg_write32 = nvmf_reg_write32,
1774 .reset_ctrl = nvme_rdma_reset_ctrl,
1775 .free_ctrl = nvme_rdma_free_ctrl,
1776 .submit_async_event = nvme_rdma_submit_async_event,
1777 .delete_ctrl = nvme_rdma_del_ctrl,
1778 .get_subsysnqn = nvmf_get_subsysnqn,
1779 .get_address = nvmf_get_address,
1780};
1781
1782static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
1783{
1784 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
1785 int ret;
1786
1787 ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
1788 if (ret)
1789 return ret;
1790
1791 ctrl->queue_count = opts->nr_io_queues + 1;
1792 if (ctrl->queue_count < 2)
1793 return 0;
1794
1795 dev_info(ctrl->ctrl.device,
1796 "creating %d I/O queues.\n", opts->nr_io_queues);
1797
1798 ret = nvme_rdma_init_io_queues(ctrl);
1799 if (ret)
1800 return ret;
1801
1802 /*
1803 * We need a reference on the device as long as the tag_set is alive,
1804 * as the MRs in the request structures need a valid ib_device.
1805 */
1806 ret = -EINVAL;
1807 if (!nvme_rdma_dev_get(ctrl->device))
1808 goto out_free_io_queues;
1809
1810 memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
1811 ctrl->tag_set.ops = &nvme_rdma_mq_ops;
1812 ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize;
1813 ctrl->tag_set.reserved_tags = 1; /* fabric connect */
1814 ctrl->tag_set.numa_node = NUMA_NO_NODE;
1815 ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1816 ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1817 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1818 ctrl->tag_set.driver_data = ctrl;
1819 ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
1820 ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
1821
1822 ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
1823 if (ret)
1824 goto out_put_dev;
1825 ctrl->ctrl.tagset = &ctrl->tag_set;
1826
1827 ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
1828 if (IS_ERR(ctrl->ctrl.connect_q)) {
1829 ret = PTR_ERR(ctrl->ctrl.connect_q);
1830 goto out_free_tag_set;
1831 }
1832
1833 ret = nvme_rdma_connect_io_queues(ctrl);
1834 if (ret)
1835 goto out_cleanup_connect_q;
1836
1837 return 0;
1838
1839out_cleanup_connect_q:
1840 blk_cleanup_queue(ctrl->ctrl.connect_q);
1841out_free_tag_set:
1842 blk_mq_free_tag_set(&ctrl->tag_set);
1843out_put_dev:
1844 nvme_rdma_dev_put(ctrl->device);
1845out_free_io_queues:
1846 nvme_rdma_free_io_queues(ctrl);
1847 return ret;
1848}
1849
1850static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
1851{
1852 u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
1853 size_t buflen = strlen(p);
1854
1855 /* XXX: handle IPv6 addresses */
1856
1857 if (buflen > INET_ADDRSTRLEN)
1858 return -EINVAL;
1859 if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
1860 return -EINVAL;
1861 in_addr->sin_family = AF_INET;
1862 return 0;
1863}
1864
1865static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1866 struct nvmf_ctrl_options *opts)
1867{
1868 struct nvme_rdma_ctrl *ctrl;
1869 int ret;
1870 bool changed;
1871
1872 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1873 if (!ctrl)
1874 return ERR_PTR(-ENOMEM);
1875 ctrl->ctrl.opts = opts;
1876 INIT_LIST_HEAD(&ctrl->list);
1877
1878 ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
1879 if (ret) {
1880 pr_err("malformed IP address passed: %s\n", opts->traddr);
1881 goto out_free_ctrl;
1882 }
1883
1884 if (opts->mask & NVMF_OPT_TRSVCID) {
1885 u16 port;
1886
1887 ret = kstrtou16(opts->trsvcid, 0, &port);
1888 if (ret)
1889 goto out_free_ctrl;
1890
1891 ctrl->addr_in.sin_port = cpu_to_be16(port);
1892 } else {
1893 ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
1894 }
1895
1896 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
1897 0 /* no quirks, we're perfect! */);
1898 if (ret)
1899 goto out_free_ctrl;
1900
1901 ctrl->reconnect_delay = opts->reconnect_delay;
1902 INIT_DELAYED_WORK(&ctrl->reconnect_work,
1903 nvme_rdma_reconnect_ctrl_work);
1904 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1905 INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
1906 INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work);
1907 spin_lock_init(&ctrl->lock);
1908
1909 ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
1910 ctrl->ctrl.sqsize = opts->queue_size;
1911 ctrl->ctrl.kato = opts->kato;
1912
1913 ret = -ENOMEM;
1914 ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
1915 GFP_KERNEL);
1916 if (!ctrl->queues)
1917 goto out_uninit_ctrl;
1918
1919 ret = nvme_rdma_configure_admin_queue(ctrl);
1920 if (ret)
1921 goto out_kfree_queues;
1922
1923 /* sanity check icdoff */
1924 if (ctrl->ctrl.icdoff) {
1925 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1926 goto out_remove_admin_queue;
1927 }
1928
1929 /* sanity check keyed sgls */
1930 if (!(ctrl->ctrl.sgls & (1 << 20))) {
1931 dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n");
1932 goto out_remove_admin_queue;
1933 }
1934
1935 if (opts->queue_size > ctrl->ctrl.maxcmd) {
1936 /* warn if maxcmd is lower than queue_size */
1937 dev_warn(ctrl->ctrl.device,
1938 "queue_size %zu > ctrl maxcmd %u, clamping down\n",
1939 opts->queue_size, ctrl->ctrl.maxcmd);
1940 opts->queue_size = ctrl->ctrl.maxcmd;
1941 }
1942
1943 if (opts->nr_io_queues) {
1944 ret = nvme_rdma_create_io_queues(ctrl);
1945 if (ret)
1946 goto out_remove_admin_queue;
1947 }
1948
1949 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1950 WARN_ON_ONCE(!changed);
1951
1952 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
1953 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1954
1955 kref_get(&ctrl->ctrl.kref);
1956
1957 mutex_lock(&nvme_rdma_ctrl_mutex);
1958 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
1959 mutex_unlock(&nvme_rdma_ctrl_mutex);
1960
1961 if (opts->nr_io_queues) {
1962 nvme_queue_scan(&ctrl->ctrl);
1963 nvme_queue_async_events(&ctrl->ctrl);
1964 }
1965
1966 return &ctrl->ctrl;
1967
1968out_remove_admin_queue:
1969 nvme_stop_keep_alive(&ctrl->ctrl);
1970 nvme_rdma_destroy_admin_queue(ctrl);
1971out_kfree_queues:
1972 kfree(ctrl->queues);
1973out_uninit_ctrl:
1974 nvme_uninit_ctrl(&ctrl->ctrl);
1975 nvme_put_ctrl(&ctrl->ctrl);
1976 if (ret > 0)
1977 ret = -EIO;
1978 return ERR_PTR(ret);
1979out_free_ctrl:
1980 kfree(ctrl);
1981 return ERR_PTR(ret);
1982}
1983
1984static struct nvmf_transport_ops nvme_rdma_transport = {
1985 .name = "rdma",
1986 .required_opts = NVMF_OPT_TRADDR,
1987 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY,
1988 .create_ctrl = nvme_rdma_create_ctrl,
1989};
1990
1991static int __init nvme_rdma_init_module(void)
1992{
1993 nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
1994 if (!nvme_rdma_wq)
1995 return -ENOMEM;
1996
1997 nvmf_register_transport(&nvme_rdma_transport);
1998 return 0;
1999}
2000
2001static void __exit nvme_rdma_cleanup_module(void)
2002{
2003 struct nvme_rdma_ctrl *ctrl;
2004
2005 nvmf_unregister_transport(&nvme_rdma_transport);
2006
2007 mutex_lock(&nvme_rdma_ctrl_mutex);
2008 list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
2009 __nvme_rdma_del_ctrl(ctrl);
2010 mutex_unlock(&nvme_rdma_ctrl_mutex);
2011
2012 destroy_workqueue(nvme_rdma_wq);
2013}
2014
2015module_init(nvme_rdma_init_module);
2016module_exit(nvme_rdma_cleanup_module);
2017
2018MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
new file mode 100644
index 000000000000..a5c31cbeb481
--- /dev/null
+++ b/drivers/nvme/target/Kconfig
@@ -0,0 +1,36 @@
1
2config NVME_TARGET
3 tristate "NVMe Target support"
4 depends on BLOCK
5 depends on CONFIGFS_FS
6 help
7 This enabled target side support for the NVMe protocol, that is
8 it allows the Linux kernel to implement NVMe subsystems and
9 controllers and export Linux block devices as NVMe namespaces.
10 You need to select at least one of the transports below to make this
11 functionality useful.
12
13 To configure the NVMe target you probably want to use the nvmetcli
14 tool from http://git.infradead.org/users/hch/nvmetcli.git.
15
16config NVME_TARGET_LOOP
17 tristate "NVMe loopback device support"
18 depends on BLK_DEV_NVME
19 depends on NVME_TARGET
20 select NVME_FABRICS
21 select SG_POOL
22 help
23 This enables the NVMe loopback device support, which can be useful
24 to test NVMe host and target side features.
25
26 If unsure, say N.
27
28config NVME_TARGET_RDMA
29 tristate "NVMe over Fabrics RDMA target support"
30 depends on INFINIBAND
31 depends on NVME_TARGET
32 help
33 This enables the NVMe RDMA target support, which allows exporting NVMe
34 devices over RDMA.
35
36 If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
new file mode 100644
index 000000000000..b7a06232c9da
--- /dev/null
+++ b/drivers/nvme/target/Makefile
@@ -0,0 +1,9 @@
1
2obj-$(CONFIG_NVME_TARGET) += nvmet.o
3obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o
4obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o
5
6nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \
7 discovery.o
8nvme-loop-y += loop.o
9nvmet-rdma-y += rdma.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
new file mode 100644
index 000000000000..2fac17a5ad53
--- /dev/null
+++ b/drivers/nvme/target/admin-cmd.c
@@ -0,0 +1,465 @@
1/*
2 * NVMe admin command implementation.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/module.h>
16#include <linux/random.h>
17#include <generated/utsrelease.h>
18#include "nvmet.h"
19
20u32 nvmet_get_log_page_len(struct nvme_command *cmd)
21{
22 u32 len = le16_to_cpu(cmd->get_log_page.numdu);
23
24 len <<= 16;
25 len += le16_to_cpu(cmd->get_log_page.numdl);
26 /* NUMD is a 0's based value */
27 len += 1;
28 len *= sizeof(u32);
29
30 return len;
31}
32
33static void nvmet_execute_get_log_page(struct nvmet_req *req)
34{
35 size_t data_len = nvmet_get_log_page_len(req->cmd);
36 void *buf;
37 u16 status = 0;
38
39 buf = kzalloc(data_len, GFP_KERNEL);
40 if (!buf) {
41 status = NVME_SC_INTERNAL;
42 goto out;
43 }
44
45 switch (req->cmd->get_log_page.lid) {
46 case 0x01:
47 /*
48 * We currently never set the More bit in the status field,
49 * so all error log entries are invalid and can be zeroed out.
50 * This is called a minum viable implementation (TM) of this
51 * mandatory log page.
52 */
53 break;
54 case 0x02:
55 /*
56 * XXX: fill out actual smart log
57 *
58 * We might have a hard time coming up with useful values for
59 * many of the fields, and even when we have useful data
60 * available (e.g. units or commands read/written) those aren't
61 * persistent over power loss.
62 */
63 break;
64 case 0x03:
65 /*
66 * We only support a single firmware slot which always is
67 * active, so we can zero out the whole firmware slot log and
68 * still claim to fully implement this mandatory log page.
69 */
70 break;
71 default:
72 BUG();
73 }
74
75 status = nvmet_copy_to_sgl(req, 0, buf, data_len);
76
77 kfree(buf);
78out:
79 nvmet_req_complete(req, status);
80}
81
82static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
83{
84 struct nvmet_ctrl *ctrl = req->sq->ctrl;
85 struct nvme_id_ctrl *id;
86 u64 serial;
87 u16 status = 0;
88
89 id = kzalloc(sizeof(*id), GFP_KERNEL);
90 if (!id) {
91 status = NVME_SC_INTERNAL;
92 goto out;
93 }
94
95 /* XXX: figure out how to assign real vendors IDs. */
96 id->vid = 0;
97 id->ssvid = 0;
98
99 /* generate a random serial number as our controllers are ephemeral: */
100 get_random_bytes(&serial, sizeof(serial));
101 memset(id->sn, ' ', sizeof(id->sn));
102 snprintf(id->sn, sizeof(id->sn), "%llx", serial);
103
104 memset(id->mn, ' ', sizeof(id->mn));
105 strncpy((char *)id->mn, "Linux", sizeof(id->mn));
106
107 memset(id->fr, ' ', sizeof(id->fr));
108 strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr));
109
110 id->rab = 6;
111
112 /*
113 * XXX: figure out how we can assign a IEEE OUI, but until then
114 * the safest is to leave it as zeroes.
115 */
116
117 /* we support multiple ports and multiples hosts: */
118 id->mic = (1 << 0) | (1 << 1);
119
120 /* no limit on data transfer sizes for now */
121 id->mdts = 0;
122 id->cntlid = cpu_to_le16(ctrl->cntlid);
123 id->ver = cpu_to_le32(ctrl->subsys->ver);
124
125 /* XXX: figure out what to do about RTD3R/RTD3 */
126 id->oaes = cpu_to_le32(1 << 8);
127 id->ctratt = cpu_to_le32(1 << 0);
128
129 id->oacs = 0;
130
131 /*
132 * We don't really have a practical limit on the number of abort
133 * comands. But we don't do anything useful for abort either, so
134 * no point in allowing more abort commands than the spec requires.
135 */
136 id->acl = 3;
137
138 id->aerl = NVMET_ASYNC_EVENTS - 1;
139
140 /* first slot is read-only, only one slot supported */
141 id->frmw = (1 << 0) | (1 << 1);
142 id->lpa = (1 << 0) | (1 << 2);
143 id->elpe = NVMET_ERROR_LOG_SLOTS - 1;
144 id->npss = 0;
145
146 /* We support keep-alive timeout in granularity of seconds */
147 id->kas = cpu_to_le16(NVMET_KAS);
148
149 id->sqes = (0x6 << 4) | 0x6;
150 id->cqes = (0x4 << 4) | 0x4;
151
152 /* no enforcement soft-limit for maxcmd - pick arbitrary high value */
153 id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
154
155 id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
156 id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM);
157
158 /* XXX: don't report vwc if the underlying device is write through */
159 id->vwc = NVME_CTRL_VWC_PRESENT;
160
161 /*
162 * We can't support atomic writes bigger than a LBA without support
163 * from the backend device.
164 */
165 id->awun = 0;
166 id->awupf = 0;
167
168 id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
169 if (ctrl->ops->has_keyed_sgls)
170 id->sgls |= cpu_to_le32(1 << 2);
171 if (ctrl->ops->sqe_inline_size)
172 id->sgls |= cpu_to_le32(1 << 20);
173
174 strcpy(id->subnqn, ctrl->subsys->subsysnqn);
175
176 /* Max command capsule size is sqe + single page of in-capsule data */
177 id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
178 ctrl->ops->sqe_inline_size) / 16);
179 /* Max response capsule size is cqe */
180 id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
181
182 id->msdbd = ctrl->ops->msdbd;
183
184 /*
185 * Meh, we don't really support any power state. Fake up the same
186 * values that qemu does.
187 */
188 id->psd[0].max_power = cpu_to_le16(0x9c4);
189 id->psd[0].entry_lat = cpu_to_le32(0x10);
190 id->psd[0].exit_lat = cpu_to_le32(0x4);
191
192 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
193
194 kfree(id);
195out:
196 nvmet_req_complete(req, status);
197}
198
199static void nvmet_execute_identify_ns(struct nvmet_req *req)
200{
201 struct nvmet_ns *ns;
202 struct nvme_id_ns *id;
203 u16 status = 0;
204
205 ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
206 if (!ns) {
207 status = NVME_SC_INVALID_NS | NVME_SC_DNR;
208 goto out;
209 }
210
211 id = kzalloc(sizeof(*id), GFP_KERNEL);
212 if (!id) {
213 status = NVME_SC_INTERNAL;
214 goto out_put_ns;
215 }
216
217 /*
218 * nuse = ncap = nsze isn't aways true, but we have no way to find
219 * that out from the underlying device.
220 */
221 id->ncap = id->nuse = id->nsze =
222 cpu_to_le64(ns->size >> ns->blksize_shift);
223
224 /*
225 * We just provide a single LBA format that matches what the
226 * underlying device reports.
227 */
228 id->nlbaf = 0;
229 id->flbas = 0;
230
231 /*
232 * Our namespace might always be shared. Not just with other
233 * controllers, but also with any other user of the block device.
234 */
235 id->nmic = (1 << 0);
236
237 memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le));
238
239 id->lbaf[0].ds = ns->blksize_shift;
240
241 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
242
243 kfree(id);
244out_put_ns:
245 nvmet_put_namespace(ns);
246out:
247 nvmet_req_complete(req, status);
248}
249
250static void nvmet_execute_identify_nslist(struct nvmet_req *req)
251{
252 static const int buf_size = 4096;
253 struct nvmet_ctrl *ctrl = req->sq->ctrl;
254 struct nvmet_ns *ns;
255 u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid);
256 __le32 *list;
257 u16 status = 0;
258 int i = 0;
259
260 list = kzalloc(buf_size, GFP_KERNEL);
261 if (!list) {
262 status = NVME_SC_INTERNAL;
263 goto out;
264 }
265
266 rcu_read_lock();
267 list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
268 if (ns->nsid <= min_nsid)
269 continue;
270 list[i++] = cpu_to_le32(ns->nsid);
271 if (i == buf_size / sizeof(__le32))
272 break;
273 }
274 rcu_read_unlock();
275
276 status = nvmet_copy_to_sgl(req, 0, list, buf_size);
277
278 kfree(list);
279out:
280 nvmet_req_complete(req, status);
281}
282
283/*
284 * A "mimimum viable" abort implementation: the command is mandatory in the
285 * spec, but we are not required to do any useful work. We couldn't really
286 * do a useful abort, so don't bother even with waiting for the command
287 * to be exectuted and return immediately telling the command to abort
288 * wasn't found.
289 */
290static void nvmet_execute_abort(struct nvmet_req *req)
291{
292 nvmet_set_result(req, 1);
293 nvmet_req_complete(req, 0);
294}
295
296static void nvmet_execute_set_features(struct nvmet_req *req)
297{
298 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
299 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
300 u64 val;
301 u32 val32;
302 u16 status = 0;
303
304 switch (cdw10 & 0xf) {
305 case NVME_FEAT_NUM_QUEUES:
306 nvmet_set_result(req,
307 (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
308 break;
309 case NVME_FEAT_KATO:
310 val = le64_to_cpu(req->cmd->prop_set.value);
311 val32 = val & 0xffff;
312 req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
313 nvmet_set_result(req, req->sq->ctrl->kato);
314 break;
315 default:
316 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
317 break;
318 }
319
320 nvmet_req_complete(req, status);
321}
322
323static void nvmet_execute_get_features(struct nvmet_req *req)
324{
325 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
326 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
327 u16 status = 0;
328
329 switch (cdw10 & 0xf) {
330 /*
331 * These features are mandatory in the spec, but we don't
332 * have a useful way to implement them. We'll eventually
333 * need to come up with some fake values for these.
334 */
335#if 0
336 case NVME_FEAT_ARBITRATION:
337 break;
338 case NVME_FEAT_POWER_MGMT:
339 break;
340 case NVME_FEAT_TEMP_THRESH:
341 break;
342 case NVME_FEAT_ERR_RECOVERY:
343 break;
344 case NVME_FEAT_IRQ_COALESCE:
345 break;
346 case NVME_FEAT_IRQ_CONFIG:
347 break;
348 case NVME_FEAT_WRITE_ATOMIC:
349 break;
350 case NVME_FEAT_ASYNC_EVENT:
351 break;
352#endif
353 case NVME_FEAT_VOLATILE_WC:
354 nvmet_set_result(req, 1);
355 break;
356 case NVME_FEAT_NUM_QUEUES:
357 nvmet_set_result(req,
358 (subsys->max_qid-1) | ((subsys->max_qid-1) << 16));
359 break;
360 case NVME_FEAT_KATO:
361 nvmet_set_result(req, req->sq->ctrl->kato * 1000);
362 break;
363 default:
364 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
365 break;
366 }
367
368 nvmet_req_complete(req, status);
369}
370
371static void nvmet_execute_async_event(struct nvmet_req *req)
372{
373 struct nvmet_ctrl *ctrl = req->sq->ctrl;
374
375 mutex_lock(&ctrl->lock);
376 if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) {
377 mutex_unlock(&ctrl->lock);
378 nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR);
379 return;
380 }
381 ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req;
382 mutex_unlock(&ctrl->lock);
383
384 schedule_work(&ctrl->async_event_work);
385}
386
387static void nvmet_execute_keep_alive(struct nvmet_req *req)
388{
389 struct nvmet_ctrl *ctrl = req->sq->ctrl;
390
391 pr_debug("ctrl %d update keep-alive timer for %d secs\n",
392 ctrl->cntlid, ctrl->kato);
393
394 mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
395 nvmet_req_complete(req, 0);
396}
397
398int nvmet_parse_admin_cmd(struct nvmet_req *req)
399{
400 struct nvme_command *cmd = req->cmd;
401
402 req->ns = NULL;
403
404 if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
405 pr_err("nvmet: got admin cmd %d while CC.EN == 0\n",
406 cmd->common.opcode);
407 return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
408 }
409 if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
410 pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n",
411 cmd->common.opcode);
412 return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
413 }
414
415 switch (cmd->common.opcode) {
416 case nvme_admin_get_log_page:
417 req->data_len = nvmet_get_log_page_len(cmd);
418
419 switch (cmd->get_log_page.lid) {
420 case 0x01:
421 case 0x02:
422 case 0x03:
423 req->execute = nvmet_execute_get_log_page;
424 return 0;
425 }
426 break;
427 case nvme_admin_identify:
428 req->data_len = 4096;
429 switch (le32_to_cpu(cmd->identify.cns)) {
430 case 0x00:
431 req->execute = nvmet_execute_identify_ns;
432 return 0;
433 case 0x01:
434 req->execute = nvmet_execute_identify_ctrl;
435 return 0;
436 case 0x02:
437 req->execute = nvmet_execute_identify_nslist;
438 return 0;
439 }
440 break;
441 case nvme_admin_abort_cmd:
442 req->execute = nvmet_execute_abort;
443 req->data_len = 0;
444 return 0;
445 case nvme_admin_set_features:
446 req->execute = nvmet_execute_set_features;
447 req->data_len = 0;
448 return 0;
449 case nvme_admin_get_features:
450 req->execute = nvmet_execute_get_features;
451 req->data_len = 0;
452 return 0;
453 case nvme_admin_async_event:
454 req->execute = nvmet_execute_async_event;
455 req->data_len = 0;
456 return 0;
457 case nvme_admin_keep_alive:
458 req->execute = nvmet_execute_keep_alive;
459 req->data_len = 0;
460 return 0;
461 }
462
463 pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
464 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
465}
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
new file mode 100644
index 000000000000..af5e2dc4a3d5
--- /dev/null
+++ b/drivers/nvme/target/configfs.c
@@ -0,0 +1,917 @@
1/*
2 * Configfs interface for the NVMe target.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18#include <linux/stat.h>
19#include <linux/ctype.h>
20
21#include "nvmet.h"
22
23static struct config_item_type nvmet_host_type;
24static struct config_item_type nvmet_subsys_type;
25
26/*
27 * nvmet_port Generic ConfigFS definitions.
28 * Used in any place in the ConfigFS tree that refers to an address.
29 */
30static ssize_t nvmet_addr_adrfam_show(struct config_item *item,
31 char *page)
32{
33 switch (to_nvmet_port(item)->disc_addr.adrfam) {
34 case NVMF_ADDR_FAMILY_IP4:
35 return sprintf(page, "ipv4\n");
36 case NVMF_ADDR_FAMILY_IP6:
37 return sprintf(page, "ipv6\n");
38 case NVMF_ADDR_FAMILY_IB:
39 return sprintf(page, "ib\n");
40 default:
41 return sprintf(page, "\n");
42 }
43}
44
45static ssize_t nvmet_addr_adrfam_store(struct config_item *item,
46 const char *page, size_t count)
47{
48 struct nvmet_port *port = to_nvmet_port(item);
49
50 if (port->enabled) {
51 pr_err("Cannot modify address while enabled\n");
52 pr_err("Disable the address before modifying\n");
53 return -EACCES;
54 }
55
56 if (sysfs_streq(page, "ipv4")) {
57 port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP4;
58 } else if (sysfs_streq(page, "ipv6")) {
59 port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6;
60 } else if (sysfs_streq(page, "ib")) {
61 port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB;
62 } else {
63 pr_err("Invalid value '%s' for adrfam\n", page);
64 return -EINVAL;
65 }
66
67 return count;
68}
69
70CONFIGFS_ATTR(nvmet_, addr_adrfam);
71
72static ssize_t nvmet_addr_portid_show(struct config_item *item,
73 char *page)
74{
75 struct nvmet_port *port = to_nvmet_port(item);
76
77 return snprintf(page, PAGE_SIZE, "%d\n",
78 le16_to_cpu(port->disc_addr.portid));
79}
80
81static ssize_t nvmet_addr_portid_store(struct config_item *item,
82 const char *page, size_t count)
83{
84 struct nvmet_port *port = to_nvmet_port(item);
85 u16 portid = 0;
86
87 if (kstrtou16(page, 0, &portid)) {
88 pr_err("Invalid value '%s' for portid\n", page);
89 return -EINVAL;
90 }
91
92 if (port->enabled) {
93 pr_err("Cannot modify address while enabled\n");
94 pr_err("Disable the address before modifying\n");
95 return -EACCES;
96 }
97 port->disc_addr.portid = cpu_to_le16(portid);
98 return count;
99}
100
101CONFIGFS_ATTR(nvmet_, addr_portid);
102
103static ssize_t nvmet_addr_traddr_show(struct config_item *item,
104 char *page)
105{
106 struct nvmet_port *port = to_nvmet_port(item);
107
108 return snprintf(page, PAGE_SIZE, "%s\n",
109 port->disc_addr.traddr);
110}
111
112static ssize_t nvmet_addr_traddr_store(struct config_item *item,
113 const char *page, size_t count)
114{
115 struct nvmet_port *port = to_nvmet_port(item);
116
117 if (count > NVMF_TRADDR_SIZE) {
118 pr_err("Invalid value '%s' for traddr\n", page);
119 return -EINVAL;
120 }
121
122 if (port->enabled) {
123 pr_err("Cannot modify address while enabled\n");
124 pr_err("Disable the address before modifying\n");
125 return -EACCES;
126 }
127 return snprintf(port->disc_addr.traddr,
128 sizeof(port->disc_addr.traddr), "%s", page);
129}
130
131CONFIGFS_ATTR(nvmet_, addr_traddr);
132
133static ssize_t nvmet_addr_treq_show(struct config_item *item,
134 char *page)
135{
136 switch (to_nvmet_port(item)->disc_addr.treq) {
137 case NVMF_TREQ_NOT_SPECIFIED:
138 return sprintf(page, "not specified\n");
139 case NVMF_TREQ_REQUIRED:
140 return sprintf(page, "required\n");
141 case NVMF_TREQ_NOT_REQUIRED:
142 return sprintf(page, "not required\n");
143 default:
144 return sprintf(page, "\n");
145 }
146}
147
148static ssize_t nvmet_addr_treq_store(struct config_item *item,
149 const char *page, size_t count)
150{
151 struct nvmet_port *port = to_nvmet_port(item);
152
153 if (port->enabled) {
154 pr_err("Cannot modify address while enabled\n");
155 pr_err("Disable the address before modifying\n");
156 return -EACCES;
157 }
158
159 if (sysfs_streq(page, "not specified")) {
160 port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED;
161 } else if (sysfs_streq(page, "required")) {
162 port->disc_addr.treq = NVMF_TREQ_REQUIRED;
163 } else if (sysfs_streq(page, "not required")) {
164 port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED;
165 } else {
166 pr_err("Invalid value '%s' for treq\n", page);
167 return -EINVAL;
168 }
169
170 return count;
171}
172
173CONFIGFS_ATTR(nvmet_, addr_treq);
174
175static ssize_t nvmet_addr_trsvcid_show(struct config_item *item,
176 char *page)
177{
178 struct nvmet_port *port = to_nvmet_port(item);
179
180 return snprintf(page, PAGE_SIZE, "%s\n",
181 port->disc_addr.trsvcid);
182}
183
184static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
185 const char *page, size_t count)
186{
187 struct nvmet_port *port = to_nvmet_port(item);
188
189 if (count > NVMF_TRSVCID_SIZE) {
190 pr_err("Invalid value '%s' for trsvcid\n", page);
191 return -EINVAL;
192 }
193 if (port->enabled) {
194 pr_err("Cannot modify address while enabled\n");
195 pr_err("Disable the address before modifying\n");
196 return -EACCES;
197 }
198 return snprintf(port->disc_addr.trsvcid,
199 sizeof(port->disc_addr.trsvcid), "%s", page);
200}
201
202CONFIGFS_ATTR(nvmet_, addr_trsvcid);
203
204static ssize_t nvmet_addr_trtype_show(struct config_item *item,
205 char *page)
206{
207 switch (to_nvmet_port(item)->disc_addr.trtype) {
208 case NVMF_TRTYPE_RDMA:
209 return sprintf(page, "rdma\n");
210 case NVMF_TRTYPE_LOOP:
211 return sprintf(page, "loop\n");
212 default:
213 return sprintf(page, "\n");
214 }
215}
216
217static void nvmet_port_init_tsas_rdma(struct nvmet_port *port)
218{
219 port->disc_addr.trtype = NVMF_TRTYPE_RDMA;
220 memset(&port->disc_addr.tsas.rdma, 0, NVMF_TSAS_SIZE);
221 port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED;
222 port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED;
223 port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM;
224}
225
226static void nvmet_port_init_tsas_loop(struct nvmet_port *port)
227{
228 port->disc_addr.trtype = NVMF_TRTYPE_LOOP;
229 memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
230}
231
232static ssize_t nvmet_addr_trtype_store(struct config_item *item,
233 const char *page, size_t count)
234{
235 struct nvmet_port *port = to_nvmet_port(item);
236
237 if (port->enabled) {
238 pr_err("Cannot modify address while enabled\n");
239 pr_err("Disable the address before modifying\n");
240 return -EACCES;
241 }
242
243 if (sysfs_streq(page, "rdma")) {
244 nvmet_port_init_tsas_rdma(port);
245 } else if (sysfs_streq(page, "loop")) {
246 nvmet_port_init_tsas_loop(port);
247 } else {
248 pr_err("Invalid value '%s' for trtype\n", page);
249 return -EINVAL;
250 }
251
252 return count;
253}
254
255CONFIGFS_ATTR(nvmet_, addr_trtype);
256
257/*
258 * Namespace structures & file operation functions below
259 */
260static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page)
261{
262 return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path);
263}
264
265static ssize_t nvmet_ns_device_path_store(struct config_item *item,
266 const char *page, size_t count)
267{
268 struct nvmet_ns *ns = to_nvmet_ns(item);
269 struct nvmet_subsys *subsys = ns->subsys;
270 int ret;
271
272 mutex_lock(&subsys->lock);
273 ret = -EBUSY;
274 if (nvmet_ns_enabled(ns))
275 goto out_unlock;
276
277 kfree(ns->device_path);
278
279 ret = -ENOMEM;
280 ns->device_path = kstrdup(page, GFP_KERNEL);
281 if (!ns->device_path)
282 goto out_unlock;
283
284 mutex_unlock(&subsys->lock);
285 return count;
286
287out_unlock:
288 mutex_unlock(&subsys->lock);
289 return ret;
290}
291
292CONFIGFS_ATTR(nvmet_ns_, device_path);
293
294static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page)
295{
296 return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid);
297}
298
299static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
300 const char *page, size_t count)
301{
302 struct nvmet_ns *ns = to_nvmet_ns(item);
303 struct nvmet_subsys *subsys = ns->subsys;
304 u8 nguid[16];
305 const char *p = page;
306 int i;
307 int ret = 0;
308
309 mutex_lock(&subsys->lock);
310 if (nvmet_ns_enabled(ns)) {
311 ret = -EBUSY;
312 goto out_unlock;
313 }
314
315 for (i = 0; i < 16; i++) {
316 if (p + 2 > page + count) {
317 ret = -EINVAL;
318 goto out_unlock;
319 }
320 if (!isxdigit(p[0]) || !isxdigit(p[1])) {
321 ret = -EINVAL;
322 goto out_unlock;
323 }
324
325 nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]);
326 p += 2;
327
328 if (*p == '-' || *p == ':')
329 p++;
330 }
331
332 memcpy(&ns->nguid, nguid, sizeof(nguid));
333out_unlock:
334 mutex_unlock(&subsys->lock);
335 return ret ? ret : count;
336}
337
338CONFIGFS_ATTR(nvmet_ns_, device_nguid);
339
340static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
341{
342 return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item)));
343}
344
345static ssize_t nvmet_ns_enable_store(struct config_item *item,
346 const char *page, size_t count)
347{
348 struct nvmet_ns *ns = to_nvmet_ns(item);
349 bool enable;
350 int ret = 0;
351
352 if (strtobool(page, &enable))
353 return -EINVAL;
354
355 if (enable)
356 ret = nvmet_ns_enable(ns);
357 else
358 nvmet_ns_disable(ns);
359
360 return ret ? ret : count;
361}
362
363CONFIGFS_ATTR(nvmet_ns_, enable);
364
365static struct configfs_attribute *nvmet_ns_attrs[] = {
366 &nvmet_ns_attr_device_path,
367 &nvmet_ns_attr_device_nguid,
368 &nvmet_ns_attr_enable,
369 NULL,
370};
371
372static void nvmet_ns_release(struct config_item *item)
373{
374 struct nvmet_ns *ns = to_nvmet_ns(item);
375
376 nvmet_ns_free(ns);
377}
378
379static struct configfs_item_operations nvmet_ns_item_ops = {
380 .release = nvmet_ns_release,
381};
382
383static struct config_item_type nvmet_ns_type = {
384 .ct_item_ops = &nvmet_ns_item_ops,
385 .ct_attrs = nvmet_ns_attrs,
386 .ct_owner = THIS_MODULE,
387};
388
389static struct config_group *nvmet_ns_make(struct config_group *group,
390 const char *name)
391{
392 struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item);
393 struct nvmet_ns *ns;
394 int ret;
395 u32 nsid;
396
397 ret = kstrtou32(name, 0, &nsid);
398 if (ret)
399 goto out;
400
401 ret = -EINVAL;
402 if (nsid == 0 || nsid == 0xffffffff)
403 goto out;
404
405 ret = -ENOMEM;
406 ns = nvmet_ns_alloc(subsys, nsid);
407 if (!ns)
408 goto out;
409 config_group_init_type_name(&ns->group, name, &nvmet_ns_type);
410
411 pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn);
412
413 return &ns->group;
414out:
415 return ERR_PTR(ret);
416}
417
418static struct configfs_group_operations nvmet_namespaces_group_ops = {
419 .make_group = nvmet_ns_make,
420};
421
422static struct config_item_type nvmet_namespaces_type = {
423 .ct_group_ops = &nvmet_namespaces_group_ops,
424 .ct_owner = THIS_MODULE,
425};
426
427static int nvmet_port_subsys_allow_link(struct config_item *parent,
428 struct config_item *target)
429{
430 struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
431 struct nvmet_subsys *subsys;
432 struct nvmet_subsys_link *link, *p;
433 int ret;
434
435 if (target->ci_type != &nvmet_subsys_type) {
436 pr_err("can only link subsystems into the subsystems dir.!\n");
437 return -EINVAL;
438 }
439 subsys = to_subsys(target);
440 link = kmalloc(sizeof(*link), GFP_KERNEL);
441 if (!link)
442 return -ENOMEM;
443 link->subsys = subsys;
444
445 down_write(&nvmet_config_sem);
446 ret = -EEXIST;
447 list_for_each_entry(p, &port->subsystems, entry) {
448 if (p->subsys == subsys)
449 goto out_free_link;
450 }
451
452 if (list_empty(&port->subsystems)) {
453 ret = nvmet_enable_port(port);
454 if (ret)
455 goto out_free_link;
456 }
457
458 list_add_tail(&link->entry, &port->subsystems);
459 nvmet_genctr++;
460 up_write(&nvmet_config_sem);
461 return 0;
462
463out_free_link:
464 up_write(&nvmet_config_sem);
465 kfree(link);
466 return ret;
467}
468
469static int nvmet_port_subsys_drop_link(struct config_item *parent,
470 struct config_item *target)
471{
472 struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
473 struct nvmet_subsys *subsys = to_subsys(target);
474 struct nvmet_subsys_link *p;
475
476 down_write(&nvmet_config_sem);
477 list_for_each_entry(p, &port->subsystems, entry) {
478 if (p->subsys == subsys)
479 goto found;
480 }
481 up_write(&nvmet_config_sem);
482 return -EINVAL;
483
484found:
485 list_del(&p->entry);
486 nvmet_genctr++;
487 if (list_empty(&port->subsystems))
488 nvmet_disable_port(port);
489 up_write(&nvmet_config_sem);
490 kfree(p);
491 return 0;
492}
493
494static struct configfs_item_operations nvmet_port_subsys_item_ops = {
495 .allow_link = nvmet_port_subsys_allow_link,
496 .drop_link = nvmet_port_subsys_drop_link,
497};
498
499static struct config_item_type nvmet_port_subsys_type = {
500 .ct_item_ops = &nvmet_port_subsys_item_ops,
501 .ct_owner = THIS_MODULE,
502};
503
504static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
505 struct config_item *target)
506{
507 struct nvmet_subsys *subsys = to_subsys(parent->ci_parent);
508 struct nvmet_host *host;
509 struct nvmet_host_link *link, *p;
510 int ret;
511
512 if (target->ci_type != &nvmet_host_type) {
513 pr_err("can only link hosts into the allowed_hosts directory!\n");
514 return -EINVAL;
515 }
516
517 host = to_host(target);
518 link = kmalloc(sizeof(*link), GFP_KERNEL);
519 if (!link)
520 return -ENOMEM;
521 link->host = host;
522
523 down_write(&nvmet_config_sem);
524 ret = -EINVAL;
525 if (subsys->allow_any_host) {
526 pr_err("can't add hosts when allow_any_host is set!\n");
527 goto out_free_link;
528 }
529
530 ret = -EEXIST;
531 list_for_each_entry(p, &subsys->hosts, entry) {
532 if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host)))
533 goto out_free_link;
534 }
535 list_add_tail(&link->entry, &subsys->hosts);
536 nvmet_genctr++;
537 up_write(&nvmet_config_sem);
538 return 0;
539out_free_link:
540 up_write(&nvmet_config_sem);
541 kfree(link);
542 return ret;
543}
544
545static int nvmet_allowed_hosts_drop_link(struct config_item *parent,
546 struct config_item *target)
547{
548 struct nvmet_subsys *subsys = to_subsys(parent->ci_parent);
549 struct nvmet_host *host = to_host(target);
550 struct nvmet_host_link *p;
551
552 down_write(&nvmet_config_sem);
553 list_for_each_entry(p, &subsys->hosts, entry) {
554 if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host)))
555 goto found;
556 }
557 up_write(&nvmet_config_sem);
558 return -EINVAL;
559
560found:
561 list_del(&p->entry);
562 nvmet_genctr++;
563 up_write(&nvmet_config_sem);
564 kfree(p);
565 return 0;
566}
567
568static struct configfs_item_operations nvmet_allowed_hosts_item_ops = {
569 .allow_link = nvmet_allowed_hosts_allow_link,
570 .drop_link = nvmet_allowed_hosts_drop_link,
571};
572
573static struct config_item_type nvmet_allowed_hosts_type = {
574 .ct_item_ops = &nvmet_allowed_hosts_item_ops,
575 .ct_owner = THIS_MODULE,
576};
577
578static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item,
579 char *page)
580{
581 return snprintf(page, PAGE_SIZE, "%d\n",
582 to_subsys(item)->allow_any_host);
583}
584
585static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
586 const char *page, size_t count)
587{
588 struct nvmet_subsys *subsys = to_subsys(item);
589 bool allow_any_host;
590 int ret = 0;
591
592 if (strtobool(page, &allow_any_host))
593 return -EINVAL;
594
595 down_write(&nvmet_config_sem);
596 if (allow_any_host && !list_empty(&subsys->hosts)) {
597 pr_err("Can't set allow_any_host when explicit hosts are set!\n");
598 ret = -EINVAL;
599 goto out_unlock;
600 }
601
602 subsys->allow_any_host = allow_any_host;
603out_unlock:
604 up_write(&nvmet_config_sem);
605 return ret ? ret : count;
606}
607
608CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host);
609
610static struct configfs_attribute *nvmet_subsys_attrs[] = {
611 &nvmet_subsys_attr_attr_allow_any_host,
612 NULL,
613};
614
615/*
616 * Subsystem structures & folder operation functions below
617 */
618static void nvmet_subsys_release(struct config_item *item)
619{
620 struct nvmet_subsys *subsys = to_subsys(item);
621
622 nvmet_subsys_put(subsys);
623}
624
625static struct configfs_item_operations nvmet_subsys_item_ops = {
626 .release = nvmet_subsys_release,
627};
628
629static struct config_item_type nvmet_subsys_type = {
630 .ct_item_ops = &nvmet_subsys_item_ops,
631 .ct_attrs = nvmet_subsys_attrs,
632 .ct_owner = THIS_MODULE,
633};
634
635static struct config_group *nvmet_subsys_make(struct config_group *group,
636 const char *name)
637{
638 struct nvmet_subsys *subsys;
639
640 if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) {
641 pr_err("can't create discovery subsystem through configfs\n");
642 return ERR_PTR(-EINVAL);
643 }
644
645 subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME);
646 if (!subsys)
647 return ERR_PTR(-ENOMEM);
648
649 config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type);
650
651 config_group_init_type_name(&subsys->namespaces_group,
652 "namespaces", &nvmet_namespaces_type);
653 configfs_add_default_group(&subsys->namespaces_group, &subsys->group);
654
655 config_group_init_type_name(&subsys->allowed_hosts_group,
656 "allowed_hosts", &nvmet_allowed_hosts_type);
657 configfs_add_default_group(&subsys->allowed_hosts_group,
658 &subsys->group);
659
660 return &subsys->group;
661}
662
663static struct configfs_group_operations nvmet_subsystems_group_ops = {
664 .make_group = nvmet_subsys_make,
665};
666
667static struct config_item_type nvmet_subsystems_type = {
668 .ct_group_ops = &nvmet_subsystems_group_ops,
669 .ct_owner = THIS_MODULE,
670};
671
672static ssize_t nvmet_referral_enable_show(struct config_item *item,
673 char *page)
674{
675 return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled);
676}
677
678static ssize_t nvmet_referral_enable_store(struct config_item *item,
679 const char *page, size_t count)
680{
681 struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
682 struct nvmet_port *port = to_nvmet_port(item);
683 bool enable;
684
685 if (strtobool(page, &enable))
686 goto inval;
687
688 if (enable)
689 nvmet_referral_enable(parent, port);
690 else
691 nvmet_referral_disable(port);
692
693 return count;
694inval:
695 pr_err("Invalid value '%s' for enable\n", page);
696 return -EINVAL;
697}
698
699CONFIGFS_ATTR(nvmet_referral_, enable);
700
701/*
702 * Discovery Service subsystem definitions
703 */
704static struct configfs_attribute *nvmet_referral_attrs[] = {
705 &nvmet_attr_addr_adrfam,
706 &nvmet_attr_addr_portid,
707 &nvmet_attr_addr_treq,
708 &nvmet_attr_addr_traddr,
709 &nvmet_attr_addr_trsvcid,
710 &nvmet_attr_addr_trtype,
711 &nvmet_referral_attr_enable,
712 NULL,
713};
714
715static void nvmet_referral_release(struct config_item *item)
716{
717 struct nvmet_port *port = to_nvmet_port(item);
718
719 nvmet_referral_disable(port);
720 kfree(port);
721}
722
723static struct configfs_item_operations nvmet_referral_item_ops = {
724 .release = nvmet_referral_release,
725};
726
727static struct config_item_type nvmet_referral_type = {
728 .ct_owner = THIS_MODULE,
729 .ct_attrs = nvmet_referral_attrs,
730 .ct_item_ops = &nvmet_referral_item_ops,
731};
732
733static struct config_group *nvmet_referral_make(
734 struct config_group *group, const char *name)
735{
736 struct nvmet_port *port;
737
738 port = kzalloc(sizeof(*port), GFP_KERNEL);
739 if (!port)
740 return ERR_PTR(-ENOMEM);
741
742 INIT_LIST_HEAD(&port->entry);
743 config_group_init_type_name(&port->group, name, &nvmet_referral_type);
744
745 return &port->group;
746}
747
748static struct configfs_group_operations nvmet_referral_group_ops = {
749 .make_group = nvmet_referral_make,
750};
751
752static struct config_item_type nvmet_referrals_type = {
753 .ct_owner = THIS_MODULE,
754 .ct_group_ops = &nvmet_referral_group_ops,
755};
756
757/*
758 * Ports definitions.
759 */
760static void nvmet_port_release(struct config_item *item)
761{
762 struct nvmet_port *port = to_nvmet_port(item);
763
764 kfree(port);
765}
766
767static struct configfs_attribute *nvmet_port_attrs[] = {
768 &nvmet_attr_addr_adrfam,
769 &nvmet_attr_addr_treq,
770 &nvmet_attr_addr_traddr,
771 &nvmet_attr_addr_trsvcid,
772 &nvmet_attr_addr_trtype,
773 NULL,
774};
775
776static struct configfs_item_operations nvmet_port_item_ops = {
777 .release = nvmet_port_release,
778};
779
780static struct config_item_type nvmet_port_type = {
781 .ct_attrs = nvmet_port_attrs,
782 .ct_item_ops = &nvmet_port_item_ops,
783 .ct_owner = THIS_MODULE,
784};
785
786static struct config_group *nvmet_ports_make(struct config_group *group,
787 const char *name)
788{
789 struct nvmet_port *port;
790 u16 portid;
791
792 if (kstrtou16(name, 0, &portid))
793 return ERR_PTR(-EINVAL);
794
795 port = kzalloc(sizeof(*port), GFP_KERNEL);
796 if (!port)
797 return ERR_PTR(-ENOMEM);
798
799 INIT_LIST_HEAD(&port->entry);
800 INIT_LIST_HEAD(&port->subsystems);
801 INIT_LIST_HEAD(&port->referrals);
802
803 port->disc_addr.portid = cpu_to_le16(portid);
804 config_group_init_type_name(&port->group, name, &nvmet_port_type);
805
806 config_group_init_type_name(&port->subsys_group,
807 "subsystems", &nvmet_port_subsys_type);
808 configfs_add_default_group(&port->subsys_group, &port->group);
809
810 config_group_init_type_name(&port->referrals_group,
811 "referrals", &nvmet_referrals_type);
812 configfs_add_default_group(&port->referrals_group, &port->group);
813
814 return &port->group;
815}
816
817static struct configfs_group_operations nvmet_ports_group_ops = {
818 .make_group = nvmet_ports_make,
819};
820
821static struct config_item_type nvmet_ports_type = {
822 .ct_group_ops = &nvmet_ports_group_ops,
823 .ct_owner = THIS_MODULE,
824};
825
826static struct config_group nvmet_subsystems_group;
827static struct config_group nvmet_ports_group;
828
829static void nvmet_host_release(struct config_item *item)
830{
831 struct nvmet_host *host = to_host(item);
832
833 kfree(host);
834}
835
836static struct configfs_item_operations nvmet_host_item_ops = {
837 .release = nvmet_host_release,
838};
839
840static struct config_item_type nvmet_host_type = {
841 .ct_item_ops = &nvmet_host_item_ops,
842 .ct_owner = THIS_MODULE,
843};
844
845static struct config_group *nvmet_hosts_make_group(struct config_group *group,
846 const char *name)
847{
848 struct nvmet_host *host;
849
850 host = kzalloc(sizeof(*host), GFP_KERNEL);
851 if (!host)
852 return ERR_PTR(-ENOMEM);
853
854 config_group_init_type_name(&host->group, name, &nvmet_host_type);
855
856 return &host->group;
857}
858
859static struct configfs_group_operations nvmet_hosts_group_ops = {
860 .make_group = nvmet_hosts_make_group,
861};
862
863static struct config_item_type nvmet_hosts_type = {
864 .ct_group_ops = &nvmet_hosts_group_ops,
865 .ct_owner = THIS_MODULE,
866};
867
868static struct config_group nvmet_hosts_group;
869
870static struct config_item_type nvmet_root_type = {
871 .ct_owner = THIS_MODULE,
872};
873
874static struct configfs_subsystem nvmet_configfs_subsystem = {
875 .su_group = {
876 .cg_item = {
877 .ci_namebuf = "nvmet",
878 .ci_type = &nvmet_root_type,
879 },
880 },
881};
882
883int __init nvmet_init_configfs(void)
884{
885 int ret;
886
887 config_group_init(&nvmet_configfs_subsystem.su_group);
888 mutex_init(&nvmet_configfs_subsystem.su_mutex);
889
890 config_group_init_type_name(&nvmet_subsystems_group,
891 "subsystems", &nvmet_subsystems_type);
892 configfs_add_default_group(&nvmet_subsystems_group,
893 &nvmet_configfs_subsystem.su_group);
894
895 config_group_init_type_name(&nvmet_ports_group,
896 "ports", &nvmet_ports_type);
897 configfs_add_default_group(&nvmet_ports_group,
898 &nvmet_configfs_subsystem.su_group);
899
900 config_group_init_type_name(&nvmet_hosts_group,
901 "hosts", &nvmet_hosts_type);
902 configfs_add_default_group(&nvmet_hosts_group,
903 &nvmet_configfs_subsystem.su_group);
904
905 ret = configfs_register_subsystem(&nvmet_configfs_subsystem);
906 if (ret) {
907 pr_err("configfs_register_subsystem: %d\n", ret);
908 return ret;
909 }
910
911 return 0;
912}
913
914void __exit nvmet_exit_configfs(void)
915{
916 configfs_unregister_subsystem(&nvmet_configfs_subsystem);
917}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
new file mode 100644
index 000000000000..8a891ca53367
--- /dev/null
+++ b/drivers/nvme/target/core.c
@@ -0,0 +1,964 @@
1/*
2 * Common code for the NVMe target.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/module.h>
16#include "nvmet.h"
17
18static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
19
20/*
21 * This read/write semaphore is used to synchronize access to configuration
22 * information on a target system that will result in discovery log page
23 * information change for at least one host.
24 * The full list of resources to protected by this semaphore is:
25 *
26 * - subsystems list
27 * - per-subsystem allowed hosts list
28 * - allow_any_host subsystem attribute
29 * - nvmet_genctr
30 * - the nvmet_transports array
31 *
32 * When updating any of those lists/structures write lock should be obtained,
33 * while when reading (popolating discovery log page or checking host-subsystem
34 * link) read lock is obtained to allow concurrent reads.
35 */
36DECLARE_RWSEM(nvmet_config_sem);
37
38static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
39 const char *subsysnqn);
40
41u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
42 size_t len)
43{
44 if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
45 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
46 return 0;
47}
48
49u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
50{
51 if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
52 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
53 return 0;
54}
55
56static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
57{
58 return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
59}
60
61static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
62{
63 struct nvmet_req *req;
64
65 while (1) {
66 mutex_lock(&ctrl->lock);
67 if (!ctrl->nr_async_event_cmds) {
68 mutex_unlock(&ctrl->lock);
69 return;
70 }
71
72 req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
73 mutex_unlock(&ctrl->lock);
74 nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR);
75 }
76}
77
78static void nvmet_async_event_work(struct work_struct *work)
79{
80 struct nvmet_ctrl *ctrl =
81 container_of(work, struct nvmet_ctrl, async_event_work);
82 struct nvmet_async_event *aen;
83 struct nvmet_req *req;
84
85 while (1) {
86 mutex_lock(&ctrl->lock);
87 aen = list_first_entry_or_null(&ctrl->async_events,
88 struct nvmet_async_event, entry);
89 if (!aen || !ctrl->nr_async_event_cmds) {
90 mutex_unlock(&ctrl->lock);
91 return;
92 }
93
94 req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
95 nvmet_set_result(req, nvmet_async_event_result(aen));
96
97 list_del(&aen->entry);
98 kfree(aen);
99
100 mutex_unlock(&ctrl->lock);
101 nvmet_req_complete(req, 0);
102 }
103}
104
105static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
106 u8 event_info, u8 log_page)
107{
108 struct nvmet_async_event *aen;
109
110 aen = kmalloc(sizeof(*aen), GFP_KERNEL);
111 if (!aen)
112 return;
113
114 aen->event_type = event_type;
115 aen->event_info = event_info;
116 aen->log_page = log_page;
117
118 mutex_lock(&ctrl->lock);
119 list_add_tail(&aen->entry, &ctrl->async_events);
120 mutex_unlock(&ctrl->lock);
121
122 schedule_work(&ctrl->async_event_work);
123}
124
125int nvmet_register_transport(struct nvmet_fabrics_ops *ops)
126{
127 int ret = 0;
128
129 down_write(&nvmet_config_sem);
130 if (nvmet_transports[ops->type])
131 ret = -EINVAL;
132 else
133 nvmet_transports[ops->type] = ops;
134 up_write(&nvmet_config_sem);
135
136 return ret;
137}
138EXPORT_SYMBOL_GPL(nvmet_register_transport);
139
140void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops)
141{
142 down_write(&nvmet_config_sem);
143 nvmet_transports[ops->type] = NULL;
144 up_write(&nvmet_config_sem);
145}
146EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
147
148int nvmet_enable_port(struct nvmet_port *port)
149{
150 struct nvmet_fabrics_ops *ops;
151 int ret;
152
153 lockdep_assert_held(&nvmet_config_sem);
154
155 ops = nvmet_transports[port->disc_addr.trtype];
156 if (!ops) {
157 up_write(&nvmet_config_sem);
158 request_module("nvmet-transport-%d", port->disc_addr.trtype);
159 down_write(&nvmet_config_sem);
160 ops = nvmet_transports[port->disc_addr.trtype];
161 if (!ops) {
162 pr_err("transport type %d not supported\n",
163 port->disc_addr.trtype);
164 return -EINVAL;
165 }
166 }
167
168 if (!try_module_get(ops->owner))
169 return -EINVAL;
170
171 ret = ops->add_port(port);
172 if (ret) {
173 module_put(ops->owner);
174 return ret;
175 }
176
177 port->enabled = true;
178 return 0;
179}
180
181void nvmet_disable_port(struct nvmet_port *port)
182{
183 struct nvmet_fabrics_ops *ops;
184
185 lockdep_assert_held(&nvmet_config_sem);
186
187 port->enabled = false;
188
189 ops = nvmet_transports[port->disc_addr.trtype];
190 ops->remove_port(port);
191 module_put(ops->owner);
192}
193
194static void nvmet_keep_alive_timer(struct work_struct *work)
195{
196 struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
197 struct nvmet_ctrl, ka_work);
198
199 pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
200 ctrl->cntlid, ctrl->kato);
201
202 ctrl->ops->delete_ctrl(ctrl);
203}
204
205static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
206{
207 pr_debug("ctrl %d start keep-alive timer for %d secs\n",
208 ctrl->cntlid, ctrl->kato);
209
210 INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
211 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
212}
213
214static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
215{
216 pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
217
218 cancel_delayed_work_sync(&ctrl->ka_work);
219}
220
221static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
222 __le32 nsid)
223{
224 struct nvmet_ns *ns;
225
226 list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
227 if (ns->nsid == le32_to_cpu(nsid))
228 return ns;
229 }
230
231 return NULL;
232}
233
234struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
235{
236 struct nvmet_ns *ns;
237
238 rcu_read_lock();
239 ns = __nvmet_find_namespace(ctrl, nsid);
240 if (ns)
241 percpu_ref_get(&ns->ref);
242 rcu_read_unlock();
243
244 return ns;
245}
246
247static void nvmet_destroy_namespace(struct percpu_ref *ref)
248{
249 struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
250
251 complete(&ns->disable_done);
252}
253
254void nvmet_put_namespace(struct nvmet_ns *ns)
255{
256 percpu_ref_put(&ns->ref);
257}
258
259int nvmet_ns_enable(struct nvmet_ns *ns)
260{
261 struct nvmet_subsys *subsys = ns->subsys;
262 struct nvmet_ctrl *ctrl;
263 int ret = 0;
264
265 mutex_lock(&subsys->lock);
266 if (!list_empty(&ns->dev_link))
267 goto out_unlock;
268
269 ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
270 NULL);
271 if (IS_ERR(ns->bdev)) {
272 pr_err("nvmet: failed to open block device %s: (%ld)\n",
273 ns->device_path, PTR_ERR(ns->bdev));
274 ret = PTR_ERR(ns->bdev);
275 ns->bdev = NULL;
276 goto out_unlock;
277 }
278
279 ns->size = i_size_read(ns->bdev->bd_inode);
280 ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
281
282 ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
283 0, GFP_KERNEL);
284 if (ret)
285 goto out_blkdev_put;
286
287 if (ns->nsid > subsys->max_nsid)
288 subsys->max_nsid = ns->nsid;
289
290 /*
291 * The namespaces list needs to be sorted to simplify the implementation
292 * of the Identify Namepace List subcommand.
293 */
294 if (list_empty(&subsys->namespaces)) {
295 list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
296 } else {
297 struct nvmet_ns *old;
298
299 list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) {
300 BUG_ON(ns->nsid == old->nsid);
301 if (ns->nsid < old->nsid)
302 break;
303 }
304
305 list_add_tail_rcu(&ns->dev_link, &old->dev_link);
306 }
307
308 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
309 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
310
311 ret = 0;
312out_unlock:
313 mutex_unlock(&subsys->lock);
314 return ret;
315out_blkdev_put:
316 blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
317 ns->bdev = NULL;
318 goto out_unlock;
319}
320
321void nvmet_ns_disable(struct nvmet_ns *ns)
322{
323 struct nvmet_subsys *subsys = ns->subsys;
324 struct nvmet_ctrl *ctrl;
325
326 mutex_lock(&subsys->lock);
327 if (list_empty(&ns->dev_link)) {
328 mutex_unlock(&subsys->lock);
329 return;
330 }
331 list_del_init(&ns->dev_link);
332 mutex_unlock(&subsys->lock);
333
334 /*
335 * Now that we removed the namespaces from the lookup list, we
336 * can kill the per_cpu ref and wait for any remaining references
337 * to be dropped, as well as a RCU grace period for anyone only
338 * using the namepace under rcu_read_lock(). Note that we can't
339 * use call_rcu here as we need to ensure the namespaces have
340 * been fully destroyed before unloading the module.
341 */
342 percpu_ref_kill(&ns->ref);
343 synchronize_rcu();
344 wait_for_completion(&ns->disable_done);
345 percpu_ref_exit(&ns->ref);
346
347 mutex_lock(&subsys->lock);
348 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
349 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
350
351 if (ns->bdev)
352 blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
353 mutex_unlock(&subsys->lock);
354}
355
356void nvmet_ns_free(struct nvmet_ns *ns)
357{
358 nvmet_ns_disable(ns);
359
360 kfree(ns->device_path);
361 kfree(ns);
362}
363
364struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
365{
366 struct nvmet_ns *ns;
367
368 ns = kzalloc(sizeof(*ns), GFP_KERNEL);
369 if (!ns)
370 return NULL;
371
372 INIT_LIST_HEAD(&ns->dev_link);
373 init_completion(&ns->disable_done);
374
375 ns->nsid = nsid;
376 ns->subsys = subsys;
377
378 return ns;
379}
380
381static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
382{
383 if (status)
384 nvmet_set_status(req, status);
385
386 /* XXX: need to fill in something useful for sq_head */
387 req->rsp->sq_head = 0;
388 if (likely(req->sq)) /* may happen during early failure */
389 req->rsp->sq_id = cpu_to_le16(req->sq->qid);
390 req->rsp->command_id = req->cmd->common.command_id;
391
392 if (req->ns)
393 nvmet_put_namespace(req->ns);
394 req->ops->queue_response(req);
395}
396
397void nvmet_req_complete(struct nvmet_req *req, u16 status)
398{
399 __nvmet_req_complete(req, status);
400 percpu_ref_put(&req->sq->ref);
401}
402EXPORT_SYMBOL_GPL(nvmet_req_complete);
403
404void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
405 u16 qid, u16 size)
406{
407 cq->qid = qid;
408 cq->size = size;
409
410 ctrl->cqs[qid] = cq;
411}
412
413void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
414 u16 qid, u16 size)
415{
416 sq->qid = qid;
417 sq->size = size;
418
419 ctrl->sqs[qid] = sq;
420}
421
422void nvmet_sq_destroy(struct nvmet_sq *sq)
423{
424 /*
425 * If this is the admin queue, complete all AERs so that our
426 * queue doesn't have outstanding requests on it.
427 */
428 if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq)
429 nvmet_async_events_free(sq->ctrl);
430 percpu_ref_kill(&sq->ref);
431 wait_for_completion(&sq->free_done);
432 percpu_ref_exit(&sq->ref);
433
434 if (sq->ctrl) {
435 nvmet_ctrl_put(sq->ctrl);
436 sq->ctrl = NULL; /* allows reusing the queue later */
437 }
438}
439EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
440
441static void nvmet_sq_free(struct percpu_ref *ref)
442{
443 struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
444
445 complete(&sq->free_done);
446}
447
448int nvmet_sq_init(struct nvmet_sq *sq)
449{
450 int ret;
451
452 ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
453 if (ret) {
454 pr_err("percpu_ref init failed!\n");
455 return ret;
456 }
457 init_completion(&sq->free_done);
458
459 return 0;
460}
461EXPORT_SYMBOL_GPL(nvmet_sq_init);
462
463bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
464 struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops)
465{
466 u8 flags = req->cmd->common.flags;
467 u16 status;
468
469 req->cq = cq;
470 req->sq = sq;
471 req->ops = ops;
472 req->sg = NULL;
473 req->sg_cnt = 0;
474 req->rsp->status = 0;
475
476 /* no support for fused commands yet */
477 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
478 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
479 goto fail;
480 }
481
482 /* either variant of SGLs is fine, as we don't support metadata */
483 if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF &&
484 (flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METASEG)) {
485 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
486 goto fail;
487 }
488
489 if (unlikely(!req->sq->ctrl))
490 /* will return an error for any Non-connect command: */
491 status = nvmet_parse_connect_cmd(req);
492 else if (likely(req->sq->qid != 0))
493 status = nvmet_parse_io_cmd(req);
494 else if (req->cmd->common.opcode == nvme_fabrics_command)
495 status = nvmet_parse_fabrics_cmd(req);
496 else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC)
497 status = nvmet_parse_discovery_cmd(req);
498 else
499 status = nvmet_parse_admin_cmd(req);
500
501 if (status)
502 goto fail;
503
504 if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
505 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
506 goto fail;
507 }
508
509 return true;
510
511fail:
512 __nvmet_req_complete(req, status);
513 return false;
514}
515EXPORT_SYMBOL_GPL(nvmet_req_init);
516
517static inline bool nvmet_cc_en(u32 cc)
518{
519 return cc & 0x1;
520}
521
522static inline u8 nvmet_cc_css(u32 cc)
523{
524 return (cc >> 4) & 0x7;
525}
526
527static inline u8 nvmet_cc_mps(u32 cc)
528{
529 return (cc >> 7) & 0xf;
530}
531
532static inline u8 nvmet_cc_ams(u32 cc)
533{
534 return (cc >> 11) & 0x7;
535}
536
537static inline u8 nvmet_cc_shn(u32 cc)
538{
539 return (cc >> 14) & 0x3;
540}
541
542static inline u8 nvmet_cc_iosqes(u32 cc)
543{
544 return (cc >> 16) & 0xf;
545}
546
547static inline u8 nvmet_cc_iocqes(u32 cc)
548{
549 return (cc >> 20) & 0xf;
550}
551
552static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
553{
554 lockdep_assert_held(&ctrl->lock);
555
556 if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
557 nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
558 nvmet_cc_mps(ctrl->cc) != 0 ||
559 nvmet_cc_ams(ctrl->cc) != 0 ||
560 nvmet_cc_css(ctrl->cc) != 0) {
561 ctrl->csts = NVME_CSTS_CFS;
562 return;
563 }
564
565 ctrl->csts = NVME_CSTS_RDY;
566}
567
568static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
569{
570 lockdep_assert_held(&ctrl->lock);
571
572 /* XXX: tear down queues? */
573 ctrl->csts &= ~NVME_CSTS_RDY;
574 ctrl->cc = 0;
575}
576
577void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
578{
579 u32 old;
580
581 mutex_lock(&ctrl->lock);
582 old = ctrl->cc;
583 ctrl->cc = new;
584
585 if (nvmet_cc_en(new) && !nvmet_cc_en(old))
586 nvmet_start_ctrl(ctrl);
587 if (!nvmet_cc_en(new) && nvmet_cc_en(old))
588 nvmet_clear_ctrl(ctrl);
589 if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
590 nvmet_clear_ctrl(ctrl);
591 ctrl->csts |= NVME_CSTS_SHST_CMPLT;
592 }
593 if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
594 ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
595 mutex_unlock(&ctrl->lock);
596}
597
598static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
599{
600 /* command sets supported: NVMe command set: */
601 ctrl->cap = (1ULL << 37);
602 /* CC.EN timeout in 500msec units: */
603 ctrl->cap |= (15ULL << 24);
604 /* maximum queue entries supported: */
605 ctrl->cap |= NVMET_QUEUE_SIZE - 1;
606}
607
608u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
609 struct nvmet_req *req, struct nvmet_ctrl **ret)
610{
611 struct nvmet_subsys *subsys;
612 struct nvmet_ctrl *ctrl;
613 u16 status = 0;
614
615 subsys = nvmet_find_get_subsys(req->port, subsysnqn);
616 if (!subsys) {
617 pr_warn("connect request for invalid subsystem %s!\n",
618 subsysnqn);
619 req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn);
620 return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
621 }
622
623 mutex_lock(&subsys->lock);
624 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
625 if (ctrl->cntlid == cntlid) {
626 if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
627 pr_warn("hostnqn mismatch.\n");
628 continue;
629 }
630 if (!kref_get_unless_zero(&ctrl->ref))
631 continue;
632
633 *ret = ctrl;
634 goto out;
635 }
636 }
637
638 pr_warn("could not find controller %d for subsys %s / host %s\n",
639 cntlid, subsysnqn, hostnqn);
640 req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid);
641 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
642
643out:
644 mutex_unlock(&subsys->lock);
645 nvmet_subsys_put(subsys);
646 return status;
647}
648
649static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
650 const char *hostnqn)
651{
652 struct nvmet_host_link *p;
653
654 if (subsys->allow_any_host)
655 return true;
656
657 list_for_each_entry(p, &subsys->hosts, entry) {
658 if (!strcmp(nvmet_host_name(p->host), hostnqn))
659 return true;
660 }
661
662 return false;
663}
664
665static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
666 const char *hostnqn)
667{
668 struct nvmet_subsys_link *s;
669
670 list_for_each_entry(s, &req->port->subsystems, entry) {
671 if (__nvmet_host_allowed(s->subsys, hostnqn))
672 return true;
673 }
674
675 return false;
676}
677
678bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
679 const char *hostnqn)
680{
681 lockdep_assert_held(&nvmet_config_sem);
682
683 if (subsys->type == NVME_NQN_DISC)
684 return nvmet_host_discovery_allowed(req, hostnqn);
685 else
686 return __nvmet_host_allowed(subsys, hostnqn);
687}
688
689u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
690 struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
691{
692 struct nvmet_subsys *subsys;
693 struct nvmet_ctrl *ctrl;
694 int ret;
695 u16 status;
696
697 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
698 subsys = nvmet_find_get_subsys(req->port, subsysnqn);
699 if (!subsys) {
700 pr_warn("connect request for invalid subsystem %s!\n",
701 subsysnqn);
702 req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn);
703 goto out;
704 }
705
706 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
707 down_read(&nvmet_config_sem);
708 if (!nvmet_host_allowed(req, subsys, hostnqn)) {
709 pr_info("connect by host %s for subsystem %s not allowed\n",
710 hostnqn, subsysnqn);
711 req->rsp->result = IPO_IATTR_CONNECT_DATA(hostnqn);
712 up_read(&nvmet_config_sem);
713 goto out_put_subsystem;
714 }
715 up_read(&nvmet_config_sem);
716
717 status = NVME_SC_INTERNAL;
718 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
719 if (!ctrl)
720 goto out_put_subsystem;
721 mutex_init(&ctrl->lock);
722
723 nvmet_init_cap(ctrl);
724
725 INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
726 INIT_LIST_HEAD(&ctrl->async_events);
727
728 memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
729 memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
730
731 kref_init(&ctrl->ref);
732 ctrl->subsys = subsys;
733
734 ctrl->cqs = kcalloc(subsys->max_qid + 1,
735 sizeof(struct nvmet_cq *),
736 GFP_KERNEL);
737 if (!ctrl->cqs)
738 goto out_free_ctrl;
739
740 ctrl->sqs = kcalloc(subsys->max_qid + 1,
741 sizeof(struct nvmet_sq *),
742 GFP_KERNEL);
743 if (!ctrl->sqs)
744 goto out_free_cqs;
745
746 ret = ida_simple_get(&subsys->cntlid_ida,
747 NVME_CNTLID_MIN, NVME_CNTLID_MAX,
748 GFP_KERNEL);
749 if (ret < 0) {
750 status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
751 goto out_free_sqs;
752 }
753 ctrl->cntlid = ret;
754
755 ctrl->ops = req->ops;
756 if (ctrl->subsys->type == NVME_NQN_DISC) {
757 /* Don't accept keep-alive timeout for discovery controllers */
758 if (kato) {
759 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
760 goto out_free_sqs;
761 }
762
763 /*
764 * Discovery controllers use some arbitrary high value in order
765 * to cleanup stale discovery sessions
766 *
767 * From the latest base diff RC:
768 * "The Keep Alive command is not supported by
769 * Discovery controllers. A transport may specify a
770 * fixed Discovery controller activity timeout value
771 * (e.g., 2 minutes). If no commands are received
772 * by a Discovery controller within that time
773 * period, the controller may perform the
774 * actions for Keep Alive Timer expiration".
775 */
776 ctrl->kato = NVMET_DISC_KATO;
777 } else {
778 /* keep-alive timeout in seconds */
779 ctrl->kato = DIV_ROUND_UP(kato, 1000);
780 }
781 nvmet_start_keep_alive_timer(ctrl);
782
783 mutex_lock(&subsys->lock);
784 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
785 mutex_unlock(&subsys->lock);
786
787 *ctrlp = ctrl;
788 return 0;
789
790out_free_sqs:
791 kfree(ctrl->sqs);
792out_free_cqs:
793 kfree(ctrl->cqs);
794out_free_ctrl:
795 kfree(ctrl);
796out_put_subsystem:
797 nvmet_subsys_put(subsys);
798out:
799 return status;
800}
801
802static void nvmet_ctrl_free(struct kref *ref)
803{
804 struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
805 struct nvmet_subsys *subsys = ctrl->subsys;
806
807 nvmet_stop_keep_alive_timer(ctrl);
808
809 mutex_lock(&subsys->lock);
810 list_del(&ctrl->subsys_entry);
811 mutex_unlock(&subsys->lock);
812
813 ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid);
814 nvmet_subsys_put(subsys);
815
816 kfree(ctrl->sqs);
817 kfree(ctrl->cqs);
818 kfree(ctrl);
819}
820
821void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
822{
823 kref_put(&ctrl->ref, nvmet_ctrl_free);
824}
825
826static void nvmet_fatal_error_handler(struct work_struct *work)
827{
828 struct nvmet_ctrl *ctrl =
829 container_of(work, struct nvmet_ctrl, fatal_err_work);
830
831 pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
832 ctrl->ops->delete_ctrl(ctrl);
833}
834
835void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
836{
837 ctrl->csts |= NVME_CSTS_CFS;
838 INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
839 schedule_work(&ctrl->fatal_err_work);
840}
841EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
842
843static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
844 const char *subsysnqn)
845{
846 struct nvmet_subsys_link *p;
847
848 if (!port)
849 return NULL;
850
851 if (!strncmp(NVME_DISC_SUBSYS_NAME, subsysnqn,
852 NVMF_NQN_SIZE)) {
853 if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
854 return NULL;
855 return nvmet_disc_subsys;
856 }
857
858 down_read(&nvmet_config_sem);
859 list_for_each_entry(p, &port->subsystems, entry) {
860 if (!strncmp(p->subsys->subsysnqn, subsysnqn,
861 NVMF_NQN_SIZE)) {
862 if (!kref_get_unless_zero(&p->subsys->ref))
863 break;
864 up_read(&nvmet_config_sem);
865 return p->subsys;
866 }
867 }
868 up_read(&nvmet_config_sem);
869 return NULL;
870}
871
872struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
873 enum nvme_subsys_type type)
874{
875 struct nvmet_subsys *subsys;
876
877 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
878 if (!subsys)
879 return NULL;
880
881 subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */
882
883 switch (type) {
884 case NVME_NQN_NVME:
885 subsys->max_qid = NVMET_NR_QUEUES;
886 break;
887 case NVME_NQN_DISC:
888 subsys->max_qid = 0;
889 break;
890 default:
891 pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
892 kfree(subsys);
893 return NULL;
894 }
895 subsys->type = type;
896 subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
897 GFP_KERNEL);
898 if (!subsys->subsysnqn) {
899 kfree(subsys);
900 return NULL;
901 }
902
903 kref_init(&subsys->ref);
904
905 mutex_init(&subsys->lock);
906 INIT_LIST_HEAD(&subsys->namespaces);
907 INIT_LIST_HEAD(&subsys->ctrls);
908
909 ida_init(&subsys->cntlid_ida);
910
911 INIT_LIST_HEAD(&subsys->hosts);
912
913 return subsys;
914}
915
916static void nvmet_subsys_free(struct kref *ref)
917{
918 struct nvmet_subsys *subsys =
919 container_of(ref, struct nvmet_subsys, ref);
920
921 WARN_ON_ONCE(!list_empty(&subsys->namespaces));
922
923 ida_destroy(&subsys->cntlid_ida);
924 kfree(subsys->subsysnqn);
925 kfree(subsys);
926}
927
928void nvmet_subsys_put(struct nvmet_subsys *subsys)
929{
930 kref_put(&subsys->ref, nvmet_subsys_free);
931}
932
933static int __init nvmet_init(void)
934{
935 int error;
936
937 error = nvmet_init_discovery();
938 if (error)
939 goto out;
940
941 error = nvmet_init_configfs();
942 if (error)
943 goto out_exit_discovery;
944 return 0;
945
946out_exit_discovery:
947 nvmet_exit_discovery();
948out:
949 return error;
950}
951
952static void __exit nvmet_exit(void)
953{
954 nvmet_exit_configfs();
955 nvmet_exit_discovery();
956
957 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
958 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
959}
960
961module_init(nvmet_init);
962module_exit(nvmet_exit);
963
964MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
new file mode 100644
index 000000000000..6f65646e89cf
--- /dev/null
+++ b/drivers/nvme/target/discovery.c
@@ -0,0 +1,221 @@
1/*
2 * Discovery service for the NVMe over Fabrics target.
3 * Copyright (C) 2016 Intel Corporation. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version
7 * 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/slab.h>
16#include <generated/utsrelease.h>
17#include "nvmet.h"
18
19struct nvmet_subsys *nvmet_disc_subsys;
20
21u64 nvmet_genctr;
22
23void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
24{
25 down_write(&nvmet_config_sem);
26 if (list_empty(&port->entry)) {
27 list_add_tail(&port->entry, &parent->referrals);
28 port->enabled = true;
29 nvmet_genctr++;
30 }
31 up_write(&nvmet_config_sem);
32}
33
34void nvmet_referral_disable(struct nvmet_port *port)
35{
36 down_write(&nvmet_config_sem);
37 if (!list_empty(&port->entry)) {
38 port->enabled = false;
39 list_del_init(&port->entry);
40 nvmet_genctr++;
41 }
42 up_write(&nvmet_config_sem);
43}
44
45static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
46 struct nvmet_port *port, char *subsys_nqn, u8 type, u32 numrec)
47{
48 struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec];
49
50 e->trtype = port->disc_addr.trtype;
51 e->adrfam = port->disc_addr.adrfam;
52 e->treq = port->disc_addr.treq;
53 e->portid = port->disc_addr.portid;
54 /* we support only dynamic controllers */
55 e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC);
56 e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH);
57 e->nqntype = type;
58 memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
59 memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
60 memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
61 memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
62}
63
64static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
65{
66 const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry);
67 struct nvmet_ctrl *ctrl = req->sq->ctrl;
68 struct nvmf_disc_rsp_page_hdr *hdr;
69 size_t data_len = nvmet_get_log_page_len(req->cmd);
70 size_t alloc_len = max(data_len, sizeof(*hdr));
71 int residual_len = data_len - sizeof(*hdr);
72 struct nvmet_subsys_link *p;
73 struct nvmet_port *r;
74 u32 numrec = 0;
75 u16 status = 0;
76
77 /*
78 * Make sure we're passing at least a buffer of response header size.
79 * If host provided data len is less than the header size, only the
80 * number of bytes requested by host will be sent to host.
81 */
82 hdr = kzalloc(alloc_len, GFP_KERNEL);
83 if (!hdr) {
84 status = NVME_SC_INTERNAL;
85 goto out;
86 }
87
88 down_read(&nvmet_config_sem);
89 list_for_each_entry(p, &req->port->subsystems, entry) {
90 if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn))
91 continue;
92 if (residual_len >= entry_size) {
93 nvmet_format_discovery_entry(hdr, req->port,
94 p->subsys->subsysnqn,
95 NVME_NQN_NVME, numrec);
96 residual_len -= entry_size;
97 }
98 numrec++;
99 }
100
101 list_for_each_entry(r, &req->port->referrals, entry) {
102 if (residual_len >= entry_size) {
103 nvmet_format_discovery_entry(hdr, r,
104 NVME_DISC_SUBSYS_NAME,
105 NVME_NQN_DISC, numrec);
106 residual_len -= entry_size;
107 }
108 numrec++;
109 }
110
111 hdr->genctr = cpu_to_le64(nvmet_genctr);
112 hdr->numrec = cpu_to_le64(numrec);
113 hdr->recfmt = cpu_to_le16(0);
114
115 up_read(&nvmet_config_sem);
116
117 status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
118 kfree(hdr);
119out:
120 nvmet_req_complete(req, status);
121}
122
123static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
124{
125 struct nvmet_ctrl *ctrl = req->sq->ctrl;
126 struct nvme_id_ctrl *id;
127 u16 status = 0;
128
129 id = kzalloc(sizeof(*id), GFP_KERNEL);
130 if (!id) {
131 status = NVME_SC_INTERNAL;
132 goto out;
133 }
134
135 memset(id->fr, ' ', sizeof(id->fr));
136 strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr));
137
138 /* no limit on data transfer sizes for now */
139 id->mdts = 0;
140 id->cntlid = cpu_to_le16(ctrl->cntlid);
141 id->ver = cpu_to_le32(ctrl->subsys->ver);
142 id->lpa = (1 << 2);
143
144 /* no enforcement soft-limit for maxcmd - pick arbitrary high value */
145 id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
146
147 id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
148 if (ctrl->ops->has_keyed_sgls)
149 id->sgls |= cpu_to_le32(1 << 2);
150 if (ctrl->ops->sqe_inline_size)
151 id->sgls |= cpu_to_le32(1 << 20);
152
153 strcpy(id->subnqn, ctrl->subsys->subsysnqn);
154
155 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
156
157 kfree(id);
158out:
159 nvmet_req_complete(req, status);
160}
161
162int nvmet_parse_discovery_cmd(struct nvmet_req *req)
163{
164 struct nvme_command *cmd = req->cmd;
165
166 req->ns = NULL;
167
168 if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
169 pr_err("nvmet: got cmd %d while not ready\n",
170 cmd->common.opcode);
171 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
172 }
173
174 switch (cmd->common.opcode) {
175 case nvme_admin_get_log_page:
176 req->data_len = nvmet_get_log_page_len(cmd);
177
178 switch (cmd->get_log_page.lid) {
179 case NVME_LOG_DISC:
180 req->execute = nvmet_execute_get_disc_log_page;
181 return 0;
182 default:
183 pr_err("nvmet: unsupported get_log_page lid %d\n",
184 cmd->get_log_page.lid);
185 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
186 }
187 case nvme_admin_identify:
188 req->data_len = 4096;
189 switch (le32_to_cpu(cmd->identify.cns)) {
190 case 0x01:
191 req->execute =
192 nvmet_execute_identify_disc_ctrl;
193 return 0;
194 default:
195 pr_err("nvmet: unsupported identify cns %d\n",
196 le32_to_cpu(cmd->identify.cns));
197 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
198 }
199 default:
200 pr_err("nvmet: unsupported cmd %d\n",
201 cmd->common.opcode);
202 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
203 }
204
205 pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
206 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
207}
208
209int __init nvmet_init_discovery(void)
210{
211 nvmet_disc_subsys =
212 nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
213 if (!nvmet_disc_subsys)
214 return -ENOMEM;
215 return 0;
216}
217
218void nvmet_exit_discovery(void)
219{
220 nvmet_subsys_put(nvmet_disc_subsys);
221}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
new file mode 100644
index 000000000000..9a97ae67e656
--- /dev/null
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -0,0 +1,240 @@
1/*
2 * NVMe Fabrics command implementation.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/blkdev.h>
16#include "nvmet.h"
17
18static void nvmet_execute_prop_set(struct nvmet_req *req)
19{
20 u16 status = 0;
21
22 if (!(req->cmd->prop_set.attrib & 1)) {
23 u64 val = le64_to_cpu(req->cmd->prop_set.value);
24
25 switch (le32_to_cpu(req->cmd->prop_set.offset)) {
26 case NVME_REG_CC:
27 nvmet_update_cc(req->sq->ctrl, val);
28 break;
29 default:
30 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
31 break;
32 }
33 } else {
34 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
35 }
36
37 nvmet_req_complete(req, status);
38}
39
40static void nvmet_execute_prop_get(struct nvmet_req *req)
41{
42 struct nvmet_ctrl *ctrl = req->sq->ctrl;
43 u16 status = 0;
44 u64 val = 0;
45
46 if (req->cmd->prop_get.attrib & 1) {
47 switch (le32_to_cpu(req->cmd->prop_get.offset)) {
48 case NVME_REG_CAP:
49 val = ctrl->cap;
50 break;
51 default:
52 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
53 break;
54 }
55 } else {
56 switch (le32_to_cpu(req->cmd->prop_get.offset)) {
57 case NVME_REG_VS:
58 val = ctrl->subsys->ver;
59 break;
60 case NVME_REG_CC:
61 val = ctrl->cc;
62 break;
63 case NVME_REG_CSTS:
64 val = ctrl->csts;
65 break;
66 default:
67 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
68 break;
69 }
70 }
71
72 req->rsp->result64 = cpu_to_le64(val);
73 nvmet_req_complete(req, status);
74}
75
76int nvmet_parse_fabrics_cmd(struct nvmet_req *req)
77{
78 struct nvme_command *cmd = req->cmd;
79
80 req->ns = NULL;
81
82 switch (cmd->fabrics.fctype) {
83 case nvme_fabrics_type_property_set:
84 req->data_len = 0;
85 req->execute = nvmet_execute_prop_set;
86 break;
87 case nvme_fabrics_type_property_get:
88 req->data_len = 0;
89 req->execute = nvmet_execute_prop_get;
90 break;
91 default:
92 pr_err("received unknown capsule type 0x%x\n",
93 cmd->fabrics.fctype);
94 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
95 }
96
97 return 0;
98}
99
100static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
101{
102 struct nvmf_connect_command *c = &req->cmd->connect;
103 u16 qid = le16_to_cpu(c->qid);
104 u16 sqsize = le16_to_cpu(c->sqsize);
105 struct nvmet_ctrl *old;
106
107 old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
108 if (old) {
109 pr_warn("queue already connected!\n");
110 return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
111 }
112
113 nvmet_cq_setup(ctrl, req->cq, qid, sqsize);
114 nvmet_sq_setup(ctrl, req->sq, qid, sqsize);
115 return 0;
116}
117
118static void nvmet_execute_admin_connect(struct nvmet_req *req)
119{
120 struct nvmf_connect_command *c = &req->cmd->connect;
121 struct nvmf_connect_data *d;
122 struct nvmet_ctrl *ctrl = NULL;
123 u16 status = 0;
124
125 d = kmap(sg_page(req->sg)) + req->sg->offset;
126
127 /* zero out initial completion result, assign values as needed */
128 req->rsp->result = 0;
129
130 if (c->recfmt != 0) {
131 pr_warn("invalid connect version (%d).\n",
132 le16_to_cpu(c->recfmt));
133 status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
134 goto out;
135 }
136
137 if (unlikely(d->cntlid != cpu_to_le16(0xffff))) {
138 pr_warn("connect attempt for invalid controller ID %#x\n",
139 d->cntlid);
140 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
141 req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid);
142 goto out;
143 }
144
145 status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
146 le32_to_cpu(c->kato), &ctrl);
147 if (status)
148 goto out;
149
150 status = nvmet_install_queue(ctrl, req);
151 if (status) {
152 nvmet_ctrl_put(ctrl);
153 goto out;
154 }
155
156 pr_info("creating controller %d for NQN %s.\n",
157 ctrl->cntlid, ctrl->hostnqn);
158 req->rsp->result16 = cpu_to_le16(ctrl->cntlid);
159
160out:
161 kunmap(sg_page(req->sg));
162 nvmet_req_complete(req, status);
163}
164
165static void nvmet_execute_io_connect(struct nvmet_req *req)
166{
167 struct nvmf_connect_command *c = &req->cmd->connect;
168 struct nvmf_connect_data *d;
169 struct nvmet_ctrl *ctrl = NULL;
170 u16 qid = le16_to_cpu(c->qid);
171 u16 status = 0;
172
173 d = kmap(sg_page(req->sg)) + req->sg->offset;
174
175 /* zero out initial completion result, assign values as needed */
176 req->rsp->result = 0;
177
178 if (c->recfmt != 0) {
179 pr_warn("invalid connect version (%d).\n",
180 le16_to_cpu(c->recfmt));
181 status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
182 goto out;
183 }
184
185 status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn,
186 le16_to_cpu(d->cntlid),
187 req, &ctrl);
188 if (status)
189 goto out;
190
191 if (unlikely(qid > ctrl->subsys->max_qid)) {
192 pr_warn("invalid queue id (%d)\n", qid);
193 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
194 req->rsp->result = IPO_IATTR_CONNECT_SQE(qid);
195 goto out_ctrl_put;
196 }
197
198 status = nvmet_install_queue(ctrl, req);
199 if (status) {
200 /* pass back cntlid that had the issue of installing queue */
201 req->rsp->result16 = cpu_to_le16(ctrl->cntlid);
202 goto out_ctrl_put;
203 }
204
205 pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
206
207out:
208 kunmap(sg_page(req->sg));
209 nvmet_req_complete(req, status);
210 return;
211
212out_ctrl_put:
213 nvmet_ctrl_put(ctrl);
214 goto out;
215}
216
217int nvmet_parse_connect_cmd(struct nvmet_req *req)
218{
219 struct nvme_command *cmd = req->cmd;
220
221 req->ns = NULL;
222
223 if (req->cmd->common.opcode != nvme_fabrics_command) {
224 pr_err("invalid command 0x%x on unconnected queue.\n",
225 cmd->fabrics.opcode);
226 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
227 }
228 if (cmd->fabrics.fctype != nvme_fabrics_type_connect) {
229 pr_err("invalid capsule type 0x%x on unconnected queue.\n",
230 cmd->fabrics.fctype);
231 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
232 }
233
234 req->data_len = sizeof(struct nvmf_connect_data);
235 if (cmd->connect.qid == 0)
236 req->execute = nvmet_execute_admin_connect;
237 else
238 req->execute = nvmet_execute_io_connect;
239 return 0;
240}
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
new file mode 100644
index 000000000000..2cd069b691ae
--- /dev/null
+++ b/drivers/nvme/target/io-cmd.c
@@ -0,0 +1,215 @@
1/*
2 * NVMe I/O command implementation.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/blkdev.h>
16#include <linux/module.h>
17#include "nvmet.h"
18
19static void nvmet_bio_done(struct bio *bio)
20{
21 struct nvmet_req *req = bio->bi_private;
22
23 nvmet_req_complete(req,
24 bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
25
26 if (bio != &req->inline_bio)
27 bio_put(bio);
28}
29
30static inline u32 nvmet_rw_len(struct nvmet_req *req)
31{
32 return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
33 req->ns->blksize_shift;
34}
35
36static void nvmet_inline_bio_init(struct nvmet_req *req)
37{
38 struct bio *bio = &req->inline_bio;
39
40 bio_init(bio);
41 bio->bi_max_vecs = NVMET_MAX_INLINE_BIOVEC;
42 bio->bi_io_vec = req->inline_bvec;
43}
44
45static void nvmet_execute_rw(struct nvmet_req *req)
46{
47 int sg_cnt = req->sg_cnt;
48 struct scatterlist *sg;
49 struct bio *bio;
50 sector_t sector;
51 blk_qc_t cookie;
52 int op, op_flags = 0, i;
53
54 if (!req->sg_cnt) {
55 nvmet_req_complete(req, 0);
56 return;
57 }
58
59 if (req->cmd->rw.opcode == nvme_cmd_write) {
60 op = REQ_OP_WRITE;
61 if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
62 op_flags |= REQ_FUA;
63 } else {
64 op = REQ_OP_READ;
65 }
66
67 sector = le64_to_cpu(req->cmd->rw.slba);
68 sector <<= (req->ns->blksize_shift - 9);
69
70 nvmet_inline_bio_init(req);
71 bio = &req->inline_bio;
72 bio->bi_bdev = req->ns->bdev;
73 bio->bi_iter.bi_sector = sector;
74 bio->bi_private = req;
75 bio->bi_end_io = nvmet_bio_done;
76 bio_set_op_attrs(bio, op, op_flags);
77
78 for_each_sg(req->sg, sg, req->sg_cnt, i) {
79 while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
80 != sg->length) {
81 struct bio *prev = bio;
82
83 bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
84 bio->bi_bdev = req->ns->bdev;
85 bio->bi_iter.bi_sector = sector;
86 bio_set_op_attrs(bio, op, op_flags);
87
88 bio_chain(bio, prev);
89 cookie = submit_bio(prev);
90 }
91
92 sector += sg->length >> 9;
93 sg_cnt--;
94 }
95
96 cookie = submit_bio(bio);
97
98 blk_poll(bdev_get_queue(req->ns->bdev), cookie);
99}
100
101static void nvmet_execute_flush(struct nvmet_req *req)
102{
103 struct bio *bio;
104
105 nvmet_inline_bio_init(req);
106 bio = &req->inline_bio;
107
108 bio->bi_bdev = req->ns->bdev;
109 bio->bi_private = req;
110 bio->bi_end_io = nvmet_bio_done;
111 bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
112
113 submit_bio(bio);
114}
115
116static u16 nvmet_discard_range(struct nvmet_ns *ns,
117 struct nvme_dsm_range *range, struct bio **bio)
118{
119 if (__blkdev_issue_discard(ns->bdev,
120 le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
121 le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
122 GFP_KERNEL, 0, bio))
123 return NVME_SC_INTERNAL | NVME_SC_DNR;
124 return 0;
125}
126
127static void nvmet_execute_discard(struct nvmet_req *req)
128{
129 struct nvme_dsm_range range;
130 struct bio *bio = NULL;
131 int i;
132 u16 status;
133
134 for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
135 status = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
136 sizeof(range));
137 if (status)
138 break;
139
140 status = nvmet_discard_range(req->ns, &range, &bio);
141 if (status)
142 break;
143 }
144
145 if (bio) {
146 bio->bi_private = req;
147 bio->bi_end_io = nvmet_bio_done;
148 if (status) {
149 bio->bi_error = -EIO;
150 bio_endio(bio);
151 } else {
152 submit_bio(bio);
153 }
154 } else {
155 nvmet_req_complete(req, status);
156 }
157}
158
159static void nvmet_execute_dsm(struct nvmet_req *req)
160{
161 switch (le32_to_cpu(req->cmd->dsm.attributes)) {
162 case NVME_DSMGMT_AD:
163 nvmet_execute_discard(req);
164 return;
165 case NVME_DSMGMT_IDR:
166 case NVME_DSMGMT_IDW:
167 default:
168 /* Not supported yet */
169 nvmet_req_complete(req, 0);
170 return;
171 }
172}
173
174int nvmet_parse_io_cmd(struct nvmet_req *req)
175{
176 struct nvme_command *cmd = req->cmd;
177
178 if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
179 pr_err("nvmet: got io cmd %d while CC.EN == 0\n",
180 cmd->common.opcode);
181 req->ns = NULL;
182 return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
183 }
184
185 if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
186 pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n",
187 cmd->common.opcode);
188 req->ns = NULL;
189 return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
190 }
191
192 req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
193 if (!req->ns)
194 return NVME_SC_INVALID_NS | NVME_SC_DNR;
195
196 switch (cmd->common.opcode) {
197 case nvme_cmd_read:
198 case nvme_cmd_write:
199 req->execute = nvmet_execute_rw;
200 req->data_len = nvmet_rw_len(req);
201 return 0;
202 case nvme_cmd_flush:
203 req->execute = nvmet_execute_flush;
204 req->data_len = 0;
205 return 0;
206 case nvme_cmd_dsm:
207 req->execute = nvmet_execute_dsm;
208 req->data_len = le32_to_cpu(cmd->dsm.nr) *
209 sizeof(struct nvme_dsm_range);
210 return 0;
211 default:
212 pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
213 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
214 }
215}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
new file mode 100644
index 000000000000..94e782987cc9
--- /dev/null
+++ b/drivers/nvme/target/loop.c
@@ -0,0 +1,754 @@
1/*
2 * NVMe over Fabrics loopback device.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/scatterlist.h>
16#include <linux/delay.h>
17#include <linux/blk-mq.h>
18#include <linux/nvme.h>
19#include <linux/module.h>
20#include <linux/parser.h>
21#include <linux/t10-pi.h>
22#include "nvmet.h"
23#include "../host/nvme.h"
24#include "../host/fabrics.h"
25
26#define NVME_LOOP_AQ_DEPTH 256
27
28#define NVME_LOOP_MAX_SEGMENTS 256
29
30/*
31 * We handle AEN commands ourselves and don't even let the
32 * block layer know about them.
33 */
34#define NVME_LOOP_NR_AEN_COMMANDS 1
35#define NVME_LOOP_AQ_BLKMQ_DEPTH \
36 (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS)
37
38struct nvme_loop_iod {
39 struct nvme_command cmd;
40 struct nvme_completion rsp;
41 struct nvmet_req req;
42 struct nvme_loop_queue *queue;
43 struct work_struct work;
44 struct sg_table sg_table;
45 struct scatterlist first_sgl[];
46};
47
48struct nvme_loop_ctrl {
49 spinlock_t lock;
50 struct nvme_loop_queue *queues;
51 u32 queue_count;
52
53 struct blk_mq_tag_set admin_tag_set;
54
55 struct list_head list;
56 u64 cap;
57 struct blk_mq_tag_set tag_set;
58 struct nvme_loop_iod async_event_iod;
59 struct nvme_ctrl ctrl;
60
61 struct nvmet_ctrl *target_ctrl;
62 struct work_struct delete_work;
63 struct work_struct reset_work;
64};
65
66static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
67{
68 return container_of(ctrl, struct nvme_loop_ctrl, ctrl);
69}
70
71struct nvme_loop_queue {
72 struct nvmet_cq nvme_cq;
73 struct nvmet_sq nvme_sq;
74 struct nvme_loop_ctrl *ctrl;
75};
76
77static struct nvmet_port *nvmet_loop_port;
78
79static LIST_HEAD(nvme_loop_ctrl_list);
80static DEFINE_MUTEX(nvme_loop_ctrl_mutex);
81
82static void nvme_loop_queue_response(struct nvmet_req *nvme_req);
83static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl);
84
85static struct nvmet_fabrics_ops nvme_loop_ops;
86
87static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue)
88{
89 return queue - queue->ctrl->queues;
90}
91
92static void nvme_loop_complete_rq(struct request *req)
93{
94 struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
95 int error = 0;
96
97 nvme_cleanup_cmd(req);
98 sg_free_table_chained(&iod->sg_table, true);
99
100 if (unlikely(req->errors)) {
101 if (nvme_req_needs_retry(req, req->errors)) {
102 nvme_requeue_req(req);
103 return;
104 }
105
106 if (req->cmd_type == REQ_TYPE_DRV_PRIV)
107 error = req->errors;
108 else
109 error = nvme_error_status(req->errors);
110 }
111
112 blk_mq_end_request(req, error);
113}
114
115static void nvme_loop_queue_response(struct nvmet_req *nvme_req)
116{
117 struct nvme_loop_iod *iod =
118 container_of(nvme_req, struct nvme_loop_iod, req);
119 struct nvme_completion *cqe = &iod->rsp;
120
121 /*
122 * AEN requests are special as they don't time out and can
123 * survive any kind of queue freeze and often don't respond to
124 * aborts. We don't even bother to allocate a struct request
125 * for them but rather special case them here.
126 */
127 if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 &&
128 cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
129 nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe);
130 } else {
131 struct request *req = blk_mq_rq_from_pdu(iod);
132
133 if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
134 memcpy(req->special, cqe, sizeof(*cqe));
135 blk_mq_complete_request(req, le16_to_cpu(cqe->status) >> 1);
136 }
137}
138
139static void nvme_loop_execute_work(struct work_struct *work)
140{
141 struct nvme_loop_iod *iod =
142 container_of(work, struct nvme_loop_iod, work);
143
144 iod->req.execute(&iod->req);
145}
146
147static enum blk_eh_timer_return
148nvme_loop_timeout(struct request *rq, bool reserved)
149{
150 struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq);
151
152 /* queue error recovery */
153 schedule_work(&iod->queue->ctrl->reset_work);
154
155 /* fail with DNR on admin cmd timeout */
156 rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
157
158 return BLK_EH_HANDLED;
159}
160
161static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
162 const struct blk_mq_queue_data *bd)
163{
164 struct nvme_ns *ns = hctx->queue->queuedata;
165 struct nvme_loop_queue *queue = hctx->driver_data;
166 struct request *req = bd->rq;
167 struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
168 int ret;
169
170 ret = nvme_setup_cmd(ns, req, &iod->cmd);
171 if (ret)
172 return ret;
173
174 iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
175 iod->req.port = nvmet_loop_port;
176 if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
177 &queue->nvme_sq, &nvme_loop_ops)) {
178 nvme_cleanup_cmd(req);
179 blk_mq_start_request(req);
180 nvme_loop_queue_response(&iod->req);
181 return 0;
182 }
183
184 if (blk_rq_bytes(req)) {
185 iod->sg_table.sgl = iod->first_sgl;
186 ret = sg_alloc_table_chained(&iod->sg_table,
187 req->nr_phys_segments, iod->sg_table.sgl);
188 if (ret)
189 return BLK_MQ_RQ_QUEUE_BUSY;
190
191 iod->req.sg = iod->sg_table.sgl;
192 iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
193 BUG_ON(iod->req.sg_cnt > req->nr_phys_segments);
194 }
195
196 iod->cmd.common.command_id = req->tag;
197 blk_mq_start_request(req);
198
199 schedule_work(&iod->work);
200 return 0;
201}
202
203static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
204{
205 struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg);
206 struct nvme_loop_queue *queue = &ctrl->queues[0];
207 struct nvme_loop_iod *iod = &ctrl->async_event_iod;
208
209 memset(&iod->cmd, 0, sizeof(iod->cmd));
210 iod->cmd.common.opcode = nvme_admin_async_event;
211 iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH;
212 iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
213
214 if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
215 &nvme_loop_ops)) {
216 dev_err(ctrl->ctrl.device, "failed async event work\n");
217 return;
218 }
219
220 schedule_work(&iod->work);
221}
222
223static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl,
224 struct nvme_loop_iod *iod, unsigned int queue_idx)
225{
226 BUG_ON(queue_idx >= ctrl->queue_count);
227
228 iod->req.cmd = &iod->cmd;
229 iod->req.rsp = &iod->rsp;
230 iod->queue = &ctrl->queues[queue_idx];
231 INIT_WORK(&iod->work, nvme_loop_execute_work);
232 return 0;
233}
234
235static int nvme_loop_init_request(void *data, struct request *req,
236 unsigned int hctx_idx, unsigned int rq_idx,
237 unsigned int numa_node)
238{
239 return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1);
240}
241
242static int nvme_loop_init_admin_request(void *data, struct request *req,
243 unsigned int hctx_idx, unsigned int rq_idx,
244 unsigned int numa_node)
245{
246 return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0);
247}
248
249static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
250 unsigned int hctx_idx)
251{
252 struct nvme_loop_ctrl *ctrl = data;
253 struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1];
254
255 BUG_ON(hctx_idx >= ctrl->queue_count);
256
257 hctx->driver_data = queue;
258 return 0;
259}
260
261static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
262 unsigned int hctx_idx)
263{
264 struct nvme_loop_ctrl *ctrl = data;
265 struct nvme_loop_queue *queue = &ctrl->queues[0];
266
267 BUG_ON(hctx_idx != 0);
268
269 hctx->driver_data = queue;
270 return 0;
271}
272
273static struct blk_mq_ops nvme_loop_mq_ops = {
274 .queue_rq = nvme_loop_queue_rq,
275 .complete = nvme_loop_complete_rq,
276 .map_queue = blk_mq_map_queue,
277 .init_request = nvme_loop_init_request,
278 .init_hctx = nvme_loop_init_hctx,
279 .timeout = nvme_loop_timeout,
280};
281
282static struct blk_mq_ops nvme_loop_admin_mq_ops = {
283 .queue_rq = nvme_loop_queue_rq,
284 .complete = nvme_loop_complete_rq,
285 .map_queue = blk_mq_map_queue,
286 .init_request = nvme_loop_init_admin_request,
287 .init_hctx = nvme_loop_init_admin_hctx,
288 .timeout = nvme_loop_timeout,
289};
290
291static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
292{
293 blk_cleanup_queue(ctrl->ctrl.admin_q);
294 blk_mq_free_tag_set(&ctrl->admin_tag_set);
295 nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
296}
297
298static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
299{
300 struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
301
302 if (list_empty(&ctrl->list))
303 goto free_ctrl;
304
305 mutex_lock(&nvme_loop_ctrl_mutex);
306 list_del(&ctrl->list);
307 mutex_unlock(&nvme_loop_ctrl_mutex);
308
309 if (nctrl->tagset) {
310 blk_cleanup_queue(ctrl->ctrl.connect_q);
311 blk_mq_free_tag_set(&ctrl->tag_set);
312 }
313 kfree(ctrl->queues);
314 nvmf_free_options(nctrl->opts);
315free_ctrl:
316 kfree(ctrl);
317}
318
319static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
320{
321 int error;
322
323 memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
324 ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
325 ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH;
326 ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
327 ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
328 ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
329 SG_CHUNK_SIZE * sizeof(struct scatterlist);
330 ctrl->admin_tag_set.driver_data = ctrl;
331 ctrl->admin_tag_set.nr_hw_queues = 1;
332 ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
333
334 ctrl->queues[0].ctrl = ctrl;
335 error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
336 if (error)
337 return error;
338 ctrl->queue_count = 1;
339
340 error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
341 if (error)
342 goto out_free_sq;
343
344 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
345 if (IS_ERR(ctrl->ctrl.admin_q)) {
346 error = PTR_ERR(ctrl->ctrl.admin_q);
347 goto out_free_tagset;
348 }
349
350 error = nvmf_connect_admin_queue(&ctrl->ctrl);
351 if (error)
352 goto out_cleanup_queue;
353
354 error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
355 if (error) {
356 dev_err(ctrl->ctrl.device,
357 "prop_get NVME_REG_CAP failed\n");
358 goto out_cleanup_queue;
359 }
360
361 ctrl->ctrl.sqsize =
362 min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
363
364 error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
365 if (error)
366 goto out_cleanup_queue;
367
368 ctrl->ctrl.max_hw_sectors =
369 (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
370
371 error = nvme_init_identify(&ctrl->ctrl);
372 if (error)
373 goto out_cleanup_queue;
374
375 nvme_start_keep_alive(&ctrl->ctrl);
376
377 return 0;
378
379out_cleanup_queue:
380 blk_cleanup_queue(ctrl->ctrl.admin_q);
381out_free_tagset:
382 blk_mq_free_tag_set(&ctrl->admin_tag_set);
383out_free_sq:
384 nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
385 return error;
386}
387
388static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
389{
390 int i;
391
392 nvme_stop_keep_alive(&ctrl->ctrl);
393
394 if (ctrl->queue_count > 1) {
395 nvme_stop_queues(&ctrl->ctrl);
396 blk_mq_tagset_busy_iter(&ctrl->tag_set,
397 nvme_cancel_request, &ctrl->ctrl);
398
399 for (i = 1; i < ctrl->queue_count; i++)
400 nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
401 }
402
403 if (ctrl->ctrl.state == NVME_CTRL_LIVE)
404 nvme_shutdown_ctrl(&ctrl->ctrl);
405
406 blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
407 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
408 nvme_cancel_request, &ctrl->ctrl);
409 nvme_loop_destroy_admin_queue(ctrl);
410}
411
412static void nvme_loop_del_ctrl_work(struct work_struct *work)
413{
414 struct nvme_loop_ctrl *ctrl = container_of(work,
415 struct nvme_loop_ctrl, delete_work);
416
417 nvme_remove_namespaces(&ctrl->ctrl);
418 nvme_loop_shutdown_ctrl(ctrl);
419 nvme_uninit_ctrl(&ctrl->ctrl);
420 nvme_put_ctrl(&ctrl->ctrl);
421}
422
423static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl)
424{
425 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
426 return -EBUSY;
427
428 if (!schedule_work(&ctrl->delete_work))
429 return -EBUSY;
430
431 return 0;
432}
433
434static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl)
435{
436 struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
437 int ret;
438
439 ret = __nvme_loop_del_ctrl(ctrl);
440 if (ret)
441 return ret;
442
443 flush_work(&ctrl->delete_work);
444
445 return 0;
446}
447
448static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
449{
450 struct nvme_loop_ctrl *ctrl;
451
452 mutex_lock(&nvme_loop_ctrl_mutex);
453 list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) {
454 if (ctrl->ctrl.cntlid == nctrl->cntlid)
455 __nvme_loop_del_ctrl(ctrl);
456 }
457 mutex_unlock(&nvme_loop_ctrl_mutex);
458}
459
460static void nvme_loop_reset_ctrl_work(struct work_struct *work)
461{
462 struct nvme_loop_ctrl *ctrl = container_of(work,
463 struct nvme_loop_ctrl, reset_work);
464 bool changed;
465 int i, ret;
466
467 nvme_loop_shutdown_ctrl(ctrl);
468
469 ret = nvme_loop_configure_admin_queue(ctrl);
470 if (ret)
471 goto out_disable;
472
473 for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) {
474 ctrl->queues[i].ctrl = ctrl;
475 ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
476 if (ret)
477 goto out_free_queues;
478
479 ctrl->queue_count++;
480 }
481
482 for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) {
483 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
484 if (ret)
485 goto out_free_queues;
486 }
487
488 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
489 WARN_ON_ONCE(!changed);
490
491 nvme_queue_scan(&ctrl->ctrl);
492 nvme_queue_async_events(&ctrl->ctrl);
493
494 nvme_start_queues(&ctrl->ctrl);
495
496 return;
497
498out_free_queues:
499 for (i = 1; i < ctrl->queue_count; i++)
500 nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
501 nvme_loop_destroy_admin_queue(ctrl);
502out_disable:
503 dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
504 nvme_remove_namespaces(&ctrl->ctrl);
505 nvme_uninit_ctrl(&ctrl->ctrl);
506 nvme_put_ctrl(&ctrl->ctrl);
507}
508
509static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl)
510{
511 struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
512
513 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
514 return -EBUSY;
515
516 if (!schedule_work(&ctrl->reset_work))
517 return -EBUSY;
518
519 flush_work(&ctrl->reset_work);
520
521 return 0;
522}
523
524static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
525 .name = "loop",
526 .module = THIS_MODULE,
527 .is_fabrics = true,
528 .reg_read32 = nvmf_reg_read32,
529 .reg_read64 = nvmf_reg_read64,
530 .reg_write32 = nvmf_reg_write32,
531 .reset_ctrl = nvme_loop_reset_ctrl,
532 .free_ctrl = nvme_loop_free_ctrl,
533 .submit_async_event = nvme_loop_submit_async_event,
534 .delete_ctrl = nvme_loop_del_ctrl,
535 .get_subsysnqn = nvmf_get_subsysnqn,
536};
537
538static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
539{
540 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
541 int ret, i;
542
543 ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
544 if (ret || !opts->nr_io_queues)
545 return ret;
546
547 dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
548 opts->nr_io_queues);
549
550 for (i = 1; i <= opts->nr_io_queues; i++) {
551 ctrl->queues[i].ctrl = ctrl;
552 ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
553 if (ret)
554 goto out_destroy_queues;
555
556 ctrl->queue_count++;
557 }
558
559 memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
560 ctrl->tag_set.ops = &nvme_loop_mq_ops;
561 ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize;
562 ctrl->tag_set.reserved_tags = 1; /* fabric connect */
563 ctrl->tag_set.numa_node = NUMA_NO_NODE;
564 ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
565 ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
566 SG_CHUNK_SIZE * sizeof(struct scatterlist);
567 ctrl->tag_set.driver_data = ctrl;
568 ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
569 ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
570 ctrl->ctrl.tagset = &ctrl->tag_set;
571
572 ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
573 if (ret)
574 goto out_destroy_queues;
575
576 ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
577 if (IS_ERR(ctrl->ctrl.connect_q)) {
578 ret = PTR_ERR(ctrl->ctrl.connect_q);
579 goto out_free_tagset;
580 }
581
582 for (i = 1; i <= opts->nr_io_queues; i++) {
583 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
584 if (ret)
585 goto out_cleanup_connect_q;
586 }
587
588 return 0;
589
590out_cleanup_connect_q:
591 blk_cleanup_queue(ctrl->ctrl.connect_q);
592out_free_tagset:
593 blk_mq_free_tag_set(&ctrl->tag_set);
594out_destroy_queues:
595 for (i = 1; i < ctrl->queue_count; i++)
596 nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
597 return ret;
598}
599
600static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
601 struct nvmf_ctrl_options *opts)
602{
603 struct nvme_loop_ctrl *ctrl;
604 bool changed;
605 int ret;
606
607 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
608 if (!ctrl)
609 return ERR_PTR(-ENOMEM);
610 ctrl->ctrl.opts = opts;
611 INIT_LIST_HEAD(&ctrl->list);
612
613 INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work);
614 INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work);
615
616 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
617 0 /* no quirks, we're perfect! */);
618 if (ret)
619 goto out_put_ctrl;
620
621 spin_lock_init(&ctrl->lock);
622
623 ret = -ENOMEM;
624
625 ctrl->ctrl.sqsize = opts->queue_size;
626 ctrl->ctrl.kato = opts->kato;
627
628 ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
629 GFP_KERNEL);
630 if (!ctrl->queues)
631 goto out_uninit_ctrl;
632
633 ret = nvme_loop_configure_admin_queue(ctrl);
634 if (ret)
635 goto out_free_queues;
636
637 if (opts->queue_size > ctrl->ctrl.maxcmd) {
638 /* warn if maxcmd is lower than queue_size */
639 dev_warn(ctrl->ctrl.device,
640 "queue_size %zu > ctrl maxcmd %u, clamping down\n",
641 opts->queue_size, ctrl->ctrl.maxcmd);
642 opts->queue_size = ctrl->ctrl.maxcmd;
643 }
644
645 if (opts->nr_io_queues) {
646 ret = nvme_loop_create_io_queues(ctrl);
647 if (ret)
648 goto out_remove_admin_queue;
649 }
650
651 nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0);
652
653 dev_info(ctrl->ctrl.device,
654 "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
655
656 kref_get(&ctrl->ctrl.kref);
657
658 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
659 WARN_ON_ONCE(!changed);
660
661 mutex_lock(&nvme_loop_ctrl_mutex);
662 list_add_tail(&ctrl->list, &nvme_loop_ctrl_list);
663 mutex_unlock(&nvme_loop_ctrl_mutex);
664
665 if (opts->nr_io_queues) {
666 nvme_queue_scan(&ctrl->ctrl);
667 nvme_queue_async_events(&ctrl->ctrl);
668 }
669
670 return &ctrl->ctrl;
671
672out_remove_admin_queue:
673 nvme_loop_destroy_admin_queue(ctrl);
674out_free_queues:
675 kfree(ctrl->queues);
676out_uninit_ctrl:
677 nvme_uninit_ctrl(&ctrl->ctrl);
678out_put_ctrl:
679 nvme_put_ctrl(&ctrl->ctrl);
680 if (ret > 0)
681 ret = -EIO;
682 return ERR_PTR(ret);
683}
684
685static int nvme_loop_add_port(struct nvmet_port *port)
686{
687 /*
688 * XXX: disalow adding more than one port so
689 * there is no connection rejections when a
690 * a subsystem is assigned to a port for which
691 * loop doesn't have a pointer.
692 * This scenario would be possible if we allowed
693 * more than one port to be added and a subsystem
694 * was assigned to a port other than nvmet_loop_port.
695 */
696
697 if (nvmet_loop_port)
698 return -EPERM;
699
700 nvmet_loop_port = port;
701 return 0;
702}
703
704static void nvme_loop_remove_port(struct nvmet_port *port)
705{
706 if (port == nvmet_loop_port)
707 nvmet_loop_port = NULL;
708}
709
710static struct nvmet_fabrics_ops nvme_loop_ops = {
711 .owner = THIS_MODULE,
712 .type = NVMF_TRTYPE_LOOP,
713 .add_port = nvme_loop_add_port,
714 .remove_port = nvme_loop_remove_port,
715 .queue_response = nvme_loop_queue_response,
716 .delete_ctrl = nvme_loop_delete_ctrl,
717};
718
719static struct nvmf_transport_ops nvme_loop_transport = {
720 .name = "loop",
721 .create_ctrl = nvme_loop_create_ctrl,
722};
723
724static int __init nvme_loop_init_module(void)
725{
726 int ret;
727
728 ret = nvmet_register_transport(&nvme_loop_ops);
729 if (ret)
730 return ret;
731 nvmf_register_transport(&nvme_loop_transport);
732 return 0;
733}
734
735static void __exit nvme_loop_cleanup_module(void)
736{
737 struct nvme_loop_ctrl *ctrl, *next;
738
739 nvmf_unregister_transport(&nvme_loop_transport);
740 nvmet_unregister_transport(&nvme_loop_ops);
741
742 mutex_lock(&nvme_loop_ctrl_mutex);
743 list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list)
744 __nvme_loop_del_ctrl(ctrl);
745 mutex_unlock(&nvme_loop_ctrl_mutex);
746
747 flush_scheduled_work();
748}
749
750module_init(nvme_loop_init_module);
751module_exit(nvme_loop_cleanup_module);
752
753MODULE_LICENSE("GPL v2");
754MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
new file mode 100644
index 000000000000..57dd6d834c28
--- /dev/null
+++ b/drivers/nvme/target/nvmet.h
@@ -0,0 +1,331 @@
1/*
2 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef _NVMET_H
15#define _NVMET_H
16
17#include <linux/dma-mapping.h>
18#include <linux/types.h>
19#include <linux/device.h>
20#include <linux/kref.h>
21#include <linux/percpu-refcount.h>
22#include <linux/list.h>
23#include <linux/mutex.h>
24#include <linux/nvme.h>
25#include <linux/configfs.h>
26#include <linux/rcupdate.h>
27#include <linux/blkdev.h>
28
29#define NVMET_ASYNC_EVENTS 4
30#define NVMET_ERROR_LOG_SLOTS 128
31
32/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM
33 * The 16 bit shift is to set IATTR bit to 1, which means offending
34 * offset starts in the data section of connect()
35 */
36#define IPO_IATTR_CONNECT_DATA(x) \
37 (cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x))))
38#define IPO_IATTR_CONNECT_SQE(x) \
39 (cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
40
41struct nvmet_ns {
42 struct list_head dev_link;
43 struct percpu_ref ref;
44 struct block_device *bdev;
45 u32 nsid;
46 u32 blksize_shift;
47 loff_t size;
48 u8 nguid[16];
49
50 struct nvmet_subsys *subsys;
51 const char *device_path;
52
53 struct config_group device_group;
54 struct config_group group;
55
56 struct completion disable_done;
57};
58
59static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
60{
61 return container_of(to_config_group(item), struct nvmet_ns, group);
62}
63
64static inline bool nvmet_ns_enabled(struct nvmet_ns *ns)
65{
66 return !list_empty_careful(&ns->dev_link);
67}
68
69struct nvmet_cq {
70 u16 qid;
71 u16 size;
72};
73
74struct nvmet_sq {
75 struct nvmet_ctrl *ctrl;
76 struct percpu_ref ref;
77 u16 qid;
78 u16 size;
79 struct completion free_done;
80};
81
82/**
83 * struct nvmet_port - Common structure to keep port
84 * information for the target.
85 * @entry: List head for holding a list of these elements.
86 * @disc_addr: Address information is stored in a format defined
87 * for a discovery log page entry.
88 * @group: ConfigFS group for this element's folder.
89 * @priv: Private data for the transport.
90 */
91struct nvmet_port {
92 struct list_head entry;
93 struct nvmf_disc_rsp_page_entry disc_addr;
94 struct config_group group;
95 struct config_group subsys_group;
96 struct list_head subsystems;
97 struct config_group referrals_group;
98 struct list_head referrals;
99 void *priv;
100 bool enabled;
101};
102
103static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
104{
105 return container_of(to_config_group(item), struct nvmet_port,
106 group);
107}
108
109struct nvmet_ctrl {
110 struct nvmet_subsys *subsys;
111 struct nvmet_cq **cqs;
112 struct nvmet_sq **sqs;
113
114 struct mutex lock;
115 u64 cap;
116 u32 cc;
117 u32 csts;
118
119 u16 cntlid;
120 u32 kato;
121
122 struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS];
123 unsigned int nr_async_event_cmds;
124 struct list_head async_events;
125 struct work_struct async_event_work;
126
127 struct list_head subsys_entry;
128 struct kref ref;
129 struct delayed_work ka_work;
130 struct work_struct fatal_err_work;
131
132 struct nvmet_fabrics_ops *ops;
133
134 char subsysnqn[NVMF_NQN_FIELD_LEN];
135 char hostnqn[NVMF_NQN_FIELD_LEN];
136};
137
138struct nvmet_subsys {
139 enum nvme_subsys_type type;
140
141 struct mutex lock;
142 struct kref ref;
143
144 struct list_head namespaces;
145 unsigned int max_nsid;
146
147 struct list_head ctrls;
148 struct ida cntlid_ida;
149
150 struct list_head hosts;
151 bool allow_any_host;
152
153 u16 max_qid;
154
155 u64 ver;
156 char *subsysnqn;
157
158 struct config_group group;
159
160 struct config_group namespaces_group;
161 struct config_group allowed_hosts_group;
162};
163
164static inline struct nvmet_subsys *to_subsys(struct config_item *item)
165{
166 return container_of(to_config_group(item), struct nvmet_subsys, group);
167}
168
169static inline struct nvmet_subsys *namespaces_to_subsys(
170 struct config_item *item)
171{
172 return container_of(to_config_group(item), struct nvmet_subsys,
173 namespaces_group);
174}
175
176struct nvmet_host {
177 struct config_group group;
178};
179
180static inline struct nvmet_host *to_host(struct config_item *item)
181{
182 return container_of(to_config_group(item), struct nvmet_host, group);
183}
184
185static inline char *nvmet_host_name(struct nvmet_host *host)
186{
187 return config_item_name(&host->group.cg_item);
188}
189
190struct nvmet_host_link {
191 struct list_head entry;
192 struct nvmet_host *host;
193};
194
195struct nvmet_subsys_link {
196 struct list_head entry;
197 struct nvmet_subsys *subsys;
198};
199
200struct nvmet_req;
201struct nvmet_fabrics_ops {
202 struct module *owner;
203 unsigned int type;
204 unsigned int sqe_inline_size;
205 unsigned int msdbd;
206 bool has_keyed_sgls : 1;
207 void (*queue_response)(struct nvmet_req *req);
208 int (*add_port)(struct nvmet_port *port);
209 void (*remove_port)(struct nvmet_port *port);
210 void (*delete_ctrl)(struct nvmet_ctrl *ctrl);
211};
212
213#define NVMET_MAX_INLINE_BIOVEC 8
214
215struct nvmet_req {
216 struct nvme_command *cmd;
217 struct nvme_completion *rsp;
218 struct nvmet_sq *sq;
219 struct nvmet_cq *cq;
220 struct nvmet_ns *ns;
221 struct scatterlist *sg;
222 struct bio inline_bio;
223 struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC];
224 int sg_cnt;
225 size_t data_len;
226
227 struct nvmet_port *port;
228
229 void (*execute)(struct nvmet_req *req);
230 struct nvmet_fabrics_ops *ops;
231};
232
233static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
234{
235 req->rsp->status = cpu_to_le16(status << 1);
236}
237
238static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
239{
240 req->rsp->result = cpu_to_le32(result);
241}
242
243/*
244 * NVMe command writes actually are DMA reads for us on the target side.
245 */
246static inline enum dma_data_direction
247nvmet_data_dir(struct nvmet_req *req)
248{
249 return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
250}
251
252struct nvmet_async_event {
253 struct list_head entry;
254 u8 event_type;
255 u8 event_info;
256 u8 log_page;
257};
258
259int nvmet_parse_connect_cmd(struct nvmet_req *req);
260int nvmet_parse_io_cmd(struct nvmet_req *req);
261int nvmet_parse_admin_cmd(struct nvmet_req *req);
262int nvmet_parse_discovery_cmd(struct nvmet_req *req);
263int nvmet_parse_fabrics_cmd(struct nvmet_req *req);
264
265bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
266 struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
267void nvmet_req_complete(struct nvmet_req *req, u16 status);
268
269void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
270 u16 size);
271void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
272 u16 size);
273void nvmet_sq_destroy(struct nvmet_sq *sq);
274int nvmet_sq_init(struct nvmet_sq *sq);
275
276void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
277
278void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new);
279u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
280 struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp);
281u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
282 struct nvmet_req *req, struct nvmet_ctrl **ret);
283void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
284
285struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
286 enum nvme_subsys_type type);
287void nvmet_subsys_put(struct nvmet_subsys *subsys);
288
289struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid);
290void nvmet_put_namespace(struct nvmet_ns *ns);
291int nvmet_ns_enable(struct nvmet_ns *ns);
292void nvmet_ns_disable(struct nvmet_ns *ns);
293struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid);
294void nvmet_ns_free(struct nvmet_ns *ns);
295
296int nvmet_register_transport(struct nvmet_fabrics_ops *ops);
297void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops);
298
299int nvmet_enable_port(struct nvmet_port *port);
300void nvmet_disable_port(struct nvmet_port *port);
301
302void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port);
303void nvmet_referral_disable(struct nvmet_port *port);
304
305u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
306 size_t len);
307u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
308 size_t len);
309
310u32 nvmet_get_log_page_len(struct nvme_command *cmd);
311
312#define NVMET_QUEUE_SIZE 1024
313#define NVMET_NR_QUEUES 64
314#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
315#define NVMET_KAS 10
316#define NVMET_DISC_KATO 120
317
318int __init nvmet_init_configfs(void);
319void __exit nvmet_exit_configfs(void);
320
321int __init nvmet_init_discovery(void);
322void nvmet_exit_discovery(void);
323
324extern struct nvmet_subsys *nvmet_disc_subsys;
325extern u64 nvmet_genctr;
326extern struct rw_semaphore nvmet_config_sem;
327
328bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
329 const char *hostnqn);
330
331#endif /* _NVMET_H */
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
new file mode 100644
index 000000000000..e06d504bdf0c
--- /dev/null
+++ b/drivers/nvme/target/rdma.c
@@ -0,0 +1,1448 @@
1/*
2 * NVMe over Fabrics RDMA target.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/atomic.h>
16#include <linux/ctype.h>
17#include <linux/delay.h>
18#include <linux/err.h>
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/nvme.h>
22#include <linux/slab.h>
23#include <linux/string.h>
24#include <linux/wait.h>
25#include <linux/inet.h>
26#include <asm/unaligned.h>
27
28#include <rdma/ib_verbs.h>
29#include <rdma/rdma_cm.h>
30#include <rdma/rw.h>
31
32#include <linux/nvme-rdma.h>
33#include "nvmet.h"
34
35/*
36 * We allow up to a page of inline data to go with the SQE
37 */
38#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE
39
40struct nvmet_rdma_cmd {
41 struct ib_sge sge[2];
42 struct ib_cqe cqe;
43 struct ib_recv_wr wr;
44 struct scatterlist inline_sg;
45 struct page *inline_page;
46 struct nvme_command *nvme_cmd;
47 struct nvmet_rdma_queue *queue;
48};
49
50enum {
51 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0),
52 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1),
53};
54
55struct nvmet_rdma_rsp {
56 struct ib_sge send_sge;
57 struct ib_cqe send_cqe;
58 struct ib_send_wr send_wr;
59
60 struct nvmet_rdma_cmd *cmd;
61 struct nvmet_rdma_queue *queue;
62
63 struct ib_cqe read_cqe;
64 struct rdma_rw_ctx rw;
65
66 struct nvmet_req req;
67
68 u8 n_rdma;
69 u32 flags;
70 u32 invalidate_rkey;
71
72 struct list_head wait_list;
73 struct list_head free_list;
74};
75
76enum nvmet_rdma_queue_state {
77 NVMET_RDMA_Q_CONNECTING,
78 NVMET_RDMA_Q_LIVE,
79 NVMET_RDMA_Q_DISCONNECTING,
80};
81
82struct nvmet_rdma_queue {
83 struct rdma_cm_id *cm_id;
84 struct nvmet_port *port;
85 struct ib_cq *cq;
86 atomic_t sq_wr_avail;
87 struct nvmet_rdma_device *dev;
88 spinlock_t state_lock;
89 enum nvmet_rdma_queue_state state;
90 struct nvmet_cq nvme_cq;
91 struct nvmet_sq nvme_sq;
92
93 struct nvmet_rdma_rsp *rsps;
94 struct list_head free_rsps;
95 spinlock_t rsps_lock;
96 struct nvmet_rdma_cmd *cmds;
97
98 struct work_struct release_work;
99 struct list_head rsp_wait_list;
100 struct list_head rsp_wr_wait_list;
101 spinlock_t rsp_wr_wait_lock;
102
103 int idx;
104 int host_qid;
105 int recv_queue_size;
106 int send_queue_size;
107
108 struct list_head queue_list;
109};
110
111struct nvmet_rdma_device {
112 struct ib_device *device;
113 struct ib_pd *pd;
114 struct ib_srq *srq;
115 struct nvmet_rdma_cmd *srq_cmds;
116 size_t srq_size;
117 struct kref ref;
118 struct list_head entry;
119};
120
121static bool nvmet_rdma_use_srq;
122module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
123MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
124
125static DEFINE_IDA(nvmet_rdma_queue_ida);
126static LIST_HEAD(nvmet_rdma_queue_list);
127static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
128
129static LIST_HEAD(device_list);
130static DEFINE_MUTEX(device_list_mutex);
131
132static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
133static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
134static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
135static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
136static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
137static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
138
139static struct nvmet_fabrics_ops nvmet_rdma_ops;
140
141/* XXX: really should move to a generic header sooner or later.. */
142static inline u32 get_unaligned_le24(const u8 *p)
143{
144 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
145}
146
147static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
148{
149 return nvme_is_write(rsp->req.cmd) &&
150 rsp->req.data_len &&
151 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
152}
153
154static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
155{
156 return !nvme_is_write(rsp->req.cmd) &&
157 rsp->req.data_len &&
158 !rsp->req.rsp->status &&
159 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
160}
161
162static inline struct nvmet_rdma_rsp *
163nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
164{
165 struct nvmet_rdma_rsp *rsp;
166 unsigned long flags;
167
168 spin_lock_irqsave(&queue->rsps_lock, flags);
169 rsp = list_first_entry(&queue->free_rsps,
170 struct nvmet_rdma_rsp, free_list);
171 list_del(&rsp->free_list);
172 spin_unlock_irqrestore(&queue->rsps_lock, flags);
173
174 return rsp;
175}
176
177static inline void
178nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
179{
180 unsigned long flags;
181
182 spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
183 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
184 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
185}
186
187static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
188{
189 struct scatterlist *sg;
190 int count;
191
192 if (!sgl || !nents)
193 return;
194
195 for_each_sg(sgl, sg, nents, count)
196 __free_page(sg_page(sg));
197 kfree(sgl);
198}
199
200static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
201 u32 length)
202{
203 struct scatterlist *sg;
204 struct page *page;
205 unsigned int nent;
206 int i = 0;
207
208 nent = DIV_ROUND_UP(length, PAGE_SIZE);
209 sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
210 if (!sg)
211 goto out;
212
213 sg_init_table(sg, nent);
214
215 while (length) {
216 u32 page_len = min_t(u32, length, PAGE_SIZE);
217
218 page = alloc_page(GFP_KERNEL);
219 if (!page)
220 goto out_free_pages;
221
222 sg_set_page(&sg[i], page, page_len, 0);
223 length -= page_len;
224 i++;
225 }
226 *sgl = sg;
227 *nents = nent;
228 return 0;
229
230out_free_pages:
231 while (i > 0) {
232 i--;
233 __free_page(sg_page(&sg[i]));
234 }
235 kfree(sg);
236out:
237 return NVME_SC_INTERNAL;
238}
239
240static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
241 struct nvmet_rdma_cmd *c, bool admin)
242{
243 /* NVMe command / RDMA RECV */
244 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
245 if (!c->nvme_cmd)
246 goto out;
247
248 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
249 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
250 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
251 goto out_free_cmd;
252
253 c->sge[0].length = sizeof(*c->nvme_cmd);
254 c->sge[0].lkey = ndev->pd->local_dma_lkey;
255
256 if (!admin) {
257 c->inline_page = alloc_pages(GFP_KERNEL,
258 get_order(NVMET_RDMA_INLINE_DATA_SIZE));
259 if (!c->inline_page)
260 goto out_unmap_cmd;
261 c->sge[1].addr = ib_dma_map_page(ndev->device,
262 c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
263 DMA_FROM_DEVICE);
264 if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
265 goto out_free_inline_page;
266 c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
267 c->sge[1].lkey = ndev->pd->local_dma_lkey;
268 }
269
270 c->cqe.done = nvmet_rdma_recv_done;
271
272 c->wr.wr_cqe = &c->cqe;
273 c->wr.sg_list = c->sge;
274 c->wr.num_sge = admin ? 1 : 2;
275
276 return 0;
277
278out_free_inline_page:
279 if (!admin) {
280 __free_pages(c->inline_page,
281 get_order(NVMET_RDMA_INLINE_DATA_SIZE));
282 }
283out_unmap_cmd:
284 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
285 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
286out_free_cmd:
287 kfree(c->nvme_cmd);
288
289out:
290 return -ENOMEM;
291}
292
293static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
294 struct nvmet_rdma_cmd *c, bool admin)
295{
296 if (!admin) {
297 ib_dma_unmap_page(ndev->device, c->sge[1].addr,
298 NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
299 __free_pages(c->inline_page,
300 get_order(NVMET_RDMA_INLINE_DATA_SIZE));
301 }
302 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
303 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
304 kfree(c->nvme_cmd);
305}
306
307static struct nvmet_rdma_cmd *
308nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
309 int nr_cmds, bool admin)
310{
311 struct nvmet_rdma_cmd *cmds;
312 int ret = -EINVAL, i;
313
314 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
315 if (!cmds)
316 goto out;
317
318 for (i = 0; i < nr_cmds; i++) {
319 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
320 if (ret)
321 goto out_free;
322 }
323
324 return cmds;
325
326out_free:
327 while (--i >= 0)
328 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
329 kfree(cmds);
330out:
331 return ERR_PTR(ret);
332}
333
334static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
335 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
336{
337 int i;
338
339 for (i = 0; i < nr_cmds; i++)
340 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
341 kfree(cmds);
342}
343
344static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
345 struct nvmet_rdma_rsp *r)
346{
347 /* NVMe CQE / RDMA SEND */
348 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
349 if (!r->req.rsp)
350 goto out;
351
352 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
353 sizeof(*r->req.rsp), DMA_TO_DEVICE);
354 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
355 goto out_free_rsp;
356
357 r->send_sge.length = sizeof(*r->req.rsp);
358 r->send_sge.lkey = ndev->pd->local_dma_lkey;
359
360 r->send_cqe.done = nvmet_rdma_send_done;
361
362 r->send_wr.wr_cqe = &r->send_cqe;
363 r->send_wr.sg_list = &r->send_sge;
364 r->send_wr.num_sge = 1;
365 r->send_wr.send_flags = IB_SEND_SIGNALED;
366
367 /* Data In / RDMA READ */
368 r->read_cqe.done = nvmet_rdma_read_data_done;
369 return 0;
370
371out_free_rsp:
372 kfree(r->req.rsp);
373out:
374 return -ENOMEM;
375}
376
377static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
378 struct nvmet_rdma_rsp *r)
379{
380 ib_dma_unmap_single(ndev->device, r->send_sge.addr,
381 sizeof(*r->req.rsp), DMA_TO_DEVICE);
382 kfree(r->req.rsp);
383}
384
385static int
386nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
387{
388 struct nvmet_rdma_device *ndev = queue->dev;
389 int nr_rsps = queue->recv_queue_size * 2;
390 int ret = -EINVAL, i;
391
392 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
393 GFP_KERNEL);
394 if (!queue->rsps)
395 goto out;
396
397 for (i = 0; i < nr_rsps; i++) {
398 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
399
400 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
401 if (ret)
402 goto out_free;
403
404 list_add_tail(&rsp->free_list, &queue->free_rsps);
405 }
406
407 return 0;
408
409out_free:
410 while (--i >= 0) {
411 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
412
413 list_del(&rsp->free_list);
414 nvmet_rdma_free_rsp(ndev, rsp);
415 }
416 kfree(queue->rsps);
417out:
418 return ret;
419}
420
421static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
422{
423 struct nvmet_rdma_device *ndev = queue->dev;
424 int i, nr_rsps = queue->recv_queue_size * 2;
425
426 for (i = 0; i < nr_rsps; i++) {
427 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
428
429 list_del(&rsp->free_list);
430 nvmet_rdma_free_rsp(ndev, rsp);
431 }
432 kfree(queue->rsps);
433}
434
435static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
436 struct nvmet_rdma_cmd *cmd)
437{
438 struct ib_recv_wr *bad_wr;
439
440 if (ndev->srq)
441 return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
442 return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
443}
444
445static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
446{
447 spin_lock(&queue->rsp_wr_wait_lock);
448 while (!list_empty(&queue->rsp_wr_wait_list)) {
449 struct nvmet_rdma_rsp *rsp;
450 bool ret;
451
452 rsp = list_entry(queue->rsp_wr_wait_list.next,
453 struct nvmet_rdma_rsp, wait_list);
454 list_del(&rsp->wait_list);
455
456 spin_unlock(&queue->rsp_wr_wait_lock);
457 ret = nvmet_rdma_execute_command(rsp);
458 spin_lock(&queue->rsp_wr_wait_lock);
459
460 if (!ret) {
461 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
462 break;
463 }
464 }
465 spin_unlock(&queue->rsp_wr_wait_lock);
466}
467
468
469static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
470{
471 struct nvmet_rdma_queue *queue = rsp->queue;
472
473 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
474
475 if (rsp->n_rdma) {
476 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
477 queue->cm_id->port_num, rsp->req.sg,
478 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
479 }
480
481 if (rsp->req.sg != &rsp->cmd->inline_sg)
482 nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
483
484 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
485 nvmet_rdma_process_wr_wait_list(queue);
486
487 nvmet_rdma_put_rsp(rsp);
488}
489
490static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
491{
492 if (queue->nvme_sq.ctrl) {
493 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
494 } else {
495 /*
496 * we didn't setup the controller yet in case
497 * of admin connect error, just disconnect and
498 * cleanup the queue
499 */
500 nvmet_rdma_queue_disconnect(queue);
501 }
502}
503
504static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
505{
506 struct nvmet_rdma_rsp *rsp =
507 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
508
509 nvmet_rdma_release_rsp(rsp);
510
511 if (unlikely(wc->status != IB_WC_SUCCESS &&
512 wc->status != IB_WC_WR_FLUSH_ERR)) {
513 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
514 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
515 nvmet_rdma_error_comp(rsp->queue);
516 }
517}
518
519static void nvmet_rdma_queue_response(struct nvmet_req *req)
520{
521 struct nvmet_rdma_rsp *rsp =
522 container_of(req, struct nvmet_rdma_rsp, req);
523 struct rdma_cm_id *cm_id = rsp->queue->cm_id;
524 struct ib_send_wr *first_wr, *bad_wr;
525
526 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
527 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
528 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
529 } else {
530 rsp->send_wr.opcode = IB_WR_SEND;
531 }
532
533 if (nvmet_rdma_need_data_out(rsp))
534 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
535 cm_id->port_num, NULL, &rsp->send_wr);
536 else
537 first_wr = &rsp->send_wr;
538
539 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
540 if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) {
541 pr_err("sending cmd response failed\n");
542 nvmet_rdma_release_rsp(rsp);
543 }
544}
545
546static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
547{
548 struct nvmet_rdma_rsp *rsp =
549 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
550 struct nvmet_rdma_queue *queue = cq->cq_context;
551
552 WARN_ON(rsp->n_rdma <= 0);
553 atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
554 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
555 queue->cm_id->port_num, rsp->req.sg,
556 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
557 rsp->n_rdma = 0;
558
559 if (unlikely(wc->status != IB_WC_SUCCESS)) {
560 nvmet_rdma_release_rsp(rsp);
561 if (wc->status != IB_WC_WR_FLUSH_ERR) {
562 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
563 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
564 nvmet_rdma_error_comp(queue);
565 }
566 return;
567 }
568
569 rsp->req.execute(&rsp->req);
570}
571
572static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
573 u64 off)
574{
575 sg_init_table(&rsp->cmd->inline_sg, 1);
576 sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
577 rsp->req.sg = &rsp->cmd->inline_sg;
578 rsp->req.sg_cnt = 1;
579}
580
581static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
582{
583 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
584 u64 off = le64_to_cpu(sgl->addr);
585 u32 len = le32_to_cpu(sgl->length);
586
587 if (!nvme_is_write(rsp->req.cmd))
588 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
589
590 if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
591 pr_err("invalid inline data offset!\n");
592 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
593 }
594
595 /* no data command? */
596 if (!len)
597 return 0;
598
599 nvmet_rdma_use_inline_sg(rsp, len, off);
600 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
601 return 0;
602}
603
604static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
605 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
606{
607 struct rdma_cm_id *cm_id = rsp->queue->cm_id;
608 u64 addr = le64_to_cpu(sgl->addr);
609 u32 len = get_unaligned_le24(sgl->length);
610 u32 key = get_unaligned_le32(sgl->key);
611 int ret;
612 u16 status;
613
614 /* no data command? */
615 if (!len)
616 return 0;
617
618 /* use the already allocated data buffer if possible */
619 if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) {
620 nvmet_rdma_use_inline_sg(rsp, len, 0);
621 } else {
622 status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
623 len);
624 if (status)
625 return status;
626 }
627
628 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
629 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
630 nvmet_data_dir(&rsp->req));
631 if (ret < 0)
632 return NVME_SC_INTERNAL;
633 rsp->n_rdma += ret;
634
635 if (invalidate) {
636 rsp->invalidate_rkey = key;
637 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
638 }
639
640 return 0;
641}
642
643static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
644{
645 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
646
647 switch (sgl->type >> 4) {
648 case NVME_SGL_FMT_DATA_DESC:
649 switch (sgl->type & 0xf) {
650 case NVME_SGL_FMT_OFFSET:
651 return nvmet_rdma_map_sgl_inline(rsp);
652 default:
653 pr_err("invalid SGL subtype: %#x\n", sgl->type);
654 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
655 }
656 case NVME_KEY_SGL_FMT_DATA_DESC:
657 switch (sgl->type & 0xf) {
658 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
659 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
660 case NVME_SGL_FMT_ADDRESS:
661 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
662 default:
663 pr_err("invalid SGL subtype: %#x\n", sgl->type);
664 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
665 }
666 default:
667 pr_err("invalid SGL type: %#x\n", sgl->type);
668 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
669 }
670}
671
672static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
673{
674 struct nvmet_rdma_queue *queue = rsp->queue;
675
676 if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
677 &queue->sq_wr_avail) < 0)) {
678 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
679 1 + rsp->n_rdma, queue->idx,
680 queue->nvme_sq.ctrl->cntlid);
681 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
682 return false;
683 }
684
685 if (nvmet_rdma_need_data_in(rsp)) {
686 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
687 queue->cm_id->port_num, &rsp->read_cqe, NULL))
688 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
689 } else {
690 rsp->req.execute(&rsp->req);
691 }
692
693 return true;
694}
695
696static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
697 struct nvmet_rdma_rsp *cmd)
698{
699 u16 status;
700
701 cmd->queue = queue;
702 cmd->n_rdma = 0;
703 cmd->req.port = queue->port;
704
705 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
706 &queue->nvme_sq, &nvmet_rdma_ops))
707 return;
708
709 status = nvmet_rdma_map_sgl(cmd);
710 if (status)
711 goto out_err;
712
713 if (unlikely(!nvmet_rdma_execute_command(cmd))) {
714 spin_lock(&queue->rsp_wr_wait_lock);
715 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
716 spin_unlock(&queue->rsp_wr_wait_lock);
717 }
718
719 return;
720
721out_err:
722 nvmet_req_complete(&cmd->req, status);
723}
724
725static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
726{
727 struct nvmet_rdma_cmd *cmd =
728 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
729 struct nvmet_rdma_queue *queue = cq->cq_context;
730 struct nvmet_rdma_rsp *rsp;
731
732 if (unlikely(wc->status != IB_WC_SUCCESS)) {
733 if (wc->status != IB_WC_WR_FLUSH_ERR) {
734 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
735 wc->wr_cqe, ib_wc_status_msg(wc->status),
736 wc->status);
737 nvmet_rdma_error_comp(queue);
738 }
739 return;
740 }
741
742 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
743 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
744 nvmet_rdma_error_comp(queue);
745 return;
746 }
747
748 cmd->queue = queue;
749 rsp = nvmet_rdma_get_rsp(queue);
750 rsp->cmd = cmd;
751 rsp->flags = 0;
752 rsp->req.cmd = cmd->nvme_cmd;
753
754 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
755 unsigned long flags;
756
757 spin_lock_irqsave(&queue->state_lock, flags);
758 if (queue->state == NVMET_RDMA_Q_CONNECTING)
759 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
760 else
761 nvmet_rdma_put_rsp(rsp);
762 spin_unlock_irqrestore(&queue->state_lock, flags);
763 return;
764 }
765
766 nvmet_rdma_handle_command(queue, rsp);
767}
768
769static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
770{
771 if (!ndev->srq)
772 return;
773
774 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
775 ib_destroy_srq(ndev->srq);
776}
777
778static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
779{
780 struct ib_srq_init_attr srq_attr = { NULL, };
781 struct ib_srq *srq;
782 size_t srq_size;
783 int ret, i;
784
785 srq_size = 4095; /* XXX: tune */
786
787 srq_attr.attr.max_wr = srq_size;
788 srq_attr.attr.max_sge = 2;
789 srq_attr.attr.srq_limit = 0;
790 srq_attr.srq_type = IB_SRQT_BASIC;
791 srq = ib_create_srq(ndev->pd, &srq_attr);
792 if (IS_ERR(srq)) {
793 /*
794 * If SRQs aren't supported we just go ahead and use normal
795 * non-shared receive queues.
796 */
797 pr_info("SRQ requested but not supported.\n");
798 return 0;
799 }
800
801 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
802 if (IS_ERR(ndev->srq_cmds)) {
803 ret = PTR_ERR(ndev->srq_cmds);
804 goto out_destroy_srq;
805 }
806
807 ndev->srq = srq;
808 ndev->srq_size = srq_size;
809
810 for (i = 0; i < srq_size; i++)
811 nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
812
813 return 0;
814
815out_destroy_srq:
816 ib_destroy_srq(srq);
817 return ret;
818}
819
820static void nvmet_rdma_free_dev(struct kref *ref)
821{
822 struct nvmet_rdma_device *ndev =
823 container_of(ref, struct nvmet_rdma_device, ref);
824
825 mutex_lock(&device_list_mutex);
826 list_del(&ndev->entry);
827 mutex_unlock(&device_list_mutex);
828
829 nvmet_rdma_destroy_srq(ndev);
830 ib_dealloc_pd(ndev->pd);
831
832 kfree(ndev);
833}
834
835static struct nvmet_rdma_device *
836nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
837{
838 struct nvmet_rdma_device *ndev;
839 int ret;
840
841 mutex_lock(&device_list_mutex);
842 list_for_each_entry(ndev, &device_list, entry) {
843 if (ndev->device->node_guid == cm_id->device->node_guid &&
844 kref_get_unless_zero(&ndev->ref))
845 goto out_unlock;
846 }
847
848 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
849 if (!ndev)
850 goto out_err;
851
852 ndev->device = cm_id->device;
853 kref_init(&ndev->ref);
854
855 ndev->pd = ib_alloc_pd(ndev->device);
856 if (IS_ERR(ndev->pd))
857 goto out_free_dev;
858
859 if (nvmet_rdma_use_srq) {
860 ret = nvmet_rdma_init_srq(ndev);
861 if (ret)
862 goto out_free_pd;
863 }
864
865 list_add(&ndev->entry, &device_list);
866out_unlock:
867 mutex_unlock(&device_list_mutex);
868 pr_debug("added %s.\n", ndev->device->name);
869 return ndev;
870
871out_free_pd:
872 ib_dealloc_pd(ndev->pd);
873out_free_dev:
874 kfree(ndev);
875out_err:
876 mutex_unlock(&device_list_mutex);
877 return NULL;
878}
879
880static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
881{
882 struct ib_qp_init_attr qp_attr;
883 struct nvmet_rdma_device *ndev = queue->dev;
884 int comp_vector, nr_cqe, ret, i;
885
886 /*
887 * Spread the io queues across completion vectors,
888 * but still keep all admin queues on vector 0.
889 */
890 comp_vector = !queue->host_qid ? 0 :
891 queue->idx % ndev->device->num_comp_vectors;
892
893 /*
894 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
895 */
896 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
897
898 queue->cq = ib_alloc_cq(ndev->device, queue,
899 nr_cqe + 1, comp_vector,
900 IB_POLL_WORKQUEUE);
901 if (IS_ERR(queue->cq)) {
902 ret = PTR_ERR(queue->cq);
903 pr_err("failed to create CQ cqe= %d ret= %d\n",
904 nr_cqe + 1, ret);
905 goto out;
906 }
907
908 memset(&qp_attr, 0, sizeof(qp_attr));
909 qp_attr.qp_context = queue;
910 qp_attr.event_handler = nvmet_rdma_qp_event;
911 qp_attr.send_cq = queue->cq;
912 qp_attr.recv_cq = queue->cq;
913 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
914 qp_attr.qp_type = IB_QPT_RC;
915 /* +1 for drain */
916 qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
917 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
918 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
919 ndev->device->attrs.max_sge);
920
921 if (ndev->srq) {
922 qp_attr.srq = ndev->srq;
923 } else {
924 /* +1 for drain */
925 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
926 qp_attr.cap.max_recv_sge = 2;
927 }
928
929 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
930 if (ret) {
931 pr_err("failed to create_qp ret= %d\n", ret);
932 goto err_destroy_cq;
933 }
934
935 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
936
937 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
938 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
939 qp_attr.cap.max_send_wr, queue->cm_id);
940
941 if (!ndev->srq) {
942 for (i = 0; i < queue->recv_queue_size; i++) {
943 queue->cmds[i].queue = queue;
944 nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
945 }
946 }
947
948out:
949 return ret;
950
951err_destroy_cq:
952 ib_free_cq(queue->cq);
953 goto out;
954}
955
956static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
957{
958 rdma_destroy_qp(queue->cm_id);
959 ib_free_cq(queue->cq);
960}
961
962static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
963{
964 pr_info("freeing queue %d\n", queue->idx);
965
966 nvmet_sq_destroy(&queue->nvme_sq);
967
968 nvmet_rdma_destroy_queue_ib(queue);
969 if (!queue->dev->srq) {
970 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
971 queue->recv_queue_size,
972 !queue->host_qid);
973 }
974 nvmet_rdma_free_rsps(queue);
975 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
976 kfree(queue);
977}
978
979static void nvmet_rdma_release_queue_work(struct work_struct *w)
980{
981 struct nvmet_rdma_queue *queue =
982 container_of(w, struct nvmet_rdma_queue, release_work);
983 struct rdma_cm_id *cm_id = queue->cm_id;
984 struct nvmet_rdma_device *dev = queue->dev;
985
986 nvmet_rdma_free_queue(queue);
987 rdma_destroy_id(cm_id);
988 kref_put(&dev->ref, nvmet_rdma_free_dev);
989}
990
991static int
992nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
993 struct nvmet_rdma_queue *queue)
994{
995 struct nvme_rdma_cm_req *req;
996
997 req = (struct nvme_rdma_cm_req *)conn->private_data;
998 if (!req || conn->private_data_len == 0)
999 return NVME_RDMA_CM_INVALID_LEN;
1000
1001 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1002 return NVME_RDMA_CM_INVALID_RECFMT;
1003
1004 queue->host_qid = le16_to_cpu(req->qid);
1005
1006 /*
1007 * req->hsqsize corresponds to our recv queue size
1008 * req->hrqsize corresponds to our send queue size
1009 */
1010 queue->recv_queue_size = le16_to_cpu(req->hsqsize);
1011 queue->send_queue_size = le16_to_cpu(req->hrqsize);
1012
1013 if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH)
1014 return NVME_RDMA_CM_INVALID_HSQSIZE;
1015
1016 /* XXX: Should we enforce some kind of max for IO queues? */
1017
1018 return 0;
1019}
1020
1021static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1022 enum nvme_rdma_cm_status status)
1023{
1024 struct nvme_rdma_cm_rej rej;
1025
1026 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1027 rej.sts = cpu_to_le16(status);
1028
1029 return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1030}
1031
1032static struct nvmet_rdma_queue *
1033nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1034 struct rdma_cm_id *cm_id,
1035 struct rdma_cm_event *event)
1036{
1037 struct nvmet_rdma_queue *queue;
1038 int ret;
1039
1040 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1041 if (!queue) {
1042 ret = NVME_RDMA_CM_NO_RSC;
1043 goto out_reject;
1044 }
1045
1046 ret = nvmet_sq_init(&queue->nvme_sq);
1047 if (ret)
1048 goto out_free_queue;
1049
1050 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1051 if (ret)
1052 goto out_destroy_sq;
1053
1054 /*
1055 * Schedules the actual release because calling rdma_destroy_id from
1056 * inside a CM callback would trigger a deadlock. (great API design..)
1057 */
1058 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1059 queue->dev = ndev;
1060 queue->cm_id = cm_id;
1061
1062 spin_lock_init(&queue->state_lock);
1063 queue->state = NVMET_RDMA_Q_CONNECTING;
1064 INIT_LIST_HEAD(&queue->rsp_wait_list);
1065 INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1066 spin_lock_init(&queue->rsp_wr_wait_lock);
1067 INIT_LIST_HEAD(&queue->free_rsps);
1068 spin_lock_init(&queue->rsps_lock);
1069
1070 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1071 if (queue->idx < 0) {
1072 ret = NVME_RDMA_CM_NO_RSC;
1073 goto out_free_queue;
1074 }
1075
1076 ret = nvmet_rdma_alloc_rsps(queue);
1077 if (ret) {
1078 ret = NVME_RDMA_CM_NO_RSC;
1079 goto out_ida_remove;
1080 }
1081
1082 if (!ndev->srq) {
1083 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1084 queue->recv_queue_size,
1085 !queue->host_qid);
1086 if (IS_ERR(queue->cmds)) {
1087 ret = NVME_RDMA_CM_NO_RSC;
1088 goto out_free_responses;
1089 }
1090 }
1091
1092 ret = nvmet_rdma_create_queue_ib(queue);
1093 if (ret) {
1094 pr_err("%s: creating RDMA queue failed (%d).\n",
1095 __func__, ret);
1096 ret = NVME_RDMA_CM_NO_RSC;
1097 goto out_free_cmds;
1098 }
1099
1100 return queue;
1101
1102out_free_cmds:
1103 if (!ndev->srq) {
1104 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1105 queue->recv_queue_size,
1106 !queue->host_qid);
1107 }
1108out_free_responses:
1109 nvmet_rdma_free_rsps(queue);
1110out_ida_remove:
1111 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1112out_destroy_sq:
1113 nvmet_sq_destroy(&queue->nvme_sq);
1114out_free_queue:
1115 kfree(queue);
1116out_reject:
1117 nvmet_rdma_cm_reject(cm_id, ret);
1118 return NULL;
1119}
1120
1121static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1122{
1123 struct nvmet_rdma_queue *queue = priv;
1124
1125 switch (event->event) {
1126 case IB_EVENT_COMM_EST:
1127 rdma_notify(queue->cm_id, event->event);
1128 break;
1129 default:
1130 pr_err("received unrecognized IB QP event %d\n", event->event);
1131 break;
1132 }
1133}
1134
1135static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1136 struct nvmet_rdma_queue *queue,
1137 struct rdma_conn_param *p)
1138{
1139 struct rdma_conn_param param = { };
1140 struct nvme_rdma_cm_rep priv = { };
1141 int ret = -ENOMEM;
1142
1143 param.rnr_retry_count = 7;
1144 param.flow_control = 1;
1145 param.initiator_depth = min_t(u8, p->initiator_depth,
1146 queue->dev->device->attrs.max_qp_init_rd_atom);
1147 param.private_data = &priv;
1148 param.private_data_len = sizeof(priv);
1149 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1150 priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1151
1152 ret = rdma_accept(cm_id, &param);
1153 if (ret)
1154 pr_err("rdma_accept failed (error code = %d)\n", ret);
1155
1156 return ret;
1157}
1158
1159static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1160 struct rdma_cm_event *event)
1161{
1162 struct nvmet_rdma_device *ndev;
1163 struct nvmet_rdma_queue *queue;
1164 int ret = -EINVAL;
1165
1166 ndev = nvmet_rdma_find_get_device(cm_id);
1167 if (!ndev) {
1168 pr_err("no client data!\n");
1169 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1170 return -ECONNREFUSED;
1171 }
1172
1173 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1174 if (!queue) {
1175 ret = -ENOMEM;
1176 goto put_device;
1177 }
1178 queue->port = cm_id->context;
1179
1180 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1181 if (ret)
1182 goto release_queue;
1183
1184 mutex_lock(&nvmet_rdma_queue_mutex);
1185 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1186 mutex_unlock(&nvmet_rdma_queue_mutex);
1187
1188 return 0;
1189
1190release_queue:
1191 nvmet_rdma_free_queue(queue);
1192put_device:
1193 kref_put(&ndev->ref, nvmet_rdma_free_dev);
1194
1195 return ret;
1196}
1197
1198static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1199{
1200 unsigned long flags;
1201
1202 spin_lock_irqsave(&queue->state_lock, flags);
1203 if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1204 pr_warn("trying to establish a connected queue\n");
1205 goto out_unlock;
1206 }
1207 queue->state = NVMET_RDMA_Q_LIVE;
1208
1209 while (!list_empty(&queue->rsp_wait_list)) {
1210 struct nvmet_rdma_rsp *cmd;
1211
1212 cmd = list_first_entry(&queue->rsp_wait_list,
1213 struct nvmet_rdma_rsp, wait_list);
1214 list_del(&cmd->wait_list);
1215
1216 spin_unlock_irqrestore(&queue->state_lock, flags);
1217 nvmet_rdma_handle_command(queue, cmd);
1218 spin_lock_irqsave(&queue->state_lock, flags);
1219 }
1220
1221out_unlock:
1222 spin_unlock_irqrestore(&queue->state_lock, flags);
1223}
1224
1225static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1226{
1227 bool disconnect = false;
1228 unsigned long flags;
1229
1230 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1231
1232 spin_lock_irqsave(&queue->state_lock, flags);
1233 switch (queue->state) {
1234 case NVMET_RDMA_Q_CONNECTING:
1235 case NVMET_RDMA_Q_LIVE:
1236 disconnect = true;
1237 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1238 break;
1239 case NVMET_RDMA_Q_DISCONNECTING:
1240 break;
1241 }
1242 spin_unlock_irqrestore(&queue->state_lock, flags);
1243
1244 if (disconnect) {
1245 rdma_disconnect(queue->cm_id);
1246 ib_drain_qp(queue->cm_id->qp);
1247 schedule_work(&queue->release_work);
1248 }
1249}
1250
1251static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1252{
1253 bool disconnect = false;
1254
1255 mutex_lock(&nvmet_rdma_queue_mutex);
1256 if (!list_empty(&queue->queue_list)) {
1257 list_del_init(&queue->queue_list);
1258 disconnect = true;
1259 }
1260 mutex_unlock(&nvmet_rdma_queue_mutex);
1261
1262 if (disconnect)
1263 __nvmet_rdma_queue_disconnect(queue);
1264}
1265
1266static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1267 struct nvmet_rdma_queue *queue)
1268{
1269 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1270
1271 pr_err("failed to connect queue\n");
1272 schedule_work(&queue->release_work);
1273}
1274
1275static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1276 struct rdma_cm_event *event)
1277{
1278 struct nvmet_rdma_queue *queue = NULL;
1279 int ret = 0;
1280
1281 if (cm_id->qp)
1282 queue = cm_id->qp->qp_context;
1283
1284 pr_debug("%s (%d): status %d id %p\n",
1285 rdma_event_msg(event->event), event->event,
1286 event->status, cm_id);
1287
1288 switch (event->event) {
1289 case RDMA_CM_EVENT_CONNECT_REQUEST:
1290 ret = nvmet_rdma_queue_connect(cm_id, event);
1291 break;
1292 case RDMA_CM_EVENT_ESTABLISHED:
1293 nvmet_rdma_queue_established(queue);
1294 break;
1295 case RDMA_CM_EVENT_ADDR_CHANGE:
1296 case RDMA_CM_EVENT_DISCONNECTED:
1297 case RDMA_CM_EVENT_DEVICE_REMOVAL:
1298 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1299 /*
1300 * We can get the device removal callback even for a
1301 * CM ID that we aren't actually using. In that case
1302 * the context pointer is NULL, so we shouldn't try
1303 * to disconnect a non-existing queue. But we also
1304 * need to return 1 so that the core will destroy
1305 * it's own ID. What a great API design..
1306 */
1307 if (queue)
1308 nvmet_rdma_queue_disconnect(queue);
1309 else
1310 ret = 1;
1311 break;
1312 case RDMA_CM_EVENT_REJECTED:
1313 case RDMA_CM_EVENT_UNREACHABLE:
1314 case RDMA_CM_EVENT_CONNECT_ERROR:
1315 nvmet_rdma_queue_connect_fail(cm_id, queue);
1316 break;
1317 default:
1318 pr_err("received unrecognized RDMA CM event %d\n",
1319 event->event);
1320 break;
1321 }
1322
1323 return ret;
1324}
1325
1326static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1327{
1328 struct nvmet_rdma_queue *queue;
1329
1330restart:
1331 mutex_lock(&nvmet_rdma_queue_mutex);
1332 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1333 if (queue->nvme_sq.ctrl == ctrl) {
1334 list_del_init(&queue->queue_list);
1335 mutex_unlock(&nvmet_rdma_queue_mutex);
1336
1337 __nvmet_rdma_queue_disconnect(queue);
1338 goto restart;
1339 }
1340 }
1341 mutex_unlock(&nvmet_rdma_queue_mutex);
1342}
1343
1344static int nvmet_rdma_add_port(struct nvmet_port *port)
1345{
1346 struct rdma_cm_id *cm_id;
1347 struct sockaddr_in addr_in;
1348 u16 port_in;
1349 int ret;
1350
1351 switch (port->disc_addr.adrfam) {
1352 case NVMF_ADDR_FAMILY_IP4:
1353 break;
1354 default:
1355 pr_err("address family %d not supported\n",
1356 port->disc_addr.adrfam);
1357 return -EINVAL;
1358 }
1359
1360 ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in);
1361 if (ret)
1362 return ret;
1363
1364 addr_in.sin_family = AF_INET;
1365 addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr);
1366 addr_in.sin_port = htons(port_in);
1367
1368 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1369 RDMA_PS_TCP, IB_QPT_RC);
1370 if (IS_ERR(cm_id)) {
1371 pr_err("CM ID creation failed\n");
1372 return PTR_ERR(cm_id);
1373 }
1374
1375 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in);
1376 if (ret) {
1377 pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret);
1378 goto out_destroy_id;
1379 }
1380
1381 ret = rdma_listen(cm_id, 128);
1382 if (ret) {
1383 pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret);
1384 goto out_destroy_id;
1385 }
1386
1387 pr_info("enabling port %d (%pISpc)\n",
1388 le16_to_cpu(port->disc_addr.portid), &addr_in);
1389 port->priv = cm_id;
1390 return 0;
1391
1392out_destroy_id:
1393 rdma_destroy_id(cm_id);
1394 return ret;
1395}
1396
1397static void nvmet_rdma_remove_port(struct nvmet_port *port)
1398{
1399 struct rdma_cm_id *cm_id = port->priv;
1400
1401 rdma_destroy_id(cm_id);
1402}
1403
1404static struct nvmet_fabrics_ops nvmet_rdma_ops = {
1405 .owner = THIS_MODULE,
1406 .type = NVMF_TRTYPE_RDMA,
1407 .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE,
1408 .msdbd = 1,
1409 .has_keyed_sgls = 1,
1410 .add_port = nvmet_rdma_add_port,
1411 .remove_port = nvmet_rdma_remove_port,
1412 .queue_response = nvmet_rdma_queue_response,
1413 .delete_ctrl = nvmet_rdma_delete_ctrl,
1414};
1415
1416static int __init nvmet_rdma_init(void)
1417{
1418 return nvmet_register_transport(&nvmet_rdma_ops);
1419}
1420
1421static void __exit nvmet_rdma_exit(void)
1422{
1423 struct nvmet_rdma_queue *queue;
1424
1425 nvmet_unregister_transport(&nvmet_rdma_ops);
1426
1427 flush_scheduled_work();
1428
1429 mutex_lock(&nvmet_rdma_queue_mutex);
1430 while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
1431 struct nvmet_rdma_queue, queue_list))) {
1432 list_del_init(&queue->queue_list);
1433
1434 mutex_unlock(&nvmet_rdma_queue_mutex);
1435 __nvmet_rdma_queue_disconnect(queue);
1436 mutex_lock(&nvmet_rdma_queue_mutex);
1437 }
1438 mutex_unlock(&nvmet_rdma_queue_mutex);
1439
1440 flush_scheduled_work();
1441 ida_destroy(&nvmet_rdma_queue_ida);
1442}
1443
1444module_init(nvmet_rdma_init);
1445module_exit(nvmet_rdma_exit);
1446
1447MODULE_LICENSE("GPL v2");
1448MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 31d544a87ba9..e2fa759bf2ad 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -45,7 +45,6 @@ int dasd_gendisk_alloc(struct dasd_block *block)
45 gdp->major = DASD_MAJOR; 45 gdp->major = DASD_MAJOR;
46 gdp->first_minor = base->devindex << DASD_PARTN_BITS; 46 gdp->first_minor = base->devindex << DASD_PARTN_BITS;
47 gdp->fops = &dasd_device_operations; 47 gdp->fops = &dasd_device_operations;
48 gdp->driverfs_dev = &base->cdev->dev;
49 48
50 /* 49 /*
51 * Set device name. 50 * Set device name.
@@ -76,7 +75,7 @@ int dasd_gendisk_alloc(struct dasd_block *block)
76 gdp->queue = block->request_queue; 75 gdp->queue = block->request_queue;
77 block->gdp = gdp; 76 block->gdp = gdp;
78 set_capacity(block->gdp, 0); 77 set_capacity(block->gdp, 0);
79 add_disk(block->gdp); 78 device_add_disk(&base->cdev->dev, block->gdp);
80 return 0; 79 return 0;
81} 80}
82 81
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 093e9e18e7e7..fac1b51ea0de 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -615,7 +615,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
615 dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL); 615 dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL);
616 dev_info->gd->queue = dev_info->dcssblk_queue; 616 dev_info->gd->queue = dev_info->dcssblk_queue;
617 dev_info->gd->private_data = dev_info; 617 dev_info->gd->private_data = dev_info;
618 dev_info->gd->driverfs_dev = &dev_info->dev;
619 blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request); 618 blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
620 blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); 619 blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096);
621 queue_flag_set_unlocked(QUEUE_FLAG_DAX, dev_info->dcssblk_queue); 620 queue_flag_set_unlocked(QUEUE_FLAG_DAX, dev_info->dcssblk_queue);
@@ -656,7 +655,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
656 goto put_dev; 655 goto put_dev;
657 656
658 get_device(&dev_info->dev); 657 get_device(&dev_info->dev);
659 add_disk(dev_info->gd); 658 device_add_disk(&dev_info->dev, dev_info->gd);
660 659
661 switch (dev_info->segment_type) { 660 switch (dev_info->segment_type) {
662 case SEG_TYPE_SR: 661 case SEG_TYPE_SR:
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index e6f54d3b8969..9f16ea6964ec 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -512,7 +512,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev)
512 goto out_queue; 512 goto out_queue;
513 513
514 rq->queuedata = scmdev; 514 rq->queuedata = scmdev;
515 bdev->gendisk->driverfs_dev = &scmdev->dev;
516 bdev->gendisk->private_data = scmdev; 515 bdev->gendisk->private_data = scmdev;
517 bdev->gendisk->fops = &scm_blk_devops; 516 bdev->gendisk->fops = &scm_blk_devops;
518 bdev->gendisk->queue = rq; 517 bdev->gendisk->queue = rq;
@@ -531,7 +530,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev)
531 530
532 /* 512 byte sectors */ 531 /* 512 byte sectors */
533 set_capacity(bdev->gendisk, scmdev->size >> 9); 532 set_capacity(bdev->gendisk, scmdev->size >> 9);
534 add_disk(bdev->gendisk); 533 device_add_disk(&scmdev->dev, bdev->gendisk);
535 return 0; 534 return 0;
536 535
537out_queue: 536out_queue:
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index daa4dc17f172..2f2a9910e30e 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1558,18 +1558,25 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
1558static struct request *_make_request(struct request_queue *q, bool has_write, 1558static struct request *_make_request(struct request_queue *q, bool has_write,
1559 struct _osd_io_info *oii, gfp_t flags) 1559 struct _osd_io_info *oii, gfp_t flags)
1560{ 1560{
1561 if (oii->bio) 1561 struct request *req;
1562 return blk_make_request(q, oii->bio, flags); 1562 struct bio *bio = oii->bio;
1563 else { 1563 int ret;
1564 struct request *req;
1565
1566 req = blk_get_request(q, has_write ? WRITE : READ, flags);
1567 if (IS_ERR(req))
1568 return req;
1569 1564
1570 blk_rq_set_block_pc(req); 1565 req = blk_get_request(q, has_write ? WRITE : READ, flags);
1566 if (IS_ERR(req))
1571 return req; 1567 return req;
1568 blk_rq_set_block_pc(req);
1569
1570 for_each_bio(bio) {
1571 struct bio *bounce_bio = bio;
1572
1573 blk_queue_bounce(req->q, &bounce_bio);
1574 ret = blk_rq_append_bio(req, bounce_bio);
1575 if (ret)
1576 return ERR_PTR(ret);
1572 } 1577 }
1578
1579 return req;
1573} 1580}
1574 1581
1575static int _init_blk_request(struct osd_request *or, 1582static int _init_blk_request(struct osd_request *or,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 0609d6802d93..fa85d19b81a6 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2994,7 +2994,6 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
2994 2994
2995 sd_revalidate_disk(gd); 2995 sd_revalidate_disk(gd);
2996 2996
2997 gd->driverfs_dev = &sdp->sdev_gendev;
2998 gd->flags = GENHD_FL_EXT_DEVT; 2997 gd->flags = GENHD_FL_EXT_DEVT;
2999 if (sdp->removable) { 2998 if (sdp->removable) {
3000 gd->flags |= GENHD_FL_REMOVABLE; 2999 gd->flags |= GENHD_FL_REMOVABLE;
@@ -3002,7 +3001,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
3002 } 3001 }
3003 3002
3004 blk_pm_runtime_init(sdp->request_queue, dev); 3003 blk_pm_runtime_init(sdp->request_queue, dev);
3005 add_disk(gd); 3004 device_add_disk(dev, gd);
3006 if (sdkp->capacity) 3005 if (sdkp->capacity)
3007 sd_dif_config_host(sdkp); 3006 sd_dif_config_host(sdkp);
3008 3007
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 64c867405ad4..ed179348de80 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -713,7 +713,6 @@ static int sr_probe(struct device *dev)
713 get_capabilities(cd); 713 get_capabilities(cd);
714 sr_vendor_init(cd); 714 sr_vendor_init(cd);
715 715
716 disk->driverfs_dev = &sdev->sdev_gendev;
717 set_capacity(disk, cd->capacity); 716 set_capacity(disk, cd->capacity);
718 disk->private_data = &cd->driver; 717 disk->private_data = &cd->driver;
719 disk->queue = sdev->request_queue; 718 disk->queue = sdev->request_queue;
@@ -730,7 +729,7 @@ static int sr_probe(struct device *dev)
730 729
731 dev_set_drvdata(dev, cd); 730 dev_set_drvdata(dev, cd);
732 disk->flags |= GENHD_FL_REMOVABLE; 731 disk->flags |= GENHD_FL_REMOVABLE;
733 add_disk(disk); 732 device_add_disk(&sdev->sdev_gendev, disk);
734 733
735 sdev_printk(KERN_DEBUG, sdev, 734 sdev_printk(KERN_DEBUG, sdev,
736 "Attached scsi CD-ROM %s\n", cd->cdi.name); 735 "Attached scsi CD-ROM %s\n", cd->cdi.name);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 81564c87f24b..9125d9358dea 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -876,19 +876,19 @@ static inline struct bio *pscsi_get_bio(int nr_vecs)
876 876
877static sense_reason_t 877static sense_reason_t
878pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, 878pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
879 enum dma_data_direction data_direction, struct bio **hbio) 879 struct request *req)
880{ 880{
881 struct pscsi_dev_virt *pdv = PSCSI_DEV(cmd->se_dev); 881 struct pscsi_dev_virt *pdv = PSCSI_DEV(cmd->se_dev);
882 struct bio *bio = NULL, *tbio = NULL; 882 struct bio *bio = NULL;
883 struct page *page; 883 struct page *page;
884 struct scatterlist *sg; 884 struct scatterlist *sg;
885 u32 data_len = cmd->data_length, i, len, bytes, off; 885 u32 data_len = cmd->data_length, i, len, bytes, off;
886 int nr_pages = (cmd->data_length + sgl[0].offset + 886 int nr_pages = (cmd->data_length + sgl[0].offset +
887 PAGE_SIZE - 1) >> PAGE_SHIFT; 887 PAGE_SIZE - 1) >> PAGE_SHIFT;
888 int nr_vecs = 0, rc; 888 int nr_vecs = 0, rc;
889 int rw = (data_direction == DMA_TO_DEVICE); 889 int rw = (cmd->data_direction == DMA_TO_DEVICE);
890 890
891 *hbio = NULL; 891 BUG_ON(!cmd->data_length);
892 892
893 pr_debug("PSCSI: nr_pages: %d\n", nr_pages); 893 pr_debug("PSCSI: nr_pages: %d\n", nr_pages);
894 894
@@ -927,16 +927,6 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
927 pr_debug("PSCSI: Allocated bio: %p," 927 pr_debug("PSCSI: Allocated bio: %p,"
928 " dir: %s nr_vecs: %d\n", bio, 928 " dir: %s nr_vecs: %d\n", bio,
929 (rw) ? "rw" : "r", nr_vecs); 929 (rw) ? "rw" : "r", nr_vecs);
930 /*
931 * Set *hbio pointer to handle the case:
932 * nr_pages > BIO_MAX_PAGES, where additional
933 * bios need to be added to complete a given
934 * command.
935 */
936 if (!*hbio)
937 *hbio = tbio = bio;
938 else
939 tbio = tbio->bi_next = bio;
940 } 930 }
941 931
942 pr_debug("PSCSI: Calling bio_add_pc_page() i: %d" 932 pr_debug("PSCSI: Calling bio_add_pc_page() i: %d"
@@ -955,11 +945,16 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
955 pr_debug("PSCSI: Reached bio->bi_vcnt max:" 945 pr_debug("PSCSI: Reached bio->bi_vcnt max:"
956 " %d i: %d bio: %p, allocating another" 946 " %d i: %d bio: %p, allocating another"
957 " bio\n", bio->bi_vcnt, i, bio); 947 " bio\n", bio->bi_vcnt, i, bio);
948
949 rc = blk_rq_append_bio(req, bio);
950 if (rc) {
951 pr_err("pSCSI: failed to append bio\n");
952 goto fail;
953 }
954
958 /* 955 /*
959 * Clear the pointer so that another bio will 956 * Clear the pointer so that another bio will
960 * be allocated with pscsi_get_bio() above, the 957 * be allocated with pscsi_get_bio() above.
961 * current bio has already been set *tbio and
962 * bio->bi_next.
963 */ 958 */
964 bio = NULL; 959 bio = NULL;
965 } 960 }
@@ -968,13 +963,16 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
968 } 963 }
969 } 964 }
970 965
966 if (bio) {
967 rc = blk_rq_append_bio(req, bio);
968 if (rc) {
969 pr_err("pSCSI: failed to append bio\n");
970 goto fail;
971 }
972 }
973
971 return 0; 974 return 0;
972fail: 975fail:
973 while (*hbio) {
974 bio = *hbio;
975 *hbio = (*hbio)->bi_next;
976 bio_endio(bio);
977 }
978 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; 976 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
979} 977}
980 978
@@ -992,11 +990,9 @@ pscsi_execute_cmd(struct se_cmd *cmd)
992{ 990{
993 struct scatterlist *sgl = cmd->t_data_sg; 991 struct scatterlist *sgl = cmd->t_data_sg;
994 u32 sgl_nents = cmd->t_data_nents; 992 u32 sgl_nents = cmd->t_data_nents;
995 enum dma_data_direction data_direction = cmd->data_direction;
996 struct pscsi_dev_virt *pdv = PSCSI_DEV(cmd->se_dev); 993 struct pscsi_dev_virt *pdv = PSCSI_DEV(cmd->se_dev);
997 struct pscsi_plugin_task *pt; 994 struct pscsi_plugin_task *pt;
998 struct request *req; 995 struct request *req;
999 struct bio *hbio;
1000 sense_reason_t ret; 996 sense_reason_t ret;
1001 997
1002 /* 998 /*
@@ -1012,31 +1008,21 @@ pscsi_execute_cmd(struct se_cmd *cmd)
1012 memcpy(pt->pscsi_cdb, cmd->t_task_cdb, 1008 memcpy(pt->pscsi_cdb, cmd->t_task_cdb,
1013 scsi_command_size(cmd->t_task_cdb)); 1009 scsi_command_size(cmd->t_task_cdb));
1014 1010
1015 if (!sgl) { 1011 req = blk_get_request(pdv->pdv_sd->request_queue,
1016 req = blk_get_request(pdv->pdv_sd->request_queue, 1012 (cmd->data_direction == DMA_TO_DEVICE),
1017 (data_direction == DMA_TO_DEVICE), 1013 GFP_KERNEL);
1018 GFP_KERNEL); 1014 if (IS_ERR(req)) {
1019 if (IS_ERR(req)) { 1015 pr_err("PSCSI: blk_get_request() failed\n");
1020 pr_err("PSCSI: blk_get_request() failed\n"); 1016 ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
1021 ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; 1017 goto fail;
1022 goto fail; 1018 }
1023 }
1024 1019
1025 blk_rq_set_block_pc(req); 1020 blk_rq_set_block_pc(req);
1026 } else {
1027 BUG_ON(!cmd->data_length);
1028 1021
1029 ret = pscsi_map_sg(cmd, sgl, sgl_nents, data_direction, &hbio); 1022 if (sgl) {
1023 ret = pscsi_map_sg(cmd, sgl, sgl_nents, req);
1030 if (ret) 1024 if (ret)
1031 goto fail; 1025 goto fail_put_request;
1032
1033 req = blk_make_request(pdv->pdv_sd->request_queue, hbio,
1034 GFP_KERNEL);
1035 if (IS_ERR(req)) {
1036 pr_err("pSCSI: blk_make_request() failed\n");
1037 ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
1038 goto fail_free_bio;
1039 }
1040 } 1026 }
1041 1027
1042 req->end_io = pscsi_req_done; 1028 req->end_io = pscsi_req_done;
@@ -1057,13 +1043,8 @@ pscsi_execute_cmd(struct se_cmd *cmd)
1057 1043
1058 return 0; 1044 return 0;
1059 1045
1060fail_free_bio: 1046fail_put_request:
1061 while (hbio) { 1047 blk_put_request(req);
1062 struct bio *bio = hbio;
1063 hbio = hbio->bi_next;
1064 bio_endio(bio);
1065 }
1066 ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
1067fail: 1048fail:
1068 kfree(pt); 1049 kfree(pt);
1069 return ret; 1050 return ret;
diff --git a/fs/buffer.c b/fs/buffer.c
index e156a36463a1..b9fa1be75e69 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -153,7 +153,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
153 if (uptodate) { 153 if (uptodate) {
154 set_buffer_uptodate(bh); 154 set_buffer_uptodate(bh);
155 } else { 155 } else {
156 /* This happens, due to failed READA attempts. */ 156 /* This happens, due to failed read-ahead attempts. */
157 clear_buffer_uptodate(bh); 157 clear_buffer_uptodate(bh);
158 } 158 }
159 unlock_buffer(bh); 159 unlock_buffer(bh);
@@ -1395,7 +1395,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1395{ 1395{
1396 struct buffer_head *bh = __getblk(bdev, block, size); 1396 struct buffer_head *bh = __getblk(bdev, block, size);
1397 if (likely(bh)) { 1397 if (likely(bh)) {
1398 ll_rw_block(REQ_OP_READ, READA, 1, &bh); 1398 ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1399 brelse(bh); 1399 brelse(bh);
1400 } 1400 }
1401} 1401}
@@ -3053,14 +3053,14 @@ EXPORT_SYMBOL(submit_bh);
3053/** 3053/**
3054 * ll_rw_block: low-level access to block devices (DEPRECATED) 3054 * ll_rw_block: low-level access to block devices (DEPRECATED)
3055 * @op: whether to %READ or %WRITE 3055 * @op: whether to %READ or %WRITE
3056 * @op_flags: rq_flag_bits or %READA (readahead) 3056 * @op_flags: rq_flag_bits
3057 * @nr: number of &struct buffer_heads in the array 3057 * @nr: number of &struct buffer_heads in the array
3058 * @bhs: array of pointers to &struct buffer_head 3058 * @bhs: array of pointers to &struct buffer_head
3059 * 3059 *
3060 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 3060 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3061 * requests an I/O operation on them, either a %READ or a %WRITE. The third 3061 * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
3062 * %READA option is described in the documentation for generic_make_request() 3062 * @op_flags contains flags modifying the detailed I/O behavior, most notably
3063 * which ll_rw_block() calls. 3063 * %REQ_RAHEAD.
3064 * 3064 *
3065 * This function drops any buffer that it cannot get a lock on (with the 3065 * This function drops any buffer that it cannot get a lock on (with the
3066 * BH_Lock state bit), any buffer that appears to be clean when doing a write 3066 * BH_Lock state bit), any buffer that appears to be clean when doing a write
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b6d600e91f39..124b4a3017b5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -159,7 +159,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
159 .sbi = sbi, 159 .sbi = sbi,
160 .type = META, 160 .type = META,
161 .op = REQ_OP_READ, 161 .op = REQ_OP_READ,
162 .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, 162 .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
163 .encrypted_page = NULL, 163 .encrypted_page = NULL,
164 }; 164 };
165 struct blk_plug plug; 165 struct blk_plug plug;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 3649d86bb431..f06ed73adf99 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -733,7 +733,8 @@ next_step:
733 733
734 start_bidx = start_bidx_of_node(nofs, inode); 734 start_bidx = start_bidx_of_node(nofs, inode);
735 data_page = get_read_data_page(inode, 735 data_page = get_read_data_page(inode,
736 start_bidx + ofs_in_node, READA, true); 736 start_bidx + ofs_in_node, REQ_RAHEAD,
737 true);
737 if (IS_ERR(data_page)) { 738 if (IS_ERR(data_page)) {
738 iput(inode); 739 iput(inode);
739 continue; 740 continue;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e53403987f6d..d1867698e601 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1119,7 +1119,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1119 if (!apage) 1119 if (!apage)
1120 return; 1120 return;
1121 1121
1122 err = read_node_page(apage, READA); 1122 err = read_node_page(apage, REQ_RAHEAD);
1123 f2fs_put_page(apage, err ? 1 : 0); 1123 f2fs_put_page(apage, err ? 1 : 0);
1124} 1124}
1125 1125
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index fd6389cf0f14..6e2bec1cd289 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -285,7 +285,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
285 if (trylock_buffer(rabh)) { 285 if (trylock_buffer(rabh)) {
286 if (!buffer_uptodate(rabh)) { 286 if (!buffer_uptodate(rabh)) {
287 rabh->b_end_io = end_buffer_read_sync; 287 rabh->b_end_io = end_buffer_read_sync;
288 submit_bh(REQ_OP_READ, READA | REQ_META, rabh); 288 submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META,
289 rabh);
289 continue; 290 continue;
290 } 291 }
291 unlock_buffer(rabh); 292 unlock_buffer(rabh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 4d68530d6636..fcb59b23f1e3 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1513,7 +1513,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
1513 continue; 1513 continue;
1514 } 1514 }
1515 bh->b_end_io = end_buffer_read_sync; 1515 bh->b_end_io = end_buffer_read_sync;
1516 submit_bh(REQ_OP_READ, READA | REQ_META, bh); 1516 submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh);
1517 continue; 1517 continue;
1518 } 1518 }
1519 brelse(bh); 1519 brelse(bh);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 052c1132e5b6..950b8be68e41 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -459,7 +459,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
459 bh = gfs2_getbuf(gl, dblock, CREATE); 459 bh = gfs2_getbuf(gl, dblock, CREATE);
460 460
461 if (!buffer_uptodate(bh) && !buffer_locked(bh)) 461 if (!buffer_uptodate(bh) && !buffer_locked(bh))
462 ll_rw_block(REQ_OP_READ, READA | REQ_META, 1, &bh); 462 ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh);
463 brelse(bh); 463 brelse(bh);
464 dblock++; 464 dblock++;
465 extlen--; 465 extlen--;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 64b29b592d86..4032d1e87c8f 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -551,7 +551,7 @@ static int search_by_key_reada(struct super_block *s,
551 if (!buffer_uptodate(bh[j])) { 551 if (!buffer_uptodate(bh[j])) {
552 if (depth == -1) 552 if (depth == -1)
553 depth = reiserfs_write_unlock_nested(s); 553 depth = reiserfs_write_unlock_nested(s);
554 ll_rw_block(REQ_OP_READ, READA, 1, bh + j); 554 ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j);
555 } 555 }
556 brelse(bh[j]); 556 brelse(bh[j]);
557 } 557 }
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 80c8a21daed9..aaec13c95253 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -113,7 +113,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
113 brelse(tmp); 113 brelse(tmp);
114 } 114 }
115 if (num) { 115 if (num) {
116 ll_rw_block(REQ_OP_READ, READA, num, bha); 116 ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
117 for (i = 0; i < num; i++) 117 for (i = 0; i < num; i++)
118 brelse(bha[i]); 118 brelse(bha[i]);
119 } 119 }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 71f3e0b5b8ab..988d5352bdb8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -87,7 +87,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
87 brelse(tmp); 87 brelse(tmp);
88 } 88 }
89 if (num) { 89 if (num) {
90 ll_rw_block(REQ_OP_READ, READA, num, bha); 90 ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
91 for (i = 0; i < num; i++) 91 for (i = 0; i < num; i++)
92 brelse(bha[i]); 92 brelse(bha[i]);
93 } 93 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b7e1a00810f2..583c10810e32 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -663,8 +663,6 @@ static inline void bio_inc_remaining(struct bio *bio)
663 * and the bvec_slabs[]. 663 * and the bvec_slabs[].
664 */ 664 */
665#define BIO_POOL_SIZE 2 665#define BIO_POOL_SIZE 2
666#define BIOVEC_NR_POOLS 6
667#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
668 666
669struct bio_set { 667struct bio_set {
670 struct kmem_cache *bio_slab; 668 struct kmem_cache *bio_slab;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2498fdf3a503..e43bbffb5b7a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -96,6 +96,7 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int,
96 unsigned int, unsigned int); 96 unsigned int, unsigned int);
97typedef void (exit_request_fn)(void *, struct request *, unsigned int, 97typedef void (exit_request_fn)(void *, struct request *, unsigned int,
98 unsigned int); 98 unsigned int);
99typedef int (reinit_request_fn)(void *, struct request *);
99 100
100typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 101typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
101 bool); 102 bool);
@@ -145,6 +146,7 @@ struct blk_mq_ops {
145 */ 146 */
146 init_request_fn *init_request; 147 init_request_fn *init_request;
147 exit_request_fn *exit_request; 148 exit_request_fn *exit_request;
149 reinit_request_fn *reinit_request;
148}; 150};
149 151
150enum { 152enum {
@@ -196,6 +198,8 @@ enum {
196 198
197struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 199struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
198 unsigned int flags); 200 unsigned int flags);
201struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op,
202 unsigned int flags, unsigned int hctx_idx);
199struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); 203struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
200struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); 204struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
201 205
@@ -243,6 +247,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
243void blk_mq_freeze_queue(struct request_queue *q); 247void blk_mq_freeze_queue(struct request_queue *q);
244void blk_mq_unfreeze_queue(struct request_queue *q); 248void blk_mq_unfreeze_queue(struct request_queue *q);
245void blk_mq_freeze_queue_start(struct request_queue *q); 249void blk_mq_freeze_queue_start(struct request_queue *q);
250int blk_mq_reinit_tagset(struct blk_mq_tag_set *set);
246 251
247void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 252void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
248 253
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b588e968dc01..f254eb264924 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -26,11 +26,11 @@ typedef void (bio_destructor_t) (struct bio *);
26struct bio { 26struct bio {
27 struct bio *bi_next; /* request queue link */ 27 struct bio *bi_next; /* request queue link */
28 struct block_device *bi_bdev; 28 struct block_device *bi_bdev;
29 unsigned int bi_flags; /* status, command, etc */
30 int bi_error; 29 int bi_error;
31 unsigned int bi_rw; /* bottom bits req flags, 30 unsigned int bi_rw; /* bottom bits req flags,
32 * top bits REQ_OP 31 * top bits REQ_OP
33 */ 32 */
33 unsigned short bi_flags; /* status, command, etc */
34 unsigned short bi_ioprio; 34 unsigned short bi_ioprio;
35 35
36 struct bvec_iter bi_iter; 36 struct bvec_iter bi_iter;
@@ -114,19 +114,25 @@ struct bio {
114 114
115/* 115/*
116 * Flags starting here get preserved by bio_reset() - this includes 116 * Flags starting here get preserved by bio_reset() - this includes
117 * BIO_POOL_IDX() 117 * BVEC_POOL_IDX()
118 */ 118 */
119#define BIO_RESET_BITS 13 119#define BIO_RESET_BITS 10
120#define BIO_OWNS_VEC 13 /* bio_free() should free bvec */
121 120
122/* 121/*
123 * top 4 bits of bio flags indicate the pool this bio came from 122 * We support 6 different bvec pools, the last one is magic in that it
123 * is backed by a mempool.
124 */ 124 */
125#define BIO_POOL_BITS (4) 125#define BVEC_POOL_NR 6
126#define BIO_POOL_NONE ((1UL << BIO_POOL_BITS) - 1) 126#define BVEC_POOL_MAX (BVEC_POOL_NR - 1)
127#define BIO_POOL_OFFSET (32 - BIO_POOL_BITS) 127
128#define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) 128/*
129#define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) 129 * Top 4 bits of bio flags indicate the pool the bvecs came from. We add
130 * 1 to the actual index so that 0 indicates that there are no bvecs to be
131 * freed.
132 */
133#define BVEC_POOL_BITS (4)
134#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS)
135#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET)
130 136
131#endif /* CONFIG_BLOCK */ 137#endif /* CONFIG_BLOCK */
132 138
@@ -143,7 +149,6 @@ enum rq_flag_bits {
143 __REQ_SYNC, /* request is sync (sync write or read) */ 149 __REQ_SYNC, /* request is sync (sync write or read) */
144 __REQ_META, /* metadata io request */ 150 __REQ_META, /* metadata io request */
145 __REQ_PRIO, /* boost priority in cfq */ 151 __REQ_PRIO, /* boost priority in cfq */
146 __REQ_SECURE, /* secure discard (used with REQ_OP_DISCARD) */
147 152
148 __REQ_NOIDLE, /* don't anticipate more IO after this one */ 153 __REQ_NOIDLE, /* don't anticipate more IO after this one */
149 __REQ_INTEGRITY, /* I/O includes block integrity payload */ 154 __REQ_INTEGRITY, /* I/O includes block integrity payload */
@@ -192,7 +197,7 @@ enum rq_flag_bits {
192 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) 197 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
193#define REQ_COMMON_MASK \ 198#define REQ_COMMON_MASK \
194 (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ 199 (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
195 REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE) 200 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
196#define REQ_CLONE_MASK REQ_COMMON_MASK 201#define REQ_CLONE_MASK REQ_COMMON_MASK
197 202
198/* This mask is used for both bio and request merge checking */ 203/* This mask is used for both bio and request merge checking */
@@ -219,7 +224,6 @@ enum rq_flag_bits {
219#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) 224#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ)
220#define REQ_IO_STAT (1ULL << __REQ_IO_STAT) 225#define REQ_IO_STAT (1ULL << __REQ_IO_STAT)
221#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) 226#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE)
222#define REQ_SECURE (1ULL << __REQ_SECURE)
223#define REQ_PM (1ULL << __REQ_PM) 227#define REQ_PM (1ULL << __REQ_PM)
224#define REQ_HASHED (1ULL << __REQ_HASHED) 228#define REQ_HASHED (1ULL << __REQ_HASHED)
225#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) 229#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
@@ -228,6 +232,7 @@ enum req_op {
228 REQ_OP_READ, 232 REQ_OP_READ,
229 REQ_OP_WRITE, 233 REQ_OP_WRITE,
230 REQ_OP_DISCARD, /* request to discard sectors */ 234 REQ_OP_DISCARD, /* request to discard sectors */
235 REQ_OP_SECURE_ERASE, /* request to securely erase sectors */
231 REQ_OP_WRITE_SAME, /* write same block many times */ 236 REQ_OP_WRITE_SAME, /* write same block many times */
232 REQ_OP_FLUSH, /* request for cache flush */ 237 REQ_OP_FLUSH, /* request for cache flush */
233}; 238};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 48f05d768a53..c96db9c22d10 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -496,7 +496,7 @@ struct request_queue {
496#define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */ 496#define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */
497#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ 497#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */
498#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ 498#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */
499#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 499#define QUEUE_FLAG_SECERASE 17 /* supports secure erase */
500#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 500#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
501#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ 501#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */
502#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ 502#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
@@ -593,8 +593,8 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
593#define blk_queue_stackable(q) \ 593#define blk_queue_stackable(q) \
594 test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) 594 test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
595#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) 595#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
596#define blk_queue_secdiscard(q) (blk_queue_discard(q) && \ 596#define blk_queue_secure_erase(q) \
597 test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags)) 597 (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
598#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) 598#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
599 599
600#define blk_noretry_request(rq) \ 600#define blk_noretry_request(rq) \
@@ -676,21 +676,6 @@ static inline bool rq_mergeable(struct request *rq)
676 return true; 676 return true;
677} 677}
678 678
679static inline bool blk_check_merge_flags(unsigned int flags1, unsigned int op1,
680 unsigned int flags2, unsigned int op2)
681{
682 if ((op1 == REQ_OP_DISCARD) != (op2 == REQ_OP_DISCARD))
683 return false;
684
685 if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE))
686 return false;
687
688 if ((op1 == REQ_OP_WRITE_SAME) != (op2 == REQ_OP_WRITE_SAME))
689 return false;
690
691 return true;
692}
693
694static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) 679static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
695{ 680{
696 if (bio_data(a) == bio_data(b)) 681 if (bio_data(a) == bio_data(b))
@@ -804,8 +789,6 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
804extern void blk_put_request(struct request *); 789extern void blk_put_request(struct request *);
805extern void __blk_put_request(struct request_queue *, struct request *); 790extern void __blk_put_request(struct request_queue *, struct request *);
806extern struct request *blk_get_request(struct request_queue *, int, gfp_t); 791extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
807extern struct request *blk_make_request(struct request_queue *, struct bio *,
808 gfp_t);
809extern void blk_rq_set_block_pc(struct request *); 792extern void blk_rq_set_block_pc(struct request *);
810extern void blk_requeue_request(struct request_queue *, struct request *); 793extern void blk_requeue_request(struct request_queue *, struct request *);
811extern void blk_add_request_payload(struct request *rq, struct page *page, 794extern void blk_add_request_payload(struct request *rq, struct page *page,
@@ -818,6 +801,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
818extern void blk_rq_unprep_clone(struct request *rq); 801extern void blk_rq_unprep_clone(struct request *rq);
819extern int blk_insert_cloned_request(struct request_queue *q, 802extern int blk_insert_cloned_request(struct request_queue *q,
820 struct request *rq); 803 struct request *rq);
804extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
821extern void blk_delay_queue(struct request_queue *, unsigned long); 805extern void blk_delay_queue(struct request_queue *, unsigned long);
822extern void blk_queue_split(struct request_queue *, struct bio **, 806extern void blk_queue_split(struct request_queue *, struct bio **,
823 struct bio_set *); 807 struct bio_set *);
@@ -1154,13 +1138,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
1154 return bqt->tag_index[tag]; 1138 return bqt->tag_index[tag];
1155} 1139}
1156 1140
1157#define BLKDEV_DISCARD_SECURE 0x01 /* secure discard */ 1141
1142#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */
1143#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */
1158 1144
1159extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); 1145extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
1160extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, 1146extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
1161 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); 1147 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
1162extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, 1148extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
1163 sector_t nr_sects, gfp_t gfp_mask, int op_flags, 1149 sector_t nr_sects, gfp_t gfp_mask, int flags,
1164 struct bio **biop); 1150 struct bio **biop);
1165extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, 1151extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
1166 sector_t nr_sects, gfp_t gfp_mask, struct page *page); 1152 sector_t nr_sects, gfp_t gfp_mask, struct page *page);
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index d6b3c9943a2c..002611c85318 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -51,7 +51,7 @@
51#endif 51#endif
52 52
53extern const char *drbd_buildtag(void); 53extern const char *drbd_buildtag(void);
54#define REL_VERSION "8.4.6" 54#define REL_VERSION "8.4.7"
55#define API_VERSION 1 55#define API_VERSION 1
56#define PRO_VERSION_MIN 86 56#define PRO_VERSION_MIN 86
57#define PRO_VERSION_MAX 101 57#define PRO_VERSION_MAX 101
@@ -370,6 +370,14 @@ enum drbd_notification_type {
370 NOTIFY_FLAGS = NOTIFY_CONTINUES, 370 NOTIFY_FLAGS = NOTIFY_CONTINUES,
371}; 371};
372 372
373enum drbd_peer_state {
374 P_INCONSISTENT = 3,
375 P_OUTDATED = 4,
376 P_DOWN = 5,
377 P_PRIMARY = 6,
378 P_FENCING = 7,
379};
380
373#define UUID_JUST_CREATED ((__u64)4) 381#define UUID_JUST_CREATED ((__u64)4)
374 382
375enum write_ordering_e { 383enum write_ordering_e {
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 2d0e5ad5de9d..c934d3a96b5e 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -123,15 +123,16 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
123 __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) 123 __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF)
124 __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) 124 __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF)
125 __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) 125 __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF)
126 __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
127 __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
128 __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF)
126 129
127 __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) 130 __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF)
128 __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) 131 __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF)
129 __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) 132 __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF)
130 __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) 133 __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF)
131 __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
132 __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
133 /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */
134 __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) 134 __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF)
135 __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED)
135) 136)
136 137
137GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, 138GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 8ac8c5d9a3ad..ddac68422a96 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -126,8 +126,7 @@
126#define DRBD_RESYNC_RATE_DEF 250 126#define DRBD_RESYNC_RATE_DEF 250
127#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ 127#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */
128 128
129 /* less than 7 would hit performance unnecessarily. */ 129#define DRBD_AL_EXTENTS_MIN 67
130#define DRBD_AL_EXTENTS_MIN 7
131 /* we use u16 as "slot number", (u16)~0 is "FREE". 130 /* we use u16 as "slot number", (u16)~0 is "FREE".
132 * If you use >= 292 kB on-disk ring buffer, 131 * If you use >= 292 kB on-disk ring buffer,
133 * this is the maximum you can use: */ 132 * this is the maximum you can use: */
@@ -210,6 +209,12 @@
210#define DRBD_MD_FLUSHES_DEF 1 209#define DRBD_MD_FLUSHES_DEF 1
211#define DRBD_TCP_CORK_DEF 1 210#define DRBD_TCP_CORK_DEF 1
212#define DRBD_AL_UPDATES_DEF 1 211#define DRBD_AL_UPDATES_DEF 1
212/* We used to ignore the discard_zeroes_data setting.
213 * To not change established (and expected) behaviour,
214 * by default assume that, for discard_zeroes_data=0,
215 * we can make that an effective discard_zeroes_data=1,
216 * if we only explicitly zero-out unaligned partial chunks. */
217#define DRBD_DISCARD_ZEROES_IF_ALIGNED 1
213 218
214#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 219#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0
215#define DRBD_ALWAYS_ASBP_DEF 0 220#define DRBD_ALWAYS_ASBP_DEF 0
@@ -230,4 +235,10 @@
230#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX 235#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX
231#define DRBD_SOCKET_CHECK_TIMEO_DEF 0 236#define DRBD_SOCKET_CHECK_TIMEO_DEF 0
232#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1' 237#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1'
238
239#define DRBD_RS_DISCARD_GRANULARITY_MIN 0
240#define DRBD_RS_DISCARD_GRANULARITY_MAX (1<<20) /* 1MiByte */
241#define DRBD_RS_DISCARD_GRANULARITY_DEF 0 /* disabled by default */
242#define DRBD_RS_DISCARD_GRANULARITY_SCALE '1' /* bytes */
243
233#endif 244#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 183024525d40..dc488662ce0b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -178,9 +178,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
178 * READ_SYNC A synchronous read. Device is not plugged, caller can 178 * READ_SYNC A synchronous read. Device is not plugged, caller can
179 * immediately wait on this read without caring about 179 * immediately wait on this read without caring about
180 * unplugging. 180 * unplugging.
181 * READA Used for read-ahead operations. Lower priority, and the
182 * block layer could (in theory) choose to ignore this
183 * request if it runs into resource problems.
184 * WRITE A normal async write. Device will be plugged. 181 * WRITE A normal async write. Device will be plugged.
185 * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down 182 * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down
186 * the hint that someone will be waiting on this IO 183 * the hint that someone will be waiting on this IO
@@ -195,11 +192,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
195 * 192 *
196 */ 193 */
197#define RW_MASK REQ_OP_WRITE 194#define RW_MASK REQ_OP_WRITE
198#define RWA_MASK REQ_RAHEAD
199 195
200#define READ REQ_OP_READ 196#define READ REQ_OP_READ
201#define WRITE RW_MASK 197#define WRITE REQ_OP_WRITE
202#define READA RWA_MASK
203 198
204#define READ_SYNC REQ_SYNC 199#define READ_SYNC REQ_SYNC
205#define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) 200#define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE)
@@ -2471,17 +2466,6 @@ static inline bool op_is_write(unsigned int op)
2471} 2466}
2472 2467
2473/* 2468/*
2474 * return READ, READA, or WRITE
2475 */
2476static inline int bio_rw(struct bio *bio)
2477{
2478 if (op_is_write(bio_op(bio)))
2479 return WRITE;
2480
2481 return bio->bi_rw & RWA_MASK;
2482}
2483
2484/*
2485 * return data direction, READ or WRITE 2469 * return data direction, READ or WRITE
2486 */ 2470 */
2487static inline int bio_data_dir(struct bio *bio) 2471static inline int bio_data_dir(struct bio *bio)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 359a8e4bd44d..1dbf52f9c24b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -205,7 +205,6 @@ struct gendisk {
205 void *private_data; 205 void *private_data;
206 206
207 int flags; 207 int flags;
208 struct device *driverfs_dev; // FIXME: remove
209 struct kobject *slave_dir; 208 struct kobject *slave_dir;
210 209
211 struct timer_rand_state *random; 210 struct timer_rand_state *random;
@@ -414,7 +413,12 @@ static inline void free_part_info(struct hd_struct *part)
414extern void part_round_stats(int cpu, struct hd_struct *part); 413extern void part_round_stats(int cpu, struct hd_struct *part);
415 414
416/* block/genhd.c */ 415/* block/genhd.c */
417extern void add_disk(struct gendisk *disk); 416extern void device_add_disk(struct device *parent, struct gendisk *disk);
417static inline void add_disk(struct gendisk *disk)
418{
419 device_add_disk(NULL, disk);
420}
421
418extern void del_gendisk(struct gendisk *gp); 422extern void del_gendisk(struct gendisk *gp);
419extern struct gendisk *get_gendisk(dev_t dev, int *partno); 423extern struct gendisk *get_gendisk(dev_t dev, int *partno);
420extern struct block_device *bdget_disk(struct gendisk *disk, int partno); 424extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ef2c7d2e76c4..ba78b8306674 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -1,7 +1,9 @@
1#ifndef NVM_H 1#ifndef NVM_H
2#define NVM_H 2#define NVM_H
3 3
4#include <linux/blkdev.h>
4#include <linux/types.h> 5#include <linux/types.h>
6#include <uapi/linux/lightnvm.h>
5 7
6enum { 8enum {
7 NVM_IO_OK = 0, 9 NVM_IO_OK = 0,
@@ -269,24 +271,15 @@ struct nvm_lun {
269 int lun_id; 271 int lun_id;
270 int chnl_id; 272 int chnl_id;
271 273
272 /* It is up to the target to mark blocks as closed. If the target does
273 * not do it, all blocks are marked as open, and nr_open_blocks
274 * represents the number of blocks in use
275 */
276 unsigned int nr_open_blocks; /* Number of used, writable blocks */
277 unsigned int nr_closed_blocks; /* Number of used, read-only blocks */
278 unsigned int nr_free_blocks; /* Number of unused blocks */
279 unsigned int nr_bad_blocks; /* Number of bad blocks */
280
281 spinlock_t lock; 274 spinlock_t lock;
282 275
276 unsigned int nr_free_blocks; /* Number of unused blocks */
283 struct nvm_block *blocks; 277 struct nvm_block *blocks;
284}; 278};
285 279
286enum { 280enum {
287 NVM_BLK_ST_FREE = 0x1, /* Free block */ 281 NVM_BLK_ST_FREE = 0x1, /* Free block */
288 NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */ 282 NVM_BLK_ST_TGT = 0x2, /* Block in use by target */
289 NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */
290 NVM_BLK_ST_BAD = 0x8, /* Bad block */ 283 NVM_BLK_ST_BAD = 0x8, /* Bad block */
291}; 284};
292 285
@@ -385,6 +378,7 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
385{ 378{
386 struct ppa_addr l; 379 struct ppa_addr l;
387 380
381 l.ppa = 0;
388 /* 382 /*
389 * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. 383 * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc.
390 */ 384 */
@@ -455,6 +449,8 @@ struct nvm_tgt_type {
455 struct list_head list; 449 struct list_head list;
456}; 450};
457 451
452extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
453
458extern int nvm_register_tgt_type(struct nvm_tgt_type *); 454extern int nvm_register_tgt_type(struct nvm_tgt_type *);
459extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); 455extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
460 456
@@ -463,6 +459,9 @@ extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t);
463 459
464typedef int (nvmm_register_fn)(struct nvm_dev *); 460typedef int (nvmm_register_fn)(struct nvm_dev *);
465typedef void (nvmm_unregister_fn)(struct nvm_dev *); 461typedef void (nvmm_unregister_fn)(struct nvm_dev *);
462
463typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *);
464typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
466typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *, 465typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *,
467 struct nvm_lun *, unsigned long); 466 struct nvm_lun *, unsigned long);
468typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *); 467typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *);
@@ -488,9 +487,10 @@ struct nvmm_type {
488 nvmm_register_fn *register_mgr; 487 nvmm_register_fn *register_mgr;
489 nvmm_unregister_fn *unregister_mgr; 488 nvmm_unregister_fn *unregister_mgr;
490 489
490 nvmm_create_tgt_fn *create_tgt;
491 nvmm_remove_tgt_fn *remove_tgt;
492
491 /* Block administration callbacks */ 493 /* Block administration callbacks */
492 nvmm_get_blk_fn *get_blk_unlocked;
493 nvmm_put_blk_fn *put_blk_unlocked;
494 nvmm_get_blk_fn *get_blk; 494 nvmm_get_blk_fn *get_blk;
495 nvmm_put_blk_fn *put_blk; 495 nvmm_put_blk_fn *put_blk;
496 nvmm_open_blk_fn *open_blk; 496 nvmm_open_blk_fn *open_blk;
@@ -520,10 +520,6 @@ struct nvmm_type {
520extern int nvm_register_mgr(struct nvmm_type *); 520extern int nvm_register_mgr(struct nvmm_type *);
521extern void nvm_unregister_mgr(struct nvmm_type *); 521extern void nvm_unregister_mgr(struct nvmm_type *);
522 522
523extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *,
524 struct nvm_lun *, unsigned long);
525extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *);
526
527extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, 523extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
528 unsigned long); 524 unsigned long);
529extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); 525extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
@@ -532,11 +528,13 @@ extern int nvm_register(struct request_queue *, char *,
532 struct nvm_dev_ops *); 528 struct nvm_dev_ops *);
533extern void nvm_unregister(char *); 529extern void nvm_unregister(char *);
534 530
531void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
532
535extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); 533extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
536extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); 534extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
537extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); 535extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
538extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, 536extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
539 struct ppa_addr *, int, int); 537 const struct ppa_addr *, int, int);
540extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); 538extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
541extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); 539extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int);
542extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); 540extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
new file mode 100644
index 000000000000..bf240a3cbf99
--- /dev/null
+++ b/include/linux/nvme-rdma.h
@@ -0,0 +1,71 @@
1/*
2 * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef _LINUX_NVME_RDMA_H
15#define _LINUX_NVME_RDMA_H
16
17enum nvme_rdma_cm_fmt {
18 NVME_RDMA_CM_FMT_1_0 = 0x0,
19};
20
21enum nvme_rdma_cm_status {
22 NVME_RDMA_CM_INVALID_LEN = 0x01,
23 NVME_RDMA_CM_INVALID_RECFMT = 0x02,
24 NVME_RDMA_CM_INVALID_QID = 0x03,
25 NVME_RDMA_CM_INVALID_HSQSIZE = 0x04,
26 NVME_RDMA_CM_INVALID_HRQSIZE = 0x05,
27 NVME_RDMA_CM_NO_RSC = 0x06,
28 NVME_RDMA_CM_INVALID_IRD = 0x07,
29 NVME_RDMA_CM_INVALID_ORD = 0x08,
30};
31
32/**
33 * struct nvme_rdma_cm_req - rdma connect request
34 *
35 * @recfmt: format of the RDMA Private Data
36 * @qid: queue Identifier for the Admin or I/O Queue
37 * @hrqsize: host receive queue size to be created
38 * @hsqsize: host send queue size to be created
39 */
40struct nvme_rdma_cm_req {
41 __le16 recfmt;
42 __le16 qid;
43 __le16 hrqsize;
44 __le16 hsqsize;
45 u8 rsvd[24];
46};
47
48/**
49 * struct nvme_rdma_cm_rep - rdma connect reply
50 *
51 * @recfmt: format of the RDMA Private Data
52 * @crqsize: controller receive queue size
53 */
54struct nvme_rdma_cm_rep {
55 __le16 recfmt;
56 __le16 crqsize;
57 u8 rsvd[28];
58};
59
60/**
61 * struct nvme_rdma_cm_rej - rdma connect reject
62 *
63 * @recfmt: format of the RDMA Private Data
64 * @fsts: error status for the associated connect request
65 */
66struct nvme_rdma_cm_rej {
67 __le16 recfmt;
68 __le16 sts;
69};
70
71#endif /* _LINUX_NVME_RDMA_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7d51b2904cb7..d8b37bab2887 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -16,6 +16,78 @@
16#define _LINUX_NVME_H 16#define _LINUX_NVME_H
17 17
18#include <linux/types.h> 18#include <linux/types.h>
19#include <linux/uuid.h>
20
21/* NQN names in commands fields specified one size */
22#define NVMF_NQN_FIELD_LEN 256
23
24/* However the max length of a qualified name is another size */
25#define NVMF_NQN_SIZE 223
26
27#define NVMF_TRSVCID_SIZE 32
28#define NVMF_TRADDR_SIZE 256
29#define NVMF_TSAS_SIZE 256
30
31#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery"
32
33#define NVME_RDMA_IP_PORT 4420
34
35enum nvme_subsys_type {
36 NVME_NQN_DISC = 1, /* Discovery type target subsystem */
37 NVME_NQN_NVME = 2, /* NVME type target subsystem */
38};
39
40/* Address Family codes for Discovery Log Page entry ADRFAM field */
41enum {
42 NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */
43 NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */
44 NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */
45 NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */
46 NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */
47};
48
49/* Transport Type codes for Discovery Log Page entry TRTYPE field */
50enum {
51 NVMF_TRTYPE_RDMA = 1, /* RDMA */
52 NVMF_TRTYPE_FC = 2, /* Fibre Channel */
53 NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */
54 NVMF_TRTYPE_MAX,
55};
56
57/* Transport Requirements codes for Discovery Log Page entry TREQ field */
58enum {
59 NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */
60 NVMF_TREQ_REQUIRED = 1, /* Required */
61 NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */
62};
63
64/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
65 * RDMA_QPTYPE field
66 */
67enum {
68 NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */
69 NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */
70};
71
72/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
73 * RDMA_QPTYPE field
74 */
75enum {
76 NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */
77 NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */
78 NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */
79 NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */
80 NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */
81};
82
83/* RDMA Connection Management Service Type codes for Discovery Log Page
84 * entry TSAS RDMA_CMS field
85 */
86enum {
87 NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */
88};
89
90#define NVMF_AQ_DEPTH 32
19 91
20enum { 92enum {
21 NVME_REG_CAP = 0x0000, /* Controller Capabilities */ 93 NVME_REG_CAP = 0x0000, /* Controller Capabilities */
@@ -50,6 +122,13 @@ enum {
50#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) 122#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2)
51#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) 123#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1)
52 124
125/*
126 * Submission and Completion Queue Entry Sizes for the NVM command set.
127 * (In bytes and specified as a power of two (2^n)).
128 */
129#define NVME_NVM_IOSQES 6
130#define NVME_NVM_IOCQES 4
131
53enum { 132enum {
54 NVME_CC_ENABLE = 1 << 0, 133 NVME_CC_ENABLE = 1 << 0,
55 NVME_CC_CSS_NVM = 0 << 4, 134 NVME_CC_CSS_NVM = 0 << 4,
@@ -61,8 +140,8 @@ enum {
61 NVME_CC_SHN_NORMAL = 1 << 14, 140 NVME_CC_SHN_NORMAL = 1 << 14,
62 NVME_CC_SHN_ABRUPT = 2 << 14, 141 NVME_CC_SHN_ABRUPT = 2 << 14,
63 NVME_CC_SHN_MASK = 3 << 14, 142 NVME_CC_SHN_MASK = 3 << 14,
64 NVME_CC_IOSQES = 6 << 16, 143 NVME_CC_IOSQES = NVME_NVM_IOSQES << 16,
65 NVME_CC_IOCQES = 4 << 20, 144 NVME_CC_IOCQES = NVME_NVM_IOCQES << 20,
66 NVME_CSTS_RDY = 1 << 0, 145 NVME_CSTS_RDY = 1 << 0,
67 NVME_CSTS_CFS = 1 << 1, 146 NVME_CSTS_CFS = 1 << 1,
68 NVME_CSTS_NSSRO = 1 << 4, 147 NVME_CSTS_NSSRO = 1 << 4,
@@ -107,7 +186,11 @@ struct nvme_id_ctrl {
107 __u8 mdts; 186 __u8 mdts;
108 __le16 cntlid; 187 __le16 cntlid;
109 __le32 ver; 188 __le32 ver;
110 __u8 rsvd84[172]; 189 __le32 rtd3r;
190 __le32 rtd3e;
191 __le32 oaes;
192 __le32 ctratt;
193 __u8 rsvd100[156];
111 __le16 oacs; 194 __le16 oacs;
112 __u8 acl; 195 __u8 acl;
113 __u8 aerl; 196 __u8 aerl;
@@ -119,10 +202,12 @@ struct nvme_id_ctrl {
119 __u8 apsta; 202 __u8 apsta;
120 __le16 wctemp; 203 __le16 wctemp;
121 __le16 cctemp; 204 __le16 cctemp;
122 __u8 rsvd270[242]; 205 __u8 rsvd270[50];
206 __le16 kas;
207 __u8 rsvd322[190];
123 __u8 sqes; 208 __u8 sqes;
124 __u8 cqes; 209 __u8 cqes;
125 __u8 rsvd514[2]; 210 __le16 maxcmd;
126 __le32 nn; 211 __le32 nn;
127 __le16 oncs; 212 __le16 oncs;
128 __le16 fuses; 213 __le16 fuses;
@@ -135,7 +220,15 @@ struct nvme_id_ctrl {
135 __le16 acwu; 220 __le16 acwu;
136 __u8 rsvd534[2]; 221 __u8 rsvd534[2];
137 __le32 sgls; 222 __le32 sgls;
138 __u8 rsvd540[1508]; 223 __u8 rsvd540[228];
224 char subnqn[256];
225 __u8 rsvd1024[768];
226 __le32 ioccsz;
227 __le32 iorcsz;
228 __le16 icdoff;
229 __u8 ctrattr;
230 __u8 msdbd;
231 __u8 rsvd1804[244];
139 struct nvme_id_power_state psd[32]; 232 struct nvme_id_power_state psd[32];
140 __u8 vs[1024]; 233 __u8 vs[1024];
141}; 234};
@@ -274,6 +367,12 @@ struct nvme_reservation_status {
274 } regctl_ds[]; 367 } regctl_ds[];
275}; 368};
276 369
370enum nvme_async_event_type {
371 NVME_AER_TYPE_ERROR = 0,
372 NVME_AER_TYPE_SMART = 1,
373 NVME_AER_TYPE_NOTICE = 2,
374};
375
277/* I/O commands */ 376/* I/O commands */
278 377
279enum nvme_opcode { 378enum nvme_opcode {
@@ -290,6 +389,84 @@ enum nvme_opcode {
290 nvme_cmd_resv_release = 0x15, 389 nvme_cmd_resv_release = 0x15,
291}; 390};
292 391
392/*
393 * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier
394 *
395 * @NVME_SGL_FMT_ADDRESS: absolute address of the data block
396 * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block
397 * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation
398 * request subtype
399 */
400enum {
401 NVME_SGL_FMT_ADDRESS = 0x00,
402 NVME_SGL_FMT_OFFSET = 0x01,
403 NVME_SGL_FMT_INVALIDATE = 0x0f,
404};
405
406/*
407 * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier
408 *
409 * For struct nvme_sgl_desc:
410 * @NVME_SGL_FMT_DATA_DESC: data block descriptor
411 * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor
412 * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor
413 *
414 * For struct nvme_keyed_sgl_desc:
415 * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor
416 */
417enum {
418 NVME_SGL_FMT_DATA_DESC = 0x00,
419 NVME_SGL_FMT_SEG_DESC = 0x02,
420 NVME_SGL_FMT_LAST_SEG_DESC = 0x03,
421 NVME_KEY_SGL_FMT_DATA_DESC = 0x04,
422};
423
424struct nvme_sgl_desc {
425 __le64 addr;
426 __le32 length;
427 __u8 rsvd[3];
428 __u8 type;
429};
430
431struct nvme_keyed_sgl_desc {
432 __le64 addr;
433 __u8 length[3];
434 __u8 key[4];
435 __u8 type;
436};
437
438union nvme_data_ptr {
439 struct {
440 __le64 prp1;
441 __le64 prp2;
442 };
443 struct nvme_sgl_desc sgl;
444 struct nvme_keyed_sgl_desc ksgl;
445};
446
447/*
448 * Lowest two bits of our flags field (FUSE field in the spec):
449 *
450 * @NVME_CMD_FUSE_FIRST: Fused Operation, first command
451 * @NVME_CMD_FUSE_SECOND: Fused Operation, second command
452 *
453 * Highest two bits in our flags field (PSDT field in the spec):
454 *
455 * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer,
456 * If used, MPTR contains addr of single physical buffer (byte aligned).
457 * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer,
458 * If used, MPTR contains an address of an SGL segment containing
459 * exactly 1 SGL descriptor (qword aligned).
460 */
461enum {
462 NVME_CMD_FUSE_FIRST = (1 << 0),
463 NVME_CMD_FUSE_SECOND = (1 << 1),
464
465 NVME_CMD_SGL_METABUF = (1 << 6),
466 NVME_CMD_SGL_METASEG = (1 << 7),
467 NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG,
468};
469
293struct nvme_common_command { 470struct nvme_common_command {
294 __u8 opcode; 471 __u8 opcode;
295 __u8 flags; 472 __u8 flags;
@@ -297,8 +474,7 @@ struct nvme_common_command {
297 __le32 nsid; 474 __le32 nsid;
298 __le32 cdw2[2]; 475 __le32 cdw2[2];
299 __le64 metadata; 476 __le64 metadata;
300 __le64 prp1; 477 union nvme_data_ptr dptr;
301 __le64 prp2;
302 __le32 cdw10[6]; 478 __le32 cdw10[6];
303}; 479};
304 480
@@ -309,8 +485,7 @@ struct nvme_rw_command {
309 __le32 nsid; 485 __le32 nsid;
310 __u64 rsvd2; 486 __u64 rsvd2;
311 __le64 metadata; 487 __le64 metadata;
312 __le64 prp1; 488 union nvme_data_ptr dptr;
313 __le64 prp2;
314 __le64 slba; 489 __le64 slba;
315 __le16 length; 490 __le16 length;
316 __le16 control; 491 __le16 control;
@@ -350,8 +525,7 @@ struct nvme_dsm_cmd {
350 __u16 command_id; 525 __u16 command_id;
351 __le32 nsid; 526 __le32 nsid;
352 __u64 rsvd2[2]; 527 __u64 rsvd2[2];
353 __le64 prp1; 528 union nvme_data_ptr dptr;
354 __le64 prp2;
355 __le32 nr; 529 __le32 nr;
356 __le32 attributes; 530 __le32 attributes;
357 __u32 rsvd12[4]; 531 __u32 rsvd12[4];
@@ -384,6 +558,7 @@ enum nvme_admin_opcode {
384 nvme_admin_async_event = 0x0c, 558 nvme_admin_async_event = 0x0c,
385 nvme_admin_activate_fw = 0x10, 559 nvme_admin_activate_fw = 0x10,
386 nvme_admin_download_fw = 0x11, 560 nvme_admin_download_fw = 0x11,
561 nvme_admin_keep_alive = 0x18,
387 nvme_admin_format_nvm = 0x80, 562 nvme_admin_format_nvm = 0x80,
388 nvme_admin_security_send = 0x81, 563 nvme_admin_security_send = 0x81,
389 nvme_admin_security_recv = 0x82, 564 nvme_admin_security_recv = 0x82,
@@ -408,6 +583,7 @@ enum {
408 NVME_FEAT_WRITE_ATOMIC = 0x0a, 583 NVME_FEAT_WRITE_ATOMIC = 0x0a,
409 NVME_FEAT_ASYNC_EVENT = 0x0b, 584 NVME_FEAT_ASYNC_EVENT = 0x0b,
410 NVME_FEAT_AUTO_PST = 0x0c, 585 NVME_FEAT_AUTO_PST = 0x0c,
586 NVME_FEAT_KATO = 0x0f,
411 NVME_FEAT_SW_PROGRESS = 0x80, 587 NVME_FEAT_SW_PROGRESS = 0x80,
412 NVME_FEAT_HOST_ID = 0x81, 588 NVME_FEAT_HOST_ID = 0x81,
413 NVME_FEAT_RESV_MASK = 0x82, 589 NVME_FEAT_RESV_MASK = 0x82,
@@ -415,6 +591,7 @@ enum {
415 NVME_LOG_ERROR = 0x01, 591 NVME_LOG_ERROR = 0x01,
416 NVME_LOG_SMART = 0x02, 592 NVME_LOG_SMART = 0x02,
417 NVME_LOG_FW_SLOT = 0x03, 593 NVME_LOG_FW_SLOT = 0x03,
594 NVME_LOG_DISC = 0x70,
418 NVME_LOG_RESERVATION = 0x80, 595 NVME_LOG_RESERVATION = 0x80,
419 NVME_FWACT_REPL = (0 << 3), 596 NVME_FWACT_REPL = (0 << 3),
420 NVME_FWACT_REPL_ACTV = (1 << 3), 597 NVME_FWACT_REPL_ACTV = (1 << 3),
@@ -427,8 +604,7 @@ struct nvme_identify {
427 __u16 command_id; 604 __u16 command_id;
428 __le32 nsid; 605 __le32 nsid;
429 __u64 rsvd2[2]; 606 __u64 rsvd2[2];
430 __le64 prp1; 607 union nvme_data_ptr dptr;
431 __le64 prp2;
432 __le32 cns; 608 __le32 cns;
433 __u32 rsvd11[5]; 609 __u32 rsvd11[5];
434}; 610};
@@ -439,8 +615,7 @@ struct nvme_features {
439 __u16 command_id; 615 __u16 command_id;
440 __le32 nsid; 616 __le32 nsid;
441 __u64 rsvd2[2]; 617 __u64 rsvd2[2];
442 __le64 prp1; 618 union nvme_data_ptr dptr;
443 __le64 prp2;
444 __le32 fid; 619 __le32 fid;
445 __le32 dword11; 620 __le32 dword11;
446 __u32 rsvd12[4]; 621 __u32 rsvd12[4];
@@ -499,8 +674,7 @@ struct nvme_download_firmware {
499 __u8 flags; 674 __u8 flags;
500 __u16 command_id; 675 __u16 command_id;
501 __u32 rsvd1[5]; 676 __u32 rsvd1[5];
502 __le64 prp1; 677 union nvme_data_ptr dptr;
503 __le64 prp2;
504 __le32 numd; 678 __le32 numd;
505 __le32 offset; 679 __le32 offset;
506 __u32 rsvd12[4]; 680 __u32 rsvd12[4];
@@ -516,6 +690,143 @@ struct nvme_format_cmd {
516 __u32 rsvd11[5]; 690 __u32 rsvd11[5];
517}; 691};
518 692
693struct nvme_get_log_page_command {
694 __u8 opcode;
695 __u8 flags;
696 __u16 command_id;
697 __le32 nsid;
698 __u64 rsvd2[2];
699 union nvme_data_ptr dptr;
700 __u8 lid;
701 __u8 rsvd10;
702 __le16 numdl;
703 __le16 numdu;
704 __u16 rsvd11;
705 __le32 lpol;
706 __le32 lpou;
707 __u32 rsvd14[2];
708};
709
710/*
711 * Fabrics subcommands.
712 */
713enum nvmf_fabrics_opcode {
714 nvme_fabrics_command = 0x7f,
715};
716
717enum nvmf_capsule_command {
718 nvme_fabrics_type_property_set = 0x00,
719 nvme_fabrics_type_connect = 0x01,
720 nvme_fabrics_type_property_get = 0x04,
721};
722
723struct nvmf_common_command {
724 __u8 opcode;
725 __u8 resv1;
726 __u16 command_id;
727 __u8 fctype;
728 __u8 resv2[35];
729 __u8 ts[24];
730};
731
732/*
733 * The legal cntlid range a NVMe Target will provide.
734 * Note that cntlid of value 0 is considered illegal in the fabrics world.
735 * Devices based on earlier specs did not have the subsystem concept;
736 * therefore, those devices had their cntlid value set to 0 as a result.
737 */
738#define NVME_CNTLID_MIN 1
739#define NVME_CNTLID_MAX 0xffef
740#define NVME_CNTLID_DYNAMIC 0xffff
741
742#define MAX_DISC_LOGS 255
743
744/* Discovery log page entry */
745struct nvmf_disc_rsp_page_entry {
746 __u8 trtype;
747 __u8 adrfam;
748 __u8 nqntype;
749 __u8 treq;
750 __le16 portid;
751 __le16 cntlid;
752 __le16 asqsz;
753 __u8 resv8[22];
754 char trsvcid[NVMF_TRSVCID_SIZE];
755 __u8 resv64[192];
756 char subnqn[NVMF_NQN_FIELD_LEN];
757 char traddr[NVMF_TRADDR_SIZE];
758 union tsas {
759 char common[NVMF_TSAS_SIZE];
760 struct rdma {
761 __u8 qptype;
762 __u8 prtype;
763 __u8 cms;
764 __u8 resv3[5];
765 __u16 pkey;
766 __u8 resv10[246];
767 } rdma;
768 } tsas;
769};
770
771/* Discovery log page header */
772struct nvmf_disc_rsp_page_hdr {
773 __le64 genctr;
774 __le64 numrec;
775 __le16 recfmt;
776 __u8 resv14[1006];
777 struct nvmf_disc_rsp_page_entry entries[0];
778};
779
780struct nvmf_connect_command {
781 __u8 opcode;
782 __u8 resv1;
783 __u16 command_id;
784 __u8 fctype;
785 __u8 resv2[19];
786 union nvme_data_ptr dptr;
787 __le16 recfmt;
788 __le16 qid;
789 __le16 sqsize;
790 __u8 cattr;
791 __u8 resv3;
792 __le32 kato;
793 __u8 resv4[12];
794};
795
796struct nvmf_connect_data {
797 uuid_le hostid;
798 __le16 cntlid;
799 char resv4[238];
800 char subsysnqn[NVMF_NQN_FIELD_LEN];
801 char hostnqn[NVMF_NQN_FIELD_LEN];
802 char resv5[256];
803};
804
805struct nvmf_property_set_command {
806 __u8 opcode;
807 __u8 resv1;
808 __u16 command_id;
809 __u8 fctype;
810 __u8 resv2[35];
811 __u8 attrib;
812 __u8 resv3[3];
813 __le32 offset;
814 __le64 value;
815 __u8 resv4[8];
816};
817
818struct nvmf_property_get_command {
819 __u8 opcode;
820 __u8 resv1;
821 __u16 command_id;
822 __u8 fctype;
823 __u8 resv2[35];
824 __u8 attrib;
825 __u8 resv3[3];
826 __le32 offset;
827 __u8 resv4[16];
828};
829
519struct nvme_command { 830struct nvme_command {
520 union { 831 union {
521 struct nvme_common_command common; 832 struct nvme_common_command common;
@@ -529,10 +840,30 @@ struct nvme_command {
529 struct nvme_format_cmd format; 840 struct nvme_format_cmd format;
530 struct nvme_dsm_cmd dsm; 841 struct nvme_dsm_cmd dsm;
531 struct nvme_abort_cmd abort; 842 struct nvme_abort_cmd abort;
843 struct nvme_get_log_page_command get_log_page;
844 struct nvmf_common_command fabrics;
845 struct nvmf_connect_command connect;
846 struct nvmf_property_set_command prop_set;
847 struct nvmf_property_get_command prop_get;
532 }; 848 };
533}; 849};
534 850
851static inline bool nvme_is_write(struct nvme_command *cmd)
852{
853 /*
854 * What a mess...
855 *
856 * Why can't we simply have a Fabrics In and Fabrics out command?
857 */
858 if (unlikely(cmd->common.opcode == nvme_fabrics_command))
859 return cmd->fabrics.opcode & 1;
860 return cmd->common.opcode & 1;
861}
862
535enum { 863enum {
864 /*
865 * Generic Command Status:
866 */
536 NVME_SC_SUCCESS = 0x0, 867 NVME_SC_SUCCESS = 0x0,
537 NVME_SC_INVALID_OPCODE = 0x1, 868 NVME_SC_INVALID_OPCODE = 0x1,
538 NVME_SC_INVALID_FIELD = 0x2, 869 NVME_SC_INVALID_FIELD = 0x2,
@@ -551,10 +882,18 @@ enum {
551 NVME_SC_SGL_INVALID_DATA = 0xf, 882 NVME_SC_SGL_INVALID_DATA = 0xf,
552 NVME_SC_SGL_INVALID_METADATA = 0x10, 883 NVME_SC_SGL_INVALID_METADATA = 0x10,
553 NVME_SC_SGL_INVALID_TYPE = 0x11, 884 NVME_SC_SGL_INVALID_TYPE = 0x11,
885
886 NVME_SC_SGL_INVALID_OFFSET = 0x16,
887 NVME_SC_SGL_INVALID_SUBTYPE = 0x17,
888
554 NVME_SC_LBA_RANGE = 0x80, 889 NVME_SC_LBA_RANGE = 0x80,
555 NVME_SC_CAP_EXCEEDED = 0x81, 890 NVME_SC_CAP_EXCEEDED = 0x81,
556 NVME_SC_NS_NOT_READY = 0x82, 891 NVME_SC_NS_NOT_READY = 0x82,
557 NVME_SC_RESERVATION_CONFLICT = 0x83, 892 NVME_SC_RESERVATION_CONFLICT = 0x83,
893
894 /*
895 * Command Specific Status:
896 */
558 NVME_SC_CQ_INVALID = 0x100, 897 NVME_SC_CQ_INVALID = 0x100,
559 NVME_SC_QID_INVALID = 0x101, 898 NVME_SC_QID_INVALID = 0x101,
560 NVME_SC_QUEUE_SIZE = 0x102, 899 NVME_SC_QUEUE_SIZE = 0x102,
@@ -572,9 +911,29 @@ enum {
572 NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, 911 NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e,
573 NVME_SC_FEATURE_NOT_PER_NS = 0x10f, 912 NVME_SC_FEATURE_NOT_PER_NS = 0x10f,
574 NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, 913 NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110,
914
915 /*
916 * I/O Command Set Specific - NVM commands:
917 */
575 NVME_SC_BAD_ATTRIBUTES = 0x180, 918 NVME_SC_BAD_ATTRIBUTES = 0x180,
576 NVME_SC_INVALID_PI = 0x181, 919 NVME_SC_INVALID_PI = 0x181,
577 NVME_SC_READ_ONLY = 0x182, 920 NVME_SC_READ_ONLY = 0x182,
921
922 /*
923 * I/O Command Set Specific - Fabrics commands:
924 */
925 NVME_SC_CONNECT_FORMAT = 0x180,
926 NVME_SC_CONNECT_CTRL_BUSY = 0x181,
927 NVME_SC_CONNECT_INVALID_PARAM = 0x182,
928 NVME_SC_CONNECT_RESTART_DISC = 0x183,
929 NVME_SC_CONNECT_INVALID_HOST = 0x184,
930
931 NVME_SC_DISCOVERY_RESTART = 0x190,
932 NVME_SC_AUTH_REQUIRED = 0x191,
933
934 /*
935 * Media and Data Integrity Errors:
936 */
578 NVME_SC_WRITE_FAULT = 0x280, 937 NVME_SC_WRITE_FAULT = 0x280,
579 NVME_SC_READ_ERROR = 0x281, 938 NVME_SC_READ_ERROR = 0x281,
580 NVME_SC_GUARD_CHECK = 0x282, 939 NVME_SC_GUARD_CHECK = 0x282,
@@ -582,12 +941,19 @@ enum {
582 NVME_SC_REFTAG_CHECK = 0x284, 941 NVME_SC_REFTAG_CHECK = 0x284,
583 NVME_SC_COMPARE_FAILED = 0x285, 942 NVME_SC_COMPARE_FAILED = 0x285,
584 NVME_SC_ACCESS_DENIED = 0x286, 943 NVME_SC_ACCESS_DENIED = 0x286,
944
585 NVME_SC_DNR = 0x4000, 945 NVME_SC_DNR = 0x4000,
586}; 946};
587 947
588struct nvme_completion { 948struct nvme_completion {
589 __le32 result; /* Used by admin commands to return data */ 949 /*
590 __u32 rsvd; 950 * Used by Admin and Fabrics commands to return data:
951 */
952 union {
953 __le16 result16;
954 __le32 result;
955 __le64 result64;
956 };
591 __le16 sq_head; /* how much of this queue may be reclaimed */ 957 __le16 sq_head; /* how much of this queue may be reclaimed */
592 __le16 sq_id; /* submission queue that generated this entry */ 958 __le16 sq_id; /* submission queue that generated this entry */
593 __u16 command_id; /* of the command which completed */ 959 __u16 command_id; /* of the command which completed */
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 878963a1f058..ff95fd02116f 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -55,7 +55,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
55 { IPU, "IN-PLACE" }, \ 55 { IPU, "IN-PLACE" }, \
56 { OPU, "OUT-OF-PLACE" }) 56 { OPU, "OUT-OF-PLACE" })
57 57
58#define F2FS_BIO_FLAG_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) 58#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | WRITE_FLUSH_FUA))
59#define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) 59#define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO))
60 60
61#define show_bio_type(op, op_flags) show_bio_op(op), \ 61#define show_bio_type(op, op_flags) show_bio_op(op), \
@@ -68,7 +68,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
68 68
69#define show_bio_op_flags(flags) \ 69#define show_bio_op_flags(flags) \
70 __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ 70 __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \
71 { READA, "READAHEAD" }, \ 71 { REQ_RAHEAD, "READAHEAD" }, \
72 { READ_SYNC, "READ_SYNC" }, \ 72 { READ_SYNC, "READ_SYNC" }, \
73 { WRITE_SYNC, "WRITE_SYNC" }, \ 73 { WRITE_SYNC, "WRITE_SYNC" }, \
74 { WRITE_FLUSH, "WRITE_FLUSH" }, \ 74 { WRITE_FLUSH, "WRITE_FLUSH" }, \
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bedb84d168d1..fb345cd11883 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1792,6 +1792,10 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
1792 case REQ_OP_DISCARD: 1792 case REQ_OP_DISCARD:
1793 rwbs[i++] = 'D'; 1793 rwbs[i++] = 'D';
1794 break; 1794 break;
1795 case REQ_OP_SECURE_ERASE:
1796 rwbs[i++] = 'D';
1797 rwbs[i++] = 'E';
1798 break;
1795 case REQ_OP_FLUSH: 1799 case REQ_OP_FLUSH:
1796 rwbs[i++] = 'F'; 1800 rwbs[i++] = 'F';
1797 break; 1801 break;
@@ -1810,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
1810 rwbs[i++] = 'S'; 1814 rwbs[i++] = 'S';
1811 if (rw & REQ_META) 1815 if (rw & REQ_META)
1812 rwbs[i++] = 'M'; 1816 rwbs[i++] = 'M';
1813 if (rw & REQ_SECURE)
1814 rwbs[i++] = 'E';
1815 1817
1816 rwbs[i] = '\0'; 1818 rwbs[i] = '\0';
1817} 1819}