summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 18:32:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 18:32:19 -0500
commite2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch)
treeb97a90170c45211bcc437761653aa8016c34afcd
parentabc36be236358162202e86ad88616ff95a755101 (diff)
parenta04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff)
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe: "This is the main pull request for block storage for 4.15-rc1. Nothing out of the ordinary in here, and no API changes or anything like that. Just various new features for drivers, core changes, etc. In particular, this pull request contains: - A patch series from Bart, closing the whole on blk/scsi-mq queue quescing. - A series from Christoph, building towards hidden gendisks (for multipath) and ability to move bio chains around. - NVMe - Support for native multipath for NVMe (Christoph). - Userspace notifications for AENs (Keith). - Command side-effects support (Keith). - SGL support (Chaitanya Kulkarni) - FC fixes and improvements (James Smart) - Lots of fixes and tweaks (Various) - bcache - New maintainer (Michael Lyle) - Writeback control improvements (Michael) - Various fixes (Coly, Elena, Eric, Liang, et al) - lightnvm updates, mostly centered around the pblk interface (Javier, Hans, and Rakesh). - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph) - Writeback series that fix the much discussed hundreds of millions of sync-all units. This goes all the way, as discussed previously (me). - Fix for missing wakeup on writeback timer adjustments (Yafang Shao). - Fix laptop mode on blk-mq (me). - {mq,name} tupple lookup for IO schedulers, allowing us to have alias names. This means you can use 'deadline' on both !mq and on mq (where it's called mq-deadline). (me). - blktrace race fix, oopsing on sg load (me). - blk-mq optimizations (me). - Obscure waitqueue race fix for kyber (Omar). - NBD fixes (Josef). - Disable writeback throttling by default on bfq, like we do on cfq (Luca Miccio). - Series from Ming that enable us to treat flush requests on blk-mq like any other request. This is a really nice cleanup. - Series from Ming that improves merging on blk-mq with schedulers, getting us closer to flipping the switch on scsi-mq again. - BFQ updates (Paolo). - blk-mq atomic flags memory ordering fixes (Peter Z). - Loop cgroup support (Shaohua). - Lots of minor fixes from lots of different folks, both for core and driver code" * 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits) nvme: fix visibility of "uuid" ns attribute blk-mq: fixup some comment typos and lengths ide: ide-atapi: fix compile error with defining macro DEBUG blk-mq: improve tag waiting setup for non-shared tags brd: remove unused brd_mutex blk-mq: only run the hardware queue if IO is pending block: avoid null pointer dereference on null disk fs: guard_bio_eod() needs to consider partitions xtensa/simdisk: fix compile error nvme: expose subsys attribute to sysfs nvme: create 'slaves' and 'holders' entries for hidden controllers block: create 'slaves' and 'holders' entries for hidden gendisks nvme: also expose the namespace identification sysfs files for mpath nodes nvme: implement multipath access to nvme subsystems nvme: track shared namespaces nvme: introduce a nvme_ns_ids structure nvme: track subsystems block, nvme: Introduce blk_mq_req_flags_t block, scsi: Make SCSI quiesce and resume work reliably block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag ...
-rw-r--r--Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads5
-rw-r--r--Documentation/block/biodoc.txt11
-rw-r--r--Documentation/block/null_blk.txt19
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/xtensa/platforms/iss/simdisk.c4
-rw-r--r--block/bfq-iosched.c225
-rw-r--r--block/bio-integrity.c7
-rw-r--r--block/bio.c40
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-core.c274
-rw-r--r--block/blk-flush.c37
-rw-r--r--block/blk-lib.c108
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-sched.c203
-rw-r--r--block/blk-mq-tag.c11
-rw-r--r--block/blk-mq-tag.h7
-rw-r--r--block/blk-mq.c422
-rw-r--r--block/blk-mq.h60
-rw-r--r--block/blk-settings.c2
-rw-r--r--block/blk-stat.c45
-rw-r--r--block/blk-throttle.c12
-rw-r--r--block/blk-timeout.c5
-rw-r--r--block/blk-wbt.c2
-rw-r--r--block/blk.h46
-rw-r--r--block/bsg.c18
-rw-r--r--block/elevator.c67
-rw-r--r--block/genhd.c70
-rw-r--r--block/ioctl.c19
-rw-r--r--block/kyber-iosched.c12
-rw-r--r--block/mq-deadline.c1
-rw-r--r--block/scsi_ioctl.c8
-rw-r--r--drivers/block/Kconfig5
-rw-r--r--drivers/block/brd.c1
-rw-r--r--drivers/block/cryptoloop.c2
-rw-r--r--drivers/block/loop.c13
-rw-r--r--drivers/block/loop.h1
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c7
-rw-r--r--drivers/block/nbd.c26
-rw-r--r--drivers/block/null_blk.c10
-rw-r--r--drivers/block/paride/Kconfig1
-rw-r--r--drivers/block/skd_main.c3
-rw-r--r--drivers/cdrom/Makefile15
-rw-r--r--drivers/ide/Kconfig2
-rw-r--r--drivers/ide/ide-atapi.c6
-rw-r--r--drivers/ide/ide-pm.c4
-rw-r--r--drivers/lightnvm/Kconfig3
-rw-r--r--drivers/lightnvm/core.c176
-rw-r--r--drivers/lightnvm/pblk-cache.c24
-rw-r--r--drivers/lightnvm/pblk-core.c512
-rw-r--r--drivers/lightnvm/pblk-gc.c289
-rw-r--r--drivers/lightnvm/pblk-init.c197
-rw-r--r--drivers/lightnvm/pblk-map.c28
-rw-r--r--drivers/lightnvm/pblk-rb.c30
-rw-r--r--drivers/lightnvm/pblk-read.c274
-rw-r--r--drivers/lightnvm/pblk-recovery.c129
-rw-r--r--drivers/lightnvm/pblk-rl.c43
-rw-r--r--drivers/lightnvm/pblk-sysfs.c2
-rw-r--r--drivers/lightnvm/pblk-write.c229
-rw-r--r--drivers/lightnvm/pblk.h132
-rw-r--r--drivers/md/bcache/alloc.c15
-rw-r--r--drivers/md/bcache/bcache.h19
-rw-r--r--drivers/md/bcache/btree.c17
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.h6
-rw-r--r--drivers/md/bcache/request.c36
-rw-r--r--drivers/md/bcache/super.c52
-rw-r--r--drivers/md/bcache/sysfs.c28
-rw-r--r--drivers/md/bcache/util.c10
-rw-r--r--drivers/md/bcache/util.h4
-rw-r--r--drivers/md/bcache/writeback.c117
-rw-r--r--drivers/md/bcache/writeback.h6
-rw-r--r--drivers/md/bitmap.c2
-rw-r--r--drivers/md/dm-rq.c2
-rw-r--r--drivers/md/dm-table.c15
-rw-r--r--drivers/md/dm.c11
-rw-r--r--drivers/nvme/Kconfig4
-rw-r--r--drivers/nvme/host/Kconfig9
-rw-r--r--drivers/nvme/host/Makefile1
-rw-r--r--drivers/nvme/host/core.c1301
-rw-r--r--drivers/nvme/host/fabrics.c16
-rw-r--r--drivers/nvme/host/fabrics.h14
-rw-r--r--drivers/nvme/host/fc.c793
-rw-r--r--drivers/nvme/host/lightnvm.c86
-rw-r--r--drivers/nvme/host/multipath.c291
-rw-r--r--drivers/nvme/host/nvme.h169
-rw-r--r--drivers/nvme/host/pci.c243
-rw-r--r--drivers/nvme/host/rdma.c246
-rw-r--r--drivers/nvme/target/admin-cmd.c21
-rw-r--r--drivers/nvme/target/core.c23
-rw-r--r--drivers/nvme/target/fc.c48
-rw-r--r--drivers/nvme/target/io-cmd.c20
-rw-r--r--drivers/nvme/target/loop.c66
-rw-r--r--drivers/nvme/target/nvmet.h6
-rw-r--r--drivers/nvme/target/rdma.c16
-rw-r--r--drivers/scsi/Kconfig3
-rw-r--r--drivers/scsi/lpfc/lpfc_attr.c5
-rw-r--r--drivers/scsi/scsi_lib.c99
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--fs/block_dev.c20
-rw-r--r--fs/buffer.c70
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/fs-writeback.c153
-rw-r--r--fs/iomap.c2
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/sync.c2
-rw-r--r--include/linux/backing-dev-defs.h24
-rw-r--r--include/linux/backing-dev.h4
-rw-r--r--include/linux/bio.h25
-rw-r--r--include/linux/blk-cgroup.h25
-rw-r--r--include/linux/blk-mq.h40
-rw-r--r--include/linux/blk_types.h16
-rw-r--r--include/linux/blkdev.h36
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/elevator.h1
-rw-r--r--include/linux/genhd.h4
-rw-r--r--include/linux/kthread.h11
-rw-r--r--include/linux/lightnvm.h11
-rw-r--r--include/linux/nvme-fc-driver.h15
-rw-r--r--include/linux/nvme.h30
-rw-r--r--include/linux/sbitmap.h64
-rw-r--r--include/linux/writeback.h30
-rw-r--r--include/scsi/scsi_device.h1
-rw-r--r--include/trace/events/writeback.h1
-rw-r--r--kernel/kthread.c66
-rw-r--r--kernel/sysctl.c5
-rw-r--r--kernel/trace/blktrace.c90
-rw-r--r--mm/backing-dev.c20
-rw-r--r--mm/page-writeback.c36
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/vmscan.c2
131 files changed, 5470 insertions, 3089 deletions
diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
deleted file mode 100644
index b0b0eeb20fe3..000000000000
--- a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
+++ /dev/null
@@ -1,5 +0,0 @@
1What: /proc/sys/vm/nr_pdflush_threads
2Date: June 2012
3Contact: Wanpeng Li <liwp@linux.vnet.ibm.com>
4Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush
5 exported in /proc/sys/vm/ should be removed.
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 9490f2845f06..86927029a52d 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -216,10 +216,9 @@ may need to abort DMA operations and revert to PIO for the transfer, in
216which case a virtual mapping of the page is required. For SCSI it is also 216which case a virtual mapping of the page is required. For SCSI it is also
217done in some scenarios where the low level driver cannot be trusted to 217done in some scenarios where the low level driver cannot be trusted to
218handle a single sg entry correctly. The driver is expected to perform the 218handle a single sg entry correctly. The driver is expected to perform the
219kmaps as needed on such occasions using the __bio_kmap_atomic and bio_kmap_irq 219kmaps as needed on such occasions as appropriate. A driver could also use
220routines as appropriate. A driver could also use the blk_queue_bounce() 220the blk_queue_bounce() routine on its own to bounce highmem i/o to low
221routine on its own to bounce highmem i/o to low memory for specific requests 221memory for specific requests if so desired.
222if so desired.
223 222
224iii. The i/o scheduler algorithm itself can be replaced/set as appropriate 223iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
225 224
@@ -1137,8 +1136,8 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For
1137PIO drivers (or drivers that need to revert to PIO transfer once in a 1136PIO drivers (or drivers that need to revert to PIO transfer once in a
1138while (IDE for example)), where the CPU is doing the actual data 1137while (IDE for example)), where the CPU is doing the actual data
1139transfer a virtual mapping is needed. If the driver supports highmem I/O, 1138transfer a virtual mapping is needed. If the driver supports highmem I/O,
1140(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic and bio_kmap_irq to 1139(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
1141temporarily map a bio into the virtual address space. 1140a bio into the virtual address space.
1142 1141
1143 1142
11448. Prior/Related/Impacted patches 11438. Prior/Related/Impacted patches
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 3140dbd860d8..733927a7b501 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -38,7 +38,7 @@ gb=[Size in GB]: Default: 250GB
38bs=[Block size (in bytes)]: Default: 512 bytes 38bs=[Block size (in bytes)]: Default: 512 bytes
39 The block size reported to the system. 39 The block size reported to the system.
40 40
41nr_devices=[Number of devices]: Default: 2 41nr_devices=[Number of devices]: Default: 1
42 Number of block devices instantiated. They are instantiated as /dev/nullb0, 42 Number of block devices instantiated. They are instantiated as /dev/nullb0,
43 etc. 43 etc.
44 44
@@ -52,13 +52,13 @@ irqmode=[0-2]: Default: 1-Soft-irq
52 2: Timer: Waits a specific period (completion_nsec) for each IO before 52 2: Timer: Waits a specific period (completion_nsec) for each IO before
53 completion. 53 completion.
54 54
55completion_nsec=[ns]: Default: 10.000ns 55completion_nsec=[ns]: Default: 10,000ns
56 Combined with irqmode=2 (timer). The time each completion event must wait. 56 Combined with irqmode=2 (timer). The time each completion event must wait.
57 57
58submit_queues=[0..nr_cpus]: 58submit_queues=[1..nr_cpus]:
59 The number of submission queues attached to the device driver. If unset, it 59 The number of submission queues attached to the device driver. If unset, it
60 defaults to 1 on single-queue and bio-based instances. For multi-queue, 60 defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
61 it is ignored when use_per_node_hctx module parameter is 1. 61 parameter is 1.
62 62
63hw_queue_depth=[0..qdepth]: Default: 64 63hw_queue_depth=[0..qdepth]: Default: 64
64 The hardware queue depth of the device. 64 The hardware queue depth of the device.
@@ -73,3 +73,12 @@ use_per_node_hctx=[0/1]: Default: 0
73 73
74use_lightnvm=[0/1]: Default: 0 74use_lightnvm=[0/1]: Default: 0
75 Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled. 75 Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
76
77no_sched=[0/1]: Default: 0
78 0: nullb* use default blk-mq io scheduler.
79 1: nullb* doesn't use io scheduler.
80
81shared_tags=[0/1]: Default: 0
82 0: Tag set is not shared.
83 1: Tag set shared between devices for blk-mq. Only makes sense with
84 nr_devices > 1, otherwise there's no tag set to share.
diff --git a/MAINTAINERS b/MAINTAINERS
index e372994747b7..ba3d8c197d92 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2562,10 +2562,12 @@ S: Maintained
2562F: drivers/net/hamradio/baycom* 2562F: drivers/net/hamradio/baycom*
2563 2563
2564BCACHE (BLOCK LAYER CACHE) 2564BCACHE (BLOCK LAYER CACHE)
2565M: Michael Lyle <mlyle@lyle.org>
2565M: Kent Overstreet <kent.overstreet@gmail.com> 2566M: Kent Overstreet <kent.overstreet@gmail.com>
2566L: linux-bcache@vger.kernel.org 2567L: linux-bcache@vger.kernel.org
2567W: http://bcache.evilpiepirate.org 2568W: http://bcache.evilpiepirate.org
2568S: Orphan 2569C: irc://irc.oftc.net/bcache
2570S: Maintained
2569F: drivers/md/bcache/ 2571F: drivers/md/bcache/
2570 2572
2571BDISP ST MEDIA DRIVER 2573BDISP ST MEDIA DRIVER
@@ -12085,7 +12087,6 @@ F: drivers/mmc/host/sdhci-omap.c
12085SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER 12087SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
12086M: Scott Bauer <scott.bauer@intel.com> 12088M: Scott Bauer <scott.bauer@intel.com>
12087M: Jonathan Derrick <jonathan.derrick@intel.com> 12089M: Jonathan Derrick <jonathan.derrick@intel.com>
12088M: Rafael Antognolli <rafael.antognolli@intel.com>
12089L: linux-block@vger.kernel.org 12090L: linux-block@vger.kernel.org
12090S: Supported 12091S: Supported
12091F: block/sed* 12092F: block/sed*
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index c45b90bb9339..1b6418407467 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -110,13 +110,13 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio)
110 sector_t sector = bio->bi_iter.bi_sector; 110 sector_t sector = bio->bi_iter.bi_sector;
111 111
112 bio_for_each_segment(bvec, bio, iter) { 112 bio_for_each_segment(bvec, bio, iter) {
113 char *buffer = __bio_kmap_atomic(bio, iter); 113 char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset;
114 unsigned len = bvec.bv_len >> SECTOR_SHIFT; 114 unsigned len = bvec.bv_len >> SECTOR_SHIFT;
115 115
116 simdisk_transfer(dev, sector, len, buffer, 116 simdisk_transfer(dev, sector, len, buffer,
117 bio_data_dir(bio) == WRITE); 117 bio_data_dir(bio) == WRITE);
118 sector += len; 118 sector += len;
119 __bio_kunmap_atomic(buffer); 119 kunmap_atomic(buffer);
120 } 120 }
121 121
122 bio_endio(bio); 122 bio_endio(bio);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index a4783da90ba8..889a8549d97f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -108,6 +108,7 @@
108#include "blk-mq-tag.h" 108#include "blk-mq-tag.h"
109#include "blk-mq-sched.h" 109#include "blk-mq-sched.h"
110#include "bfq-iosched.h" 110#include "bfq-iosched.h"
111#include "blk-wbt.h"
111 112
112#define BFQ_BFQQ_FNS(name) \ 113#define BFQ_BFQQ_FNS(name) \
113void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ 114void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
@@ -724,6 +725,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
724 } 725 }
725} 726}
726 727
728static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
729{
730 u64 dur;
731
732 if (bfqd->bfq_wr_max_time > 0)
733 return bfqd->bfq_wr_max_time;
734
735 dur = bfqd->RT_prod;
736 do_div(dur, bfqd->peak_rate);
737
738 /*
739 * Limit duration between 3 and 13 seconds. Tests show that
740 * higher values than 13 seconds often yield the opposite of
741 * the desired result, i.e., worsen responsiveness by letting
742 * non-interactive and non-soft-real-time applications
743 * preserve weight raising for a too long time interval.
744 *
745 * On the other end, lower values than 3 seconds make it
746 * difficult for most interactive tasks to complete their jobs
747 * before weight-raising finishes.
748 */
749 if (dur > msecs_to_jiffies(13000))
750 dur = msecs_to_jiffies(13000);
751 else if (dur < msecs_to_jiffies(3000))
752 dur = msecs_to_jiffies(3000);
753
754 return dur;
755}
756
757/* switch back from soft real-time to interactive weight raising */
758static void switch_back_to_interactive_wr(struct bfq_queue *bfqq,
759 struct bfq_data *bfqd)
760{
761 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
762 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
763 bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt;
764}
765
727static void 766static void
728bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, 767bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
729 struct bfq_io_cq *bic, bool bfq_already_existing) 768 struct bfq_io_cq *bic, bool bfq_already_existing)
@@ -750,10 +789,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
750 if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || 789 if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
751 time_is_before_jiffies(bfqq->last_wr_start_finish + 790 time_is_before_jiffies(bfqq->last_wr_start_finish +
752 bfqq->wr_cur_max_time))) { 791 bfqq->wr_cur_max_time))) {
753 bfq_log_bfqq(bfqq->bfqd, bfqq, 792 if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
754 "resume state: switching off wr"); 793 !bfq_bfqq_in_large_burst(bfqq) &&
755 794 time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt +
756 bfqq->wr_coeff = 1; 795 bfq_wr_duration(bfqd))) {
796 switch_back_to_interactive_wr(bfqq, bfqd);
797 } else {
798 bfqq->wr_coeff = 1;
799 bfq_log_bfqq(bfqq->bfqd, bfqq,
800 "resume state: switching off wr");
801 }
757 } 802 }
758 803
759 /* make sure weight will be updated, however we got here */ 804 /* make sure weight will be updated, however we got here */
@@ -1173,33 +1218,22 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1173 return wr_or_deserves_wr; 1218 return wr_or_deserves_wr;
1174} 1219}
1175 1220
1176static unsigned int bfq_wr_duration(struct bfq_data *bfqd) 1221/*
1222 * Return the farthest future time instant according to jiffies
1223 * macros.
1224 */
1225static unsigned long bfq_greatest_from_now(void)
1177{ 1226{
1178 u64 dur; 1227 return jiffies + MAX_JIFFY_OFFSET;
1179 1228}
1180 if (bfqd->bfq_wr_max_time > 0)
1181 return bfqd->bfq_wr_max_time;
1182
1183 dur = bfqd->RT_prod;
1184 do_div(dur, bfqd->peak_rate);
1185
1186 /*
1187 * Limit duration between 3 and 13 seconds. Tests show that
1188 * higher values than 13 seconds often yield the opposite of
1189 * the desired result, i.e., worsen responsiveness by letting
1190 * non-interactive and non-soft-real-time applications
1191 * preserve weight raising for a too long time interval.
1192 *
1193 * On the other end, lower values than 3 seconds make it
1194 * difficult for most interactive tasks to complete their jobs
1195 * before weight-raising finishes.
1196 */
1197 if (dur > msecs_to_jiffies(13000))
1198 dur = msecs_to_jiffies(13000);
1199 else if (dur < msecs_to_jiffies(3000))
1200 dur = msecs_to_jiffies(3000);
1201 1229
1202 return dur; 1230/*
1231 * Return the farthest past time instant according to jiffies
1232 * macros.
1233 */
1234static unsigned long bfq_smallest_from_now(void)
1235{
1236 return jiffies - MAX_JIFFY_OFFSET;
1203} 1237}
1204 1238
1205static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, 1239static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
@@ -1216,7 +1250,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
1216 bfqq->wr_coeff = bfqd->bfq_wr_coeff; 1250 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
1217 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); 1251 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
1218 } else { 1252 } else {
1219 bfqq->wr_start_at_switch_to_srt = jiffies; 1253 /*
1254 * No interactive weight raising in progress
1255 * here: assign minus infinity to
1256 * wr_start_at_switch_to_srt, to make sure
1257 * that, at the end of the soft-real-time
1258 * weight raising periods that is starting
1259 * now, no interactive weight-raising period
1260 * may be wrongly considered as still in
1261 * progress (and thus actually started by
1262 * mistake).
1263 */
1264 bfqq->wr_start_at_switch_to_srt =
1265 bfq_smallest_from_now();
1220 bfqq->wr_coeff = bfqd->bfq_wr_coeff * 1266 bfqq->wr_coeff = bfqd->bfq_wr_coeff *
1221 BFQ_SOFTRT_WEIGHT_FACTOR; 1267 BFQ_SOFTRT_WEIGHT_FACTOR;
1222 bfqq->wr_cur_max_time = 1268 bfqq->wr_cur_max_time =
@@ -2016,10 +2062,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
2016 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); 2062 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
2017 bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); 2063 bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
2018 bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); 2064 bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
2019 bic->saved_wr_coeff = bfqq->wr_coeff; 2065 if (unlikely(bfq_bfqq_just_created(bfqq) &&
2020 bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; 2066 !bfq_bfqq_in_large_burst(bfqq))) {
2021 bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; 2067 /*
2022 bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; 2068 * bfqq being merged right after being created: bfqq
2069 * would have deserved interactive weight raising, but
2070 * did not make it to be set in a weight-raised state,
2071 * because of this early merge. Store directly the
2072 * weight-raising state that would have been assigned
2073 * to bfqq, so that to avoid that bfqq unjustly fails
2074 * to enjoy weight raising if split soon.
2075 */
2076 bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
2077 bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
2078 bic->saved_last_wr_start_finish = jiffies;
2079 } else {
2080 bic->saved_wr_coeff = bfqq->wr_coeff;
2081 bic->saved_wr_start_at_switch_to_srt =
2082 bfqq->wr_start_at_switch_to_srt;
2083 bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
2084 bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
2085 }
2023} 2086}
2024 2087
2025static void 2088static void
@@ -2897,24 +2960,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2897 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); 2960 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
2898} 2961}
2899 2962
2900/*
2901 * Return the farthest future time instant according to jiffies
2902 * macros.
2903 */
2904static unsigned long bfq_greatest_from_now(void)
2905{
2906 return jiffies + MAX_JIFFY_OFFSET;
2907}
2908
2909/*
2910 * Return the farthest past time instant according to jiffies
2911 * macros.
2912 */
2913static unsigned long bfq_smallest_from_now(void)
2914{
2915 return jiffies - MAX_JIFFY_OFFSET;
2916}
2917
2918/** 2963/**
2919 * bfq_bfqq_expire - expire a queue. 2964 * bfq_bfqq_expire - expire a queue.
2920 * @bfqd: device owning the queue. 2965 * @bfqd: device owning the queue.
@@ -3489,11 +3534,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3489 bfq_wr_duration(bfqd))) 3534 bfq_wr_duration(bfqd)))
3490 bfq_bfqq_end_wr(bfqq); 3535 bfq_bfqq_end_wr(bfqq);
3491 else { 3536 else {
3492 /* switch back to interactive wr */ 3537 switch_back_to_interactive_wr(bfqq, bfqd);
3493 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
3494 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
3495 bfqq->last_wr_start_finish =
3496 bfqq->wr_start_at_switch_to_srt;
3497 bfqq->entity.prio_changed = 1; 3538 bfqq->entity.prio_changed = 1;
3498 } 3539 }
3499 } 3540 }
@@ -3685,16 +3726,37 @@ void bfq_put_queue(struct bfq_queue *bfqq)
3685 if (bfqq->ref) 3726 if (bfqq->ref)
3686 return; 3727 return;
3687 3728
3688 if (bfq_bfqq_sync(bfqq)) 3729 if (!hlist_unhashed(&bfqq->burst_list_node)) {
3730 hlist_del_init(&bfqq->burst_list_node);
3689 /* 3731 /*
3690 * The fact that this queue is being destroyed does not 3732 * Decrement also burst size after the removal, if the
3691 * invalidate the fact that this queue may have been 3733 * process associated with bfqq is exiting, and thus
3692 * activated during the current burst. As a consequence, 3734 * does not contribute to the burst any longer. This
3693 * although the queue does not exist anymore, and hence 3735 * decrement helps filter out false positives of large
3694 * needs to be removed from the burst list if there, 3736 * bursts, when some short-lived process (often due to
3695 * the burst size has not to be decremented. 3737 * the execution of commands by some service) happens
3738 * to start and exit while a complex application is
3739 * starting, and thus spawning several processes that
3740 * do I/O (and that *must not* be treated as a large
3741 * burst, see comments on bfq_handle_burst).
3742 *
3743 * In particular, the decrement is performed only if:
3744 * 1) bfqq is not a merged queue, because, if it is,
3745 * then this free of bfqq is not triggered by the exit
3746 * of the process bfqq is associated with, but exactly
3747 * by the fact that bfqq has just been merged.
3748 * 2) burst_size is greater than 0, to handle
3749 * unbalanced decrements. Unbalanced decrements may
3750 * happen in te following case: bfqq is inserted into
3751 * the current burst list--without incrementing
3752 * bust_size--because of a split, but the current
3753 * burst list is not the burst list bfqq belonged to
3754 * (see comments on the case of a split in
3755 * bfq_set_request).
3696 */ 3756 */
3697 hlist_del_init(&bfqq->burst_list_node); 3757 if (bfqq->bic && bfqq->bfqd->burst_size > 0)
3758 bfqq->bfqd->burst_size--;
3759 }
3698 3760
3699 kmem_cache_free(bfq_pool, bfqq); 3761 kmem_cache_free(bfq_pool, bfqq);
3700#ifdef CONFIG_BFQ_GROUP_IOSCHED 3762#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -4127,7 +4189,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
4127 new_bfqq->allocated++; 4189 new_bfqq->allocated++;
4128 bfqq->allocated--; 4190 bfqq->allocated--;
4129 new_bfqq->ref++; 4191 new_bfqq->ref++;
4130 bfq_clear_bfqq_just_created(bfqq);
4131 /* 4192 /*
4132 * If the bic associated with the process 4193 * If the bic associated with the process
4133 * issuing this request still points to bfqq 4194 * issuing this request still points to bfqq
@@ -4139,6 +4200,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
4139 if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) 4200 if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
4140 bfq_merge_bfqqs(bfqd, RQ_BIC(rq), 4201 bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
4141 bfqq, new_bfqq); 4202 bfqq, new_bfqq);
4203
4204 bfq_clear_bfqq_just_created(bfqq);
4142 /* 4205 /*
4143 * rq is about to be enqueued into new_bfqq, 4206 * rq is about to be enqueued into new_bfqq,
4144 * release rq reference on bfqq 4207 * release rq reference on bfqq
@@ -4424,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
4424 else { 4487 else {
4425 bfq_clear_bfqq_in_large_burst(bfqq); 4488 bfq_clear_bfqq_in_large_burst(bfqq);
4426 if (bic->was_in_burst_list) 4489 if (bic->was_in_burst_list)
4490 /*
4491 * If bfqq was in the current
4492 * burst list before being
4493 * merged, then we have to add
4494 * it back. And we do not need
4495 * to increase burst_size, as
4496 * we did not decrement
4497 * burst_size when we removed
4498 * bfqq from the burst list as
4499 * a consequence of a merge
4500 * (see comments in
4501 * bfq_put_queue). In this
4502 * respect, it would be rather
4503 * costly to know whether the
4504 * current burst list is still
4505 * the same burst list from
4506 * which bfqq was removed on
4507 * the merge. To avoid this
4508 * cost, if bfqq was in a
4509 * burst list, then we add
4510 * bfqq to the current burst
4511 * list without any further
4512 * check. This can cause
4513 * inappropriate insertions,
4514 * but rarely enough to not
4515 * harm the detection of large
4516 * bursts significantly.
4517 */
4427 hlist_add_head(&bfqq->burst_list_node, 4518 hlist_add_head(&bfqq->burst_list_node,
4428 &bfqd->burst_list); 4519 &bfqd->burst_list);
4429 } 4520 }
@@ -4775,7 +4866,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
4775 bfq_init_root_group(bfqd->root_group, bfqd); 4866 bfq_init_root_group(bfqd->root_group, bfqd);
4776 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); 4867 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
4777 4868
4778 4869 wbt_disable_default(q);
4779 return 0; 4870 return 0;
4780 4871
4781out_free: 4872out_free:
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5df32907ff3b..23b42e8aa03e 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
485 485
486void bioset_integrity_free(struct bio_set *bs) 486void bioset_integrity_free(struct bio_set *bs)
487{ 487{
488 if (bs->bio_integrity_pool) 488 mempool_destroy(bs->bio_integrity_pool);
489 mempool_destroy(bs->bio_integrity_pool); 489 mempool_destroy(bs->bvec_integrity_pool);
490
491 if (bs->bvec_integrity_pool)
492 mempool_destroy(bs->bvec_integrity_pool);
493} 490}
494EXPORT_SYMBOL(bioset_integrity_free); 491EXPORT_SYMBOL(bioset_integrity_free);
495 492
diff --git a/block/bio.c b/block/bio.c
index cc60213e56d8..b94a802f8ba3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
400 400
401/** 401/**
402 * bio_alloc_bioset - allocate a bio for I/O 402 * bio_alloc_bioset - allocate a bio for I/O
403 * @gfp_mask: the GFP_ mask given to the slab allocator 403 * @gfp_mask: the GFP_* mask given to the slab allocator
404 * @nr_iovecs: number of iovecs to pre-allocate 404 * @nr_iovecs: number of iovecs to pre-allocate
405 * @bs: the bio_set to allocate from. 405 * @bs: the bio_set to allocate from.
406 * 406 *
@@ -1931,11 +1931,8 @@ void bioset_free(struct bio_set *bs)
1931 if (bs->rescue_workqueue) 1931 if (bs->rescue_workqueue)
1932 destroy_workqueue(bs->rescue_workqueue); 1932 destroy_workqueue(bs->rescue_workqueue);
1933 1933
1934 if (bs->bio_pool) 1934 mempool_destroy(bs->bio_pool);
1935 mempool_destroy(bs->bio_pool); 1935 mempool_destroy(bs->bvec_pool);
1936
1937 if (bs->bvec_pool)
1938 mempool_destroy(bs->bvec_pool);
1939 1936
1940 bioset_integrity_free(bs); 1937 bioset_integrity_free(bs);
1941 bio_put_slab(bs); 1938 bio_put_slab(bs);
@@ -2036,37 +2033,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
2036EXPORT_SYMBOL_GPL(bio_associate_blkcg); 2033EXPORT_SYMBOL_GPL(bio_associate_blkcg);
2037 2034
2038/** 2035/**
2039 * bio_associate_current - associate a bio with %current
2040 * @bio: target bio
2041 *
2042 * Associate @bio with %current if it hasn't been associated yet. Block
2043 * layer will treat @bio as if it were issued by %current no matter which
2044 * task actually issues it.
2045 *
2046 * This function takes an extra reference of @task's io_context and blkcg
2047 * which will be put when @bio is released. The caller must own @bio,
2048 * ensure %current->io_context exists, and is responsible for synchronizing
2049 * calls to this function.
2050 */
2051int bio_associate_current(struct bio *bio)
2052{
2053 struct io_context *ioc;
2054
2055 if (bio->bi_css)
2056 return -EBUSY;
2057
2058 ioc = current->io_context;
2059 if (!ioc)
2060 return -ENOENT;
2061
2062 get_io_context_active(ioc);
2063 bio->bi_ioc = ioc;
2064 bio->bi_css = task_get_css(current, io_cgrp_id);
2065 return 0;
2066}
2067EXPORT_SYMBOL_GPL(bio_associate_current);
2068
2069/**
2070 * bio_disassociate_task - undo bio_associate_current() 2036 * bio_disassociate_task - undo bio_associate_current()
2071 * @bio: target bio 2037 * @bio: target bio
2072 */ 2038 */
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56baee936..4117524ca45b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
1419 if (i >= BLKCG_MAX_POLS) 1419 if (i >= BLKCG_MAX_POLS)
1420 goto err_unlock; 1420 goto err_unlock;
1421 1421
1422 /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
1423 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1424 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1425 goto err_unlock;
1426
1422 /* register @pol */ 1427 /* register @pol */
1423 pol->plid = i; 1428 pol->plid = i;
1424 blkcg_policy[pol->plid] = pol; 1429 blkcg_policy[pol->plid] = pol;
@@ -1452,7 +1457,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
1452 return 0; 1457 return 0;
1453 1458
1454err_free_cpds: 1459err_free_cpds:
1455 if (pol->cpd_alloc_fn) { 1460 if (pol->cpd_free_fn) {
1456 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1461 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1457 if (blkcg->cpd[pol->plid]) { 1462 if (blkcg->cpd[pol->plid]) {
1458 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1463 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1492,7 +1497,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
1492 /* remove cpds and unregister */ 1497 /* remove cpds and unregister */
1493 mutex_lock(&blkcg_pol_mutex); 1498 mutex_lock(&blkcg_pol_mutex);
1494 1499
1495 if (pol->cpd_alloc_fn) { 1500 if (pol->cpd_free_fn) {
1496 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1501 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1497 if (blkcg->cpd[pol->plid]) { 1502 if (blkcg->cpd[pol->plid]) {
1498 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1503 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
diff --git a/block/blk-core.c b/block/blk-core.c
index 048be4aa6024..7c54c195e79e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -333,11 +333,13 @@ EXPORT_SYMBOL(blk_stop_queue);
333void blk_sync_queue(struct request_queue *q) 333void blk_sync_queue(struct request_queue *q)
334{ 334{
335 del_timer_sync(&q->timeout); 335 del_timer_sync(&q->timeout);
336 cancel_work_sync(&q->timeout_work);
336 337
337 if (q->mq_ops) { 338 if (q->mq_ops) {
338 struct blk_mq_hw_ctx *hctx; 339 struct blk_mq_hw_ctx *hctx;
339 int i; 340 int i;
340 341
342 cancel_delayed_work_sync(&q->requeue_work);
341 queue_for_each_hw_ctx(q, hctx, i) 343 queue_for_each_hw_ctx(q, hctx, i)
342 cancel_delayed_work_sync(&hctx->run_work); 344 cancel_delayed_work_sync(&hctx->run_work);
343 } else { 345 } else {
@@ -347,6 +349,37 @@ void blk_sync_queue(struct request_queue *q)
347EXPORT_SYMBOL(blk_sync_queue); 349EXPORT_SYMBOL(blk_sync_queue);
348 350
349/** 351/**
352 * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
353 * @q: request queue pointer
354 *
355 * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
356 * set and 1 if the flag was already set.
357 */
358int blk_set_preempt_only(struct request_queue *q)
359{
360 unsigned long flags;
361 int res;
362
363 spin_lock_irqsave(q->queue_lock, flags);
364 res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
365 spin_unlock_irqrestore(q->queue_lock, flags);
366
367 return res;
368}
369EXPORT_SYMBOL_GPL(blk_set_preempt_only);
370
371void blk_clear_preempt_only(struct request_queue *q)
372{
373 unsigned long flags;
374
375 spin_lock_irqsave(q->queue_lock, flags);
376 queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
377 wake_up_all(&q->mq_freeze_wq);
378 spin_unlock_irqrestore(q->queue_lock, flags);
379}
380EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
381
382/**
350 * __blk_run_queue_uncond - run a queue whether or not it has been stopped 383 * __blk_run_queue_uncond - run a queue whether or not it has been stopped
351 * @q: The queue to run 384 * @q: The queue to run
352 * 385 *
@@ -610,6 +643,9 @@ void blk_set_queue_dying(struct request_queue *q)
610 } 643 }
611 spin_unlock_irq(q->queue_lock); 644 spin_unlock_irq(q->queue_lock);
612 } 645 }
646
647 /* Make blk_queue_enter() reexamine the DYING flag. */
648 wake_up_all(&q->mq_freeze_wq);
613} 649}
614EXPORT_SYMBOL_GPL(blk_set_queue_dying); 650EXPORT_SYMBOL_GPL(blk_set_queue_dying);
615 651
@@ -718,7 +754,7 @@ static void free_request_size(void *element, void *data)
718int blk_init_rl(struct request_list *rl, struct request_queue *q, 754int blk_init_rl(struct request_list *rl, struct request_queue *q,
719 gfp_t gfp_mask) 755 gfp_t gfp_mask)
720{ 756{
721 if (unlikely(rl->rq_pool)) 757 if (unlikely(rl->rq_pool) || q->mq_ops)
722 return 0; 758 return 0;
723 759
724 rl->q = q; 760 rl->q = q;
@@ -760,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
760} 796}
761EXPORT_SYMBOL(blk_alloc_queue); 797EXPORT_SYMBOL(blk_alloc_queue);
762 798
763int blk_queue_enter(struct request_queue *q, bool nowait) 799/**
800 * blk_queue_enter() - try to increase q->q_usage_counter
801 * @q: request queue pointer
802 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
803 */
804int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
764{ 805{
806 const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
807
765 while (true) { 808 while (true) {
809 bool success = false;
766 int ret; 810 int ret;
767 811
768 if (percpu_ref_tryget_live(&q->q_usage_counter)) 812 rcu_read_lock_sched();
813 if (percpu_ref_tryget_live(&q->q_usage_counter)) {
814 /*
815 * The code that sets the PREEMPT_ONLY flag is
816 * responsible for ensuring that that flag is globally
817 * visible before the queue is unfrozen.
818 */
819 if (preempt || !blk_queue_preempt_only(q)) {
820 success = true;
821 } else {
822 percpu_ref_put(&q->q_usage_counter);
823 }
824 }
825 rcu_read_unlock_sched();
826
827 if (success)
769 return 0; 828 return 0;
770 829
771 if (nowait) 830 if (flags & BLK_MQ_REQ_NOWAIT)
772 return -EBUSY; 831 return -EBUSY;
773 832
774 /* 833 /*
@@ -781,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
781 smp_rmb(); 840 smp_rmb();
782 841
783 ret = wait_event_interruptible(q->mq_freeze_wq, 842 ret = wait_event_interruptible(q->mq_freeze_wq,
784 !atomic_read(&q->mq_freeze_depth) || 843 (atomic_read(&q->mq_freeze_depth) == 0 &&
844 (preempt || !blk_queue_preempt_only(q))) ||
785 blk_queue_dying(q)); 845 blk_queue_dying(q));
786 if (blk_queue_dying(q)) 846 if (blk_queue_dying(q))
787 return -ENODEV; 847 return -ENODEV;
@@ -844,6 +904,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
844 setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, 904 setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
845 laptop_mode_timer_fn, (unsigned long) q); 905 laptop_mode_timer_fn, (unsigned long) q);
846 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 906 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
907 INIT_WORK(&q->timeout_work, NULL);
847 INIT_LIST_HEAD(&q->queue_head); 908 INIT_LIST_HEAD(&q->queue_head);
848 INIT_LIST_HEAD(&q->timeout_list); 909 INIT_LIST_HEAD(&q->timeout_list);
849 INIT_LIST_HEAD(&q->icq_list); 910 INIT_LIST_HEAD(&q->icq_list);
@@ -1154,7 +1215,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
1154 * @rl: request list to allocate from 1215 * @rl: request list to allocate from
1155 * @op: operation and flags 1216 * @op: operation and flags
1156 * @bio: bio to allocate request for (can be %NULL) 1217 * @bio: bio to allocate request for (can be %NULL)
1157 * @gfp_mask: allocation mask 1218 * @flags: BLQ_MQ_REQ_* flags
1158 * 1219 *
1159 * Get a free request from @q. This function may fail under memory 1220 * Get a free request from @q. This function may fail under memory
1160 * pressure or if @q is dead. 1221 * pressure or if @q is dead.
@@ -1164,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
1164 * Returns request pointer on success, with @q->queue_lock *not held*. 1225 * Returns request pointer on success, with @q->queue_lock *not held*.
1165 */ 1226 */
1166static struct request *__get_request(struct request_list *rl, unsigned int op, 1227static struct request *__get_request(struct request_list *rl, unsigned int op,
1167 struct bio *bio, gfp_t gfp_mask) 1228 struct bio *bio, blk_mq_req_flags_t flags)
1168{ 1229{
1169 struct request_queue *q = rl->q; 1230 struct request_queue *q = rl->q;
1170 struct request *rq; 1231 struct request *rq;
@@ -1173,6 +1234,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
1173 struct io_cq *icq = NULL; 1234 struct io_cq *icq = NULL;
1174 const bool is_sync = op_is_sync(op); 1235 const bool is_sync = op_is_sync(op);
1175 int may_queue; 1236 int may_queue;
1237 gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
1238 __GFP_DIRECT_RECLAIM;
1176 req_flags_t rq_flags = RQF_ALLOCED; 1239 req_flags_t rq_flags = RQF_ALLOCED;
1177 1240
1178 lockdep_assert_held(q->queue_lock); 1241 lockdep_assert_held(q->queue_lock);
@@ -1255,6 +1318,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
1255 blk_rq_set_rl(rq, rl); 1318 blk_rq_set_rl(rq, rl);
1256 rq->cmd_flags = op; 1319 rq->cmd_flags = op;
1257 rq->rq_flags = rq_flags; 1320 rq->rq_flags = rq_flags;
1321 if (flags & BLK_MQ_REQ_PREEMPT)
1322 rq->rq_flags |= RQF_PREEMPT;
1258 1323
1259 /* init elvpriv */ 1324 /* init elvpriv */
1260 if (rq_flags & RQF_ELVPRIV) { 1325 if (rq_flags & RQF_ELVPRIV) {
@@ -1333,7 +1398,7 @@ rq_starved:
1333 * @q: request_queue to allocate request from 1398 * @q: request_queue to allocate request from
1334 * @op: operation and flags 1399 * @op: operation and flags
1335 * @bio: bio to allocate request for (can be %NULL) 1400 * @bio: bio to allocate request for (can be %NULL)
1336 * @gfp_mask: allocation mask 1401 * @flags: BLK_MQ_REQ_* flags.
1337 * 1402 *
1338 * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, 1403 * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
1339 * this function keeps retrying under memory pressure and fails iff @q is dead. 1404 * this function keeps retrying under memory pressure and fails iff @q is dead.
@@ -1343,7 +1408,7 @@ rq_starved:
1343 * Returns request pointer on success, with @q->queue_lock *not held*. 1408 * Returns request pointer on success, with @q->queue_lock *not held*.
1344 */ 1409 */
1345static struct request *get_request(struct request_queue *q, unsigned int op, 1410static struct request *get_request(struct request_queue *q, unsigned int op,
1346 struct bio *bio, gfp_t gfp_mask) 1411 struct bio *bio, blk_mq_req_flags_t flags)
1347{ 1412{
1348 const bool is_sync = op_is_sync(op); 1413 const bool is_sync = op_is_sync(op);
1349 DEFINE_WAIT(wait); 1414 DEFINE_WAIT(wait);
@@ -1355,7 +1420,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
1355 1420
1356 rl = blk_get_rl(q, bio); /* transferred to @rq on success */ 1421 rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1357retry: 1422retry:
1358 rq = __get_request(rl, op, bio, gfp_mask); 1423 rq = __get_request(rl, op, bio, flags);
1359 if (!IS_ERR(rq)) 1424 if (!IS_ERR(rq))
1360 return rq; 1425 return rq;
1361 1426
@@ -1364,7 +1429,7 @@ retry:
1364 return ERR_PTR(-EAGAIN); 1429 return ERR_PTR(-EAGAIN);
1365 } 1430 }
1366 1431
1367 if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { 1432 if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
1368 blk_put_rl(rl); 1433 blk_put_rl(rl);
1369 return rq; 1434 return rq;
1370 } 1435 }
@@ -1391,20 +1456,28 @@ retry:
1391 goto retry; 1456 goto retry;
1392} 1457}
1393 1458
1459/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
1394static struct request *blk_old_get_request(struct request_queue *q, 1460static struct request *blk_old_get_request(struct request_queue *q,
1395 unsigned int op, gfp_t gfp_mask) 1461 unsigned int op, blk_mq_req_flags_t flags)
1396{ 1462{
1397 struct request *rq; 1463 struct request *rq;
1464 gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
1465 __GFP_DIRECT_RECLAIM;
1466 int ret = 0;
1398 1467
1399 WARN_ON_ONCE(q->mq_ops); 1468 WARN_ON_ONCE(q->mq_ops);
1400 1469
1401 /* create ioc upfront */ 1470 /* create ioc upfront */
1402 create_io_context(gfp_mask, q->node); 1471 create_io_context(gfp_mask, q->node);
1403 1472
1473 ret = blk_queue_enter(q, flags);
1474 if (ret)
1475 return ERR_PTR(ret);
1404 spin_lock_irq(q->queue_lock); 1476 spin_lock_irq(q->queue_lock);
1405 rq = get_request(q, op, NULL, gfp_mask); 1477 rq = get_request(q, op, NULL, flags);
1406 if (IS_ERR(rq)) { 1478 if (IS_ERR(rq)) {
1407 spin_unlock_irq(q->queue_lock); 1479 spin_unlock_irq(q->queue_lock);
1480 blk_queue_exit(q);
1408 return rq; 1481 return rq;
1409 } 1482 }
1410 1483
@@ -1415,25 +1488,40 @@ static struct request *blk_old_get_request(struct request_queue *q,
1415 return rq; 1488 return rq;
1416} 1489}
1417 1490
1418struct request *blk_get_request(struct request_queue *q, unsigned int op, 1491/**
1419 gfp_t gfp_mask) 1492 * blk_get_request_flags - allocate a request
1493 * @q: request queue to allocate a request for
1494 * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
1495 * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
1496 */
1497struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
1498 blk_mq_req_flags_t flags)
1420{ 1499{
1421 struct request *req; 1500 struct request *req;
1422 1501
1502 WARN_ON_ONCE(op & REQ_NOWAIT);
1503 WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
1504
1423 if (q->mq_ops) { 1505 if (q->mq_ops) {
1424 req = blk_mq_alloc_request(q, op, 1506 req = blk_mq_alloc_request(q, op, flags);
1425 (gfp_mask & __GFP_DIRECT_RECLAIM) ?
1426 0 : BLK_MQ_REQ_NOWAIT);
1427 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) 1507 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
1428 q->mq_ops->initialize_rq_fn(req); 1508 q->mq_ops->initialize_rq_fn(req);
1429 } else { 1509 } else {
1430 req = blk_old_get_request(q, op, gfp_mask); 1510 req = blk_old_get_request(q, op, flags);
1431 if (!IS_ERR(req) && q->initialize_rq_fn) 1511 if (!IS_ERR(req) && q->initialize_rq_fn)
1432 q->initialize_rq_fn(req); 1512 q->initialize_rq_fn(req);
1433 } 1513 }
1434 1514
1435 return req; 1515 return req;
1436} 1516}
1517EXPORT_SYMBOL(blk_get_request_flags);
1518
1519struct request *blk_get_request(struct request_queue *q, unsigned int op,
1520 gfp_t gfp_mask)
1521{
1522 return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
1523 0 : BLK_MQ_REQ_NOWAIT);
1524}
1437EXPORT_SYMBOL(blk_get_request); 1525EXPORT_SYMBOL(blk_get_request);
1438 1526
1439/** 1527/**
@@ -1576,6 +1664,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1576 blk_free_request(rl, req); 1664 blk_free_request(rl, req);
1577 freed_request(rl, sync, rq_flags); 1665 freed_request(rl, sync, rq_flags);
1578 blk_put_rl(rl); 1666 blk_put_rl(rl);
1667 blk_queue_exit(q);
1579 } 1668 }
1580} 1669}
1581EXPORT_SYMBOL_GPL(__blk_put_request); 1670EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1857,8 +1946,10 @@ get_rq:
1857 * Grab a free request. This is might sleep but can not fail. 1946 * Grab a free request. This is might sleep but can not fail.
1858 * Returns with the queue unlocked. 1947 * Returns with the queue unlocked.
1859 */ 1948 */
1860 req = get_request(q, bio->bi_opf, bio, GFP_NOIO); 1949 blk_queue_enter_live(q);
1950 req = get_request(q, bio->bi_opf, bio, 0);
1861 if (IS_ERR(req)) { 1951 if (IS_ERR(req)) {
1952 blk_queue_exit(q);
1862 __wbt_done(q->rq_wb, wb_acct); 1953 __wbt_done(q->rq_wb, wb_acct);
1863 if (PTR_ERR(req) == -ENOMEM) 1954 if (PTR_ERR(req) == -ENOMEM)
1864 bio->bi_status = BLK_STS_RESOURCE; 1955 bio->bi_status = BLK_STS_RESOURCE;
@@ -2200,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
2200 current->bio_list = bio_list_on_stack; 2291 current->bio_list = bio_list_on_stack;
2201 do { 2292 do {
2202 struct request_queue *q = bio->bi_disk->queue; 2293 struct request_queue *q = bio->bi_disk->queue;
2294 blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
2295 BLK_MQ_REQ_NOWAIT : 0;
2203 2296
2204 if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { 2297 if (likely(blk_queue_enter(q, flags) == 0)) {
2205 struct bio_list lower, same; 2298 struct bio_list lower, same;
2206 2299
2207 /* Create a fresh bio_list for all subordinate requests */ 2300 /* Create a fresh bio_list for all subordinate requests */
@@ -2242,6 +2335,40 @@ out:
2242EXPORT_SYMBOL(generic_make_request); 2335EXPORT_SYMBOL(generic_make_request);
2243 2336
2244/** 2337/**
2338 * direct_make_request - hand a buffer directly to its device driver for I/O
2339 * @bio: The bio describing the location in memory and on the device.
2340 *
2341 * This function behaves like generic_make_request(), but does not protect
2342 * against recursion. Must only be used if the called driver is known
2343 * to not call generic_make_request (or direct_make_request) again from
2344 * its make_request function. (Calling direct_make_request again from
2345 * a workqueue is perfectly fine as that doesn't recurse).
2346 */
2347blk_qc_t direct_make_request(struct bio *bio)
2348{
2349 struct request_queue *q = bio->bi_disk->queue;
2350 bool nowait = bio->bi_opf & REQ_NOWAIT;
2351 blk_qc_t ret;
2352
2353 if (!generic_make_request_checks(bio))
2354 return BLK_QC_T_NONE;
2355
2356 if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
2357 if (nowait && !blk_queue_dying(q))
2358 bio->bi_status = BLK_STS_AGAIN;
2359 else
2360 bio->bi_status = BLK_STS_IOERR;
2361 bio_endio(bio);
2362 return BLK_QC_T_NONE;
2363 }
2364
2365 ret = q->make_request_fn(q, bio);
2366 blk_queue_exit(q);
2367 return ret;
2368}
2369EXPORT_SYMBOL_GPL(direct_make_request);
2370
2371/**
2245 * submit_bio - submit a bio to the block device layer for I/O 2372 * submit_bio - submit a bio to the block device layer for I/O
2246 * @bio: The &struct bio which describes the I/O 2373 * @bio: The &struct bio which describes the I/O
2247 * 2374 *
@@ -2285,6 +2412,17 @@ blk_qc_t submit_bio(struct bio *bio)
2285} 2412}
2286EXPORT_SYMBOL(submit_bio); 2413EXPORT_SYMBOL(submit_bio);
2287 2414
2415bool blk_poll(struct request_queue *q, blk_qc_t cookie)
2416{
2417 if (!q->poll_fn || !blk_qc_t_valid(cookie))
2418 return false;
2419
2420 if (current->plug)
2421 blk_flush_plug_list(current->plug, false);
2422 return q->poll_fn(q, cookie);
2423}
2424EXPORT_SYMBOL_GPL(blk_poll);
2425
2288/** 2426/**
2289 * blk_cloned_rq_check_limits - Helper function to check a cloned request 2427 * blk_cloned_rq_check_limits - Helper function to check a cloned request
2290 * for new the queue limits 2428 * for new the queue limits
@@ -2350,7 +2488,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
2350 * bypass a potential scheduler on the bottom device for 2488 * bypass a potential scheduler on the bottom device for
2351 * insert. 2489 * insert.
2352 */ 2490 */
2353 blk_mq_request_bypass_insert(rq); 2491 blk_mq_request_bypass_insert(rq, true);
2354 return BLK_STS_OK; 2492 return BLK_STS_OK;
2355 } 2493 }
2356 2494
@@ -2464,20 +2602,22 @@ void blk_account_io_done(struct request *req)
2464 * Don't process normal requests when queue is suspended 2602 * Don't process normal requests when queue is suspended
2465 * or in the process of suspending/resuming 2603 * or in the process of suspending/resuming
2466 */ 2604 */
2467static struct request *blk_pm_peek_request(struct request_queue *q, 2605static bool blk_pm_allow_request(struct request *rq)
2468 struct request *rq)
2469{ 2606{
2470 if (q->dev && (q->rpm_status == RPM_SUSPENDED || 2607 switch (rq->q->rpm_status) {
2471 (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) 2608 case RPM_RESUMING:
2472 return NULL; 2609 case RPM_SUSPENDING:
2473 else 2610 return rq->rq_flags & RQF_PM;
2474 return rq; 2611 case RPM_SUSPENDED:
2612 return false;
2613 }
2614
2615 return true;
2475} 2616}
2476#else 2617#else
2477static inline struct request *blk_pm_peek_request(struct request_queue *q, 2618static bool blk_pm_allow_request(struct request *rq)
2478 struct request *rq)
2479{ 2619{
2480 return rq; 2620 return true;
2481} 2621}
2482#endif 2622#endif
2483 2623
@@ -2517,6 +2657,48 @@ void blk_account_io_start(struct request *rq, bool new_io)
2517 part_stat_unlock(); 2657 part_stat_unlock();
2518} 2658}
2519 2659
2660static struct request *elv_next_request(struct request_queue *q)
2661{
2662 struct request *rq;
2663 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
2664
2665 WARN_ON_ONCE(q->mq_ops);
2666
2667 while (1) {
2668 list_for_each_entry(rq, &q->queue_head, queuelist) {
2669 if (blk_pm_allow_request(rq))
2670 return rq;
2671
2672 if (rq->rq_flags & RQF_SOFTBARRIER)
2673 break;
2674 }
2675
2676 /*
2677 * Flush request is running and flush request isn't queueable
2678 * in the drive, we can hold the queue till flush request is
2679 * finished. Even we don't do this, driver can't dispatch next
2680 * requests and will requeue them. And this can improve
2681 * throughput too. For example, we have request flush1, write1,
2682 * flush 2. flush1 is dispatched, then queue is hold, write1
2683 * isn't inserted to queue. After flush1 is finished, flush2
2684 * will be dispatched. Since disk cache is already clean,
2685 * flush2 will be finished very soon, so looks like flush2 is
2686 * folded to flush1.
2687 * Since the queue is hold, a flag is set to indicate the queue
2688 * should be restarted later. Please see flush_end_io() for
2689 * details.
2690 */
2691 if (fq->flush_pending_idx != fq->flush_running_idx &&
2692 !queue_flush_queueable(q)) {
2693 fq->flush_queue_delayed = 1;
2694 return NULL;
2695 }
2696 if (unlikely(blk_queue_bypass(q)) ||
2697 !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
2698 return NULL;
2699 }
2700}
2701
2520/** 2702/**
2521 * blk_peek_request - peek at the top of a request queue 2703 * blk_peek_request - peek at the top of a request queue
2522 * @q: request queue to peek at 2704 * @q: request queue to peek at
@@ -2538,12 +2720,7 @@ struct request *blk_peek_request(struct request_queue *q)
2538 lockdep_assert_held(q->queue_lock); 2720 lockdep_assert_held(q->queue_lock);
2539 WARN_ON_ONCE(q->mq_ops); 2721 WARN_ON_ONCE(q->mq_ops);
2540 2722
2541 while ((rq = __elv_next_request(q)) != NULL) { 2723 while ((rq = elv_next_request(q)) != NULL) {
2542
2543 rq = blk_pm_peek_request(q, rq);
2544 if (!rq)
2545 break;
2546
2547 if (!(rq->rq_flags & RQF_STARTED)) { 2724 if (!(rq->rq_flags & RQF_STARTED)) {
2548 /* 2725 /*
2549 * This is the first time the device driver 2726 * This is the first time the device driver
@@ -2695,6 +2872,27 @@ struct request *blk_fetch_request(struct request_queue *q)
2695} 2872}
2696EXPORT_SYMBOL(blk_fetch_request); 2873EXPORT_SYMBOL(blk_fetch_request);
2697 2874
2875/*
2876 * Steal bios from a request and add them to a bio list.
2877 * The request must not have been partially completed before.
2878 */
2879void blk_steal_bios(struct bio_list *list, struct request *rq)
2880{
2881 if (rq->bio) {
2882 if (list->tail)
2883 list->tail->bi_next = rq->bio;
2884 else
2885 list->head = rq->bio;
2886 list->tail = rq->biotail;
2887
2888 rq->bio = NULL;
2889 rq->biotail = NULL;
2890 }
2891
2892 rq->__data_len = 0;
2893}
2894EXPORT_SYMBOL_GPL(blk_steal_bios);
2895
2698/** 2896/**
2699 * blk_update_request - Special helper function for request stacking drivers 2897 * blk_update_request - Special helper function for request stacking drivers
2700 * @req: the request being processed 2898 * @req: the request being processed
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 4938bec8cfef..f17170675917 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
231 /* release the tag's ownership to the req cloned from */ 231 /* release the tag's ownership to the req cloned from */
232 spin_lock_irqsave(&fq->mq_flush_lock, flags); 232 spin_lock_irqsave(&fq->mq_flush_lock, flags);
233 hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); 233 hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
234 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); 234 if (!q->elevator) {
235 flush_rq->tag = -1; 235 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
236 flush_rq->tag = -1;
237 } else {
238 blk_mq_put_driver_tag_hctx(hctx, flush_rq);
239 flush_rq->internal_tag = -1;
240 }
236 } 241 }
237 242
238 running = &fq->flush_queue[fq->flush_running_idx]; 243 running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
318 blk_rq_init(q, flush_rq); 323 blk_rq_init(q, flush_rq);
319 324
320 /* 325 /*
321 * Borrow tag from the first request since they can't 326 * In case of none scheduler, borrow tag from the first request
322 * be in flight at the same time. And acquire the tag's 327 * since they can't be in flight at the same time. And acquire
323 * ownership for flush req. 328 * the tag's ownership for flush req.
329 *
330 * In case of IO scheduler, flush rq need to borrow scheduler tag
331 * just for cheating put/get driver tag.
324 */ 332 */
325 if (q->mq_ops) { 333 if (q->mq_ops) {
326 struct blk_mq_hw_ctx *hctx; 334 struct blk_mq_hw_ctx *hctx;
327 335
328 flush_rq->mq_ctx = first_rq->mq_ctx; 336 flush_rq->mq_ctx = first_rq->mq_ctx;
329 flush_rq->tag = first_rq->tag;
330 fq->orig_rq = first_rq;
331 337
332 hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); 338 if (!q->elevator) {
333 blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); 339 fq->orig_rq = first_rq;
340 flush_rq->tag = first_rq->tag;
341 hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
342 blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
343 } else {
344 flush_rq->internal_tag = first_rq->internal_tag;
345 }
334 } 346 }
335 347
336 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; 348 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
394 406
395 hctx = blk_mq_map_queue(q, ctx->cpu); 407 hctx = blk_mq_map_queue(q, ctx->cpu);
396 408
409 if (q->elevator) {
410 WARN_ON(rq->tag < 0);
411 blk_mq_put_driver_tag_hctx(hctx, rq);
412 }
413
397 /* 414 /*
398 * After populating an empty queue, kick it to avoid stall. Read 415 * After populating an empty queue, kick it to avoid stall. Read
399 * the comment in flush_end_io(). 416 * the comment in flush_end_io().
@@ -463,7 +480,7 @@ void blk_insert_flush(struct request *rq)
463 if ((policy & REQ_FSEQ_DATA) && 480 if ((policy & REQ_FSEQ_DATA) &&
464 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 481 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
465 if (q->mq_ops) 482 if (q->mq_ops)
466 blk_mq_sched_insert_request(rq, false, true, false, false); 483 blk_mq_request_bypass_insert(rq, false);
467 else 484 else
468 list_add_tail(&rq->queuelist, &q->queue_head); 485 list_add_tail(&rq->queuelist, &q->queue_head);
469 return; 486 return;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 63fb971d6574..2bc544ce3d2e 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -275,6 +275,40 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
275 return min(pages, (sector_t)BIO_MAX_PAGES); 275 return min(pages, (sector_t)BIO_MAX_PAGES);
276} 276}
277 277
278static int __blkdev_issue_zero_pages(struct block_device *bdev,
279 sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
280 struct bio **biop)
281{
282 struct request_queue *q = bdev_get_queue(bdev);
283 struct bio *bio = *biop;
284 int bi_size = 0;
285 unsigned int sz;
286
287 if (!q)
288 return -ENXIO;
289
290 while (nr_sects != 0) {
291 bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
292 gfp_mask);
293 bio->bi_iter.bi_sector = sector;
294 bio_set_dev(bio, bdev);
295 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
296
297 while (nr_sects != 0) {
298 sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
299 bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
300 nr_sects -= bi_size >> 9;
301 sector += bi_size >> 9;
302 if (bi_size < sz)
303 break;
304 }
305 cond_resched();
306 }
307
308 *biop = bio;
309 return 0;
310}
311
278/** 312/**
279 * __blkdev_issue_zeroout - generate number of zero filed write bios 313 * __blkdev_issue_zeroout - generate number of zero filed write bios
280 * @bdev: blockdev to issue 314 * @bdev: blockdev to issue
@@ -288,12 +322,6 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
288 * Zero-fill a block range, either using hardware offload or by explicitly 322 * Zero-fill a block range, either using hardware offload or by explicitly
289 * writing zeroes to the device. 323 * writing zeroes to the device.
290 * 324 *
291 * Note that this function may fail with -EOPNOTSUPP if the driver signals
292 * zeroing offload support, but the device fails to process the command (for
293 * some devices there is no non-destructive way to verify whether this
294 * operation is actually supported). In this case the caller should call
295 * retry the call to blkdev_issue_zeroout() and the fallback path will be used.
296 *
297 * If a device is using logical block provisioning, the underlying space will 325 * If a device is using logical block provisioning, the underlying space will
298 * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. 326 * not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
299 * 327 *
@@ -305,9 +333,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
305 unsigned flags) 333 unsigned flags)
306{ 334{
307 int ret; 335 int ret;
308 int bi_size = 0;
309 struct bio *bio = *biop;
310 unsigned int sz;
311 sector_t bs_mask; 336 sector_t bs_mask;
312 337
313 bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; 338 bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
@@ -317,30 +342,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
317 ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, 342 ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
318 biop, flags); 343 biop, flags);
319 if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) 344 if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
320 goto out; 345 return ret;
321
322 ret = 0;
323 while (nr_sects != 0) {
324 bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
325 gfp_mask);
326 bio->bi_iter.bi_sector = sector;
327 bio_set_dev(bio, bdev);
328 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
329
330 while (nr_sects != 0) {
331 sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
332 bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
333 nr_sects -= bi_size >> 9;
334 sector += bi_size >> 9;
335 if (bi_size < sz)
336 break;
337 }
338 cond_resched();
339 }
340 346
341 *biop = bio; 347 return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
342out: 348 biop);
343 return ret;
344} 349}
345EXPORT_SYMBOL(__blkdev_issue_zeroout); 350EXPORT_SYMBOL(__blkdev_issue_zeroout);
346 351
@@ -360,18 +365,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
360int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 365int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
361 sector_t nr_sects, gfp_t gfp_mask, unsigned flags) 366 sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
362{ 367{
363 int ret; 368 int ret = 0;
364 struct bio *bio = NULL; 369 sector_t bs_mask;
370 struct bio *bio;
365 struct blk_plug plug; 371 struct blk_plug plug;
372 bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
366 373
374 bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
375 if ((sector | nr_sects) & bs_mask)
376 return -EINVAL;
377
378retry:
379 bio = NULL;
367 blk_start_plug(&plug); 380 blk_start_plug(&plug);
368 ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 381 if (try_write_zeroes) {
369 &bio, flags); 382 ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
383 gfp_mask, &bio, flags);
384 } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
385 ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
386 gfp_mask, &bio);
387 } else {
388 /* No zeroing offload support */
389 ret = -EOPNOTSUPP;
390 }
370 if (ret == 0 && bio) { 391 if (ret == 0 && bio) {
371 ret = submit_bio_wait(bio); 392 ret = submit_bio_wait(bio);
372 bio_put(bio); 393 bio_put(bio);
373 } 394 }
374 blk_finish_plug(&plug); 395 blk_finish_plug(&plug);
396 if (ret && try_write_zeroes) {
397 if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
398 try_write_zeroes = false;
399 goto retry;
400 }
401 if (!bdev_write_zeroes_sectors(bdev)) {
402 /*
403 * Zeroing offload support was indicated, but the
404 * device reported ILLEGAL REQUEST (for some devices
405 * there is no non-destructive way to verify whether
406 * WRITE ZEROES is actually supported).
407 */
408 ret = -EOPNOTSUPP;
409 }
410 }
375 411
376 return ret; 412 return ret;
377} 413}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index de294d775acf..b56a4f35720d 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = {
54 QUEUE_FLAG_NAME(NOMERGES), 54 QUEUE_FLAG_NAME(NOMERGES),
55 QUEUE_FLAG_NAME(SAME_COMP), 55 QUEUE_FLAG_NAME(SAME_COMP),
56 QUEUE_FLAG_NAME(FAIL_IO), 56 QUEUE_FLAG_NAME(FAIL_IO),
57 QUEUE_FLAG_NAME(STACKABLE),
58 QUEUE_FLAG_NAME(NONROT), 57 QUEUE_FLAG_NAME(NONROT),
59 QUEUE_FLAG_NAME(IO_STAT), 58 QUEUE_FLAG_NAME(IO_STAT),
60 QUEUE_FLAG_NAME(DISCARD), 59 QUEUE_FLAG_NAME(DISCARD),
@@ -75,6 +74,7 @@ static const char *const blk_queue_flag_name[] = {
75 QUEUE_FLAG_NAME(REGISTERED), 74 QUEUE_FLAG_NAME(REGISTERED),
76 QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), 75 QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
77 QUEUE_FLAG_NAME(QUIESCED), 76 QUEUE_FLAG_NAME(QUIESCED),
77 QUEUE_FLAG_NAME(PREEMPT_ONLY),
78}; 78};
79#undef QUEUE_FLAG_NAME 79#undef QUEUE_FLAG_NAME
80 80
@@ -180,7 +180,6 @@ static const char *const hctx_state_name[] = {
180 HCTX_STATE_NAME(STOPPED), 180 HCTX_STATE_NAME(STOPPED),
181 HCTX_STATE_NAME(TAG_ACTIVE), 181 HCTX_STATE_NAME(TAG_ACTIVE),
182 HCTX_STATE_NAME(SCHED_RESTART), 182 HCTX_STATE_NAME(SCHED_RESTART),
183 HCTX_STATE_NAME(TAG_WAITING),
184 HCTX_STATE_NAME(START_ON_RUN), 183 HCTX_STATE_NAME(START_ON_RUN),
185}; 184};
186#undef HCTX_STATE_NAME 185#undef HCTX_STATE_NAME
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..c117bd8fd1f6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -81,20 +81,103 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
81 } else 81 } else
82 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 82 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
83 83
84 if (blk_mq_hctx_has_pending(hctx)) { 84 return blk_mq_run_hw_queue(hctx, true);
85 blk_mq_run_hw_queue(hctx, true); 85}
86 return true;
87 }
88 86
89 return false; 87/*
88 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
89 * its queue by itself in its completion handler, so we don't need to
90 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
91 */
92static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
93{
94 struct request_queue *q = hctx->queue;
95 struct elevator_queue *e = q->elevator;
96 LIST_HEAD(rq_list);
97
98 do {
99 struct request *rq;
100
101 if (e->type->ops.mq.has_work &&
102 !e->type->ops.mq.has_work(hctx))
103 break;
104
105 if (!blk_mq_get_dispatch_budget(hctx))
106 break;
107
108 rq = e->type->ops.mq.dispatch_request(hctx);
109 if (!rq) {
110 blk_mq_put_dispatch_budget(hctx);
111 break;
112 }
113
114 /*
115 * Now this rq owns the budget which has to be released
116 * if this rq won't be queued to driver via .queue_rq()
117 * in blk_mq_dispatch_rq_list().
118 */
119 list_add(&rq->queuelist, &rq_list);
120 } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
90} 121}
91 122
123static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
124 struct blk_mq_ctx *ctx)
125{
126 unsigned idx = ctx->index_hw;
127
128 if (++idx == hctx->nr_ctx)
129 idx = 0;
130
131 return hctx->ctxs[idx];
132}
133
134/*
135 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
136 * its queue by itself in its completion handler, so we don't need to
137 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
138 */
139static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
140{
141 struct request_queue *q = hctx->queue;
142 LIST_HEAD(rq_list);
143 struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
144
145 do {
146 struct request *rq;
147
148 if (!sbitmap_any_bit_set(&hctx->ctx_map))
149 break;
150
151 if (!blk_mq_get_dispatch_budget(hctx))
152 break;
153
154 rq = blk_mq_dequeue_from_ctx(hctx, ctx);
155 if (!rq) {
156 blk_mq_put_dispatch_budget(hctx);
157 break;
158 }
159
160 /*
161 * Now this rq owns the budget which has to be released
162 * if this rq won't be queued to driver via .queue_rq()
163 * in blk_mq_dispatch_rq_list().
164 */
165 list_add(&rq->queuelist, &rq_list);
166
167 /* round robin for fair dispatch */
168 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
169
170 } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
171
172 WRITE_ONCE(hctx->dispatch_from, ctx);
173}
174
175/* return true if hw queue need to be run again */
92void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 176void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
93{ 177{
94 struct request_queue *q = hctx->queue; 178 struct request_queue *q = hctx->queue;
95 struct elevator_queue *e = q->elevator; 179 struct elevator_queue *e = q->elevator;
96 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 180 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
97 bool did_work = false;
98 LIST_HEAD(rq_list); 181 LIST_HEAD(rq_list);
99 182
100 /* RCU or SRCU read lock is needed before checking quiesced flag */ 183 /* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,29 +205,34 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
122 * scheduler, we can no longer merge or sort them. So it's best to 205 * scheduler, we can no longer merge or sort them. So it's best to
123 * leave them there for as long as we can. Mark the hw queue as 206 * leave them there for as long as we can. Mark the hw queue as
124 * needing a restart in that case. 207 * needing a restart in that case.
208 *
209 * We want to dispatch from the scheduler if there was nothing
210 * on the dispatch list or we were able to dispatch from the
211 * dispatch list.
125 */ 212 */
126 if (!list_empty(&rq_list)) { 213 if (!list_empty(&rq_list)) {
127 blk_mq_sched_mark_restart_hctx(hctx); 214 blk_mq_sched_mark_restart_hctx(hctx);
128 did_work = blk_mq_dispatch_rq_list(q, &rq_list); 215 if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
129 } else if (!has_sched_dispatch) { 216 if (has_sched_dispatch)
217 blk_mq_do_dispatch_sched(hctx);
218 else
219 blk_mq_do_dispatch_ctx(hctx);
220 }
221 } else if (has_sched_dispatch) {
222 blk_mq_do_dispatch_sched(hctx);
223 } else if (q->mq_ops->get_budget) {
224 /*
225 * If we need to get budget before queuing request, we
226 * dequeue request one by one from sw queue for avoiding
227 * to mess up I/O merge when dispatch runs out of resource.
228 *
229 * TODO: get more budgets, and dequeue more requests in
230 * one time.
231 */
232 blk_mq_do_dispatch_ctx(hctx);
233 } else {
130 blk_mq_flush_busy_ctxs(hctx, &rq_list); 234 blk_mq_flush_busy_ctxs(hctx, &rq_list);
131 blk_mq_dispatch_rq_list(q, &rq_list); 235 blk_mq_dispatch_rq_list(q, &rq_list, false);
132 }
133
134 /*
135 * We want to dispatch from the scheduler if we had no work left
136 * on the dispatch list, OR if we did have work but weren't able
137 * to make progress.
138 */
139 if (!did_work && has_sched_dispatch) {
140 do {
141 struct request *rq;
142
143 rq = e->type->ops.mq.dispatch_request(hctx);
144 if (!rq)
145 break;
146 list_add(&rq->queuelist, &rq_list);
147 } while (blk_mq_dispatch_rq_list(q, &rq_list));
148 } 236 }
149} 237}
150 238
@@ -260,21 +348,21 @@ void blk_mq_sched_request_inserted(struct request *rq)
260EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); 348EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
261 349
262static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, 350static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
351 bool has_sched,
263 struct request *rq) 352 struct request *rq)
264{ 353{
265 if (rq->tag == -1) { 354 /* dispatch flush rq directly */
266 rq->rq_flags |= RQF_SORTED; 355 if (rq->rq_flags & RQF_FLUSH_SEQ) {
267 return false; 356 spin_lock(&hctx->lock);
357 list_add(&rq->queuelist, &hctx->dispatch);
358 spin_unlock(&hctx->lock);
359 return true;
268 } 360 }
269 361
270 /* 362 if (has_sched)
271 * If we already have a real request tag, send directly to 363 rq->rq_flags |= RQF_SORTED;
272 * the dispatch list. 364
273 */ 365 return false;
274 spin_lock(&hctx->lock);
275 list_add(&rq->queuelist, &hctx->dispatch);
276 spin_unlock(&hctx->lock);
277 return true;
278} 366}
279 367
280/** 368/**
@@ -339,21 +427,6 @@ done:
339 } 427 }
340} 428}
341 429
342/*
343 * Add flush/fua to the queue. If we fail getting a driver tag, then
344 * punt to the requeue list. Requeue will re-invoke us from a context
345 * that's safe to block from.
346 */
347static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
348 struct request *rq, bool can_block)
349{
350 if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
351 blk_insert_flush(rq);
352 blk_mq_run_hw_queue(hctx, true);
353 } else
354 blk_mq_add_to_requeue_list(rq, false, true);
355}
356
357void blk_mq_sched_insert_request(struct request *rq, bool at_head, 430void blk_mq_sched_insert_request(struct request *rq, bool at_head,
358 bool run_queue, bool async, bool can_block) 431 bool run_queue, bool async, bool can_block)
359{ 432{
@@ -362,12 +435,15 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
362 struct blk_mq_ctx *ctx = rq->mq_ctx; 435 struct blk_mq_ctx *ctx = rq->mq_ctx;
363 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 436 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
364 437
365 if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { 438 /* flush rq in flush machinery need to be dispatched directly */
366 blk_mq_sched_insert_flush(hctx, rq, can_block); 439 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
367 return; 440 blk_insert_flush(rq);
441 goto run;
368 } 442 }
369 443
370 if (e && blk_mq_sched_bypass_insert(hctx, rq)) 444 WARN_ON(e && (rq->tag != -1));
445
446 if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
371 goto run; 447 goto run;
372 448
373 if (e && e->type->ops.mq.insert_requests) { 449 if (e && e->type->ops.mq.insert_requests) {
@@ -393,23 +469,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
393 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 469 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
394 struct elevator_queue *e = hctx->queue->elevator; 470 struct elevator_queue *e = hctx->queue->elevator;
395 471
396 if (e) {
397 struct request *rq, *next;
398
399 /*
400 * We bypass requests that already have a driver tag assigned,
401 * which should only be flushes. Flushes are only ever inserted
402 * as single requests, so we shouldn't ever hit the
403 * WARN_ON_ONCE() below (but let's handle it just in case).
404 */
405 list_for_each_entry_safe(rq, next, list, queuelist) {
406 if (WARN_ON_ONCE(rq->tag != -1)) {
407 list_del_init(&rq->queuelist);
408 blk_mq_sched_bypass_insert(hctx, rq);
409 }
410 }
411 }
412
413 if (e && e->type->ops.mq.insert_requests) 472 if (e && e->type->ops.mq.insert_requests)
414 e->type->ops.mq.insert_requests(hctx, list, false); 473 e->type->ops.mq.insert_requests(hctx, list, false);
415 else 474 else
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6714507aa6c7..c81b40ecd3f1 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
298} 298}
299EXPORT_SYMBOL(blk_mq_tagset_busy_iter); 299EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
300 300
301int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, 301int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
302 int (reinit_request)(void *, struct request *)) 302 int (fn)(void *, struct request *))
303{ 303{
304 int i, j, ret = 0; 304 int i, j, ret = 0;
305 305
306 if (WARN_ON_ONCE(!reinit_request)) 306 if (WARN_ON_ONCE(!fn))
307 goto out; 307 goto out;
308 308
309 for (i = 0; i < set->nr_hw_queues; i++) { 309 for (i = 0; i < set->nr_hw_queues; i++) {
@@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
316 if (!tags->static_rqs[j]) 316 if (!tags->static_rqs[j])
317 continue; 317 continue;
318 318
319 ret = reinit_request(set->driver_data, 319 ret = fn(data, tags->static_rqs[j]);
320 tags->static_rqs[j]);
321 if (ret) 320 if (ret)
322 goto out; 321 goto out;
323 } 322 }
@@ -326,7 +325,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
326out: 325out:
327 return ret; 326 return ret;
328} 327}
329EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); 328EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
330 329
331void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 330void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
332 void *priv) 331 void *priv)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index c190165d92ea..61deab0b5a5a 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -45,13 +45,8 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
45} 45}
46 46
47enum { 47enum {
48 BLK_MQ_TAG_CACHE_MIN = 1,
49 BLK_MQ_TAG_CACHE_MAX = 64,
50};
51
52enum {
53 BLK_MQ_TAG_FAIL = -1U, 48 BLK_MQ_TAG_FAIL = -1U,
54 BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, 49 BLK_MQ_TAG_MIN = 1,
55 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 50 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
56}; 51};
57 52
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a18609755e..b600463791ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -37,6 +37,7 @@
37#include "blk-wbt.h" 37#include "blk-wbt.h"
38#include "blk-mq-sched.h" 38#include "blk-mq-sched.h"
39 39
40static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
40static void blk_mq_poll_stats_start(struct request_queue *q); 41static void blk_mq_poll_stats_start(struct request_queue *q);
41static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 42static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
42 43
@@ -60,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
60/* 61/*
61 * Check if any of the ctx's have pending work in this hardware queue 62 * Check if any of the ctx's have pending work in this hardware queue
62 */ 63 */
63bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 64static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
64{ 65{
65 return sbitmap_any_bit_set(&hctx->ctx_map) || 66 return !list_empty_careful(&hctx->dispatch) ||
66 !list_empty_careful(&hctx->dispatch) || 67 sbitmap_any_bit_set(&hctx->ctx_map) ||
67 blk_mq_sched_has_work(hctx); 68 blk_mq_sched_has_work(hctx);
68} 69}
69 70
@@ -125,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q)
125 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 126 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
126 if (freeze_depth == 1) { 127 if (freeze_depth == 1) {
127 percpu_ref_kill(&q->q_usage_counter); 128 percpu_ref_kill(&q->q_usage_counter);
128 blk_mq_run_hw_queues(q, false); 129 if (q->mq_ops)
130 blk_mq_run_hw_queues(q, false);
129 } 131 }
130} 132}
131EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 133EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -255,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
255 queue_for_each_hw_ctx(q, hctx, i) 257 queue_for_each_hw_ctx(q, hctx, i)
256 if (blk_mq_hw_queue_mapped(hctx)) 258 if (blk_mq_hw_queue_mapped(hctx))
257 blk_mq_tag_wakeup_all(hctx->tags, true); 259 blk_mq_tag_wakeup_all(hctx->tags, true);
258
259 /*
260 * If we are called because the queue has now been marked as
261 * dying, we need to ensure that processes currently waiting on
262 * the queue are notified as well.
263 */
264 wake_up_all(&q->mq_freeze_wq);
265} 260}
266 261
267bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 262bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -296,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
296 rq->q = data->q; 291 rq->q = data->q;
297 rq->mq_ctx = data->ctx; 292 rq->mq_ctx = data->ctx;
298 rq->cmd_flags = op; 293 rq->cmd_flags = op;
294 if (data->flags & BLK_MQ_REQ_PREEMPT)
295 rq->rq_flags |= RQF_PREEMPT;
299 if (blk_queue_io_stat(data->q)) 296 if (blk_queue_io_stat(data->q))
300 rq->rq_flags |= RQF_IO_STAT; 297 rq->rq_flags |= RQF_IO_STAT;
301 /* do not touch atomic flags, it needs atomic ops against the timer */ 298 /* do not touch atomic flags, it needs atomic ops against the timer */
@@ -336,12 +333,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
336 struct elevator_queue *e = q->elevator; 333 struct elevator_queue *e = q->elevator;
337 struct request *rq; 334 struct request *rq;
338 unsigned int tag; 335 unsigned int tag;
339 struct blk_mq_ctx *local_ctx = NULL; 336 bool put_ctx_on_error = false;
340 337
341 blk_queue_enter_live(q); 338 blk_queue_enter_live(q);
342 data->q = q; 339 data->q = q;
343 if (likely(!data->ctx)) 340 if (likely(!data->ctx)) {
344 data->ctx = local_ctx = blk_mq_get_ctx(q); 341 data->ctx = blk_mq_get_ctx(q);
342 put_ctx_on_error = true;
343 }
345 if (likely(!data->hctx)) 344 if (likely(!data->hctx))
346 data->hctx = blk_mq_map_queue(q, data->ctx->cpu); 345 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
347 if (op & REQ_NOWAIT) 346 if (op & REQ_NOWAIT)
@@ -360,8 +359,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
360 359
361 tag = blk_mq_get_tag(data); 360 tag = blk_mq_get_tag(data);
362 if (tag == BLK_MQ_TAG_FAIL) { 361 if (tag == BLK_MQ_TAG_FAIL) {
363 if (local_ctx) { 362 if (put_ctx_on_error) {
364 blk_mq_put_ctx(local_ctx); 363 blk_mq_put_ctx(data->ctx);
365 data->ctx = NULL; 364 data->ctx = NULL;
366 } 365 }
367 blk_queue_exit(q); 366 blk_queue_exit(q);
@@ -384,13 +383,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
384} 383}
385 384
386struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 385struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
387 unsigned int flags) 386 blk_mq_req_flags_t flags)
388{ 387{
389 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 388 struct blk_mq_alloc_data alloc_data = { .flags = flags };
390 struct request *rq; 389 struct request *rq;
391 int ret; 390 int ret;
392 391
393 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); 392 ret = blk_queue_enter(q, flags);
394 if (ret) 393 if (ret)
395 return ERR_PTR(ret); 394 return ERR_PTR(ret);
396 395
@@ -410,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
410EXPORT_SYMBOL(blk_mq_alloc_request); 409EXPORT_SYMBOL(blk_mq_alloc_request);
411 410
412struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 411struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
413 unsigned int op, unsigned int flags, unsigned int hctx_idx) 412 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
414{ 413{
415 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 414 struct blk_mq_alloc_data alloc_data = { .flags = flags };
416 struct request *rq; 415 struct request *rq;
@@ -429,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
429 if (hctx_idx >= q->nr_hw_queues) 428 if (hctx_idx >= q->nr_hw_queues)
430 return ERR_PTR(-EIO); 429 return ERR_PTR(-EIO);
431 430
432 ret = blk_queue_enter(q, true); 431 ret = blk_queue_enter(q, flags);
433 if (ret) 432 if (ret)
434 return ERR_PTR(ret); 433 return ERR_PTR(ret);
435 434
@@ -476,8 +475,14 @@ void blk_mq_free_request(struct request *rq)
476 if (rq->rq_flags & RQF_MQ_INFLIGHT) 475 if (rq->rq_flags & RQF_MQ_INFLIGHT)
477 atomic_dec(&hctx->nr_active); 476 atomic_dec(&hctx->nr_active);
478 477
478 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
479 laptop_io_completion(q->backing_dev_info);
480
479 wbt_done(q->rq_wb, &rq->issue_stat); 481 wbt_done(q->rq_wb, &rq->issue_stat);
480 482
483 if (blk_rq_rl(rq))
484 blk_put_rl(blk_rq_rl(rq));
485
481 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 486 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
482 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 487 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
483 if (rq->tag != -1) 488 if (rq->tag != -1)
@@ -593,22 +598,32 @@ void blk_mq_start_request(struct request *rq)
593 598
594 blk_add_timer(rq); 599 blk_add_timer(rq);
595 600
596 /* 601 WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
597 * Ensure that ->deadline is visible before set the started
598 * flag and clear the completed flag.
599 */
600 smp_mb__before_atomic();
601 602
602 /* 603 /*
603 * Mark us as started and clear complete. Complete might have been 604 * Mark us as started and clear complete. Complete might have been
604 * set if requeue raced with timeout, which then marked it as 605 * set if requeue raced with timeout, which then marked it as
605 * complete. So be sure to clear complete again when we start 606 * complete. So be sure to clear complete again when we start
606 * the request, otherwise we'll ignore the completion event. 607 * the request, otherwise we'll ignore the completion event.
608 *
609 * Ensure that ->deadline is visible before we set STARTED, such that
610 * blk_mq_check_expired() is guaranteed to observe our ->deadline when
611 * it observes STARTED.
607 */ 612 */
608 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 613 smp_wmb();
609 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 614 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
610 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 615 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
616 /*
617 * Coherence order guarantees these consecutive stores to a
618 * single variable propagate in the specified order. Thus the
619 * clear_bit() is ordered _after_ the set bit. See
620 * blk_mq_check_expired().
621 *
622 * (the bits must be part of the same byte for this to be
623 * true).
624 */
611 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 625 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
626 }
612 627
613 if (q->dma_drain_size && blk_rq_bytes(rq)) { 628 if (q->dma_drain_size && blk_rq_bytes(rq)) {
614 /* 629 /*
@@ -634,6 +649,8 @@ static void __blk_mq_requeue_request(struct request *rq)
634{ 649{
635 struct request_queue *q = rq->q; 650 struct request_queue *q = rq->q;
636 651
652 blk_mq_put_driver_tag(rq);
653
637 trace_block_rq_requeue(q, rq); 654 trace_block_rq_requeue(q, rq);
638 wbt_requeue(q->rq_wb, &rq->issue_stat); 655 wbt_requeue(q->rq_wb, &rq->issue_stat);
639 blk_mq_sched_requeue_request(rq); 656 blk_mq_sched_requeue_request(rq);
@@ -690,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
690 707
691 /* 708 /*
692 * We abuse this flag that is otherwise used by the I/O scheduler to 709 * We abuse this flag that is otherwise used by the I/O scheduler to
693 * request head insertation from the workqueue. 710 * request head insertion from the workqueue.
694 */ 711 */
695 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); 712 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
696 713
@@ -778,11 +795,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
778 struct request *rq, void *priv, bool reserved) 795 struct request *rq, void *priv, bool reserved)
779{ 796{
780 struct blk_mq_timeout_data *data = priv; 797 struct blk_mq_timeout_data *data = priv;
798 unsigned long deadline;
781 799
782 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 800 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
783 return; 801 return;
784 802
785 /* 803 /*
804 * Ensures that if we see STARTED we must also see our
805 * up-to-date deadline, see blk_mq_start_request().
806 */
807 smp_rmb();
808
809 deadline = READ_ONCE(rq->deadline);
810
811 /*
786 * The rq being checked may have been freed and reallocated 812 * The rq being checked may have been freed and reallocated
787 * out already here, we avoid this race by checking rq->deadline 813 * out already here, we avoid this race by checking rq->deadline
788 * and REQ_ATOM_COMPLETE flag together: 814 * and REQ_ATOM_COMPLETE flag together:
@@ -795,11 +821,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
795 * and clearing the flag in blk_mq_start_request(), so 821 * and clearing the flag in blk_mq_start_request(), so
796 * this rq won't be timed out too. 822 * this rq won't be timed out too.
797 */ 823 */
798 if (time_after_eq(jiffies, rq->deadline)) { 824 if (time_after_eq(jiffies, deadline)) {
799 if (!blk_mark_rq_complete(rq)) 825 if (!blk_mark_rq_complete(rq)) {
826 /*
827 * Again coherence order ensures that consecutive reads
828 * from the same variable must be in that order. This
829 * ensures that if we see COMPLETE clear, we must then
830 * see STARTED set and we'll ignore this timeout.
831 *
832 * (There's also the MB implied by the test_and_clear())
833 */
800 blk_mq_rq_timed_out(rq, reserved); 834 blk_mq_rq_timed_out(rq, reserved);
801 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 835 }
802 data->next = rq->deadline; 836 } else if (!data->next_set || time_after(data->next, deadline)) {
837 data->next = deadline;
803 data->next_set = 1; 838 data->next_set = 1;
804 } 839 }
805} 840}
@@ -880,6 +915,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
880} 915}
881EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 916EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
882 917
918struct dispatch_rq_data {
919 struct blk_mq_hw_ctx *hctx;
920 struct request *rq;
921};
922
923static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
924 void *data)
925{
926 struct dispatch_rq_data *dispatch_data = data;
927 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
928 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
929
930 spin_lock(&ctx->lock);
931 if (unlikely(!list_empty(&ctx->rq_list))) {
932 dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
933 list_del_init(&dispatch_data->rq->queuelist);
934 if (list_empty(&ctx->rq_list))
935 sbitmap_clear_bit(sb, bitnr);
936 }
937 spin_unlock(&ctx->lock);
938
939 return !dispatch_data->rq;
940}
941
942struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
943 struct blk_mq_ctx *start)
944{
945 unsigned off = start ? start->index_hw : 0;
946 struct dispatch_rq_data data = {
947 .hctx = hctx,
948 .rq = NULL,
949 };
950
951 __sbitmap_for_each_set(&hctx->ctx_map, off,
952 dispatch_rq_from_ctx, &data);
953
954 return data.rq;
955}
956
883static inline unsigned int queued_to_index(unsigned int queued) 957static inline unsigned int queued_to_index(unsigned int queued)
884{ 958{
885 if (!queued) 959 if (!queued)
@@ -920,109 +994,95 @@ done:
920 return rq->tag != -1; 994 return rq->tag != -1;
921} 995}
922 996
923static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, 997static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
924 struct request *rq) 998 int flags, void *key)
925{
926 blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
927 rq->tag = -1;
928
929 if (rq->rq_flags & RQF_MQ_INFLIGHT) {
930 rq->rq_flags &= ~RQF_MQ_INFLIGHT;
931 atomic_dec(&hctx->nr_active);
932 }
933}
934
935static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
936 struct request *rq)
937{
938 if (rq->tag == -1 || rq->internal_tag == -1)
939 return;
940
941 __blk_mq_put_driver_tag(hctx, rq);
942}
943
944static void blk_mq_put_driver_tag(struct request *rq)
945{ 999{
946 struct blk_mq_hw_ctx *hctx; 1000 struct blk_mq_hw_ctx *hctx;
947 1001
948 if (rq->tag == -1 || rq->internal_tag == -1) 1002 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
949 return;
950 1003
951 hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); 1004 list_del_init(&wait->entry);
952 __blk_mq_put_driver_tag(hctx, rq); 1005 blk_mq_run_hw_queue(hctx, true);
1006 return 1;
953} 1007}
954 1008
955/* 1009/*
956 * If we fail getting a driver tag because all the driver tags are already 1010 * Mark us waiting for a tag. For shared tags, this involves hooking us into
957 * assigned and on the dispatch list, BUT the first entry does not have a 1011 * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
958 * tag, then we could deadlock. For that case, move entries with assigned 1012 * restart. For both caes, take care to check the condition again after
959 * driver tags to the front, leaving the set of tagged requests in the 1013 * marking us as waiting.
960 * same order, and the untagged set in the same order.
961 */ 1014 */
962static bool reorder_tags_to_front(struct list_head *list) 1015static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
963{ 1016 struct request *rq)
964 struct request *rq, *tmp, *first = NULL;
965
966 list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
967 if (rq == first)
968 break;
969 if (rq->tag != -1) {
970 list_move(&rq->queuelist, list);
971 if (!first)
972 first = rq;
973 }
974 }
975
976 return first != NULL;
977}
978
979static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
980 void *key)
981{ 1017{
982 struct blk_mq_hw_ctx *hctx; 1018 struct blk_mq_hw_ctx *this_hctx = *hctx;
1019 bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
1020 struct sbq_wait_state *ws;
1021 wait_queue_entry_t *wait;
1022 bool ret;
983 1023
984 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1024 if (!shared_tags) {
1025 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
1026 set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
1027 } else {
1028 wait = &this_hctx->dispatch_wait;
1029 if (!list_empty_careful(&wait->entry))
1030 return false;
985 1031
986 list_del(&wait->entry); 1032 spin_lock(&this_hctx->lock);
987 clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); 1033 if (!list_empty(&wait->entry)) {
988 blk_mq_run_hw_queue(hctx, true); 1034 spin_unlock(&this_hctx->lock);
989 return 1; 1035 return false;
990} 1036 }
991 1037
992static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) 1038 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
993{ 1039 add_wait_queue(&ws->wait, wait);
994 struct sbq_wait_state *ws; 1040 }
995 1041
996 /* 1042 /*
997 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. 1043 * It's possible that a tag was freed in the window between the
998 * The thread which wins the race to grab this bit adds the hardware 1044 * allocation failure and adding the hardware queue to the wait
999 * queue to the wait queue. 1045 * queue.
1000 */ 1046 */
1001 if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || 1047 ret = blk_mq_get_driver_tag(rq, hctx, false);
1002 test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
1003 return false;
1004 1048
1005 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 1049 if (!shared_tags) {
1006 ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); 1050 /*
1051 * Don't clear RESTART here, someone else could have set it.
1052 * At most this will cost an extra queue run.
1053 */
1054 return ret;
1055 } else {
1056 if (!ret) {
1057 spin_unlock(&this_hctx->lock);
1058 return false;
1059 }
1007 1060
1008 /* 1061 /*
1009 * As soon as this returns, it's no longer safe to fiddle with 1062 * We got a tag, remove ourselves from the wait queue to ensure
1010 * hctx->dispatch_wait, since a completion can wake up the wait queue 1063 * someone else gets the wakeup.
1011 * and unlock the bit. 1064 */
1012 */ 1065 spin_lock_irq(&ws->wait.lock);
1013 add_wait_queue(&ws->wait, &hctx->dispatch_wait); 1066 list_del_init(&wait->entry);
1014 return true; 1067 spin_unlock_irq(&ws->wait.lock);
1068 spin_unlock(&this_hctx->lock);
1069 return true;
1070 }
1015} 1071}
1016 1072
1017bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) 1073bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1074 bool got_budget)
1018{ 1075{
1019 struct blk_mq_hw_ctx *hctx; 1076 struct blk_mq_hw_ctx *hctx;
1020 struct request *rq; 1077 struct request *rq, *nxt;
1078 bool no_tag = false;
1021 int errors, queued; 1079 int errors, queued;
1022 1080
1023 if (list_empty(list)) 1081 if (list_empty(list))
1024 return false; 1082 return false;
1025 1083
1084 WARN_ON(!list_is_singular(list) && got_budget);
1085
1026 /* 1086 /*
1027 * Now process all the entries, sending them to the driver. 1087 * Now process all the entries, sending them to the driver.
1028 */ 1088 */
@@ -1033,23 +1093,29 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
1033 1093
1034 rq = list_first_entry(list, struct request, queuelist); 1094 rq = list_first_entry(list, struct request, queuelist);
1035 if (!blk_mq_get_driver_tag(rq, &hctx, false)) { 1095 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1036 if (!queued && reorder_tags_to_front(list))
1037 continue;
1038
1039 /* 1096 /*
1040 * The initial allocation attempt failed, so we need to 1097 * The initial allocation attempt failed, so we need to
1041 * rerun the hardware queue when a tag is freed. 1098 * rerun the hardware queue when a tag is freed. The
1099 * waitqueue takes care of that. If the queue is run
1100 * before we add this entry back on the dispatch list,
1101 * we'll re-run it below.
1042 */ 1102 */
1043 if (!blk_mq_dispatch_wait_add(hctx)) 1103 if (!blk_mq_mark_tag_wait(&hctx, rq)) {
1104 if (got_budget)
1105 blk_mq_put_dispatch_budget(hctx);
1106 /*
1107 * For non-shared tags, the RESTART check
1108 * will suffice.
1109 */
1110 if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1111 no_tag = true;
1044 break; 1112 break;
1113 }
1114 }
1045 1115
1046 /* 1116 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
1047 * It's possible that a tag was freed in the window 1117 blk_mq_put_driver_tag(rq);
1048 * between the allocation failure and adding the 1118 break;
1049 * hardware queue to the wait queue.
1050 */
1051 if (!blk_mq_get_driver_tag(rq, &hctx, false))
1052 break;
1053 } 1119 }
1054 1120
1055 list_del_init(&rq->queuelist); 1121 list_del_init(&rq->queuelist);
@@ -1063,15 +1129,21 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
1063 if (list_empty(list)) 1129 if (list_empty(list))
1064 bd.last = true; 1130 bd.last = true;
1065 else { 1131 else {
1066 struct request *nxt;
1067
1068 nxt = list_first_entry(list, struct request, queuelist); 1132 nxt = list_first_entry(list, struct request, queuelist);
1069 bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); 1133 bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
1070 } 1134 }
1071 1135
1072 ret = q->mq_ops->queue_rq(hctx, &bd); 1136 ret = q->mq_ops->queue_rq(hctx, &bd);
1073 if (ret == BLK_STS_RESOURCE) { 1137 if (ret == BLK_STS_RESOURCE) {
1074 blk_mq_put_driver_tag_hctx(hctx, rq); 1138 /*
1139 * If an I/O scheduler has been configured and we got a
1140 * driver tag for the next request already, free it
1141 * again.
1142 */
1143 if (!list_empty(list)) {
1144 nxt = list_first_entry(list, struct request, queuelist);
1145 blk_mq_put_driver_tag(nxt);
1146 }
1075 list_add(&rq->queuelist, list); 1147 list_add(&rq->queuelist, list);
1076 __blk_mq_requeue_request(rq); 1148 __blk_mq_requeue_request(rq);
1077 break; 1149 break;
@@ -1093,13 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
1093 * that is where we will continue on next queue run. 1165 * that is where we will continue on next queue run.
1094 */ 1166 */
1095 if (!list_empty(list)) { 1167 if (!list_empty(list)) {
1096 /*
1097 * If an I/O scheduler has been configured and we got a driver
1098 * tag for the next request already, free it again.
1099 */
1100 rq = list_first_entry(list, struct request, queuelist);
1101 blk_mq_put_driver_tag(rq);
1102
1103 spin_lock(&hctx->lock); 1168 spin_lock(&hctx->lock);
1104 list_splice_init(list, &hctx->dispatch); 1169 list_splice_init(list, &hctx->dispatch);
1105 spin_unlock(&hctx->lock); 1170 spin_unlock(&hctx->lock);
@@ -1109,10 +1174,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
1109 * it is no longer set that means that it was cleared by another 1174 * it is no longer set that means that it was cleared by another
1110 * thread and hence that a queue rerun is needed. 1175 * thread and hence that a queue rerun is needed.
1111 * 1176 *
1112 * If TAG_WAITING is set that means that an I/O scheduler has 1177 * If 'no_tag' is set, that means that we failed getting
1113 * been configured and another thread is waiting for a driver 1178 * a driver tag with an I/O scheduler attached. If our dispatch
1114 * tag. To guarantee fairness, do not rerun this hardware queue 1179 * waitqueue is no longer active, ensure that we run the queue
1115 * but let the other thread grab the driver tag. 1180 * AFTER adding our entries back to the list.
1116 * 1181 *
1117 * If no I/O scheduler has been configured it is possible that 1182 * If no I/O scheduler has been configured it is possible that
1118 * the hardware queue got stopped and restarted before requests 1183 * the hardware queue got stopped and restarted before requests
@@ -1124,8 +1189,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
1124 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1189 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1125 * and dm-rq. 1190 * and dm-rq.
1126 */ 1191 */
1127 if (!blk_mq_sched_needs_restart(hctx) && 1192 if (!blk_mq_sched_needs_restart(hctx) ||
1128 !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) 1193 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1129 blk_mq_run_hw_queue(hctx, true); 1194 blk_mq_run_hw_queue(hctx, true);
1130 } 1195 }
1131 1196
@@ -1218,9 +1283,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1218} 1283}
1219EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 1284EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1220 1285
1221void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1286bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1222{ 1287{
1223 __blk_mq_delay_run_hw_queue(hctx, async, 0); 1288 if (blk_mq_hctx_has_pending(hctx)) {
1289 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1290 return true;
1291 }
1292
1293 return false;
1224} 1294}
1225EXPORT_SYMBOL(blk_mq_run_hw_queue); 1295EXPORT_SYMBOL(blk_mq_run_hw_queue);
1226 1296
@@ -1230,8 +1300,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1230 int i; 1300 int i;
1231 1301
1232 queue_for_each_hw_ctx(q, hctx, i) { 1302 queue_for_each_hw_ctx(q, hctx, i) {
1233 if (!blk_mq_hctx_has_pending(hctx) || 1303 if (blk_mq_hctx_stopped(hctx))
1234 blk_mq_hctx_stopped(hctx))
1235 continue; 1304 continue;
1236 1305
1237 blk_mq_run_hw_queue(hctx, async); 1306 blk_mq_run_hw_queue(hctx, async);
@@ -1405,7 +1474,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1405 * Should only be used carefully, when the caller knows we want to 1474 * Should only be used carefully, when the caller knows we want to
1406 * bypass a potential IO scheduler on the target device. 1475 * bypass a potential IO scheduler on the target device.
1407 */ 1476 */
1408void blk_mq_request_bypass_insert(struct request *rq) 1477void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1409{ 1478{
1410 struct blk_mq_ctx *ctx = rq->mq_ctx; 1479 struct blk_mq_ctx *ctx = rq->mq_ctx;
1411 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); 1480 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
@@ -1414,7 +1483,8 @@ void blk_mq_request_bypass_insert(struct request *rq)
1414 list_add_tail(&rq->queuelist, &hctx->dispatch); 1483 list_add_tail(&rq->queuelist, &hctx->dispatch);
1415 spin_unlock(&hctx->lock); 1484 spin_unlock(&hctx->lock);
1416 1485
1417 blk_mq_run_hw_queue(hctx, false); 1486 if (run_queue)
1487 blk_mq_run_hw_queue(hctx, false);
1418} 1488}
1419 1489
1420void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 1490void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
@@ -1501,13 +1571,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1501{ 1571{
1502 blk_init_request_from_bio(rq, bio); 1572 blk_init_request_from_bio(rq, bio);
1503 1573
1504 blk_account_io_start(rq, true); 1574 blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1505}
1506 1575
1507static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1576 blk_account_io_start(rq, true);
1508{
1509 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1510 !blk_queue_nomerges(hctx->queue);
1511} 1577}
1512 1578
1513static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, 1579static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
@@ -1552,6 +1618,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1552 if (!blk_mq_get_driver_tag(rq, NULL, false)) 1618 if (!blk_mq_get_driver_tag(rq, NULL, false))
1553 goto insert; 1619 goto insert;
1554 1620
1621 if (!blk_mq_get_dispatch_budget(hctx)) {
1622 blk_mq_put_driver_tag(rq);
1623 goto insert;
1624 }
1625
1555 new_cookie = request_to_qc_t(hctx, rq); 1626 new_cookie = request_to_qc_t(hctx, rq);
1556 1627
1557 /* 1628 /*
@@ -1641,13 +1712,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1641 if (unlikely(is_flush_fua)) { 1712 if (unlikely(is_flush_fua)) {
1642 blk_mq_put_ctx(data.ctx); 1713 blk_mq_put_ctx(data.ctx);
1643 blk_mq_bio_to_request(rq, bio); 1714 blk_mq_bio_to_request(rq, bio);
1644 if (q->elevator) { 1715
1645 blk_mq_sched_insert_request(rq, false, true, true, 1716 /* bypass scheduler for flush rq */
1646 true); 1717 blk_insert_flush(rq);
1647 } else { 1718 blk_mq_run_hw_queue(data.hctx, true);
1648 blk_insert_flush(rq);
1649 blk_mq_run_hw_queue(data.hctx, true);
1650 }
1651 } else if (plug && q->nr_hw_queues == 1) { 1719 } else if (plug && q->nr_hw_queues == 1) {
1652 struct request *last = NULL; 1720 struct request *last = NULL;
1653 1721
@@ -1990,6 +2058,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
1990 2058
1991 hctx->nr_ctx = 0; 2059 hctx->nr_ctx = 0;
1992 2060
2061 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2062 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2063
1993 if (set->ops->init_hctx && 2064 if (set->ops->init_hctx &&
1994 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 2065 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1995 goto free_bitmap; 2066 goto free_bitmap;
@@ -2229,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2229 2300
2230 mutex_lock(&set->tag_list_lock); 2301 mutex_lock(&set->tag_list_lock);
2231 2302
2232 /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ 2303 /*
2233 if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { 2304 * Check to see if we're transitioning to shared (from 1 to 2 queues).
2305 */
2306 if (!list_empty(&set->tag_list) &&
2307 !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2234 set->flags |= BLK_MQ_F_TAG_SHARED; 2308 set->flags |= BLK_MQ_F_TAG_SHARED;
2235 /* update existing queue */ 2309 /* update existing queue */
2236 blk_mq_update_tag_set_depth(set, true); 2310 blk_mq_update_tag_set_depth(set, true);
@@ -2404,6 +2478,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2404 spin_lock_init(&q->requeue_lock); 2478 spin_lock_init(&q->requeue_lock);
2405 2479
2406 blk_queue_make_request(q, blk_mq_make_request); 2480 blk_queue_make_request(q, blk_mq_make_request);
2481 if (q->mq_ops->poll)
2482 q->poll_fn = blk_mq_poll;
2407 2483
2408 /* 2484 /*
2409 * Do this after blk_queue_make_request() overrides it... 2485 * Do this after blk_queue_make_request() overrides it...
@@ -2460,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q)
2460 2536
2461 /* 2537 /*
2462 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2538 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2463 * we should change hctx numa_node according to new topology (this 2539 * we should change hctx numa_node according to the new topology (this
2464 * involves free and re-allocate memory, worthy doing?) 2540 * involves freeing and re-allocating memory, worth doing?)
2465 */ 2541 */
2466
2467 blk_mq_map_swqueue(q); 2542 blk_mq_map_swqueue(q);
2468 2543
2469 blk_mq_sysfs_register(q); 2544 blk_mq_sysfs_register(q);
@@ -2552,6 +2627,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2552 if (!set->ops->queue_rq) 2627 if (!set->ops->queue_rq)
2553 return -EINVAL; 2628 return -EINVAL;
2554 2629
2630 if (!set->ops->get_budget ^ !set->ops->put_budget)
2631 return -EINVAL;
2632
2555 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2633 if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2556 pr_info("blk-mq: reduced tag depth to %u\n", 2634 pr_info("blk-mq: reduced tag depth to %u\n",
2557 BLK_MQ_MAX_DEPTH); 2635 BLK_MQ_MAX_DEPTH);
@@ -2642,8 +2720,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2642 * queue depth. This is similar to what the old code would do. 2720 * queue depth. This is similar to what the old code would do.
2643 */ 2721 */
2644 if (!hctx->sched_tags) { 2722 if (!hctx->sched_tags) {
2645 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, 2723 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
2646 min(nr, set->queue_depth),
2647 false); 2724 false);
2648 } else { 2725 } else {
2649 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 2726 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
@@ -2863,20 +2940,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
2863 return false; 2940 return false;
2864} 2941}
2865 2942
2866bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) 2943static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
2867{ 2944{
2868 struct blk_mq_hw_ctx *hctx; 2945 struct blk_mq_hw_ctx *hctx;
2869 struct blk_plug *plug;
2870 struct request *rq; 2946 struct request *rq;
2871 2947
2872 if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || 2948 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
2873 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
2874 return false; 2949 return false;
2875 2950
2876 plug = current->plug;
2877 if (plug)
2878 blk_flush_plug_list(plug, false);
2879
2880 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 2951 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
2881 if (!blk_qc_t_is_internal(cookie)) 2952 if (!blk_qc_t_is_internal(cookie))
2882 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 2953 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
@@ -2894,10 +2965,15 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
2894 2965
2895 return __blk_mq_poll(hctx, rq); 2966 return __blk_mq_poll(hctx, rq);
2896} 2967}
2897EXPORT_SYMBOL_GPL(blk_mq_poll);
2898 2968
2899static int __init blk_mq_init(void) 2969static int __init blk_mq_init(void)
2900{ 2970{
2971 /*
2972 * See comment in block/blk.h rq_atomic_flags enum
2973 */
2974 BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
2975 (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
2976
2901 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 2977 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
2902 blk_mq_hctx_notify_dead); 2978 blk_mq_hctx_notify_dead);
2903 return 0; 2979 return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 4933af9d61f7..6c7c3ff5bf62 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -3,6 +3,7 @@
3#define INT_BLK_MQ_H 3#define INT_BLK_MQ_H
4 4
5#include "blk-stat.h" 5#include "blk-stat.h"
6#include "blk-mq-tag.h"
6 7
7struct blk_mq_tag_set; 8struct blk_mq_tag_set;
8 9
@@ -26,16 +27,16 @@ struct blk_mq_ctx {
26 struct kobject kobj; 27 struct kobject kobj;
27} ____cacheline_aligned_in_smp; 28} ____cacheline_aligned_in_smp;
28 29
29void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
30void blk_mq_freeze_queue(struct request_queue *q); 30void blk_mq_freeze_queue(struct request_queue *q);
31void blk_mq_free_queue(struct request_queue *q); 31void blk_mq_free_queue(struct request_queue *q);
32int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); 32int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
33void blk_mq_wake_waiters(struct request_queue *q); 33void blk_mq_wake_waiters(struct request_queue *q);
34bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); 34bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
35void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); 35void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
36bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
37bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, 36bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
38 bool wait); 37 bool wait);
38struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
39 struct blk_mq_ctx *start);
39 40
40/* 41/*
41 * Internal helpers for allocating/freeing the request map 42 * Internal helpers for allocating/freeing the request map
@@ -55,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
55 */ 56 */
56void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 57void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
57 bool at_head); 58 bool at_head);
58void blk_mq_request_bypass_insert(struct request *rq); 59void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
59void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 60void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
60 struct list_head *list); 61 struct list_head *list);
61 62
@@ -109,7 +110,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
109struct blk_mq_alloc_data { 110struct blk_mq_alloc_data {
110 /* input parameter */ 111 /* input parameter */
111 struct request_queue *q; 112 struct request_queue *q;
112 unsigned int flags; 113 blk_mq_req_flags_t flags;
113 unsigned int shallow_depth; 114 unsigned int shallow_depth;
114 115
115 /* input & output parameter */ 116 /* input & output parameter */
@@ -138,4 +139,53 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
138void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, 139void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
139 unsigned int inflight[2]); 140 unsigned int inflight[2]);
140 141
142static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
143{
144 struct request_queue *q = hctx->queue;
145
146 if (q->mq_ops->put_budget)
147 q->mq_ops->put_budget(hctx);
148}
149
150static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
151{
152 struct request_queue *q = hctx->queue;
153
154 if (q->mq_ops->get_budget)
155 return q->mq_ops->get_budget(hctx);
156 return true;
157}
158
159static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
160 struct request *rq)
161{
162 blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
163 rq->tag = -1;
164
165 if (rq->rq_flags & RQF_MQ_INFLIGHT) {
166 rq->rq_flags &= ~RQF_MQ_INFLIGHT;
167 atomic_dec(&hctx->nr_active);
168 }
169}
170
171static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
172 struct request *rq)
173{
174 if (rq->tag == -1 || rq->internal_tag == -1)
175 return;
176
177 __blk_mq_put_driver_tag(hctx, rq);
178}
179
180static inline void blk_mq_put_driver_tag(struct request *rq)
181{
182 struct blk_mq_hw_ctx *hctx;
183
184 if (rq->tag == -1 || rq->internal_tag == -1)
185 return;
186
187 hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
188 __blk_mq_put_driver_tag(hctx, rq);
189}
190
141#endif 191#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8559e9563c52..48ebe6be07b7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
157 * Caveat: 157 * Caveat:
158 * The driver that does this *must* be able to deal appropriately 158 * The driver that does this *must* be able to deal appropriately
159 * with buffers in "highmemory". This can be accomplished by either calling 159 * with buffers in "highmemory". This can be accomplished by either calling
160 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling 160 * kmap_atomic() to get a temporary kernel mapping, or by calling
161 * blk_queue_bounce() to create a buffer in normal memory. 161 * blk_queue_bounce() to create a buffer in normal memory.
162 **/ 162 **/
163void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) 163void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d90fe3..3a2f3c96f367 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
11#include "blk-mq.h" 11#include "blk-mq.h"
12#include "blk.h" 12#include "blk.h"
13 13
14#define BLK_RQ_STAT_BATCH 64
15
16struct blk_queue_stats { 14struct blk_queue_stats {
17 struct list_head callbacks; 15 struct list_head callbacks;
18 spinlock_t lock; 16 spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
23{ 21{
24 stat->min = -1ULL; 22 stat->min = -1ULL;
25 stat->max = stat->nr_samples = stat->mean = 0; 23 stat->max = stat->nr_samples = stat->mean = 0;
26 stat->batch = stat->nr_batch = 0; 24 stat->batch = 0;
27}
28
29static void blk_stat_flush_batch(struct blk_rq_stat *stat)
30{
31 const s32 nr_batch = READ_ONCE(stat->nr_batch);
32 const s32 nr_samples = READ_ONCE(stat->nr_samples);
33
34 if (!nr_batch)
35 return;
36 if (!nr_samples)
37 stat->mean = div64_s64(stat->batch, nr_batch);
38 else {
39 stat->mean = div64_s64((stat->mean * nr_samples) +
40 stat->batch,
41 nr_batch + nr_samples);
42 }
43
44 stat->nr_samples += nr_batch;
45 stat->nr_batch = stat->batch = 0;
46} 25}
47 26
27/* src is a per-cpu stat, mean isn't initialized */
48static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) 28static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
49{ 29{
50 blk_stat_flush_batch(src);
51
52 if (!src->nr_samples) 30 if (!src->nr_samples)
53 return; 31 return;
54 32
55 dst->min = min(dst->min, src->min); 33 dst->min = min(dst->min, src->min);
56 dst->max = max(dst->max, src->max); 34 dst->max = max(dst->max, src->max);
57 35
58 if (!dst->nr_samples) 36 dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
59 dst->mean = src->mean; 37 dst->nr_samples + src->nr_samples);
60 else { 38
61 dst->mean = div64_s64((src->mean * src->nr_samples) +
62 (dst->mean * dst->nr_samples),
63 dst->nr_samples + src->nr_samples);
64 }
65 dst->nr_samples += src->nr_samples; 39 dst->nr_samples += src->nr_samples;
66} 40}
67 41
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
69{ 43{
70 stat->min = min(stat->min, value); 44 stat->min = min(stat->min, value);
71 stat->max = max(stat->max, value); 45 stat->max = max(stat->max, value);
72
73 if (stat->batch + value < stat->batch ||
74 stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
75 blk_stat_flush_batch(stat);
76
77 stat->batch += value; 46 stat->batch += value;
78 stat->nr_batch++; 47 stat->nr_samples++;
79} 48}
80 49
81void blk_stat_add(struct request *rq) 50void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
84 struct blk_stat_callback *cb; 53 struct blk_stat_callback *cb;
85 struct blk_rq_stat *stat; 54 struct blk_rq_stat *stat;
86 int bucket; 55 int bucket;
87 s64 now, value; 56 u64 now, value;
88 57
89 now = __blk_stat_time(ktime_to_ns(ktime_get())); 58 now = __blk_stat_time(ktime_to_ns(ktime_get()));
90 if (now < blk_stat_time(&rq->issue_stat)) 59 if (now < blk_stat_time(&rq->issue_stat))
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8631763866c6..96ad32623427 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2113,8 +2113,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
2113static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) 2113static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
2114{ 2114{
2115#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2115#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2116 if (bio->bi_css) 2116 if (bio->bi_css) {
2117 if (bio->bi_cg_private)
2118 blkg_put(tg_to_blkg(bio->bi_cg_private));
2117 bio->bi_cg_private = tg; 2119 bio->bi_cg_private = tg;
2120 blkg_get(tg_to_blkg(tg));
2121 }
2118 blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); 2122 blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
2119#endif 2123#endif
2120} 2124}
@@ -2284,8 +2288,10 @@ void blk_throtl_bio_endio(struct bio *bio)
2284 2288
2285 start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; 2289 start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
2286 finish_time = __blk_stat_time(finish_time_ns) >> 10; 2290 finish_time = __blk_stat_time(finish_time_ns) >> 10;
2287 if (!start_time || finish_time <= start_time) 2291 if (!start_time || finish_time <= start_time) {
2292 blkg_put(tg_to_blkg(tg));
2288 return; 2293 return;
2294 }
2289 2295
2290 lat = finish_time - start_time; 2296 lat = finish_time - start_time;
2291 /* this is only for bio based driver */ 2297 /* this is only for bio based driver */
@@ -2315,6 +2321,8 @@ void blk_throtl_bio_endio(struct bio *bio)
2315 tg->bio_cnt /= 2; 2321 tg->bio_cnt /= 2;
2316 tg->bad_bio_cnt /= 2; 2322 tg->bad_bio_cnt /= 2;
2317 } 2323 }
2324
2325 blkg_put(tg_to_blkg(tg));
2318} 2326}
2319#endif 2327#endif
2320 2328
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 17ec83bb0900..764ecf9aeb30 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work)
134 struct request *rq, *tmp; 134 struct request *rq, *tmp;
135 int next_set = 0; 135 int next_set = 0;
136 136
137 if (blk_queue_enter(q, true))
138 return;
139 spin_lock_irqsave(q->queue_lock, flags); 137 spin_lock_irqsave(q->queue_lock, flags);
140 138
141 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) 139 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work)
145 mod_timer(&q->timeout, round_jiffies_up(next)); 143 mod_timer(&q->timeout, round_jiffies_up(next));
146 144
147 spin_unlock_irqrestore(q->queue_lock, flags); 145 spin_unlock_irqrestore(q->queue_lock, flags);
148 blk_queue_exit(q);
149} 146}
150 147
151/** 148/**
@@ -211,7 +208,7 @@ void blk_add_timer(struct request *req)
211 if (!req->timeout) 208 if (!req->timeout)
212 req->timeout = q->rq_timeout; 209 req->timeout = q->rq_timeout;
213 210
214 req->deadline = jiffies + req->timeout; 211 WRITE_ONCE(req->deadline, jiffies + req->timeout);
215 212
216 /* 213 /*
217 * Only the non-mq case needs to add the request to a protected list. 214 * Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index d822530e6aea..b252da0e4c11 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
654} 654}
655 655
656/* 656/*
657 * Disable wbt, if enabled by default. Only called from CFQ. 657 * Disable wbt, if enabled by default.
658 */ 658 */
659void wbt_disable_default(struct request_queue *q) 659void wbt_disable_default(struct request_queue *q)
660{ 660{
diff --git a/block/blk.h b/block/blk.h
index 85be8b232b37..3f1446937aec 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -123,8 +123,15 @@ void blk_account_io_done(struct request *req);
123 * Internal atomic flags for request handling 123 * Internal atomic flags for request handling
124 */ 124 */
125enum rq_atomic_flags { 125enum rq_atomic_flags {
126 /*
127 * Keep these two bits first - not because we depend on the
128 * value of them, but we do depend on them being in the same
129 * byte of storage to ensure ordering on writes. Keeping them
130 * first will achieve that nicely.
131 */
126 REQ_ATOM_COMPLETE = 0, 132 REQ_ATOM_COMPLETE = 0,
127 REQ_ATOM_STARTED, 133 REQ_ATOM_STARTED,
134
128 REQ_ATOM_POLL_SLEPT, 135 REQ_ATOM_POLL_SLEPT,
129}; 136};
130 137
@@ -149,45 +156,6 @@ static inline void blk_clear_rq_complete(struct request *rq)
149 156
150void blk_insert_flush(struct request *rq); 157void blk_insert_flush(struct request *rq);
151 158
152static inline struct request *__elv_next_request(struct request_queue *q)
153{
154 struct request *rq;
155 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
156
157 WARN_ON_ONCE(q->mq_ops);
158
159 while (1) {
160 if (!list_empty(&q->queue_head)) {
161 rq = list_entry_rq(q->queue_head.next);
162 return rq;
163 }
164
165 /*
166 * Flush request is running and flush request isn't queueable
167 * in the drive, we can hold the queue till flush request is
168 * finished. Even we don't do this, driver can't dispatch next
169 * requests and will requeue them. And this can improve
170 * throughput too. For example, we have request flush1, write1,
171 * flush 2. flush1 is dispatched, then queue is hold, write1
172 * isn't inserted to queue. After flush1 is finished, flush2
173 * will be dispatched. Since disk cache is already clean,
174 * flush2 will be finished very soon, so looks like flush2 is
175 * folded to flush1.
176 * Since the queue is hold, a flag is set to indicate the queue
177 * should be restarted later. Please see flush_end_io() for
178 * details.
179 */
180 if (fq->flush_pending_idx != fq->flush_running_idx &&
181 !queue_flush_queueable(q)) {
182 fq->flush_queue_delayed = 1;
183 return NULL;
184 }
185 if (unlikely(blk_queue_bypass(q)) ||
186 !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
187 return NULL;
188 }
189}
190
191static inline void elv_activate_rq(struct request_queue *q, struct request *rq) 159static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
192{ 160{
193 struct elevator_queue *e = q->elevator; 161 struct elevator_queue *e = q->elevator;
diff --git a/block/bsg.c b/block/bsg.c
index ee1335c68de7..452f94f1c5d4 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
137 137
138static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, 138static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
139 struct sg_io_v4 *hdr, struct bsg_device *bd, 139 struct sg_io_v4 *hdr, struct bsg_device *bd,
140 fmode_t has_write_perm) 140 fmode_t mode)
141{ 141{
142 struct scsi_request *req = scsi_req(rq); 142 struct scsi_request *req = scsi_req(rq);
143 143
@@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
152 return -EFAULT; 152 return -EFAULT;
153 153
154 if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { 154 if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
155 if (blk_verify_command(req->cmd, has_write_perm)) 155 if (blk_verify_command(req->cmd, mode))
156 return -EPERM; 156 return -EPERM;
157 } else if (!capable(CAP_SYS_RAWIO)) 157 } else if (!capable(CAP_SYS_RAWIO))
158 return -EPERM; 158 return -EPERM;
@@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
206 * map sg_io_v4 to a request. 206 * map sg_io_v4 to a request.
207 */ 207 */
208static struct request * 208static struct request *
209bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) 209bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
210{ 210{
211 struct request_queue *q = bd->queue; 211 struct request_queue *q = bd->queue;
212 struct request *rq, *next_rq = NULL; 212 struct request *rq, *next_rq = NULL;
@@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
237 if (IS_ERR(rq)) 237 if (IS_ERR(rq))
238 return rq; 238 return rq;
239 239
240 ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); 240 ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode);
241 if (ret) 241 if (ret)
242 goto out; 242 goto out;
243 243
@@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
587} 587}
588 588
589static int __bsg_write(struct bsg_device *bd, const char __user *buf, 589static int __bsg_write(struct bsg_device *bd, const char __user *buf,
590 size_t count, ssize_t *bytes_written, 590 size_t count, ssize_t *bytes_written, fmode_t mode)
591 fmode_t has_write_perm)
592{ 591{
593 struct bsg_command *bc; 592 struct bsg_command *bc;
594 struct request *rq; 593 struct request *rq;
@@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
619 /* 618 /*
620 * get a request, fill in the blanks, and add to request queue 619 * get a request, fill in the blanks, and add to request queue
621 */ 620 */
622 rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); 621 rq = bsg_map_hdr(bd, &bc->hdr, mode);
623 if (IS_ERR(rq)) { 622 if (IS_ERR(rq)) {
624 ret = PTR_ERR(rq); 623 ret = PTR_ERR(rq);
625 rq = NULL; 624 rq = NULL;
@@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
655 bsg_set_block(bd, file); 654 bsg_set_block(bd, file);
656 655
657 bytes_written = 0; 656 bytes_written = 0;
658 ret = __bsg_write(bd, buf, count, &bytes_written, 657 ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
659 file->f_mode & FMODE_WRITE);
660 658
661 *ppos = bytes_written; 659 *ppos = bytes_written;
662 660
@@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
915 if (copy_from_user(&hdr, uarg, sizeof(hdr))) 913 if (copy_from_user(&hdr, uarg, sizeof(hdr)))
916 return -EFAULT; 914 return -EFAULT;
917 915
918 rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); 916 rq = bsg_map_hdr(bd, &hdr, file->f_mode);
919 if (IS_ERR(rq)) 917 if (IS_ERR(rq))
920 return PTR_ERR(rq); 918 return PTR_ERR(rq);
921 919
diff --git a/block/elevator.c b/block/elevator.c
index 153926a90901..7bda083d5968 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,12 +83,25 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
83} 83}
84EXPORT_SYMBOL(elv_bio_merge_ok); 84EXPORT_SYMBOL(elv_bio_merge_ok);
85 85
86static struct elevator_type *elevator_find(const char *name) 86static bool elevator_match(const struct elevator_type *e, const char *name)
87{
88 if (!strcmp(e->elevator_name, name))
89 return true;
90 if (e->elevator_alias && !strcmp(e->elevator_alias, name))
91 return true;
92
93 return false;
94}
95
96/*
97 * Return scheduler with name 'name' and with matching 'mq capability
98 */
99static struct elevator_type *elevator_find(const char *name, bool mq)
87{ 100{
88 struct elevator_type *e; 101 struct elevator_type *e;
89 102
90 list_for_each_entry(e, &elv_list, list) { 103 list_for_each_entry(e, &elv_list, list) {
91 if (!strcmp(e->elevator_name, name)) 104 if (elevator_match(e, name) && (mq == e->uses_mq))
92 return e; 105 return e;
93 } 106 }
94 107
@@ -100,25 +113,25 @@ static void elevator_put(struct elevator_type *e)
100 module_put(e->elevator_owner); 113 module_put(e->elevator_owner);
101} 114}
102 115
103static struct elevator_type *elevator_get(const char *name, bool try_loading) 116static struct elevator_type *elevator_get(struct request_queue *q,
117 const char *name, bool try_loading)
104{ 118{
105 struct elevator_type *e; 119 struct elevator_type *e;
106 120
107 spin_lock(&elv_list_lock); 121 spin_lock(&elv_list_lock);
108 122
109 e = elevator_find(name); 123 e = elevator_find(name, q->mq_ops != NULL);
110 if (!e && try_loading) { 124 if (!e && try_loading) {
111 spin_unlock(&elv_list_lock); 125 spin_unlock(&elv_list_lock);
112 request_module("%s-iosched", name); 126 request_module("%s-iosched", name);
113 spin_lock(&elv_list_lock); 127 spin_lock(&elv_list_lock);
114 e = elevator_find(name); 128 e = elevator_find(name, q->mq_ops != NULL);
115 } 129 }
116 130
117 if (e && !try_module_get(e->elevator_owner)) 131 if (e && !try_module_get(e->elevator_owner))
118 e = NULL; 132 e = NULL;
119 133
120 spin_unlock(&elv_list_lock); 134 spin_unlock(&elv_list_lock);
121
122 return e; 135 return e;
123} 136}
124 137
@@ -144,8 +157,12 @@ void __init load_default_elevator_module(void)
144 if (!chosen_elevator[0]) 157 if (!chosen_elevator[0])
145 return; 158 return;
146 159
160 /*
161 * Boot parameter is deprecated, we haven't supported that for MQ.
162 * Only look for non-mq schedulers from here.
163 */
147 spin_lock(&elv_list_lock); 164 spin_lock(&elv_list_lock);
148 e = elevator_find(chosen_elevator); 165 e = elevator_find(chosen_elevator, false);
149 spin_unlock(&elv_list_lock); 166 spin_unlock(&elv_list_lock);
150 167
151 if (!e) 168 if (!e)
@@ -202,7 +219,7 @@ int elevator_init(struct request_queue *q, char *name)
202 q->boundary_rq = NULL; 219 q->boundary_rq = NULL;
203 220
204 if (name) { 221 if (name) {
205 e = elevator_get(name, true); 222 e = elevator_get(q, name, true);
206 if (!e) 223 if (!e)
207 return -EINVAL; 224 return -EINVAL;
208 } 225 }
@@ -214,7 +231,7 @@ int elevator_init(struct request_queue *q, char *name)
214 * allowed from async. 231 * allowed from async.
215 */ 232 */
216 if (!e && !q->mq_ops && *chosen_elevator) { 233 if (!e && !q->mq_ops && *chosen_elevator) {
217 e = elevator_get(chosen_elevator, false); 234 e = elevator_get(q, chosen_elevator, false);
218 if (!e) 235 if (!e)
219 printk(KERN_ERR "I/O scheduler %s not found\n", 236 printk(KERN_ERR "I/O scheduler %s not found\n",
220 chosen_elevator); 237 chosen_elevator);
@@ -229,17 +246,17 @@ int elevator_init(struct request_queue *q, char *name)
229 */ 246 */
230 if (q->mq_ops) { 247 if (q->mq_ops) {
231 if (q->nr_hw_queues == 1) 248 if (q->nr_hw_queues == 1)
232 e = elevator_get("mq-deadline", false); 249 e = elevator_get(q, "mq-deadline", false);
233 if (!e) 250 if (!e)
234 return 0; 251 return 0;
235 } else 252 } else
236 e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); 253 e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
237 254
238 if (!e) { 255 if (!e) {
239 printk(KERN_ERR 256 printk(KERN_ERR
240 "Default I/O scheduler not found. " \ 257 "Default I/O scheduler not found. " \
241 "Using noop.\n"); 258 "Using noop.\n");
242 e = elevator_get("noop", false); 259 e = elevator_get(q, "noop", false);
243 } 260 }
244 } 261 }
245 262
@@ -905,7 +922,7 @@ int elv_register(struct elevator_type *e)
905 922
906 /* register, don't allow duplicate names */ 923 /* register, don't allow duplicate names */
907 spin_lock(&elv_list_lock); 924 spin_lock(&elv_list_lock);
908 if (elevator_find(e->elevator_name)) { 925 if (elevator_find(e->elevator_name, e->uses_mq)) {
909 spin_unlock(&elv_list_lock); 926 spin_unlock(&elv_list_lock);
910 if (e->icq_cache) 927 if (e->icq_cache)
911 kmem_cache_destroy(e->icq_cache); 928 kmem_cache_destroy(e->icq_cache);
@@ -915,9 +932,9 @@ int elv_register(struct elevator_type *e)
915 spin_unlock(&elv_list_lock); 932 spin_unlock(&elv_list_lock);
916 933
917 /* print pretty message */ 934 /* print pretty message */
918 if (!strcmp(e->elevator_name, chosen_elevator) || 935 if (elevator_match(e, chosen_elevator) ||
919 (!*chosen_elevator && 936 (!*chosen_elevator &&
920 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) 937 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
921 def = " (default)"; 938 def = " (default)";
922 939
923 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 940 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
@@ -1066,25 +1083,15 @@ static int __elevator_change(struct request_queue *q, const char *name)
1066 return elevator_switch(q, NULL); 1083 return elevator_switch(q, NULL);
1067 1084
1068 strlcpy(elevator_name, name, sizeof(elevator_name)); 1085 strlcpy(elevator_name, name, sizeof(elevator_name));
1069 e = elevator_get(strstrip(elevator_name), true); 1086 e = elevator_get(q, strstrip(elevator_name), true);
1070 if (!e) 1087 if (!e)
1071 return -EINVAL; 1088 return -EINVAL;
1072 1089
1073 if (q->elevator && 1090 if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
1074 !strcmp(elevator_name, q->elevator->type->elevator_name)) {
1075 elevator_put(e); 1091 elevator_put(e);
1076 return 0; 1092 return 0;
1077 } 1093 }
1078 1094
1079 if (!e->uses_mq && q->mq_ops) {
1080 elevator_put(e);
1081 return -EINVAL;
1082 }
1083 if (e->uses_mq && !q->mq_ops) {
1084 elevator_put(e);
1085 return -EINVAL;
1086 }
1087
1088 return elevator_switch(q, e); 1095 return elevator_switch(q, e);
1089} 1096}
1090 1097
@@ -1116,9 +1123,10 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1116 struct elevator_queue *e = q->elevator; 1123 struct elevator_queue *e = q->elevator;
1117 struct elevator_type *elv = NULL; 1124 struct elevator_type *elv = NULL;
1118 struct elevator_type *__e; 1125 struct elevator_type *__e;
1126 bool uses_mq = q->mq_ops != NULL;
1119 int len = 0; 1127 int len = 0;
1120 1128
1121 if (!blk_queue_stackable(q)) 1129 if (!queue_is_rq_based(q))
1122 return sprintf(name, "none\n"); 1130 return sprintf(name, "none\n");
1123 1131
1124 if (!q->elevator) 1132 if (!q->elevator)
@@ -1128,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1128 1136
1129 spin_lock(&elv_list_lock); 1137 spin_lock(&elv_list_lock);
1130 list_for_each_entry(__e, &elv_list, list) { 1138 list_for_each_entry(__e, &elv_list, list) {
1131 if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { 1139 if (elv && elevator_match(elv, __e->elevator_name) &&
1140 (__e->uses_mq == uses_mq)) {
1132 len += sprintf(name+len, "[%s] ", elv->elevator_name); 1141 len += sprintf(name+len, "[%s] ", elv->elevator_name);
1133 continue; 1142 continue;
1134 } 1143 }
diff --git a/block/genhd.c b/block/genhd.c
index 630c0da6cfcf..c2223f12a805 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -588,6 +588,11 @@ static void register_disk(struct device *parent, struct gendisk *disk)
588 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); 588 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
589 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); 589 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
590 590
591 if (disk->flags & GENHD_FL_HIDDEN) {
592 dev_set_uevent_suppress(ddev, 0);
593 return;
594 }
595
591 /* No minors to use for partitions */ 596 /* No minors to use for partitions */
592 if (!disk_part_scan_enabled(disk)) 597 if (!disk_part_scan_enabled(disk))
593 goto exit; 598 goto exit;
@@ -616,6 +621,11 @@ exit:
616 while ((part = disk_part_iter_next(&piter))) 621 while ((part = disk_part_iter_next(&piter)))
617 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); 622 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
618 disk_part_iter_exit(&piter); 623 disk_part_iter_exit(&piter);
624
625 err = sysfs_create_link(&ddev->kobj,
626 &disk->queue->backing_dev_info->dev->kobj,
627 "bdi");
628 WARN_ON(err);
619} 629}
620 630
621/** 631/**
@@ -630,7 +640,6 @@ exit:
630 */ 640 */
631void device_add_disk(struct device *parent, struct gendisk *disk) 641void device_add_disk(struct device *parent, struct gendisk *disk)
632{ 642{
633 struct backing_dev_info *bdi;
634 dev_t devt; 643 dev_t devt;
635 int retval; 644 int retval;
636 645
@@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
639 * parameters make sense. 648 * parameters make sense.
640 */ 649 */
641 WARN_ON(disk->minors && !(disk->major || disk->first_minor)); 650 WARN_ON(disk->minors && !(disk->major || disk->first_minor));
642 WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); 651 WARN_ON(!disk->minors &&
652 !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
643 653
644 disk->flags |= GENHD_FL_UP; 654 disk->flags |= GENHD_FL_UP;
645 655
@@ -648,22 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
648 WARN_ON(1); 658 WARN_ON(1);
649 return; 659 return;
650 } 660 }
651 disk_to_dev(disk)->devt = devt;
652
653 /* ->major and ->first_minor aren't supposed to be
654 * dereferenced from here on, but set them just in case.
655 */
656 disk->major = MAJOR(devt); 661 disk->major = MAJOR(devt);
657 disk->first_minor = MINOR(devt); 662 disk->first_minor = MINOR(devt);
658 663
659 disk_alloc_events(disk); 664 disk_alloc_events(disk);
660 665
661 /* Register BDI before referencing it from bdev */ 666 if (disk->flags & GENHD_FL_HIDDEN) {
662 bdi = disk->queue->backing_dev_info; 667 /*
663 bdi_register_owner(bdi, disk_to_dev(disk)); 668 * Don't let hidden disks show up in /proc/partitions,
664 669 * and don't bother scanning for partitions either.
665 blk_register_region(disk_devt(disk), disk->minors, NULL, 670 */
666 exact_match, exact_lock, disk); 671 disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
672 disk->flags |= GENHD_FL_NO_PART_SCAN;
673 } else {
674 /* Register BDI before referencing it from bdev */
675 disk_to_dev(disk)->devt = devt;
676 bdi_register_owner(disk->queue->backing_dev_info,
677 disk_to_dev(disk));
678 blk_register_region(disk_devt(disk), disk->minors, NULL,
679 exact_match, exact_lock, disk);
680 }
667 register_disk(parent, disk); 681 register_disk(parent, disk);
668 blk_register_queue(disk); 682 blk_register_queue(disk);
669 683
@@ -673,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
673 */ 687 */
674 WARN_ON_ONCE(!blk_get_queue(disk->queue)); 688 WARN_ON_ONCE(!blk_get_queue(disk->queue));
675 689
676 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
677 "bdi");
678 WARN_ON(retval);
679
680 disk_add_events(disk); 690 disk_add_events(disk);
681 blk_integrity_add(disk); 691 blk_integrity_add(disk);
682} 692}
@@ -705,7 +715,8 @@ void del_gendisk(struct gendisk *disk)
705 set_capacity(disk, 0); 715 set_capacity(disk, 0);
706 disk->flags &= ~GENHD_FL_UP; 716 disk->flags &= ~GENHD_FL_UP;
707 717
708 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); 718 if (!(disk->flags & GENHD_FL_HIDDEN))
719 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
709 if (disk->queue) { 720 if (disk->queue) {
710 /* 721 /*
711 * Unregister bdi before releasing device numbers (as they can 722 * Unregister bdi before releasing device numbers (as they can
@@ -716,13 +727,15 @@ void del_gendisk(struct gendisk *disk)
716 } else { 727 } else {
717 WARN_ON(1); 728 WARN_ON(1);
718 } 729 }
719 blk_unregister_region(disk_devt(disk), disk->minors);
720 730
721 part_stat_set_all(&disk->part0, 0); 731 if (!(disk->flags & GENHD_FL_HIDDEN))
722 disk->part0.stamp = 0; 732 blk_unregister_region(disk_devt(disk), disk->minors);
723 733
724 kobject_put(disk->part0.holder_dir); 734 kobject_put(disk->part0.holder_dir);
725 kobject_put(disk->slave_dir); 735 kobject_put(disk->slave_dir);
736
737 part_stat_set_all(&disk->part0, 0);
738 disk->part0.stamp = 0;
726 if (!sysfs_deprecated) 739 if (!sysfs_deprecated)
727 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); 740 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
728 pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); 741 pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -785,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
785 spin_unlock_bh(&ext_devt_lock); 798 spin_unlock_bh(&ext_devt_lock);
786 } 799 }
787 800
801 if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
802 put_disk(disk);
803 disk = NULL;
804 }
788 return disk; 805 return disk;
789} 806}
790EXPORT_SYMBOL(get_gendisk); 807EXPORT_SYMBOL(get_gendisk);
@@ -1028,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev,
1028 (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); 1045 (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
1029} 1046}
1030 1047
1048static ssize_t disk_hidden_show(struct device *dev,
1049 struct device_attribute *attr, char *buf)
1050{
1051 struct gendisk *disk = dev_to_disk(dev);
1052
1053 return sprintf(buf, "%d\n",
1054 (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
1055}
1056
1031static ssize_t disk_ro_show(struct device *dev, 1057static ssize_t disk_ro_show(struct device *dev,
1032 struct device_attribute *attr, char *buf) 1058 struct device_attribute *attr, char *buf)
1033{ 1059{
@@ -1065,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
1065static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); 1091static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
1066static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); 1092static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
1067static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); 1093static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
1094static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
1068static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); 1095static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
1069static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 1096static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
1070static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); 1097static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
@@ -1089,6 +1116,7 @@ static struct attribute *disk_attrs[] = {
1089 &dev_attr_range.attr, 1116 &dev_attr_range.attr,
1090 &dev_attr_ext_range.attr, 1117 &dev_attr_ext_range.attr,
1091 &dev_attr_removable.attr, 1118 &dev_attr_removable.attr,
1119 &dev_attr_hidden.attr,
1092 &dev_attr_ro.attr, 1120 &dev_attr_ro.attr,
1093 &dev_attr_size.attr, 1121 &dev_attr_size.attr,
1094 &dev_attr_alignment_offset.attr, 1122 &dev_attr_alignment_offset.attr,
diff --git a/block/ioctl.c b/block/ioctl.c
index 0de02ee67eed..1668506d8ed8 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
202{ 202{
203 uint64_t range[2]; 203 uint64_t range[2];
204 uint64_t start, len; 204 uint64_t start, len;
205 struct request_queue *q = bdev_get_queue(bdev);
206 struct address_space *mapping = bdev->bd_inode->i_mapping;
207
205 208
206 if (!(mode & FMODE_WRITE)) 209 if (!(mode & FMODE_WRITE))
207 return -EBADF; 210 return -EBADF;
208 211
212 if (!blk_queue_discard(q))
213 return -EOPNOTSUPP;
214
209 if (copy_from_user(range, (void __user *)arg, sizeof(range))) 215 if (copy_from_user(range, (void __user *)arg, sizeof(range)))
210 return -EFAULT; 216 return -EFAULT;
211 217
@@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
216 return -EINVAL; 222 return -EINVAL;
217 if (len & 511) 223 if (len & 511)
218 return -EINVAL; 224 return -EINVAL;
219 start >>= 9;
220 len >>= 9;
221 225
222 if (start + len > (i_size_read(bdev->bd_inode) >> 9)) 226 if (start + len > i_size_read(bdev->bd_inode))
223 return -EINVAL; 227 return -EINVAL;
224 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); 228 truncate_inode_pages_range(mapping, start, start + len);
229 return blkdev_issue_discard(bdev, start >> 9, len >> 9,
230 GFP_KERNEL, flags);
225} 231}
226 232
227static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, 233static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
@@ -437,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode,
437{ 443{
438 int ret, n; 444 int ret, n;
439 445
446 if (!capable(CAP_SYS_ADMIN))
447 return -EACCES;
448
440 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 449 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
441 if (!is_unrecognized_ioctl(ret)) 450 if (!is_unrecognized_ioctl(ret))
442 return ret; 451 return ret;
443 if (!capable(CAP_SYS_ADMIN))
444 return -EACCES;
445 if (get_user(n, (int __user *)arg)) 452 if (get_user(n, (int __user *)arg))
446 return -EFAULT; 453 return -EFAULT;
447 set_device_ro(bdev, n); 454 set_device_ro(bdev, n);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index f58cab82105b..b4df317c2916 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -541,9 +541,17 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
541 541
542 /* 542 /*
543 * Try again in case a token was freed before we got on the wait 543 * Try again in case a token was freed before we got on the wait
544 * queue. 544 * queue. The waker may have already removed the entry from the
545 * wait queue, but list_del_init() is okay with that.
545 */ 546 */
546 nr = __sbitmap_queue_get(domain_tokens); 547 nr = __sbitmap_queue_get(domain_tokens);
548 if (nr >= 0) {
549 unsigned long flags;
550
551 spin_lock_irqsave(&ws->wait.lock, flags);
552 list_del_init(&wait->entry);
553 spin_unlock_irqrestore(&ws->wait.lock, flags);
554 }
547 } 555 }
548 return nr; 556 return nr;
549} 557}
@@ -641,7 +649,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
641 if (!list_empty_careful(&khd->rqs[i])) 649 if (!list_empty_careful(&khd->rqs[i]))
642 return true; 650 return true;
643 } 651 }
644 return false; 652 return sbitmap_any_bit_set(&hctx->ctx_map);
645} 653}
646 654
647#define KYBER_LAT_SHOW_STORE(op) \ 655#define KYBER_LAT_SHOW_STORE(op) \
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a1cad4331edd..0179e484ec98 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -657,6 +657,7 @@ static struct elevator_type mq_deadline = {
657#endif 657#endif
658 .elevator_attrs = deadline_attrs, 658 .elevator_attrs = deadline_attrs,
659 .elevator_name = "mq-deadline", 659 .elevator_name = "mq-deadline",
660 .elevator_alias = "deadline",
660 .elevator_owner = THIS_MODULE, 661 .elevator_owner = THIS_MODULE,
661}; 662};
662MODULE_ALIAS("mq-deadline-iosched"); 663MODULE_ALIAS("mq-deadline-iosched");
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 7440de44dd85..edcfff974527 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
207 __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); 207 __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
208} 208}
209 209
210int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) 210int blk_verify_command(unsigned char *cmd, fmode_t mode)
211{ 211{
212 struct blk_cmd_filter *filter = &blk_default_cmd_filter; 212 struct blk_cmd_filter *filter = &blk_default_cmd_filter;
213 213
@@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
220 return 0; 220 return 0;
221 221
222 /* Write-safe commands require a writable open */ 222 /* Write-safe commands require a writable open */
223 if (test_bit(cmd[0], filter->write_ok) && has_write_perm) 223 if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE))
224 return 0; 224 return 0;
225 225
226 return -EPERM; 226 return -EPERM;
@@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
234 234
235 if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) 235 if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
236 return -EFAULT; 236 return -EFAULT;
237 if (blk_verify_command(req->cmd, mode & FMODE_WRITE)) 237 if (blk_verify_command(req->cmd, mode))
238 return -EPERM; 238 return -EPERM;
239 239
240 /* 240 /*
@@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
469 if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) 469 if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
470 goto error; 470 goto error;
471 471
472 err = blk_verify_command(req->cmd, mode & FMODE_WRITE); 472 err = blk_verify_command(req->cmd, mode);
473 if (err) 473 if (err)
474 goto error; 474 goto error;
475 475
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 7b2df7a54d87..923b417eaf4c 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -68,9 +68,13 @@ config AMIGA_Z2RAM
68 To compile this driver as a module, choose M here: the 68 To compile this driver as a module, choose M here: the
69 module will be called z2ram. 69 module will be called z2ram.
70 70
71config CDROM
72 tristate
73
71config GDROM 74config GDROM
72 tristate "SEGA Dreamcast GD-ROM drive" 75 tristate "SEGA Dreamcast GD-ROM drive"
73 depends on SH_DREAMCAST 76 depends on SH_DREAMCAST
77 select CDROM
74 select BLK_SCSI_REQUEST # only for the generic cdrom code 78 select BLK_SCSI_REQUEST # only for the generic cdrom code
75 help 79 help
76 A standard SEGA Dreamcast comes with a modified CD ROM drive called a 80 A standard SEGA Dreamcast comes with a modified CD ROM drive called a
@@ -348,6 +352,7 @@ config BLK_DEV_RAM_DAX
348config CDROM_PKTCDVD 352config CDROM_PKTCDVD
349 tristate "Packet writing on CD/DVD media (DEPRECATED)" 353 tristate "Packet writing on CD/DVD media (DEPRECATED)"
350 depends on !UML 354 depends on !UML
355 select CDROM
351 select BLK_SCSI_REQUEST 356 select BLK_SCSI_REQUEST
352 help 357 help
353 Note: This driver is deprecated and will be removed from the 358 Note: This driver is deprecated and will be removed from the
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 2d7178f7754e..c1cf87718c2e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -60,7 +60,6 @@ struct brd_device {
60/* 60/*
61 * Look up and return a brd's page for a given sector. 61 * Look up and return a brd's page for a given sector.
62 */ 62 */
63static DEFINE_MUTEX(brd_mutex);
64static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) 63static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
65{ 64{
66 pgoff_t idx; 65 pgoff_t idx;
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 74e03aa537ad..7033a4beda66 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c
@@ -43,7 +43,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
43 int cipher_len; 43 int cipher_len;
44 int mode_len; 44 int mode_len;
45 char cms[LO_NAME_SIZE]; /* cipher-mode string */ 45 char cms[LO_NAME_SIZE]; /* cipher-mode string */
46 char *cipher;
47 char *mode; 46 char *mode;
48 char *cmsp = cms; /* c-m string pointer */ 47 char *cmsp = cms; /* c-m string pointer */
49 struct crypto_skcipher *tfm; 48 struct crypto_skcipher *tfm;
@@ -56,7 +55,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
56 strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE); 55 strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE);
57 cms[LO_NAME_SIZE - 1] = 0; 56 cms[LO_NAME_SIZE - 1] = 0;
58 57
59 cipher = cmsp;
60 cipher_len = strcspn(cmsp, "-"); 58 cipher_len = strcspn(cmsp, "-");
61 59
62 mode = cmsp + cipher_len; 60 mode = cmsp + cipher_len;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 85de67334695..bc8e61506968 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -476,6 +476,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
476{ 476{
477 struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); 477 struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
478 478
479 if (cmd->css)
480 css_put(cmd->css);
479 cmd->ret = ret; 481 cmd->ret = ret;
480 lo_rw_aio_do_completion(cmd); 482 lo_rw_aio_do_completion(cmd);
481} 483}
@@ -535,6 +537,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
535 cmd->iocb.ki_filp = file; 537 cmd->iocb.ki_filp = file;
536 cmd->iocb.ki_complete = lo_rw_aio_complete; 538 cmd->iocb.ki_complete = lo_rw_aio_complete;
537 cmd->iocb.ki_flags = IOCB_DIRECT; 539 cmd->iocb.ki_flags = IOCB_DIRECT;
540 if (cmd->css)
541 kthread_associate_blkcg(cmd->css);
538 542
539 if (rw == WRITE) 543 if (rw == WRITE)
540 ret = call_write_iter(file, &cmd->iocb, &iter); 544 ret = call_write_iter(file, &cmd->iocb, &iter);
@@ -542,6 +546,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
542 ret = call_read_iter(file, &cmd->iocb, &iter); 546 ret = call_read_iter(file, &cmd->iocb, &iter);
543 547
544 lo_rw_aio_do_completion(cmd); 548 lo_rw_aio_do_completion(cmd);
549 kthread_associate_blkcg(NULL);
545 550
546 if (ret != -EIOCBQUEUED) 551 if (ret != -EIOCBQUEUED)
547 cmd->iocb.ki_complete(&cmd->iocb, ret, 0); 552 cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
@@ -1686,6 +1691,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
1686 break; 1691 break;
1687 } 1692 }
1688 1693
1694 /* always use the first bio's css */
1695#ifdef CONFIG_BLK_CGROUP
1696 if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
1697 cmd->css = cmd->rq->bio->bi_css;
1698 css_get(cmd->css);
1699 } else
1700#endif
1701 cmd->css = NULL;
1689 kthread_queue_work(&lo->worker, &cmd->work); 1702 kthread_queue_work(&lo->worker, &cmd->work);
1690 1703
1691 return BLK_STS_OK; 1704 return BLK_STS_OK;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 1f3956702993..0f45416e4fcf 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -72,6 +72,7 @@ struct loop_cmd {
72 long ret; 72 long ret;
73 struct kiocb iocb; 73 struct kiocb iocb;
74 struct bio_vec *bvec; 74 struct bio_vec *bvec;
75 struct cgroup_subsys_state *css;
75}; 76};
76 77
77/* Support for loadable transfer modules */ 78/* Support for loadable transfer modules */
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 4a3cfc7940de..b8af7352a18f 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -887,12 +887,9 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
887static bool mtip_pause_ncq(struct mtip_port *port, 887static bool mtip_pause_ncq(struct mtip_port *port,
888 struct host_to_dev_fis *fis) 888 struct host_to_dev_fis *fis)
889{ 889{
890 struct host_to_dev_fis *reply;
891 unsigned long task_file_data; 890 unsigned long task_file_data;
892 891
893 reply = port->rxfis + RX_FIS_D2H_REG;
894 task_file_data = readl(port->mmio+PORT_TFDATA); 892 task_file_data = readl(port->mmio+PORT_TFDATA);
895
896 if ((task_file_data & 1)) 893 if ((task_file_data & 1))
897 return false; 894 return false;
898 895
@@ -1020,7 +1017,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1020 .opts = opts 1017 .opts = opts
1021 }; 1018 };
1022 int rv = 0; 1019 int rv = 0;
1023 unsigned long start;
1024 1020
1025 /* Make sure the buffer is 8 byte aligned. This is asic specific. */ 1021 /* Make sure the buffer is 8 byte aligned. This is asic specific. */
1026 if (buffer & 0x00000007) { 1022 if (buffer & 0x00000007) {
@@ -1057,7 +1053,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1057 /* Copy the command to the command table */ 1053 /* Copy the command to the command table */
1058 memcpy(int_cmd->command, fis, fis_len*4); 1054 memcpy(int_cmd->command, fis, fis_len*4);
1059 1055
1060 start = jiffies;
1061 rq->timeout = timeout; 1056 rq->timeout = timeout;
1062 1057
1063 /* insert request and run queue */ 1058 /* insert request and run queue */
@@ -3015,7 +3010,6 @@ static int mtip_hw_init(struct driver_data *dd)
3015{ 3010{
3016 int i; 3011 int i;
3017 int rv; 3012 int rv;
3018 unsigned int num_command_slots;
3019 unsigned long timeout, timetaken; 3013 unsigned long timeout, timetaken;
3020 3014
3021 dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; 3015 dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
@@ -3025,7 +3019,6 @@ static int mtip_hw_init(struct driver_data *dd)
3025 rv = -EIO; 3019 rv = -EIO;
3026 goto out1; 3020 goto out1;
3027 } 3021 }
3028 num_command_slots = dd->slot_groups * 32;
3029 3022
3030 hba_setup(dd); 3023 hba_setup(dd);
3031 3024
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9adfb5445f8d..5f2a4240a204 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -288,15 +288,6 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
288 cmd->status = BLK_STS_TIMEOUT; 288 cmd->status = BLK_STS_TIMEOUT;
289 return BLK_EH_HANDLED; 289 return BLK_EH_HANDLED;
290 } 290 }
291
292 /* If we are waiting on our dead timer then we could get timeout
293 * callbacks for our request. For this we just want to reset the timer
294 * and let the queue side take care of everything.
295 */
296 if (!completion_done(&cmd->send_complete)) {
297 nbd_config_put(nbd);
298 return BLK_EH_RESET_TIMER;
299 }
300 config = nbd->config; 291 config = nbd->config;
301 292
302 if (config->num_connections > 1) { 293 if (config->num_connections > 1) {
@@ -723,9 +714,9 @@ static int wait_for_reconnect(struct nbd_device *nbd)
723 return 0; 714 return 0;
724 if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) 715 if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
725 return 0; 716 return 0;
726 wait_event_interruptible_timeout(config->conn_wait, 717 wait_event_timeout(config->conn_wait,
727 atomic_read(&config->live_connections), 718 atomic_read(&config->live_connections),
728 config->dead_conn_timeout); 719 config->dead_conn_timeout);
729 return atomic_read(&config->live_connections); 720 return atomic_read(&config->live_connections);
730} 721}
731 722
@@ -740,6 +731,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
740 if (!refcount_inc_not_zero(&nbd->config_refs)) { 731 if (!refcount_inc_not_zero(&nbd->config_refs)) {
741 dev_err_ratelimited(disk_to_dev(nbd->disk), 732 dev_err_ratelimited(disk_to_dev(nbd->disk),
742 "Socks array is empty\n"); 733 "Socks array is empty\n");
734 blk_mq_start_request(req);
743 return -EINVAL; 735 return -EINVAL;
744 } 736 }
745 config = nbd->config; 737 config = nbd->config;
@@ -748,6 +740,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
748 dev_err_ratelimited(disk_to_dev(nbd->disk), 740 dev_err_ratelimited(disk_to_dev(nbd->disk),
749 "Attempted send on invalid socket\n"); 741 "Attempted send on invalid socket\n");
750 nbd_config_put(nbd); 742 nbd_config_put(nbd);
743 blk_mq_start_request(req);
751 return -EINVAL; 744 return -EINVAL;
752 } 745 }
753 cmd->status = BLK_STS_OK; 746 cmd->status = BLK_STS_OK;
@@ -771,6 +764,7 @@ again:
771 */ 764 */
772 sock_shutdown(nbd); 765 sock_shutdown(nbd);
773 nbd_config_put(nbd); 766 nbd_config_put(nbd);
767 blk_mq_start_request(req);
774 return -EIO; 768 return -EIO;
775 } 769 }
776 goto again; 770 goto again;
@@ -781,6 +775,7 @@ again:
781 * here so that it gets put _after_ the request that is already on the 775 * here so that it gets put _after_ the request that is already on the
782 * dispatch list. 776 * dispatch list.
783 */ 777 */
778 blk_mq_start_request(req);
784 if (unlikely(nsock->pending && nsock->pending != req)) { 779 if (unlikely(nsock->pending && nsock->pending != req)) {
785 blk_mq_requeue_request(req, true); 780 blk_mq_requeue_request(req, true);
786 ret = 0; 781 ret = 0;
@@ -793,10 +788,10 @@ again:
793 ret = nbd_send_cmd(nbd, cmd, index); 788 ret = nbd_send_cmd(nbd, cmd, index);
794 if (ret == -EAGAIN) { 789 if (ret == -EAGAIN) {
795 dev_err_ratelimited(disk_to_dev(nbd->disk), 790 dev_err_ratelimited(disk_to_dev(nbd->disk),
796 "Request send failed trying another connection\n"); 791 "Request send failed, requeueing\n");
797 nbd_mark_nsock_dead(nbd, nsock, 1); 792 nbd_mark_nsock_dead(nbd, nsock, 1);
798 mutex_unlock(&nsock->tx_lock); 793 blk_mq_requeue_request(req, true);
799 goto again; 794 ret = 0;
800 } 795 }
801out: 796out:
802 mutex_unlock(&nsock->tx_lock); 797 mutex_unlock(&nsock->tx_lock);
@@ -820,7 +815,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
820 * done sending everything over the wire. 815 * done sending everything over the wire.
821 */ 816 */
822 init_completion(&cmd->send_complete); 817 init_completion(&cmd->send_complete);
823 blk_mq_start_request(bd->rq);
824 818
825 /* We can be called directly from the user space process, which means we 819 /* We can be called directly from the user space process, which means we
826 * could possibly have signals pending so our sendmsg will fail. In 820 * could possibly have signals pending so our sendmsg will fail. In
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cda69dbefe3b..c61960deb74a 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -154,6 +154,10 @@ enum {
154 NULL_Q_MQ = 2, 154 NULL_Q_MQ = 2,
155}; 155};
156 156
157static int g_no_sched;
158module_param_named(no_sched, g_no_sched, int, S_IRUGO);
159MODULE_PARM_DESC(no_sched, "No io scheduler");
160
157static int g_submit_queues = 1; 161static int g_submit_queues = 1;
158module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); 162module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
159MODULE_PARM_DESC(submit_queues, "Number of submission queues"); 163MODULE_PARM_DESC(submit_queues, "Number of submission queues");
@@ -1754,6 +1758,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
1754 set->numa_node = nullb ? nullb->dev->home_node : g_home_node; 1758 set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
1755 set->cmd_size = sizeof(struct nullb_cmd); 1759 set->cmd_size = sizeof(struct nullb_cmd);
1756 set->flags = BLK_MQ_F_SHOULD_MERGE; 1760 set->flags = BLK_MQ_F_SHOULD_MERGE;
1761 if (g_no_sched)
1762 set->flags |= BLK_MQ_F_NO_SCHED;
1757 set->driver_data = NULL; 1763 set->driver_data = NULL;
1758 1764
1759 if ((nullb && nullb->dev->blocking) || g_blocking) 1765 if ((nullb && nullb->dev->blocking) || g_blocking)
@@ -1985,8 +1991,10 @@ static int __init null_init(void)
1985 1991
1986 for (i = 0; i < nr_devices; i++) { 1992 for (i = 0; i < nr_devices; i++) {
1987 dev = null_alloc_dev(); 1993 dev = null_alloc_dev();
1988 if (!dev) 1994 if (!dev) {
1995 ret = -ENOMEM;
1989 goto err_dev; 1996 goto err_dev;
1997 }
1990 ret = null_add_dev(dev); 1998 ret = null_add_dev(dev);
1991 if (ret) { 1999 if (ret) {
1992 null_free_dev(dev); 2000 null_free_dev(dev);
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
index b226835a909a..f8bd6ef3605a 100644
--- a/drivers/block/paride/Kconfig
+++ b/drivers/block/paride/Kconfig
@@ -26,6 +26,7 @@ config PARIDE_PD
26config PARIDE_PCD 26config PARIDE_PCD
27 tristate "Parallel port ATAPI CD-ROMs" 27 tristate "Parallel port ATAPI CD-ROMs"
28 depends on PARIDE 28 depends on PARIDE
29 select CDROM
29 select BLK_SCSI_REQUEST # only for the generic cdrom code 30 select BLK_SCSI_REQUEST # only for the generic cdrom code
30 ---help--- 31 ---help---
31 This option enables the high-level driver for ATAPI CD-ROM devices 32 This option enables the high-level driver for ATAPI CD-ROM devices
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 64d0fc17c174..2819f23e8bf2 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -1967,7 +1967,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev)
1967 break; 1967 break;
1968 1968
1969 case FIT_MTD_CMD_LOG_HOST_ID: 1969 case FIT_MTD_CMD_LOG_HOST_ID:
1970 skdev->connect_time_stamp = get_seconds(); 1970 /* hardware interface overflows in y2106 */
1971 skdev->connect_time_stamp = (u32)ktime_get_real_seconds();
1971 data = skdev->connect_time_stamp & 0xFFFF; 1972 data = skdev->connect_time_stamp & 0xFFFF;
1972 mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); 1973 mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
1973 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); 1974 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile
index a95566ff47d3..0f3664b45f48 100644
--- a/drivers/cdrom/Makefile
+++ b/drivers/cdrom/Makefile
@@ -1,14 +1,3 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2# Makefile for the kernel cdrom device drivers. 2obj-$(CONFIG_CDROM) += cdrom.o
3# 3obj-$(CONFIG_GDROM) += gdrom.o
4# 30 Jan 1998, Michael Elizabeth Chastain, <mailto:mec@shout.net>
5# Rewritten to use lists instead of if-statements.
6
7# Each configuration option enables a list of files.
8
9obj-$(CONFIG_BLK_DEV_IDECD) += cdrom.o
10obj-$(CONFIG_BLK_DEV_SR) += cdrom.o
11obj-$(CONFIG_PARIDE_PCD) += cdrom.o
12obj-$(CONFIG_CDROM_PKTCDVD) += cdrom.o
13
14obj-$(CONFIG_GDROM) += gdrom.o cdrom.o
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index c99a25c075bc..cf1fb3fb5d26 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -117,7 +117,9 @@ config BLK_DEV_DELKIN
117 117
118config BLK_DEV_IDECD 118config BLK_DEV_IDECD
119 tristate "Include IDE/ATAPI CDROM support" 119 tristate "Include IDE/ATAPI CDROM support"
120 depends on BLK_DEV
120 select IDE_ATAPI 121 select IDE_ATAPI
122 select CDROM
121 ---help--- 123 ---help---
122 If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is 124 If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is
123 a newer protocol used by IDE CD-ROM and TAPE drives, similar to the 125 a newer protocol used by IDE CD-ROM and TAPE drives, similar to the
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 14d1e7d9a1d6..0e6bc631a1ca 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -282,7 +282,7 @@ int ide_cd_expiry(ide_drive_t *drive)
282 struct request *rq = drive->hwif->rq; 282 struct request *rq = drive->hwif->rq;
283 unsigned long wait = 0; 283 unsigned long wait = 0;
284 284
285 debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]); 285 debug_log("%s: scsi_req(rq)->cmd[0]: 0x%x\n", __func__, scsi_req(rq)->cmd[0]);
286 286
287 /* 287 /*
288 * Some commands are *slow* and normally take a long time to complete. 288 * Some commands are *slow* and normally take a long time to complete.
@@ -463,7 +463,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
463 return ide_do_reset(drive); 463 return ide_do_reset(drive);
464 } 464 }
465 465
466 debug_log("[cmd %x]: check condition\n", rq->cmd[0]); 466 debug_log("[cmd %x]: check condition\n", scsi_req(rq)->cmd[0]);
467 467
468 /* Retry operation */ 468 /* Retry operation */
469 ide_retry_pc(drive); 469 ide_retry_pc(drive);
@@ -531,7 +531,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
531 ide_pad_transfer(drive, write, bcount); 531 ide_pad_transfer(drive, write, bcount);
532 532
533 debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n", 533 debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n",
534 rq->cmd[0], done, bcount, scsi_req(rq)->resid_len); 534 scsi_req(rq)->cmd[0], done, bcount, scsi_req(rq)->resid_len);
535 535
536 /* And set the interrupt handler again */ 536 /* And set the interrupt handler again */
537 ide_set_handler(drive, ide_pc_intr, timeout); 537 ide_set_handler(drive, ide_pc_intr, timeout);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index dccdca9eda38..ad8a125defdd 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -90,9 +90,9 @@ int generic_ide_resume(struct device *dev)
90 } 90 }
91 91
92 memset(&rqpm, 0, sizeof(rqpm)); 92 memset(&rqpm, 0, sizeof(rqpm));
93 rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); 93 rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
94 BLK_MQ_REQ_PREEMPT);
94 ide_req(rq)->type = ATA_PRIV_PM_RESUME; 95 ide_req(rq)->type = ATA_PRIV_PM_RESUME;
95 rq->rq_flags |= RQF_PREEMPT;
96 rq->special = &rqpm; 96 rq->special = &rqpm;
97 rqpm.pm_step = IDE_PM_START_RESUME; 97 rqpm.pm_step = IDE_PM_START_RESUME;
98 rqpm.pm_state = PM_EVENT_ON; 98 rqpm.pm_state = PM_EVENT_ON;
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index ead61a93cb4e..2a953efec4e1 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -4,7 +4,8 @@
4 4
5menuconfig NVM 5menuconfig NVM
6 bool "Open-Channel SSD target support" 6 bool "Open-Channel SSD target support"
7 depends on BLOCK && HAS_DMA 7 depends on BLOCK && HAS_DMA && PCI
8 select BLK_DEV_NVME
8 help 9 help
9 Say Y here to get to enable Open-channel SSDs. 10 Say Y here to get to enable Open-channel SSDs.
10 11
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ddae430b6eae..83249b43dd06 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -22,6 +22,7 @@
22#include <linux/types.h> 22#include <linux/types.h>
23#include <linux/sem.h> 23#include <linux/sem.h>
24#include <linux/bitmap.h> 24#include <linux/bitmap.h>
25#include <linux/module.h>
25#include <linux/moduleparam.h> 26#include <linux/moduleparam.h>
26#include <linux/miscdevice.h> 27#include <linux/miscdevice.h>
27#include <linux/lightnvm.h> 28#include <linux/lightnvm.h>
@@ -138,7 +139,6 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
138 int prev_nr_luns; 139 int prev_nr_luns;
139 int i, j; 140 int i, j;
140 141
141 nr_chnls = nr_luns / dev->geo.luns_per_chnl;
142 nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; 142 nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
143 143
144 dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); 144 dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
@@ -226,6 +226,24 @@ static const struct block_device_operations nvm_fops = {
226 .owner = THIS_MODULE, 226 .owner = THIS_MODULE,
227}; 227};
228 228
229static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
230{
231 struct nvm_tgt_type *tmp, *tt = NULL;
232
233 if (lock)
234 down_write(&nvm_tgtt_lock);
235
236 list_for_each_entry(tmp, &nvm_tgt_types, list)
237 if (!strcmp(name, tmp->name)) {
238 tt = tmp;
239 break;
240 }
241
242 if (lock)
243 up_write(&nvm_tgtt_lock);
244 return tt;
245}
246
229static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) 247static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
230{ 248{
231 struct nvm_ioctl_create_simple *s = &create->conf.s; 249 struct nvm_ioctl_create_simple *s = &create->conf.s;
@@ -316,6 +334,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
316 list_add_tail(&t->list, &dev->targets); 334 list_add_tail(&t->list, &dev->targets);
317 mutex_unlock(&dev->mlock); 335 mutex_unlock(&dev->mlock);
318 336
337 __module_get(tt->owner);
338
319 return 0; 339 return 0;
320err_sysfs: 340err_sysfs:
321 if (tt->exit) 341 if (tt->exit)
@@ -351,6 +371,7 @@ static void __nvm_remove_target(struct nvm_target *t)
351 371
352 nvm_remove_tgt_dev(t->dev, 1); 372 nvm_remove_tgt_dev(t->dev, 1);
353 put_disk(tdisk); 373 put_disk(tdisk);
374 module_put(t->type->owner);
354 375
355 list_del(&t->list); 376 list_del(&t->list);
356 kfree(t); 377 kfree(t);
@@ -532,25 +553,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
532} 553}
533EXPORT_SYMBOL(nvm_part_to_tgt); 554EXPORT_SYMBOL(nvm_part_to_tgt);
534 555
535struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
536{
537 struct nvm_tgt_type *tmp, *tt = NULL;
538
539 if (lock)
540 down_write(&nvm_tgtt_lock);
541
542 list_for_each_entry(tmp, &nvm_tgt_types, list)
543 if (!strcmp(name, tmp->name)) {
544 tt = tmp;
545 break;
546 }
547
548 if (lock)
549 up_write(&nvm_tgtt_lock);
550 return tt;
551}
552EXPORT_SYMBOL(nvm_find_target_type);
553
554int nvm_register_tgt_type(struct nvm_tgt_type *tt) 556int nvm_register_tgt_type(struct nvm_tgt_type *tt)
555{ 557{
556 int ret = 0; 558 int ret = 0;
@@ -571,9 +573,9 @@ void nvm_unregister_tgt_type(struct nvm_tgt_type *tt)
571 if (!tt) 573 if (!tt)
572 return; 574 return;
573 575
574 down_write(&nvm_lock); 576 down_write(&nvm_tgtt_lock);
575 list_del(&tt->list); 577 list_del(&tt->list);
576 up_write(&nvm_lock); 578 up_write(&nvm_tgtt_lock);
577} 579}
578EXPORT_SYMBOL(nvm_unregister_tgt_type); 580EXPORT_SYMBOL(nvm_unregister_tgt_type);
579 581
@@ -602,6 +604,52 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
602 return NULL; 604 return NULL;
603} 605}
604 606
607static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
608 const struct ppa_addr *ppas, int nr_ppas)
609{
610 struct nvm_dev *dev = tgt_dev->parent;
611 struct nvm_geo *geo = &tgt_dev->geo;
612 int i, plane_cnt, pl_idx;
613 struct ppa_addr ppa;
614
615 if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
616 rqd->nr_ppas = nr_ppas;
617 rqd->ppa_addr = ppas[0];
618
619 return 0;
620 }
621
622 rqd->nr_ppas = nr_ppas;
623 rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
624 if (!rqd->ppa_list) {
625 pr_err("nvm: failed to allocate dma memory\n");
626 return -ENOMEM;
627 }
628
629 plane_cnt = geo->plane_mode;
630 rqd->nr_ppas *= plane_cnt;
631
632 for (i = 0; i < nr_ppas; i++) {
633 for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
634 ppa = ppas[i];
635 ppa.g.pl = pl_idx;
636 rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
637 }
638 }
639
640 return 0;
641}
642
643static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev,
644 struct nvm_rq *rqd)
645{
646 if (!rqd->ppa_list)
647 return;
648
649 nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
650}
651
652
605int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, 653int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
606 int nr_ppas, int type) 654 int nr_ppas, int type)
607{ 655{
@@ -616,7 +664,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
616 664
617 memset(&rqd, 0, sizeof(struct nvm_rq)); 665 memset(&rqd, 0, sizeof(struct nvm_rq));
618 666
619 nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); 667 nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
620 nvm_rq_tgt_to_dev(tgt_dev, &rqd); 668 nvm_rq_tgt_to_dev(tgt_dev, &rqd);
621 669
622 ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); 670 ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
@@ -658,12 +706,25 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
658} 706}
659EXPORT_SYMBOL(nvm_submit_io); 707EXPORT_SYMBOL(nvm_submit_io);
660 708
661static void nvm_end_io_sync(struct nvm_rq *rqd) 709int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
662{ 710{
663 struct completion *waiting = rqd->private; 711 struct nvm_dev *dev = tgt_dev->parent;
712 int ret;
664 713
665 complete(waiting); 714 if (!dev->ops->submit_io_sync)
715 return -ENODEV;
716
717 nvm_rq_tgt_to_dev(tgt_dev, rqd);
718
719 rqd->dev = tgt_dev;
720
721 /* In case of error, fail with right address format */
722 ret = dev->ops->submit_io_sync(dev, rqd);
723 nvm_rq_dev_to_tgt(tgt_dev, rqd);
724
725 return ret;
666} 726}
727EXPORT_SYMBOL(nvm_submit_io_sync);
667 728
668int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, 729int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
669 int nr_ppas) 730 int nr_ppas)
@@ -671,25 +732,21 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
671 struct nvm_geo *geo = &tgt_dev->geo; 732 struct nvm_geo *geo = &tgt_dev->geo;
672 struct nvm_rq rqd; 733 struct nvm_rq rqd;
673 int ret; 734 int ret;
674 DECLARE_COMPLETION_ONSTACK(wait);
675 735
676 memset(&rqd, 0, sizeof(struct nvm_rq)); 736 memset(&rqd, 0, sizeof(struct nvm_rq));
677 737
678 rqd.opcode = NVM_OP_ERASE; 738 rqd.opcode = NVM_OP_ERASE;
679 rqd.end_io = nvm_end_io_sync;
680 rqd.private = &wait;
681 rqd.flags = geo->plane_mode >> 1; 739 rqd.flags = geo->plane_mode >> 1;
682 740
683 ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); 741 ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
684 if (ret) 742 if (ret)
685 return ret; 743 return ret;
686 744
687 ret = nvm_submit_io(tgt_dev, &rqd); 745 ret = nvm_submit_io_sync(tgt_dev, &rqd);
688 if (ret) { 746 if (ret) {
689 pr_err("rrpr: erase I/O submission failed: %d\n", ret); 747 pr_err("rrpr: erase I/O submission failed: %d\n", ret);
690 goto free_ppa_list; 748 goto free_ppa_list;
691 } 749 }
692 wait_for_completion_io(&wait);
693 750
694free_ppa_list: 751free_ppa_list:
695 nvm_free_rqd_ppalist(tgt_dev, &rqd); 752 nvm_free_rqd_ppalist(tgt_dev, &rqd);
@@ -775,57 +832,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
775} 832}
776EXPORT_SYMBOL(nvm_put_area); 833EXPORT_SYMBOL(nvm_put_area);
777 834
778int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
779 const struct ppa_addr *ppas, int nr_ppas, int vblk)
780{
781 struct nvm_dev *dev = tgt_dev->parent;
782 struct nvm_geo *geo = &tgt_dev->geo;
783 int i, plane_cnt, pl_idx;
784 struct ppa_addr ppa;
785
786 if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
787 rqd->nr_ppas = nr_ppas;
788 rqd->ppa_addr = ppas[0];
789
790 return 0;
791 }
792
793 rqd->nr_ppas = nr_ppas;
794 rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
795 if (!rqd->ppa_list) {
796 pr_err("nvm: failed to allocate dma memory\n");
797 return -ENOMEM;
798 }
799
800 if (!vblk) {
801 for (i = 0; i < nr_ppas; i++)
802 rqd->ppa_list[i] = ppas[i];
803 } else {
804 plane_cnt = geo->plane_mode;
805 rqd->nr_ppas *= plane_cnt;
806
807 for (i = 0; i < nr_ppas; i++) {
808 for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
809 ppa = ppas[i];
810 ppa.g.pl = pl_idx;
811 rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
812 }
813 }
814 }
815
816 return 0;
817}
818EXPORT_SYMBOL(nvm_set_rqd_ppalist);
819
820void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
821{
822 if (!rqd->ppa_list)
823 return;
824
825 nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
826}
827EXPORT_SYMBOL(nvm_free_rqd_ppalist);
828
829void nvm_end_io(struct nvm_rq *rqd) 835void nvm_end_io(struct nvm_rq *rqd)
830{ 836{
831 struct nvm_tgt_dev *tgt_dev = rqd->dev; 837 struct nvm_tgt_dev *tgt_dev = rqd->dev;
@@ -1177,7 +1183,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
1177 info->version[1] = NVM_VERSION_MINOR; 1183 info->version[1] = NVM_VERSION_MINOR;
1178 info->version[2] = NVM_VERSION_PATCH; 1184 info->version[2] = NVM_VERSION_PATCH;
1179 1185
1180 down_write(&nvm_lock); 1186 down_write(&nvm_tgtt_lock);
1181 list_for_each_entry(tt, &nvm_tgt_types, list) { 1187 list_for_each_entry(tt, &nvm_tgt_types, list) {
1182 struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; 1188 struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter];
1183 1189
@@ -1190,7 +1196,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
1190 } 1196 }
1191 1197
1192 info->tgtsize = tgt_iter; 1198 info->tgtsize = tgt_iter;
1193 up_write(&nvm_lock); 1199 up_write(&nvm_tgtt_lock);
1194 1200
1195 if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { 1201 if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) {
1196 kfree(info); 1202 kfree(info);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 024a8fc93069..0d227ef7d1b9 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -43,8 +43,10 @@ retry:
43 if (unlikely(!bio_has_data(bio))) 43 if (unlikely(!bio_has_data(bio)))
44 goto out; 44 goto out;
45 45
46 w_ctx.flags = flags;
47 pblk_ppa_set_empty(&w_ctx.ppa); 46 pblk_ppa_set_empty(&w_ctx.ppa);
47 w_ctx.flags = flags;
48 if (bio->bi_opf & REQ_PREFLUSH)
49 w_ctx.flags |= PBLK_FLUSH_ENTRY;
48 50
49 for (i = 0; i < nr_entries; i++) { 51 for (i = 0; i < nr_entries; i++) {
50 void *data = bio_data(bio); 52 void *data = bio_data(bio);
@@ -73,12 +75,11 @@ out:
73 * On GC the incoming lbas are not necessarily sequential. Also, some of the 75 * On GC the incoming lbas are not necessarily sequential. Also, some of the
74 * lbas might not be valid entries, which are marked as empty by the GC thread 76 * lbas might not be valid entries, which are marked as empty by the GC thread
75 */ 77 */
76int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, 78int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
77 unsigned int nr_entries, unsigned int nr_rec_entries,
78 struct pblk_line *gc_line, unsigned long flags)
79{ 79{
80 struct pblk_w_ctx w_ctx; 80 struct pblk_w_ctx w_ctx;
81 unsigned int bpos, pos; 81 unsigned int bpos, pos;
82 void *data = gc_rq->data;
82 int i, valid_entries; 83 int i, valid_entries;
83 84
84 /* Update the write buffer head (mem) with the entries that we can 85 /* Update the write buffer head (mem) with the entries that we can
@@ -86,28 +87,29 @@ int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
86 * rollback from here on. 87 * rollback from here on.
87 */ 88 */
88retry: 89retry:
89 if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) { 90 if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) {
90 io_schedule(); 91 io_schedule();
91 goto retry; 92 goto retry;
92 } 93 }
93 94
94 w_ctx.flags = flags; 95 w_ctx.flags = PBLK_IOTYPE_GC;
95 pblk_ppa_set_empty(&w_ctx.ppa); 96 pblk_ppa_set_empty(&w_ctx.ppa);
96 97
97 for (i = 0, valid_entries = 0; i < nr_entries; i++) { 98 for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) {
98 if (lba_list[i] == ADDR_EMPTY) 99 if (gc_rq->lba_list[i] == ADDR_EMPTY)
99 continue; 100 continue;
100 101
101 w_ctx.lba = lba_list[i]; 102 w_ctx.lba = gc_rq->lba_list[i];
102 103
103 pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); 104 pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
104 pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos); 105 pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line,
106 gc_rq->paddr_list[i], pos);
105 107
106 data += PBLK_EXPOSED_PAGE_SIZE; 108 data += PBLK_EXPOSED_PAGE_SIZE;
107 valid_entries++; 109 valid_entries++;
108 } 110 }
109 111
110 WARN_ONCE(nr_rec_entries != valid_entries, 112 WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
111 "pblk: inconsistent GC write\n"); 113 "pblk: inconsistent GC write\n");
112 114
113#ifdef CONFIG_NVM_DEBUG 115#ifdef CONFIG_NVM_DEBUG
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 81501644fb15..ce90213a42fa 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -18,6 +18,31 @@
18 18
19#include "pblk.h" 19#include "pblk.h"
20 20
21static void pblk_line_mark_bb(struct work_struct *work)
22{
23 struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
24 ws);
25 struct pblk *pblk = line_ws->pblk;
26 struct nvm_tgt_dev *dev = pblk->dev;
27 struct ppa_addr *ppa = line_ws->priv;
28 int ret;
29
30 ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
31 if (ret) {
32 struct pblk_line *line;
33 int pos;
34
35 line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
36 pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
37
38 pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
39 line->id, pos);
40 }
41
42 kfree(ppa);
43 mempool_free(line_ws, pblk->gen_ws_pool);
44}
45
21static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, 46static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
22 struct ppa_addr *ppa) 47 struct ppa_addr *ppa)
23{ 48{
@@ -33,7 +58,8 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
33 pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", 58 pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
34 line->id, pos); 59 line->id, pos);
35 60
36 pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq); 61 pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb,
62 GFP_ATOMIC, pblk->bb_wq);
37} 63}
38 64
39static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) 65static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
@@ -63,7 +89,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
63 struct pblk *pblk = rqd->private; 89 struct pblk *pblk = rqd->private;
64 90
65 __pblk_end_io_erase(pblk, rqd); 91 __pblk_end_io_erase(pblk, rqd);
66 mempool_free(rqd, pblk->g_rq_pool); 92 mempool_free(rqd, pblk->e_rq_pool);
67} 93}
68 94
69void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, 95void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
@@ -77,11 +103,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
77 * that newer updates are not overwritten. 103 * that newer updates are not overwritten.
78 */ 104 */
79 spin_lock(&line->lock); 105 spin_lock(&line->lock);
80 if (line->state == PBLK_LINESTATE_GC || 106 WARN_ON(line->state == PBLK_LINESTATE_FREE);
81 line->state == PBLK_LINESTATE_FREE) {
82 spin_unlock(&line->lock);
83 return;
84 }
85 107
86 if (test_and_set_bit(paddr, line->invalid_bitmap)) { 108 if (test_and_set_bit(paddr, line->invalid_bitmap)) {
87 WARN_ONCE(1, "pblk: double invalidate\n"); 109 WARN_ONCE(1, "pblk: double invalidate\n");
@@ -98,8 +120,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
98 spin_lock(&l_mg->gc_lock); 120 spin_lock(&l_mg->gc_lock);
99 spin_lock(&line->lock); 121 spin_lock(&line->lock);
100 /* Prevent moving a line that has just been chosen for GC */ 122 /* Prevent moving a line that has just been chosen for GC */
101 if (line->state == PBLK_LINESTATE_GC || 123 if (line->state == PBLK_LINESTATE_GC) {
102 line->state == PBLK_LINESTATE_FREE) {
103 spin_unlock(&line->lock); 124 spin_unlock(&line->lock);
104 spin_unlock(&l_mg->gc_lock); 125 spin_unlock(&l_mg->gc_lock);
105 return; 126 return;
@@ -150,17 +171,25 @@ static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
150 spin_unlock(&pblk->trans_lock); 171 spin_unlock(&pblk->trans_lock);
151} 172}
152 173
153struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) 174/* Caller must guarantee that the request is a valid type */
175struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
154{ 176{
155 mempool_t *pool; 177 mempool_t *pool;
156 struct nvm_rq *rqd; 178 struct nvm_rq *rqd;
157 int rq_size; 179 int rq_size;
158 180
159 if (rw == WRITE) { 181 switch (type) {
182 case PBLK_WRITE:
183 case PBLK_WRITE_INT:
160 pool = pblk->w_rq_pool; 184 pool = pblk->w_rq_pool;
161 rq_size = pblk_w_rq_size; 185 rq_size = pblk_w_rq_size;
162 } else { 186 break;
163 pool = pblk->g_rq_pool; 187 case PBLK_READ:
188 pool = pblk->r_rq_pool;
189 rq_size = pblk_g_rq_size;
190 break;
191 default:
192 pool = pblk->e_rq_pool;
164 rq_size = pblk_g_rq_size; 193 rq_size = pblk_g_rq_size;
165 } 194 }
166 195
@@ -170,15 +199,30 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
170 return rqd; 199 return rqd;
171} 200}
172 201
173void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) 202/* Typically used on completion path. Cannot guarantee request consistency */
203void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
174{ 204{
205 struct nvm_tgt_dev *dev = pblk->dev;
175 mempool_t *pool; 206 mempool_t *pool;
176 207
177 if (rw == WRITE) 208 switch (type) {
209 case PBLK_WRITE:
210 kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
211 case PBLK_WRITE_INT:
178 pool = pblk->w_rq_pool; 212 pool = pblk->w_rq_pool;
179 else 213 break;
180 pool = pblk->g_rq_pool; 214 case PBLK_READ:
215 pool = pblk->r_rq_pool;
216 break;
217 case PBLK_ERASE:
218 pool = pblk->e_rq_pool;
219 break;
220 default:
221 pr_err("pblk: trying to free unknown rqd type\n");
222 return;
223 }
181 224
225 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
182 mempool_free(rqd, pool); 226 mempool_free(rqd, pool);
183} 227}
184 228
@@ -190,10 +234,9 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
190 234
191 WARN_ON(off + nr_pages != bio->bi_vcnt); 235 WARN_ON(off + nr_pages != bio->bi_vcnt);
192 236
193 bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
194 for (i = off; i < nr_pages + off; i++) { 237 for (i = off; i < nr_pages + off; i++) {
195 bv = bio->bi_io_vec[i]; 238 bv = bio->bi_io_vec[i];
196 mempool_free(bv.bv_page, pblk->page_pool); 239 mempool_free(bv.bv_page, pblk->page_bio_pool);
197 } 240 }
198} 241}
199 242
@@ -205,14 +248,12 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
205 int i, ret; 248 int i, ret;
206 249
207 for (i = 0; i < nr_pages; i++) { 250 for (i = 0; i < nr_pages; i++) {
208 page = mempool_alloc(pblk->page_pool, flags); 251 page = mempool_alloc(pblk->page_bio_pool, flags);
209 if (!page)
210 goto err;
211 252
212 ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); 253 ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
213 if (ret != PBLK_EXPOSED_PAGE_SIZE) { 254 if (ret != PBLK_EXPOSED_PAGE_SIZE) {
214 pr_err("pblk: could not add page to bio\n"); 255 pr_err("pblk: could not add page to bio\n");
215 mempool_free(page, pblk->page_pool); 256 mempool_free(page, pblk->page_bio_pool);
216 goto err; 257 goto err;
217 } 258 }
218 } 259 }
@@ -245,13 +286,6 @@ void pblk_write_should_kick(struct pblk *pblk)
245 pblk_write_kick(pblk); 286 pblk_write_kick(pblk);
246} 287}
247 288
248void pblk_end_bio_sync(struct bio *bio)
249{
250 struct completion *waiting = bio->bi_private;
251
252 complete(waiting);
253}
254
255void pblk_end_io_sync(struct nvm_rq *rqd) 289void pblk_end_io_sync(struct nvm_rq *rqd)
256{ 290{
257 struct completion *waiting = rqd->private; 291 struct completion *waiting = rqd->private;
@@ -259,7 +293,7 @@ void pblk_end_io_sync(struct nvm_rq *rqd)
259 complete(waiting); 293 complete(waiting);
260} 294}
261 295
262void pblk_wait_for_meta(struct pblk *pblk) 296static void pblk_wait_for_meta(struct pblk *pblk)
263{ 297{
264 do { 298 do {
265 if (!atomic_read(&pblk->inflight_io)) 299 if (!atomic_read(&pblk->inflight_io))
@@ -336,17 +370,6 @@ void pblk_discard(struct pblk *pblk, struct bio *bio)
336 pblk_invalidate_range(pblk, slba, nr_secs); 370 pblk_invalidate_range(pblk, slba, nr_secs);
337} 371}
338 372
339struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
340{
341 struct ppa_addr ppa;
342
343 spin_lock(&pblk->trans_lock);
344 ppa = pblk_trans_map_get(pblk, lba);
345 spin_unlock(&pblk->trans_lock);
346
347 return ppa;
348}
349
350void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) 373void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
351{ 374{
352 atomic_long_inc(&pblk->write_failed); 375 atomic_long_inc(&pblk->write_failed);
@@ -389,39 +412,38 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
389 struct nvm_tgt_dev *dev = pblk->dev; 412 struct nvm_tgt_dev *dev = pblk->dev;
390 413
391#ifdef CONFIG_NVM_DEBUG 414#ifdef CONFIG_NVM_DEBUG
392 struct ppa_addr *ppa_list; 415 int ret;
393 416
394 ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; 417 ret = pblk_check_io(pblk, rqd);
395 if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { 418 if (ret)
396 WARN_ON(1); 419 return ret;
397 return -EINVAL; 420#endif
398 }
399 421
400 if (rqd->opcode == NVM_OP_PWRITE) { 422 atomic_inc(&pblk->inflight_io);
401 struct pblk_line *line;
402 struct ppa_addr ppa;
403 int i;
404 423
405 for (i = 0; i < rqd->nr_ppas; i++) { 424 return nvm_submit_io(dev, rqd);
406 ppa = ppa_list[i]; 425}
407 line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
408 426
409 spin_lock(&line->lock); 427int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
410 if (line->state != PBLK_LINESTATE_OPEN) { 428{
411 pr_err("pblk: bad ppa: line:%d,state:%d\n", 429 struct nvm_tgt_dev *dev = pblk->dev;
412 line->id, line->state); 430
413 WARN_ON(1); 431#ifdef CONFIG_NVM_DEBUG
414 spin_unlock(&line->lock); 432 int ret;
415 return -EINVAL; 433
416 } 434 ret = pblk_check_io(pblk, rqd);
417 spin_unlock(&line->lock); 435 if (ret)
418 } 436 return ret;
419 }
420#endif 437#endif
421 438
422 atomic_inc(&pblk->inflight_io); 439 atomic_inc(&pblk->inflight_io);
423 440
424 return nvm_submit_io(dev, rqd); 441 return nvm_submit_io_sync(dev, rqd);
442}
443
444static void pblk_bio_map_addr_endio(struct bio *bio)
445{
446 bio_put(bio);
425} 447}
426 448
427struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, 449struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
@@ -460,6 +482,8 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
460 482
461 kaddr += PAGE_SIZE; 483 kaddr += PAGE_SIZE;
462 } 484 }
485
486 bio->bi_end_io = pblk_bio_map_addr_endio;
463out: 487out:
464 return bio; 488 return bio;
465} 489}
@@ -486,12 +510,14 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
486 u64 addr; 510 u64 addr;
487 int i; 511 int i;
488 512
513 spin_lock(&line->lock);
489 addr = find_next_zero_bit(line->map_bitmap, 514 addr = find_next_zero_bit(line->map_bitmap,
490 pblk->lm.sec_per_line, line->cur_sec); 515 pblk->lm.sec_per_line, line->cur_sec);
491 line->cur_sec = addr - nr_secs; 516 line->cur_sec = addr - nr_secs;
492 517
493 for (i = 0; i < nr_secs; i++, line->cur_sec--) 518 for (i = 0; i < nr_secs; i++, line->cur_sec--)
494 WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); 519 WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap));
520 spin_unlock(&line->lock);
495} 521}
496 522
497u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) 523u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
@@ -565,12 +591,11 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
565 int cmd_op, bio_op; 591 int cmd_op, bio_op;
566 int i, j; 592 int i, j;
567 int ret; 593 int ret;
568 DECLARE_COMPLETION_ONSTACK(wait);
569 594
570 if (dir == WRITE) { 595 if (dir == PBLK_WRITE) {
571 bio_op = REQ_OP_WRITE; 596 bio_op = REQ_OP_WRITE;
572 cmd_op = NVM_OP_PWRITE; 597 cmd_op = NVM_OP_PWRITE;
573 } else if (dir == READ) { 598 } else if (dir == PBLK_READ) {
574 bio_op = REQ_OP_READ; 599 bio_op = REQ_OP_READ;
575 cmd_op = NVM_OP_PREAD; 600 cmd_op = NVM_OP_PREAD;
576 } else 601 } else
@@ -607,13 +632,11 @@ next_rq:
607 rqd.dma_ppa_list = dma_ppa_list; 632 rqd.dma_ppa_list = dma_ppa_list;
608 rqd.opcode = cmd_op; 633 rqd.opcode = cmd_op;
609 rqd.nr_ppas = rq_ppas; 634 rqd.nr_ppas = rq_ppas;
610 rqd.end_io = pblk_end_io_sync;
611 rqd.private = &wait;
612 635
613 if (dir == WRITE) { 636 if (dir == PBLK_WRITE) {
614 struct pblk_sec_meta *meta_list = rqd.meta_list; 637 struct pblk_sec_meta *meta_list = rqd.meta_list;
615 638
616 rqd.flags = pblk_set_progr_mode(pblk, WRITE); 639 rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
617 for (i = 0; i < rqd.nr_ppas; ) { 640 for (i = 0; i < rqd.nr_ppas; ) {
618 spin_lock(&line->lock); 641 spin_lock(&line->lock);
619 paddr = __pblk_alloc_page(pblk, line, min); 642 paddr = __pblk_alloc_page(pblk, line, min);
@@ -662,25 +685,17 @@ next_rq:
662 } 685 }
663 } 686 }
664 687
665 ret = pblk_submit_io(pblk, &rqd); 688 ret = pblk_submit_io_sync(pblk, &rqd);
666 if (ret) { 689 if (ret) {
667 pr_err("pblk: emeta I/O submission failed: %d\n", ret); 690 pr_err("pblk: emeta I/O submission failed: %d\n", ret);
668 bio_put(bio); 691 bio_put(bio);
669 goto free_rqd_dma; 692 goto free_rqd_dma;
670 } 693 }
671 694
672 if (!wait_for_completion_io_timeout(&wait,
673 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
674 pr_err("pblk: emeta I/O timed out\n");
675 }
676 atomic_dec(&pblk->inflight_io); 695 atomic_dec(&pblk->inflight_io);
677 reinit_completion(&wait);
678
679 if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META))
680 bio_put(bio);
681 696
682 if (rqd.error) { 697 if (rqd.error) {
683 if (dir == WRITE) 698 if (dir == PBLK_WRITE)
684 pblk_log_write_err(pblk, &rqd); 699 pblk_log_write_err(pblk, &rqd);
685 else 700 else
686 pblk_log_read_err(pblk, &rqd); 701 pblk_log_read_err(pblk, &rqd);
@@ -721,14 +736,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
721 int i, ret; 736 int i, ret;
722 int cmd_op, bio_op; 737 int cmd_op, bio_op;
723 int flags; 738 int flags;
724 DECLARE_COMPLETION_ONSTACK(wait);
725 739
726 if (dir == WRITE) { 740 if (dir == PBLK_WRITE) {
727 bio_op = REQ_OP_WRITE; 741 bio_op = REQ_OP_WRITE;
728 cmd_op = NVM_OP_PWRITE; 742 cmd_op = NVM_OP_PWRITE;
729 flags = pblk_set_progr_mode(pblk, WRITE); 743 flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
730 lba_list = emeta_to_lbas(pblk, line->emeta->buf); 744 lba_list = emeta_to_lbas(pblk, line->emeta->buf);
731 } else if (dir == READ) { 745 } else if (dir == PBLK_READ) {
732 bio_op = REQ_OP_READ; 746 bio_op = REQ_OP_READ;
733 cmd_op = NVM_OP_PREAD; 747 cmd_op = NVM_OP_PREAD;
734 flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); 748 flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -758,15 +772,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
758 rqd.opcode = cmd_op; 772 rqd.opcode = cmd_op;
759 rqd.flags = flags; 773 rqd.flags = flags;
760 rqd.nr_ppas = lm->smeta_sec; 774 rqd.nr_ppas = lm->smeta_sec;
761 rqd.end_io = pblk_end_io_sync;
762 rqd.private = &wait;
763 775
764 for (i = 0; i < lm->smeta_sec; i++, paddr++) { 776 for (i = 0; i < lm->smeta_sec; i++, paddr++) {
765 struct pblk_sec_meta *meta_list = rqd.meta_list; 777 struct pblk_sec_meta *meta_list = rqd.meta_list;
766 778
767 rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); 779 rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
768 780
769 if (dir == WRITE) { 781 if (dir == PBLK_WRITE) {
770 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); 782 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
771 783
772 meta_list[i].lba = lba_list[paddr] = addr_empty; 784 meta_list[i].lba = lba_list[paddr] = addr_empty;
@@ -778,21 +790,17 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
778 * the write thread is the only one sending write and erase commands, 790 * the write thread is the only one sending write and erase commands,
779 * there is no need to take the LUN semaphore. 791 * there is no need to take the LUN semaphore.
780 */ 792 */
781 ret = pblk_submit_io(pblk, &rqd); 793 ret = pblk_submit_io_sync(pblk, &rqd);
782 if (ret) { 794 if (ret) {
783 pr_err("pblk: smeta I/O submission failed: %d\n", ret); 795 pr_err("pblk: smeta I/O submission failed: %d\n", ret);
784 bio_put(bio); 796 bio_put(bio);
785 goto free_ppa_list; 797 goto free_ppa_list;
786 } 798 }
787 799
788 if (!wait_for_completion_io_timeout(&wait,
789 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
790 pr_err("pblk: smeta I/O timed out\n");
791 }
792 atomic_dec(&pblk->inflight_io); 800 atomic_dec(&pblk->inflight_io);
793 801
794 if (rqd.error) { 802 if (rqd.error) {
795 if (dir == WRITE) 803 if (dir == PBLK_WRITE)
796 pblk_log_write_err(pblk, &rqd); 804 pblk_log_write_err(pblk, &rqd);
797 else 805 else
798 pblk_log_read_err(pblk, &rqd); 806 pblk_log_read_err(pblk, &rqd);
@@ -808,14 +816,14 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
808{ 816{
809 u64 bpaddr = pblk_line_smeta_start(pblk, line); 817 u64 bpaddr = pblk_line_smeta_start(pblk, line);
810 818
811 return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ); 819 return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
812} 820}
813 821
814int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, 822int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
815 void *emeta_buf) 823 void *emeta_buf)
816{ 824{
817 return pblk_line_submit_emeta_io(pblk, line, emeta_buf, 825 return pblk_line_submit_emeta_io(pblk, line, emeta_buf,
818 line->emeta_ssec, READ); 826 line->emeta_ssec, PBLK_READ);
819} 827}
820 828
821static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, 829static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
@@ -824,7 +832,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
824 rqd->opcode = NVM_OP_ERASE; 832 rqd->opcode = NVM_OP_ERASE;
825 rqd->ppa_addr = ppa; 833 rqd->ppa_addr = ppa;
826 rqd->nr_ppas = 1; 834 rqd->nr_ppas = 1;
827 rqd->flags = pblk_set_progr_mode(pblk, ERASE); 835 rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE);
828 rqd->bio = NULL; 836 rqd->bio = NULL;
829} 837}
830 838
@@ -832,19 +840,15 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
832{ 840{
833 struct nvm_rq rqd; 841 struct nvm_rq rqd;
834 int ret = 0; 842 int ret = 0;
835 DECLARE_COMPLETION_ONSTACK(wait);
836 843
837 memset(&rqd, 0, sizeof(struct nvm_rq)); 844 memset(&rqd, 0, sizeof(struct nvm_rq));
838 845
839 pblk_setup_e_rq(pblk, &rqd, ppa); 846 pblk_setup_e_rq(pblk, &rqd, ppa);
840 847
841 rqd.end_io = pblk_end_io_sync;
842 rqd.private = &wait;
843
844 /* The write thread schedules erases so that it minimizes disturbances 848 /* The write thread schedules erases so that it minimizes disturbances
845 * with writes. Thus, there is no need to take the LUN semaphore. 849 * with writes. Thus, there is no need to take the LUN semaphore.
846 */ 850 */
847 ret = pblk_submit_io(pblk, &rqd); 851 ret = pblk_submit_io_sync(pblk, &rqd);
848 if (ret) { 852 if (ret) {
849 struct nvm_tgt_dev *dev = pblk->dev; 853 struct nvm_tgt_dev *dev = pblk->dev;
850 struct nvm_geo *geo = &dev->geo; 854 struct nvm_geo *geo = &dev->geo;
@@ -857,11 +861,6 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
857 goto out; 861 goto out;
858 } 862 }
859 863
860 if (!wait_for_completion_io_timeout(&wait,
861 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
862 pr_err("pblk: sync erase timed out\n");
863 }
864
865out: 864out:
866 rqd.private = pblk; 865 rqd.private = pblk;
867 __pblk_end_io_erase(pblk, &rqd); 866 __pblk_end_io_erase(pblk, &rqd);
@@ -976,7 +975,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
976 memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); 975 memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
977 smeta_buf->header.id = cpu_to_le32(line->id); 976 smeta_buf->header.id = cpu_to_le32(line->id);
978 smeta_buf->header.type = cpu_to_le16(line->type); 977 smeta_buf->header.type = cpu_to_le16(line->type);
979 smeta_buf->header.version = cpu_to_le16(1); 978 smeta_buf->header.version = SMETA_VERSION;
980 979
981 /* Start metadata */ 980 /* Start metadata */
982 smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); 981 smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
@@ -1046,7 +1045,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
1046 line->smeta_ssec = off; 1045 line->smeta_ssec = off;
1047 line->cur_sec = off + lm->smeta_sec; 1046 line->cur_sec = off + lm->smeta_sec;
1048 1047
1049 if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) { 1048 if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
1050 pr_debug("pblk: line smeta I/O failed. Retry\n"); 1049 pr_debug("pblk: line smeta I/O failed. Retry\n");
1051 return 1; 1050 return 1;
1052 } 1051 }
@@ -1056,7 +1055,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
1056 /* Mark emeta metadata sectors as bad sectors. We need to consider bad 1055 /* Mark emeta metadata sectors as bad sectors. We need to consider bad
1057 * blocks to make sure that there are enough sectors to store emeta 1056 * blocks to make sure that there are enough sectors to store emeta
1058 */ 1057 */
1059 bit = lm->sec_per_line;
1060 off = lm->sec_per_line - lm->emeta_sec[0]; 1058 off = lm->sec_per_line - lm->emeta_sec[0];
1061 bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]); 1059 bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]);
1062 while (nr_bb) { 1060 while (nr_bb) {
@@ -1093,25 +1091,21 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
1093 struct pblk_line_meta *lm = &pblk->lm; 1091 struct pblk_line_meta *lm = &pblk->lm;
1094 int blk_in_line = atomic_read(&line->blk_in_line); 1092 int blk_in_line = atomic_read(&line->blk_in_line);
1095 1093
1096 line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC); 1094 line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
1097 if (!line->map_bitmap) 1095 if (!line->map_bitmap)
1098 return -ENOMEM; 1096 return -ENOMEM;
1099 memset(line->map_bitmap, 0, lm->sec_bitmap_len);
1100 1097
1101 /* invalid_bitmap is special since it is used when line is closed. No 1098 /* will be initialized using bb info from map_bitmap */
1102 * need to zeroized; it will be initialized using bb info form 1099 line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC);
1103 * map_bitmap
1104 */
1105 line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
1106 if (!line->invalid_bitmap) { 1100 if (!line->invalid_bitmap) {
1107 mempool_free(line->map_bitmap, pblk->line_meta_pool); 1101 kfree(line->map_bitmap);
1108 return -ENOMEM; 1102 return -ENOMEM;
1109 } 1103 }
1110 1104
1111 spin_lock(&line->lock); 1105 spin_lock(&line->lock);
1112 if (line->state != PBLK_LINESTATE_FREE) { 1106 if (line->state != PBLK_LINESTATE_FREE) {
1113 mempool_free(line->invalid_bitmap, pblk->line_meta_pool); 1107 kfree(line->map_bitmap);
1114 mempool_free(line->map_bitmap, pblk->line_meta_pool); 1108 kfree(line->invalid_bitmap);
1115 spin_unlock(&line->lock); 1109 spin_unlock(&line->lock);
1116 WARN(1, "pblk: corrupted line %d, state %d\n", 1110 WARN(1, "pblk: corrupted line %d, state %d\n",
1117 line->id, line->state); 1111 line->id, line->state);
@@ -1163,7 +1157,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
1163 1157
1164void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) 1158void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
1165{ 1159{
1166 mempool_free(line->map_bitmap, pblk->line_meta_pool); 1160 kfree(line->map_bitmap);
1167 line->map_bitmap = NULL; 1161 line->map_bitmap = NULL;
1168 line->smeta = NULL; 1162 line->smeta = NULL;
1169 line->emeta = NULL; 1163 line->emeta = NULL;
@@ -1328,6 +1322,41 @@ static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line)
1328 pblk->state = PBLK_STATE_STOPPING; 1322 pblk->state = PBLK_STATE_STOPPING;
1329} 1323}
1330 1324
1325static void pblk_line_close_meta_sync(struct pblk *pblk)
1326{
1327 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
1328 struct pblk_line_meta *lm = &pblk->lm;
1329 struct pblk_line *line, *tline;
1330 LIST_HEAD(list);
1331
1332 spin_lock(&l_mg->close_lock);
1333 if (list_empty(&l_mg->emeta_list)) {
1334 spin_unlock(&l_mg->close_lock);
1335 return;
1336 }
1337
1338 list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
1339 spin_unlock(&l_mg->close_lock);
1340
1341 list_for_each_entry_safe(line, tline, &list, list) {
1342 struct pblk_emeta *emeta = line->emeta;
1343
1344 while (emeta->mem < lm->emeta_len[0]) {
1345 int ret;
1346
1347 ret = pblk_submit_meta_io(pblk, line);
1348 if (ret) {
1349 pr_err("pblk: sync meta line %d failed (%d)\n",
1350 line->id, ret);
1351 return;
1352 }
1353 }
1354 }
1355
1356 pblk_wait_for_meta(pblk);
1357 flush_workqueue(pblk->close_wq);
1358}
1359
1331void pblk_pipeline_stop(struct pblk *pblk) 1360void pblk_pipeline_stop(struct pblk *pblk)
1332{ 1361{
1333 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 1362 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1361,17 +1390,17 @@ void pblk_pipeline_stop(struct pblk *pblk)
1361 spin_unlock(&l_mg->free_lock); 1390 spin_unlock(&l_mg->free_lock);
1362} 1391}
1363 1392
1364void pblk_line_replace_data(struct pblk *pblk) 1393struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
1365{ 1394{
1366 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 1395 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
1367 struct pblk_line *cur, *new; 1396 struct pblk_line *cur, *new = NULL;
1368 unsigned int left_seblks; 1397 unsigned int left_seblks;
1369 int is_next = 0; 1398 int is_next = 0;
1370 1399
1371 cur = l_mg->data_line; 1400 cur = l_mg->data_line;
1372 new = l_mg->data_next; 1401 new = l_mg->data_next;
1373 if (!new) 1402 if (!new)
1374 return; 1403 goto out;
1375 l_mg->data_line = new; 1404 l_mg->data_line = new;
1376 1405
1377 spin_lock(&l_mg->free_lock); 1406 spin_lock(&l_mg->free_lock);
@@ -1379,7 +1408,7 @@ void pblk_line_replace_data(struct pblk *pblk)
1379 l_mg->data_line = NULL; 1408 l_mg->data_line = NULL;
1380 l_mg->data_next = NULL; 1409 l_mg->data_next = NULL;
1381 spin_unlock(&l_mg->free_lock); 1410 spin_unlock(&l_mg->free_lock);
1382 return; 1411 goto out;
1383 } 1412 }
1384 1413
1385 pblk_line_setup_metadata(new, l_mg, &pblk->lm); 1414 pblk_line_setup_metadata(new, l_mg, &pblk->lm);
@@ -1391,7 +1420,7 @@ retry_erase:
1391 /* If line is not fully erased, erase it */ 1420 /* If line is not fully erased, erase it */
1392 if (atomic_read(&new->left_eblks)) { 1421 if (atomic_read(&new->left_eblks)) {
1393 if (pblk_line_erase(pblk, new)) 1422 if (pblk_line_erase(pblk, new))
1394 return; 1423 goto out;
1395 } else { 1424 } else {
1396 io_schedule(); 1425 io_schedule();
1397 } 1426 }
@@ -1402,7 +1431,7 @@ retry_setup:
1402 if (!pblk_line_init_metadata(pblk, new, cur)) { 1431 if (!pblk_line_init_metadata(pblk, new, cur)) {
1403 new = pblk_line_retry(pblk, new); 1432 new = pblk_line_retry(pblk, new);
1404 if (!new) 1433 if (!new)
1405 return; 1434 goto out;
1406 1435
1407 goto retry_setup; 1436 goto retry_setup;
1408 } 1437 }
@@ -1410,7 +1439,7 @@ retry_setup:
1410 if (!pblk_line_init_bb(pblk, new, 1)) { 1439 if (!pblk_line_init_bb(pblk, new, 1)) {
1411 new = pblk_line_retry(pblk, new); 1440 new = pblk_line_retry(pblk, new);
1412 if (!new) 1441 if (!new)
1413 return; 1442 goto out;
1414 1443
1415 goto retry_setup; 1444 goto retry_setup;
1416 } 1445 }
@@ -1434,14 +1463,15 @@ retry_setup:
1434 1463
1435 if (is_next) 1464 if (is_next)
1436 pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); 1465 pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
1466
1467out:
1468 return new;
1437} 1469}
1438 1470
1439void pblk_line_free(struct pblk *pblk, struct pblk_line *line) 1471void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
1440{ 1472{
1441 if (line->map_bitmap) 1473 kfree(line->map_bitmap);
1442 mempool_free(line->map_bitmap, pblk->line_meta_pool); 1474 kfree(line->invalid_bitmap);
1443 if (line->invalid_bitmap)
1444 mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
1445 1475
1446 *line->vsc = cpu_to_le32(EMPTY_ENTRY); 1476 *line->vsc = cpu_to_le32(EMPTY_ENTRY);
1447 1477
@@ -1451,11 +1481,10 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
1451 line->emeta = NULL; 1481 line->emeta = NULL;
1452} 1482}
1453 1483
1454void pblk_line_put(struct kref *ref) 1484static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
1455{ 1485{
1456 struct pblk_line *line = container_of(ref, struct pblk_line, ref);
1457 struct pblk *pblk = line->pblk;
1458 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 1486 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
1487 struct pblk_gc *gc = &pblk->gc;
1459 1488
1460 spin_lock(&line->lock); 1489 spin_lock(&line->lock);
1461 WARN_ON(line->state != PBLK_LINESTATE_GC); 1490 WARN_ON(line->state != PBLK_LINESTATE_GC);
@@ -1464,6 +1493,8 @@ void pblk_line_put(struct kref *ref)
1464 pblk_line_free(pblk, line); 1493 pblk_line_free(pblk, line);
1465 spin_unlock(&line->lock); 1494 spin_unlock(&line->lock);
1466 1495
1496 atomic_dec(&gc->pipeline_gc);
1497
1467 spin_lock(&l_mg->free_lock); 1498 spin_lock(&l_mg->free_lock);
1468 list_add_tail(&line->list, &l_mg->free_list); 1499 list_add_tail(&line->list, &l_mg->free_list);
1469 l_mg->nr_free_lines++; 1500 l_mg->nr_free_lines++;
@@ -1472,13 +1503,49 @@ void pblk_line_put(struct kref *ref)
1472 pblk_rl_free_lines_inc(&pblk->rl, line); 1503 pblk_rl_free_lines_inc(&pblk->rl, line);
1473} 1504}
1474 1505
1506static void pblk_line_put_ws(struct work_struct *work)
1507{
1508 struct pblk_line_ws *line_put_ws = container_of(work,
1509 struct pblk_line_ws, ws);
1510 struct pblk *pblk = line_put_ws->pblk;
1511 struct pblk_line *line = line_put_ws->line;
1512
1513 __pblk_line_put(pblk, line);
1514 mempool_free(line_put_ws, pblk->gen_ws_pool);
1515}
1516
1517void pblk_line_put(struct kref *ref)
1518{
1519 struct pblk_line *line = container_of(ref, struct pblk_line, ref);
1520 struct pblk *pblk = line->pblk;
1521
1522 __pblk_line_put(pblk, line);
1523}
1524
1525void pblk_line_put_wq(struct kref *ref)
1526{
1527 struct pblk_line *line = container_of(ref, struct pblk_line, ref);
1528 struct pblk *pblk = line->pblk;
1529 struct pblk_line_ws *line_put_ws;
1530
1531 line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC);
1532 if (!line_put_ws)
1533 return;
1534
1535 line_put_ws->pblk = pblk;
1536 line_put_ws->line = line;
1537 line_put_ws->priv = NULL;
1538
1539 INIT_WORK(&line_put_ws->ws, pblk_line_put_ws);
1540 queue_work(pblk->r_end_wq, &line_put_ws->ws);
1541}
1542
1475int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) 1543int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
1476{ 1544{
1477 struct nvm_rq *rqd; 1545 struct nvm_rq *rqd;
1478 int err; 1546 int err;
1479 1547
1480 rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL); 1548 rqd = pblk_alloc_rqd(pblk, PBLK_ERASE);
1481 memset(rqd, 0, pblk_g_rq_size);
1482 1549
1483 pblk_setup_e_rq(pblk, rqd, ppa); 1550 pblk_setup_e_rq(pblk, rqd, ppa);
1484 1551
@@ -1517,41 +1584,6 @@ int pblk_line_is_full(struct pblk_line *line)
1517 return (line->left_msecs == 0); 1584 return (line->left_msecs == 0);
1518} 1585}
1519 1586
1520void pblk_line_close_meta_sync(struct pblk *pblk)
1521{
1522 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
1523 struct pblk_line_meta *lm = &pblk->lm;
1524 struct pblk_line *line, *tline;
1525 LIST_HEAD(list);
1526
1527 spin_lock(&l_mg->close_lock);
1528 if (list_empty(&l_mg->emeta_list)) {
1529 spin_unlock(&l_mg->close_lock);
1530 return;
1531 }
1532
1533 list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
1534 spin_unlock(&l_mg->close_lock);
1535
1536 list_for_each_entry_safe(line, tline, &list, list) {
1537 struct pblk_emeta *emeta = line->emeta;
1538
1539 while (emeta->mem < lm->emeta_len[0]) {
1540 int ret;
1541
1542 ret = pblk_submit_meta_io(pblk, line);
1543 if (ret) {
1544 pr_err("pblk: sync meta line %d failed (%d)\n",
1545 line->id, ret);
1546 return;
1547 }
1548 }
1549 }
1550
1551 pblk_wait_for_meta(pblk);
1552 flush_workqueue(pblk->close_wq);
1553}
1554
1555static void pblk_line_should_sync_meta(struct pblk *pblk) 1587static void pblk_line_should_sync_meta(struct pblk *pblk)
1556{ 1588{
1557 if (pblk_rl_is_limit(&pblk->rl)) 1589 if (pblk_rl_is_limit(&pblk->rl))
@@ -1582,15 +1614,13 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
1582 1614
1583 list_add_tail(&line->list, move_list); 1615 list_add_tail(&line->list, move_list);
1584 1616
1585 mempool_free(line->map_bitmap, pblk->line_meta_pool); 1617 kfree(line->map_bitmap);
1586 line->map_bitmap = NULL; 1618 line->map_bitmap = NULL;
1587 line->smeta = NULL; 1619 line->smeta = NULL;
1588 line->emeta = NULL; 1620 line->emeta = NULL;
1589 1621
1590 spin_unlock(&line->lock); 1622 spin_unlock(&line->lock);
1591 spin_unlock(&l_mg->gc_lock); 1623 spin_unlock(&l_mg->gc_lock);
1592
1593 pblk_gc_should_kick(pblk);
1594} 1624}
1595 1625
1596void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) 1626void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
@@ -1624,43 +1654,16 @@ void pblk_line_close_ws(struct work_struct *work)
1624 struct pblk_line *line = line_ws->line; 1654 struct pblk_line *line = line_ws->line;
1625 1655
1626 pblk_line_close(pblk, line); 1656 pblk_line_close(pblk, line);
1627 mempool_free(line_ws, pblk->line_ws_pool); 1657 mempool_free(line_ws, pblk->gen_ws_pool);
1628}
1629
1630void pblk_line_mark_bb(struct work_struct *work)
1631{
1632 struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
1633 ws);
1634 struct pblk *pblk = line_ws->pblk;
1635 struct nvm_tgt_dev *dev = pblk->dev;
1636 struct ppa_addr *ppa = line_ws->priv;
1637 int ret;
1638
1639 ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
1640 if (ret) {
1641 struct pblk_line *line;
1642 int pos;
1643
1644 line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
1645 pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
1646
1647 pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
1648 line->id, pos);
1649 }
1650
1651 kfree(ppa);
1652 mempool_free(line_ws, pblk->line_ws_pool);
1653} 1658}
1654 1659
1655void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, 1660void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
1656 void (*work)(struct work_struct *), 1661 void (*work)(struct work_struct *), gfp_t gfp_mask,
1657 struct workqueue_struct *wq) 1662 struct workqueue_struct *wq)
1658{ 1663{
1659 struct pblk_line_ws *line_ws; 1664 struct pblk_line_ws *line_ws;
1660 1665
1661 line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC); 1666 line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask);
1662 if (!line_ws)
1663 return;
1664 1667
1665 line_ws->pblk = pblk; 1668 line_ws->pblk = pblk;
1666 line_ws->line = line; 1669 line_ws->line = line;
@@ -1689,16 +1692,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
1689#endif 1692#endif
1690 1693
1691 ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); 1694 ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
1692 if (ret) { 1695 if (ret == -ETIME || ret == -EINTR)
1693 switch (ret) { 1696 pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret);
1694 case -ETIME:
1695 pr_err("pblk: lun semaphore timed out\n");
1696 break;
1697 case -EINTR:
1698 pr_err("pblk: lun semaphore timed out\n");
1699 break;
1700 }
1701 }
1702} 1697}
1703 1698
1704void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) 1699void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
@@ -1758,13 +1753,11 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
1758 rlun = &pblk->luns[bit]; 1753 rlun = &pblk->luns[bit];
1759 up(&rlun->wr_sem); 1754 up(&rlun->wr_sem);
1760 } 1755 }
1761
1762 kfree(lun_bitmap);
1763} 1756}
1764 1757
1765void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) 1758void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
1766{ 1759{
1767 struct ppa_addr l2p_ppa; 1760 struct ppa_addr ppa_l2p;
1768 1761
1769 /* logic error: lba out-of-bounds. Ignore update */ 1762 /* logic error: lba out-of-bounds. Ignore update */
1770 if (!(lba < pblk->rl.nr_secs)) { 1763 if (!(lba < pblk->rl.nr_secs)) {
@@ -1773,10 +1766,10 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
1773 } 1766 }
1774 1767
1775 spin_lock(&pblk->trans_lock); 1768 spin_lock(&pblk->trans_lock);
1776 l2p_ppa = pblk_trans_map_get(pblk, lba); 1769 ppa_l2p = pblk_trans_map_get(pblk, lba);
1777 1770
1778 if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa)) 1771 if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p))
1779 pblk_map_invalidate(pblk, l2p_ppa); 1772 pblk_map_invalidate(pblk, ppa_l2p);
1780 1773
1781 pblk_trans_map_set(pblk, lba, ppa); 1774 pblk_trans_map_set(pblk, lba, ppa);
1782 spin_unlock(&pblk->trans_lock); 1775 spin_unlock(&pblk->trans_lock);
@@ -1784,6 +1777,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
1784 1777
1785void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) 1778void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
1786{ 1779{
1780
1787#ifdef CONFIG_NVM_DEBUG 1781#ifdef CONFIG_NVM_DEBUG
1788 /* Callers must ensure that the ppa points to a cache address */ 1782 /* Callers must ensure that the ppa points to a cache address */
1789 BUG_ON(!pblk_addr_in_cache(ppa)); 1783 BUG_ON(!pblk_addr_in_cache(ppa));
@@ -1793,16 +1787,16 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
1793 pblk_update_map(pblk, lba, ppa); 1787 pblk_update_map(pblk, lba, ppa);
1794} 1788}
1795 1789
1796int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, 1790int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
1797 struct pblk_line *gc_line) 1791 struct pblk_line *gc_line, u64 paddr_gc)
1798{ 1792{
1799 struct ppa_addr l2p_ppa; 1793 struct ppa_addr ppa_l2p, ppa_gc;
1800 int ret = 1; 1794 int ret = 1;
1801 1795
1802#ifdef CONFIG_NVM_DEBUG 1796#ifdef CONFIG_NVM_DEBUG
1803 /* Callers must ensure that the ppa points to a cache address */ 1797 /* Callers must ensure that the ppa points to a cache address */
1804 BUG_ON(!pblk_addr_in_cache(ppa)); 1798 BUG_ON(!pblk_addr_in_cache(ppa_new));
1805 BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); 1799 BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
1806#endif 1800#endif
1807 1801
1808 /* logic error: lba out-of-bounds. Ignore update */ 1802 /* logic error: lba out-of-bounds. Ignore update */
@@ -1812,36 +1806,41 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
1812 } 1806 }
1813 1807
1814 spin_lock(&pblk->trans_lock); 1808 spin_lock(&pblk->trans_lock);
1815 l2p_ppa = pblk_trans_map_get(pblk, lba); 1809 ppa_l2p = pblk_trans_map_get(pblk, lba);
1810 ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id);
1811
1812 if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) {
1813 spin_lock(&gc_line->lock);
1814 WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap),
1815 "pblk: corrupted GC update");
1816 spin_unlock(&gc_line->lock);
1816 1817
1817 /* Prevent updated entries to be overwritten by GC */
1818 if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
1819 pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
1820 ret = 0; 1818 ret = 0;
1821 goto out; 1819 goto out;
1822 } 1820 }
1823 1821
1824 pblk_trans_map_set(pblk, lba, ppa); 1822 pblk_trans_map_set(pblk, lba, ppa_new);
1825out: 1823out:
1826 spin_unlock(&pblk->trans_lock); 1824 spin_unlock(&pblk->trans_lock);
1827 return ret; 1825 return ret;
1828} 1826}
1829 1827
1830void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, 1828void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
1831 struct ppa_addr entry_line) 1829 struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache)
1832{ 1830{
1833 struct ppa_addr l2p_line; 1831 struct ppa_addr ppa_l2p;
1834 1832
1835#ifdef CONFIG_NVM_DEBUG 1833#ifdef CONFIG_NVM_DEBUG
1836 /* Callers must ensure that the ppa points to a device address */ 1834 /* Callers must ensure that the ppa points to a device address */
1837 BUG_ON(pblk_addr_in_cache(ppa)); 1835 BUG_ON(pblk_addr_in_cache(ppa_mapped));
1838#endif 1836#endif
1839 /* Invalidate and discard padded entries */ 1837 /* Invalidate and discard padded entries */
1840 if (lba == ADDR_EMPTY) { 1838 if (lba == ADDR_EMPTY) {
1841#ifdef CONFIG_NVM_DEBUG 1839#ifdef CONFIG_NVM_DEBUG
1842 atomic_long_inc(&pblk->padded_wb); 1840 atomic_long_inc(&pblk->padded_wb);
1843#endif 1841#endif
1844 pblk_map_invalidate(pblk, ppa); 1842 if (!pblk_ppa_empty(ppa_mapped))
1843 pblk_map_invalidate(pblk, ppa_mapped);
1845 return; 1844 return;
1846 } 1845 }
1847 1846
@@ -1852,22 +1851,22 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
1852 } 1851 }
1853 1852
1854 spin_lock(&pblk->trans_lock); 1853 spin_lock(&pblk->trans_lock);
1855 l2p_line = pblk_trans_map_get(pblk, lba); 1854 ppa_l2p = pblk_trans_map_get(pblk, lba);
1856 1855
1857 /* Do not update L2P if the cacheline has been updated. In this case, 1856 /* Do not update L2P if the cacheline has been updated. In this case,
1858 * the mapped ppa must be invalidated 1857 * the mapped ppa must be invalidated
1859 */ 1858 */
1860 if (l2p_line.ppa != entry_line.ppa) { 1859 if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) {
1861 if (!pblk_ppa_empty(ppa)) 1860 if (!pblk_ppa_empty(ppa_mapped))
1862 pblk_map_invalidate(pblk, ppa); 1861 pblk_map_invalidate(pblk, ppa_mapped);
1863 goto out; 1862 goto out;
1864 } 1863 }
1865 1864
1866#ifdef CONFIG_NVM_DEBUG 1865#ifdef CONFIG_NVM_DEBUG
1867 WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line)); 1866 WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
1868#endif 1867#endif
1869 1868
1870 pblk_trans_map_set(pblk, lba, ppa); 1869 pblk_trans_map_set(pblk, lba, ppa_mapped);
1871out: 1870out:
1872 spin_unlock(&pblk->trans_lock); 1871 spin_unlock(&pblk->trans_lock);
1873} 1872}
@@ -1878,23 +1877,32 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
1878 int i; 1877 int i;
1879 1878
1880 spin_lock(&pblk->trans_lock); 1879 spin_lock(&pblk->trans_lock);
1881 for (i = 0; i < nr_secs; i++) 1880 for (i = 0; i < nr_secs; i++) {
1882 ppas[i] = pblk_trans_map_get(pblk, blba + i); 1881 struct ppa_addr ppa;
1882
1883 ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i);
1884
1885 /* If the L2P entry maps to a line, the reference is valid */
1886 if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
1887 int line_id = pblk_dev_ppa_to_line(ppa);
1888 struct pblk_line *line = &pblk->lines[line_id];
1889
1890 kref_get(&line->ref);
1891 }
1892 }
1883 spin_unlock(&pblk->trans_lock); 1893 spin_unlock(&pblk->trans_lock);
1884} 1894}
1885 1895
1886void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, 1896void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
1887 u64 *lba_list, int nr_secs) 1897 u64 *lba_list, int nr_secs)
1888{ 1898{
1889 sector_t lba; 1899 u64 lba;
1890 int i; 1900 int i;
1891 1901
1892 spin_lock(&pblk->trans_lock); 1902 spin_lock(&pblk->trans_lock);
1893 for (i = 0; i < nr_secs; i++) { 1903 for (i = 0; i < nr_secs; i++) {
1894 lba = lba_list[i]; 1904 lba = lba_list[i];
1895 if (lba == ADDR_EMPTY) { 1905 if (lba != ADDR_EMPTY) {
1896 ppas[i].ppa = ADDR_EMPTY;
1897 } else {
1898 /* logic error: lba out-of-bounds. Ignore update */ 1906 /* logic error: lba out-of-bounds. Ignore update */
1899 if (!(lba < pblk->rl.nr_secs)) { 1907 if (!(lba < pblk->rl.nr_secs)) {
1900 WARN(1, "pblk: corrupted L2P map request\n"); 1908 WARN(1, "pblk: corrupted L2P map request\n");
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 6090d28f7995..00d5698d64a9 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -20,7 +20,8 @@
20 20
21static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) 21static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
22{ 22{
23 vfree(gc_rq->data); 23 if (gc_rq->data)
24 vfree(gc_rq->data);
24 kfree(gc_rq); 25 kfree(gc_rq);
25} 26}
26 27
@@ -41,10 +42,7 @@ static int pblk_gc_write(struct pblk *pblk)
41 spin_unlock(&gc->w_lock); 42 spin_unlock(&gc->w_lock);
42 43
43 list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { 44 list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
44 pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list, 45 pblk_write_gc_to_cache(pblk, gc_rq);
45 gc_rq->nr_secs, gc_rq->secs_to_gc,
46 gc_rq->line, PBLK_IOTYPE_GC);
47
48 list_del(&gc_rq->list); 46 list_del(&gc_rq->list);
49 kref_put(&gc_rq->line->ref, pblk_line_put); 47 kref_put(&gc_rq->line->ref, pblk_line_put);
50 pblk_gc_free_gc_rq(gc_rq); 48 pblk_gc_free_gc_rq(gc_rq);
@@ -58,42 +56,59 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc)
58 wake_up_process(gc->gc_writer_ts); 56 wake_up_process(gc->gc_writer_ts);
59} 57}
60 58
61/* 59static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
62 * Responsible for managing all memory related to a gc request. Also in case of 60{
63 * failure 61 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
64 */ 62 struct list_head *move_list;
65static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) 63
64 spin_lock(&line->lock);
65 WARN_ON(line->state != PBLK_LINESTATE_GC);
66 line->state = PBLK_LINESTATE_CLOSED;
67 move_list = pblk_line_gc_list(pblk, line);
68 spin_unlock(&line->lock);
69
70 if (move_list) {
71 spin_lock(&l_mg->gc_lock);
72 list_add_tail(&line->list, move_list);
73 spin_unlock(&l_mg->gc_lock);
74 }
75}
76
77static void pblk_gc_line_ws(struct work_struct *work)
66{ 78{
79 struct pblk_line_ws *gc_rq_ws = container_of(work,
80 struct pblk_line_ws, ws);
81 struct pblk *pblk = gc_rq_ws->pblk;
67 struct nvm_tgt_dev *dev = pblk->dev; 82 struct nvm_tgt_dev *dev = pblk->dev;
68 struct nvm_geo *geo = &dev->geo; 83 struct nvm_geo *geo = &dev->geo;
69 struct pblk_gc *gc = &pblk->gc; 84 struct pblk_gc *gc = &pblk->gc;
70 struct pblk_line *line = gc_rq->line; 85 struct pblk_line *line = gc_rq_ws->line;
71 void *data; 86 struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
72 unsigned int secs_to_gc; 87 int ret;
73 int ret = 0;
74 88
75 data = vmalloc(gc_rq->nr_secs * geo->sec_size); 89 up(&gc->gc_sem);
76 if (!data) { 90
77 ret = -ENOMEM; 91 gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size);
92 if (!gc_rq->data) {
93 pr_err("pblk: could not GC line:%d (%d/%d)\n",
94 line->id, *line->vsc, gc_rq->nr_secs);
78 goto out; 95 goto out;
79 } 96 }
80 97
81 /* Read from GC victim block */ 98 /* Read from GC victim block */
82 if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs, 99 ret = pblk_submit_read_gc(pblk, gc_rq);
83 &secs_to_gc, line)) { 100 if (ret) {
84 ret = -EFAULT; 101 pr_err("pblk: failed GC read in line:%d (err:%d)\n",
85 goto free_data; 102 line->id, ret);
103 goto out;
86 } 104 }
87 105
88 if (!secs_to_gc) 106 if (!gc_rq->secs_to_gc)
89 goto free_rq; 107 goto out;
90
91 gc_rq->data = data;
92 gc_rq->secs_to_gc = secs_to_gc;
93 108
94retry: 109retry:
95 spin_lock(&gc->w_lock); 110 spin_lock(&gc->w_lock);
96 if (gc->w_entries >= PBLK_GC_W_QD) { 111 if (gc->w_entries >= PBLK_GC_RQ_QD) {
97 spin_unlock(&gc->w_lock); 112 spin_unlock(&gc->w_lock);
98 pblk_gc_writer_kick(&pblk->gc); 113 pblk_gc_writer_kick(&pblk->gc);
99 usleep_range(128, 256); 114 usleep_range(128, 256);
@@ -105,53 +120,13 @@ retry:
105 120
106 pblk_gc_writer_kick(&pblk->gc); 121 pblk_gc_writer_kick(&pblk->gc);
107 122
108 return 0; 123 kfree(gc_rq_ws);
124 return;
109 125
110free_rq:
111 kfree(gc_rq);
112free_data:
113 vfree(data);
114out: 126out:
127 pblk_gc_free_gc_rq(gc_rq);
115 kref_put(&line->ref, pblk_line_put); 128 kref_put(&line->ref, pblk_line_put);
116 return ret; 129 kfree(gc_rq_ws);
117}
118
119static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
120{
121 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
122 struct list_head *move_list;
123
124 spin_lock(&line->lock);
125 WARN_ON(line->state != PBLK_LINESTATE_GC);
126 line->state = PBLK_LINESTATE_CLOSED;
127 move_list = pblk_line_gc_list(pblk, line);
128 spin_unlock(&line->lock);
129
130 if (move_list) {
131 spin_lock(&l_mg->gc_lock);
132 list_add_tail(&line->list, move_list);
133 spin_unlock(&l_mg->gc_lock);
134 }
135}
136
137static void pblk_gc_line_ws(struct work_struct *work)
138{
139 struct pblk_line_ws *line_rq_ws = container_of(work,
140 struct pblk_line_ws, ws);
141 struct pblk *pblk = line_rq_ws->pblk;
142 struct pblk_gc *gc = &pblk->gc;
143 struct pblk_line *line = line_rq_ws->line;
144 struct pblk_gc_rq *gc_rq = line_rq_ws->priv;
145
146 up(&gc->gc_sem);
147
148 if (pblk_gc_move_valid_secs(pblk, gc_rq)) {
149 pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n",
150 line->id, *line->vsc,
151 gc_rq->nr_secs);
152 }
153
154 mempool_free(line_rq_ws, pblk->line_ws_pool);
155} 130}
156 131
157static void pblk_gc_line_prepare_ws(struct work_struct *work) 132static void pblk_gc_line_prepare_ws(struct work_struct *work)
@@ -164,17 +139,24 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
164 struct pblk_line_meta *lm = &pblk->lm; 139 struct pblk_line_meta *lm = &pblk->lm;
165 struct pblk_gc *gc = &pblk->gc; 140 struct pblk_gc *gc = &pblk->gc;
166 struct line_emeta *emeta_buf; 141 struct line_emeta *emeta_buf;
167 struct pblk_line_ws *line_rq_ws; 142 struct pblk_line_ws *gc_rq_ws;
168 struct pblk_gc_rq *gc_rq; 143 struct pblk_gc_rq *gc_rq;
169 __le64 *lba_list; 144 __le64 *lba_list;
145 unsigned long *invalid_bitmap;
170 int sec_left, nr_secs, bit; 146 int sec_left, nr_secs, bit;
171 int ret; 147 int ret;
172 148
149 invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
150 if (!invalid_bitmap) {
151 pr_err("pblk: could not allocate GC invalid bitmap\n");
152 goto fail_free_ws;
153 }
154
173 emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, 155 emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
174 GFP_KERNEL); 156 GFP_KERNEL);
175 if (!emeta_buf) { 157 if (!emeta_buf) {
176 pr_err("pblk: cannot use GC emeta\n"); 158 pr_err("pblk: cannot use GC emeta\n");
177 return; 159 goto fail_free_bitmap;
178 } 160 }
179 161
180 ret = pblk_line_read_emeta(pblk, line, emeta_buf); 162 ret = pblk_line_read_emeta(pblk, line, emeta_buf);
@@ -193,7 +175,11 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
193 goto fail_free_emeta; 175 goto fail_free_emeta;
194 } 176 }
195 177
178 spin_lock(&line->lock);
179 bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line);
196 sec_left = pblk_line_vsc(line); 180 sec_left = pblk_line_vsc(line);
181 spin_unlock(&line->lock);
182
197 if (sec_left < 0) { 183 if (sec_left < 0) {
198 pr_err("pblk: corrupted GC line (%d)\n", line->id); 184 pr_err("pblk: corrupted GC line (%d)\n", line->id);
199 goto fail_free_emeta; 185 goto fail_free_emeta;
@@ -207,11 +193,12 @@ next_rq:
207 193
208 nr_secs = 0; 194 nr_secs = 0;
209 do { 195 do {
210 bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line, 196 bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line,
211 bit + 1); 197 bit + 1);
212 if (bit > line->emeta_ssec) 198 if (bit > line->emeta_ssec)
213 break; 199 break;
214 200
201 gc_rq->paddr_list[nr_secs] = bit;
215 gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); 202 gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]);
216 } while (nr_secs < pblk->max_write_pgs); 203 } while (nr_secs < pblk->max_write_pgs);
217 204
@@ -223,19 +210,25 @@ next_rq:
223 gc_rq->nr_secs = nr_secs; 210 gc_rq->nr_secs = nr_secs;
224 gc_rq->line = line; 211 gc_rq->line = line;
225 212
226 line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); 213 gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
227 if (!line_rq_ws) 214 if (!gc_rq_ws)
228 goto fail_free_gc_rq; 215 goto fail_free_gc_rq;
229 216
230 line_rq_ws->pblk = pblk; 217 gc_rq_ws->pblk = pblk;
231 line_rq_ws->line = line; 218 gc_rq_ws->line = line;
232 line_rq_ws->priv = gc_rq; 219 gc_rq_ws->priv = gc_rq;
220
221 /* The write GC path can be much slower than the read GC one due to
222 * the budget imposed by the rate-limiter. Balance in case that we get
223 * back pressure from the write GC path.
224 */
225 while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000)))
226 io_schedule();
233 227
234 down(&gc->gc_sem);
235 kref_get(&line->ref); 228 kref_get(&line->ref);
236 229
237 INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws); 230 INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws);
238 queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws); 231 queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws);
239 232
240 sec_left -= nr_secs; 233 sec_left -= nr_secs;
241 if (sec_left > 0) 234 if (sec_left > 0)
@@ -243,10 +236,11 @@ next_rq:
243 236
244out: 237out:
245 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); 238 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
246 mempool_free(line_ws, pblk->line_ws_pool); 239 kfree(line_ws);
240 kfree(invalid_bitmap);
247 241
248 kref_put(&line->ref, pblk_line_put); 242 kref_put(&line->ref, pblk_line_put);
249 atomic_dec(&gc->inflight_gc); 243 atomic_dec(&gc->read_inflight_gc);
250 244
251 return; 245 return;
252 246
@@ -254,10 +248,14 @@ fail_free_gc_rq:
254 kfree(gc_rq); 248 kfree(gc_rq);
255fail_free_emeta: 249fail_free_emeta:
256 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); 250 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
251fail_free_bitmap:
252 kfree(invalid_bitmap);
253fail_free_ws:
254 kfree(line_ws);
255
257 pblk_put_line_back(pblk, line); 256 pblk_put_line_back(pblk, line);
258 kref_put(&line->ref, pblk_line_put); 257 kref_put(&line->ref, pblk_line_put);
259 mempool_free(line_ws, pblk->line_ws_pool); 258 atomic_dec(&gc->read_inflight_gc);
260 atomic_dec(&gc->inflight_gc);
261 259
262 pr_err("pblk: Failed to GC line %d\n", line->id); 260 pr_err("pblk: Failed to GC line %d\n", line->id);
263} 261}
@@ -269,19 +267,40 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
269 267
270 pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); 268 pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
271 269
272 line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); 270 line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
273 if (!line_ws) 271 if (!line_ws)
274 return -ENOMEM; 272 return -ENOMEM;
275 273
276 line_ws->pblk = pblk; 274 line_ws->pblk = pblk;
277 line_ws->line = line; 275 line_ws->line = line;
278 276
277 atomic_inc(&gc->pipeline_gc);
279 INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); 278 INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws);
280 queue_work(gc->gc_reader_wq, &line_ws->ws); 279 queue_work(gc->gc_reader_wq, &line_ws->ws);
281 280
282 return 0; 281 return 0;
283} 282}
284 283
284static void pblk_gc_reader_kick(struct pblk_gc *gc)
285{
286 wake_up_process(gc->gc_reader_ts);
287}
288
289static void pblk_gc_kick(struct pblk *pblk)
290{
291 struct pblk_gc *gc = &pblk->gc;
292
293 pblk_gc_writer_kick(gc);
294 pblk_gc_reader_kick(gc);
295
296 /* If we're shutting down GC, let's not start it up again */
297 if (gc->gc_enabled) {
298 wake_up_process(gc->gc_ts);
299 mod_timer(&gc->gc_timer,
300 jiffies + msecs_to_jiffies(GC_TIME_MSECS));
301 }
302}
303
285static int pblk_gc_read(struct pblk *pblk) 304static int pblk_gc_read(struct pblk *pblk)
286{ 305{
287 struct pblk_gc *gc = &pblk->gc; 306 struct pblk_gc *gc = &pblk->gc;
@@ -305,11 +324,6 @@ static int pblk_gc_read(struct pblk *pblk)
305 return 0; 324 return 0;
306} 325}
307 326
308static void pblk_gc_reader_kick(struct pblk_gc *gc)
309{
310 wake_up_process(gc->gc_reader_ts);
311}
312
313static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, 327static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
314 struct list_head *group_list) 328 struct list_head *group_list)
315{ 329{
@@ -338,26 +352,17 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
338 return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); 352 return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free));
339} 353}
340 354
341/* 355void pblk_gc_free_full_lines(struct pblk *pblk)
342 * Lines with no valid sectors will be returned to the free list immediately. If
343 * GC is activated - either because the free block count is under the determined
344 * threshold, or because it is being forced from user space - only lines with a
345 * high count of invalid sectors will be recycled.
346 */
347static void pblk_gc_run(struct pblk *pblk)
348{ 356{
349 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 357 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
350 struct pblk_gc *gc = &pblk->gc; 358 struct pblk_gc *gc = &pblk->gc;
351 struct pblk_line *line; 359 struct pblk_line *line;
352 struct list_head *group_list;
353 bool run_gc;
354 int inflight_gc, gc_group = 0, prev_group = 0;
355 360
356 do { 361 do {
357 spin_lock(&l_mg->gc_lock); 362 spin_lock(&l_mg->gc_lock);
358 if (list_empty(&l_mg->gc_full_list)) { 363 if (list_empty(&l_mg->gc_full_list)) {
359 spin_unlock(&l_mg->gc_lock); 364 spin_unlock(&l_mg->gc_lock);
360 break; 365 return;
361 } 366 }
362 367
363 line = list_first_entry(&l_mg->gc_full_list, 368 line = list_first_entry(&l_mg->gc_full_list,
@@ -371,11 +376,30 @@ static void pblk_gc_run(struct pblk *pblk)
371 list_del(&line->list); 376 list_del(&line->list);
372 spin_unlock(&l_mg->gc_lock); 377 spin_unlock(&l_mg->gc_lock);
373 378
379 atomic_inc(&gc->pipeline_gc);
374 kref_put(&line->ref, pblk_line_put); 380 kref_put(&line->ref, pblk_line_put);
375 } while (1); 381 } while (1);
382}
383
384/*
385 * Lines with no valid sectors will be returned to the free list immediately. If
386 * GC is activated - either because the free block count is under the determined
387 * threshold, or because it is being forced from user space - only lines with a
388 * high count of invalid sectors will be recycled.
389 */
390static void pblk_gc_run(struct pblk *pblk)
391{
392 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
393 struct pblk_gc *gc = &pblk->gc;
394 struct pblk_line *line;
395 struct list_head *group_list;
396 bool run_gc;
397 int read_inflight_gc, gc_group = 0, prev_group = 0;
398
399 pblk_gc_free_full_lines(pblk);
376 400
377 run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); 401 run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
378 if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) 402 if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD))
379 return; 403 return;
380 404
381next_gc_group: 405next_gc_group:
@@ -402,14 +426,14 @@ next_gc_group:
402 list_add_tail(&line->list, &gc->r_list); 426 list_add_tail(&line->list, &gc->r_list);
403 spin_unlock(&gc->r_lock); 427 spin_unlock(&gc->r_lock);
404 428
405 inflight_gc = atomic_inc_return(&gc->inflight_gc); 429 read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc);
406 pblk_gc_reader_kick(gc); 430 pblk_gc_reader_kick(gc);
407 431
408 prev_group = 1; 432 prev_group = 1;
409 433
410 /* No need to queue up more GC lines than we can handle */ 434 /* No need to queue up more GC lines than we can handle */
411 run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); 435 run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
412 if (!run_gc || inflight_gc >= PBLK_GC_L_QD) 436 if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD)
413 break; 437 break;
414 } while (1); 438 } while (1);
415 439
@@ -418,16 +442,6 @@ next_gc_group:
418 goto next_gc_group; 442 goto next_gc_group;
419} 443}
420 444
421void pblk_gc_kick(struct pblk *pblk)
422{
423 struct pblk_gc *gc = &pblk->gc;
424
425 wake_up_process(gc->gc_ts);
426 pblk_gc_writer_kick(gc);
427 pblk_gc_reader_kick(gc);
428 mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
429}
430
431static void pblk_gc_timer(unsigned long data) 445static void pblk_gc_timer(unsigned long data)
432{ 446{
433 struct pblk *pblk = (struct pblk *)data; 447 struct pblk *pblk = (struct pblk *)data;
@@ -465,6 +479,7 @@ static int pblk_gc_writer_ts(void *data)
465static int pblk_gc_reader_ts(void *data) 479static int pblk_gc_reader_ts(void *data)
466{ 480{
467 struct pblk *pblk = data; 481 struct pblk *pblk = data;
482 struct pblk_gc *gc = &pblk->gc;
468 483
469 while (!kthread_should_stop()) { 484 while (!kthread_should_stop()) {
470 if (!pblk_gc_read(pblk)) 485 if (!pblk_gc_read(pblk))
@@ -473,6 +488,18 @@ static int pblk_gc_reader_ts(void *data)
473 io_schedule(); 488 io_schedule();
474 } 489 }
475 490
491#ifdef CONFIG_NVM_DEBUG
492 pr_info("pblk: flushing gc pipeline, %d lines left\n",
493 atomic_read(&gc->pipeline_gc));
494#endif
495
496 do {
497 if (!atomic_read(&gc->pipeline_gc))
498 break;
499
500 schedule();
501 } while (1);
502
476 return 0; 503 return 0;
477} 504}
478 505
@@ -486,10 +513,10 @@ void pblk_gc_should_start(struct pblk *pblk)
486{ 513{
487 struct pblk_gc *gc = &pblk->gc; 514 struct pblk_gc *gc = &pblk->gc;
488 515
489 if (gc->gc_enabled && !gc->gc_active) 516 if (gc->gc_enabled && !gc->gc_active) {
490 pblk_gc_start(pblk); 517 pblk_gc_start(pblk);
491 518 pblk_gc_kick(pblk);
492 pblk_gc_kick(pblk); 519 }
493} 520}
494 521
495/* 522/*
@@ -510,6 +537,11 @@ void pblk_gc_should_stop(struct pblk *pblk)
510 pblk_gc_stop(pblk, 0); 537 pblk_gc_stop(pblk, 0);
511} 538}
512 539
540void pblk_gc_should_kick(struct pblk *pblk)
541{
542 pblk_rl_update_rates(&pblk->rl);
543}
544
513void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, 545void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
514 int *gc_active) 546 int *gc_active)
515{ 547{
@@ -576,7 +608,8 @@ int pblk_gc_init(struct pblk *pblk)
576 gc->gc_forced = 0; 608 gc->gc_forced = 0;
577 gc->gc_enabled = 1; 609 gc->gc_enabled = 1;
578 gc->w_entries = 0; 610 gc->w_entries = 0;
579 atomic_set(&gc->inflight_gc, 0); 611 atomic_set(&gc->read_inflight_gc, 0);
612 atomic_set(&gc->pipeline_gc, 0);
580 613
581 /* Workqueue that reads valid sectors from a line and submit them to the 614 /* Workqueue that reads valid sectors from a line and submit them to the
582 * GC writer to be recycled. 615 * GC writer to be recycled.
@@ -602,7 +635,7 @@ int pblk_gc_init(struct pblk *pblk)
602 spin_lock_init(&gc->w_lock); 635 spin_lock_init(&gc->w_lock);
603 spin_lock_init(&gc->r_lock); 636 spin_lock_init(&gc->r_lock);
604 637
605 sema_init(&gc->gc_sem, 128); 638 sema_init(&gc->gc_sem, PBLK_GC_RQ_QD);
606 639
607 INIT_LIST_HEAD(&gc->w_list); 640 INIT_LIST_HEAD(&gc->w_list);
608 INIT_LIST_HEAD(&gc->r_list); 641 INIT_LIST_HEAD(&gc->r_list);
@@ -625,24 +658,24 @@ void pblk_gc_exit(struct pblk *pblk)
625{ 658{
626 struct pblk_gc *gc = &pblk->gc; 659 struct pblk_gc *gc = &pblk->gc;
627 660
628 flush_workqueue(gc->gc_reader_wq); 661 gc->gc_enabled = 0;
629 flush_workqueue(gc->gc_line_reader_wq); 662 del_timer_sync(&gc->gc_timer);
630
631 del_timer(&gc->gc_timer);
632 pblk_gc_stop(pblk, 1); 663 pblk_gc_stop(pblk, 1);
633 664
634 if (gc->gc_ts) 665 if (gc->gc_ts)
635 kthread_stop(gc->gc_ts); 666 kthread_stop(gc->gc_ts);
636 667
668 if (gc->gc_reader_ts)
669 kthread_stop(gc->gc_reader_ts);
670
671 flush_workqueue(gc->gc_reader_wq);
637 if (gc->gc_reader_wq) 672 if (gc->gc_reader_wq)
638 destroy_workqueue(gc->gc_reader_wq); 673 destroy_workqueue(gc->gc_reader_wq);
639 674
675 flush_workqueue(gc->gc_line_reader_wq);
640 if (gc->gc_line_reader_wq) 676 if (gc->gc_line_reader_wq)
641 destroy_workqueue(gc->gc_line_reader_wq); 677 destroy_workqueue(gc->gc_line_reader_wq);
642 678
643 if (gc->gc_writer_ts) 679 if (gc->gc_writer_ts)
644 kthread_stop(gc->gc_writer_ts); 680 kthread_stop(gc->gc_writer_ts);
645
646 if (gc->gc_reader_ts)
647 kthread_stop(gc->gc_reader_ts);
648} 681}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 1b0f61233c21..f62112ba5482 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -20,8 +20,8 @@
20 20
21#include "pblk.h" 21#include "pblk.h"
22 22
23static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, 23static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
24 *pblk_w_rq_cache, *pblk_line_meta_cache; 24 *pblk_w_rq_cache;
25static DECLARE_RWSEM(pblk_lock); 25static DECLARE_RWSEM(pblk_lock);
26struct bio_set *pblk_bio_set; 26struct bio_set *pblk_bio_set;
27 27
@@ -46,7 +46,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
46 * user I/Os. Unless stalled, the rate limiter leaves at least 256KB 46 * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
47 * available for user I/O. 47 * available for user I/O.
48 */ 48 */
49 if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl))) 49 if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
50 blk_queue_split(q, &bio); 50 blk_queue_split(q, &bio);
51 51
52 return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); 52 return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
@@ -76,6 +76,28 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
76 return BLK_QC_T_NONE; 76 return BLK_QC_T_NONE;
77} 77}
78 78
79static size_t pblk_trans_map_size(struct pblk *pblk)
80{
81 int entry_size = 8;
82
83 if (pblk->ppaf_bitsize < 32)
84 entry_size = 4;
85
86 return entry_size * pblk->rl.nr_secs;
87}
88
89#ifdef CONFIG_NVM_DEBUG
90static u32 pblk_l2p_crc(struct pblk *pblk)
91{
92 size_t map_size;
93 u32 crc = ~(u32)0;
94
95 map_size = pblk_trans_map_size(pblk);
96 crc = crc32_le(crc, pblk->trans_map, map_size);
97 return crc;
98}
99#endif
100
79static void pblk_l2p_free(struct pblk *pblk) 101static void pblk_l2p_free(struct pblk *pblk)
80{ 102{
81 vfree(pblk->trans_map); 103 vfree(pblk->trans_map);
@@ -85,12 +107,10 @@ static int pblk_l2p_init(struct pblk *pblk)
85{ 107{
86 sector_t i; 108 sector_t i;
87 struct ppa_addr ppa; 109 struct ppa_addr ppa;
88 int entry_size = 8; 110 size_t map_size;
89 111
90 if (pblk->ppaf_bitsize < 32) 112 map_size = pblk_trans_map_size(pblk);
91 entry_size = 4; 113 pblk->trans_map = vmalloc(map_size);
92
93 pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
94 if (!pblk->trans_map) 114 if (!pblk->trans_map)
95 return -ENOMEM; 115 return -ENOMEM;
96 116
@@ -132,7 +152,6 @@ static int pblk_rwb_init(struct pblk *pblk)
132} 152}
133 153
134/* Minimum pages needed within a lun */ 154/* Minimum pages needed within a lun */
135#define PAGE_POOL_SIZE 16
136#define ADDR_POOL_SIZE 64 155#define ADDR_POOL_SIZE 64
137 156
138static int pblk_set_ppaf(struct pblk *pblk) 157static int pblk_set_ppaf(struct pblk *pblk)
@@ -182,12 +201,10 @@ static int pblk_set_ppaf(struct pblk *pblk)
182 201
183static int pblk_init_global_caches(struct pblk *pblk) 202static int pblk_init_global_caches(struct pblk *pblk)
184{ 203{
185 char cache_name[PBLK_CACHE_NAME_LEN];
186
187 down_write(&pblk_lock); 204 down_write(&pblk_lock);
188 pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws", 205 pblk_ws_cache = kmem_cache_create("pblk_blk_ws",
189 sizeof(struct pblk_line_ws), 0, 0, NULL); 206 sizeof(struct pblk_line_ws), 0, 0, NULL);
190 if (!pblk_blk_ws_cache) { 207 if (!pblk_ws_cache) {
191 up_write(&pblk_lock); 208 up_write(&pblk_lock);
192 return -ENOMEM; 209 return -ENOMEM;
193 } 210 }
@@ -195,7 +212,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
195 pblk_rec_cache = kmem_cache_create("pblk_rec", 212 pblk_rec_cache = kmem_cache_create("pblk_rec",
196 sizeof(struct pblk_rec_ctx), 0, 0, NULL); 213 sizeof(struct pblk_rec_ctx), 0, 0, NULL);
197 if (!pblk_rec_cache) { 214 if (!pblk_rec_cache) {
198 kmem_cache_destroy(pblk_blk_ws_cache); 215 kmem_cache_destroy(pblk_ws_cache);
199 up_write(&pblk_lock); 216 up_write(&pblk_lock);
200 return -ENOMEM; 217 return -ENOMEM;
201 } 218 }
@@ -203,7 +220,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
203 pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, 220 pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size,
204 0, 0, NULL); 221 0, 0, NULL);
205 if (!pblk_g_rq_cache) { 222 if (!pblk_g_rq_cache) {
206 kmem_cache_destroy(pblk_blk_ws_cache); 223 kmem_cache_destroy(pblk_ws_cache);
207 kmem_cache_destroy(pblk_rec_cache); 224 kmem_cache_destroy(pblk_rec_cache);
208 up_write(&pblk_lock); 225 up_write(&pblk_lock);
209 return -ENOMEM; 226 return -ENOMEM;
@@ -212,30 +229,25 @@ static int pblk_init_global_caches(struct pblk *pblk)
212 pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, 229 pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
213 0, 0, NULL); 230 0, 0, NULL);
214 if (!pblk_w_rq_cache) { 231 if (!pblk_w_rq_cache) {
215 kmem_cache_destroy(pblk_blk_ws_cache); 232 kmem_cache_destroy(pblk_ws_cache);
216 kmem_cache_destroy(pblk_rec_cache); 233 kmem_cache_destroy(pblk_rec_cache);
217 kmem_cache_destroy(pblk_g_rq_cache); 234 kmem_cache_destroy(pblk_g_rq_cache);
218 up_write(&pblk_lock); 235 up_write(&pblk_lock);
219 return -ENOMEM; 236 return -ENOMEM;
220 } 237 }
221
222 snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
223 pblk->disk->disk_name);
224 pblk_line_meta_cache = kmem_cache_create(cache_name,
225 pblk->lm.sec_bitmap_len, 0, 0, NULL);
226 if (!pblk_line_meta_cache) {
227 kmem_cache_destroy(pblk_blk_ws_cache);
228 kmem_cache_destroy(pblk_rec_cache);
229 kmem_cache_destroy(pblk_g_rq_cache);
230 kmem_cache_destroy(pblk_w_rq_cache);
231 up_write(&pblk_lock);
232 return -ENOMEM;
233 }
234 up_write(&pblk_lock); 238 up_write(&pblk_lock);
235 239
236 return 0; 240 return 0;
237} 241}
238 242
243static void pblk_free_global_caches(struct pblk *pblk)
244{
245 kmem_cache_destroy(pblk_ws_cache);
246 kmem_cache_destroy(pblk_rec_cache);
247 kmem_cache_destroy(pblk_g_rq_cache);
248 kmem_cache_destroy(pblk_w_rq_cache);
249}
250
239static int pblk_core_init(struct pblk *pblk) 251static int pblk_core_init(struct pblk *pblk)
240{ 252{
241 struct nvm_tgt_dev *dev = pblk->dev; 253 struct nvm_tgt_dev *dev = pblk->dev;
@@ -247,70 +259,80 @@ static int pblk_core_init(struct pblk *pblk)
247 if (pblk_init_global_caches(pblk)) 259 if (pblk_init_global_caches(pblk))
248 return -ENOMEM; 260 return -ENOMEM;
249 261
250 pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0); 262 /* Internal bios can be at most the sectors signaled by the device. */
251 if (!pblk->page_pool) 263 pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev),
252 return -ENOMEM; 264 0);
265 if (!pblk->page_bio_pool)
266 goto free_global_caches;
253 267
254 pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, 268 pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE,
255 pblk_blk_ws_cache); 269 pblk_ws_cache);
256 if (!pblk->line_ws_pool) 270 if (!pblk->gen_ws_pool)
257 goto free_page_pool; 271 goto free_page_bio_pool;
258 272
259 pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); 273 pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
260 if (!pblk->rec_pool) 274 if (!pblk->rec_pool)
261 goto free_blk_ws_pool; 275 goto free_gen_ws_pool;
262 276
263 pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, 277 pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns,
264 pblk_g_rq_cache); 278 pblk_g_rq_cache);
265 if (!pblk->g_rq_pool) 279 if (!pblk->r_rq_pool)
266 goto free_rec_pool; 280 goto free_rec_pool;
267 281
268 pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2, 282 pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns,
283 pblk_g_rq_cache);
284 if (!pblk->e_rq_pool)
285 goto free_r_rq_pool;
286
287 pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns,
269 pblk_w_rq_cache); 288 pblk_w_rq_cache);
270 if (!pblk->w_rq_pool) 289 if (!pblk->w_rq_pool)
271 goto free_g_rq_pool; 290 goto free_e_rq_pool;
272
273 pblk->line_meta_pool =
274 mempool_create_slab_pool(PBLK_META_POOL_SIZE,
275 pblk_line_meta_cache);
276 if (!pblk->line_meta_pool)
277 goto free_w_rq_pool;
278 291
279 pblk->close_wq = alloc_workqueue("pblk-close-wq", 292 pblk->close_wq = alloc_workqueue("pblk-close-wq",
280 WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); 293 WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS);
281 if (!pblk->close_wq) 294 if (!pblk->close_wq)
282 goto free_line_meta_pool; 295 goto free_w_rq_pool;
283 296
284 pblk->bb_wq = alloc_workqueue("pblk-bb-wq", 297 pblk->bb_wq = alloc_workqueue("pblk-bb-wq",
285 WQ_MEM_RECLAIM | WQ_UNBOUND, 0); 298 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
286 if (!pblk->bb_wq) 299 if (!pblk->bb_wq)
287 goto free_close_wq; 300 goto free_close_wq;
288 301
289 if (pblk_set_ppaf(pblk)) 302 pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq",
303 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
304 if (!pblk->r_end_wq)
290 goto free_bb_wq; 305 goto free_bb_wq;
291 306
307 if (pblk_set_ppaf(pblk))
308 goto free_r_end_wq;
309
292 if (pblk_rwb_init(pblk)) 310 if (pblk_rwb_init(pblk))
293 goto free_bb_wq; 311 goto free_r_end_wq;
294 312
295 INIT_LIST_HEAD(&pblk->compl_list); 313 INIT_LIST_HEAD(&pblk->compl_list);
296 return 0; 314 return 0;
297 315
316free_r_end_wq:
317 destroy_workqueue(pblk->r_end_wq);
298free_bb_wq: 318free_bb_wq:
299 destroy_workqueue(pblk->bb_wq); 319 destroy_workqueue(pblk->bb_wq);
300free_close_wq: 320free_close_wq:
301 destroy_workqueue(pblk->close_wq); 321 destroy_workqueue(pblk->close_wq);
302free_line_meta_pool:
303 mempool_destroy(pblk->line_meta_pool);
304free_w_rq_pool: 322free_w_rq_pool:
305 mempool_destroy(pblk->w_rq_pool); 323 mempool_destroy(pblk->w_rq_pool);
306free_g_rq_pool: 324free_e_rq_pool:
307 mempool_destroy(pblk->g_rq_pool); 325 mempool_destroy(pblk->e_rq_pool);
326free_r_rq_pool:
327 mempool_destroy(pblk->r_rq_pool);
308free_rec_pool: 328free_rec_pool:
309 mempool_destroy(pblk->rec_pool); 329 mempool_destroy(pblk->rec_pool);
310free_blk_ws_pool: 330free_gen_ws_pool:
311 mempool_destroy(pblk->line_ws_pool); 331 mempool_destroy(pblk->gen_ws_pool);
312free_page_pool: 332free_page_bio_pool:
313 mempool_destroy(pblk->page_pool); 333 mempool_destroy(pblk->page_bio_pool);
334free_global_caches:
335 pblk_free_global_caches(pblk);
314 return -ENOMEM; 336 return -ENOMEM;
315} 337}
316 338
@@ -319,21 +341,20 @@ static void pblk_core_free(struct pblk *pblk)
319 if (pblk->close_wq) 341 if (pblk->close_wq)
320 destroy_workqueue(pblk->close_wq); 342 destroy_workqueue(pblk->close_wq);
321 343
344 if (pblk->r_end_wq)
345 destroy_workqueue(pblk->r_end_wq);
346
322 if (pblk->bb_wq) 347 if (pblk->bb_wq)
323 destroy_workqueue(pblk->bb_wq); 348 destroy_workqueue(pblk->bb_wq);
324 349
325 mempool_destroy(pblk->page_pool); 350 mempool_destroy(pblk->page_bio_pool);
326 mempool_destroy(pblk->line_ws_pool); 351 mempool_destroy(pblk->gen_ws_pool);
327 mempool_destroy(pblk->rec_pool); 352 mempool_destroy(pblk->rec_pool);
328 mempool_destroy(pblk->g_rq_pool); 353 mempool_destroy(pblk->r_rq_pool);
354 mempool_destroy(pblk->e_rq_pool);
329 mempool_destroy(pblk->w_rq_pool); 355 mempool_destroy(pblk->w_rq_pool);
330 mempool_destroy(pblk->line_meta_pool);
331 356
332 kmem_cache_destroy(pblk_blk_ws_cache); 357 pblk_free_global_caches(pblk);
333 kmem_cache_destroy(pblk_rec_cache);
334 kmem_cache_destroy(pblk_g_rq_cache);
335 kmem_cache_destroy(pblk_w_rq_cache);
336 kmem_cache_destroy(pblk_line_meta_cache);
337} 358}
338 359
339static void pblk_luns_free(struct pblk *pblk) 360static void pblk_luns_free(struct pblk *pblk)
@@ -372,13 +393,11 @@ static void pblk_line_meta_free(struct pblk *pblk)
372 kfree(l_mg->bb_aux); 393 kfree(l_mg->bb_aux);
373 kfree(l_mg->vsc_list); 394 kfree(l_mg->vsc_list);
374 395
375 spin_lock(&l_mg->free_lock);
376 for (i = 0; i < PBLK_DATA_LINES; i++) { 396 for (i = 0; i < PBLK_DATA_LINES; i++) {
377 kfree(l_mg->sline_meta[i]); 397 kfree(l_mg->sline_meta[i]);
378 pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); 398 pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
379 kfree(l_mg->eline_meta[i]); 399 kfree(l_mg->eline_meta[i]);
380 } 400 }
381 spin_unlock(&l_mg->free_lock);
382 401
383 kfree(pblk->lines); 402 kfree(pblk->lines);
384} 403}
@@ -507,6 +526,13 @@ static int pblk_lines_configure(struct pblk *pblk, int flags)
507 } 526 }
508 } 527 }
509 528
529#ifdef CONFIG_NVM_DEBUG
530 pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
531#endif
532
533 /* Free full lines directly as GC has not been started yet */
534 pblk_gc_free_full_lines(pblk);
535
510 if (!line) { 536 if (!line) {
511 /* Configure next line for user data */ 537 /* Configure next line for user data */
512 line = pblk_line_get_first_data(pblk); 538 line = pblk_line_get_first_data(pblk);
@@ -630,7 +656,10 @@ static int pblk_lines_alloc_metadata(struct pblk *pblk)
630 656
631fail_free_emeta: 657fail_free_emeta:
632 while (--i >= 0) { 658 while (--i >= 0) {
633 vfree(l_mg->eline_meta[i]->buf); 659 if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META)
660 vfree(l_mg->eline_meta[i]->buf);
661 else
662 kfree(l_mg->eline_meta[i]->buf);
634 kfree(l_mg->eline_meta[i]); 663 kfree(l_mg->eline_meta[i]);
635 } 664 }
636 665
@@ -681,8 +710,8 @@ static int pblk_lines_init(struct pblk *pblk)
681 lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); 710 lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
682 lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); 711 lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
683 lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); 712 lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
684 lm->high_thrs = lm->sec_per_line / 2; 713 lm->mid_thrs = lm->sec_per_line / 2;
685 lm->mid_thrs = lm->sec_per_line / 4; 714 lm->high_thrs = lm->sec_per_line / 4;
686 lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; 715 lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs;
687 716
688 /* Calculate necessary pages for smeta. See comment over struct 717 /* Calculate necessary pages for smeta. See comment over struct
@@ -713,9 +742,13 @@ add_emeta_page:
713 goto add_emeta_page; 742 goto add_emeta_page;
714 } 743 }
715 744
716 lm->emeta_bb = geo->nr_luns - i; 745 lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
717 lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0], 746
718 geo->sec_per_blk); 747 lm->min_blk_line = 1;
748 if (geo->nr_luns > 1)
749 lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
750 lm->emeta_sec[0], geo->sec_per_blk);
751
719 if (lm->min_blk_line > lm->blk_per_line) { 752 if (lm->min_blk_line > lm->blk_per_line) {
720 pr_err("pblk: config. not supported. Min. LUN in line:%d\n", 753 pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
721 lm->blk_per_line); 754 lm->blk_per_line);
@@ -890,6 +923,11 @@ static void pblk_exit(void *private)
890 down_write(&pblk_lock); 923 down_write(&pblk_lock);
891 pblk_gc_exit(pblk); 924 pblk_gc_exit(pblk);
892 pblk_tear_down(pblk); 925 pblk_tear_down(pblk);
926
927#ifdef CONFIG_NVM_DEBUG
928 pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
929#endif
930
893 pblk_free(pblk); 931 pblk_free(pblk);
894 up_write(&pblk_lock); 932 up_write(&pblk_lock);
895} 933}
@@ -911,7 +949,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
911 int ret; 949 int ret;
912 950
913 if (dev->identity.dom & NVM_RSP_L2P) { 951 if (dev->identity.dom & NVM_RSP_L2P) {
914 pr_err("pblk: device-side L2P table not supported. (%x)\n", 952 pr_err("pblk: host-side L2P table not supported. (%x)\n",
915 dev->identity.dom); 953 dev->identity.dom);
916 return ERR_PTR(-EINVAL); 954 return ERR_PTR(-EINVAL);
917 } 955 }
@@ -923,6 +961,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
923 pblk->dev = dev; 961 pblk->dev = dev;
924 pblk->disk = tdisk; 962 pblk->disk = tdisk;
925 pblk->state = PBLK_STATE_RUNNING; 963 pblk->state = PBLK_STATE_RUNNING;
964 pblk->gc.gc_enabled = 0;
926 965
927 spin_lock_init(&pblk->trans_lock); 966 spin_lock_init(&pblk->trans_lock);
928 spin_lock_init(&pblk->lock); 967 spin_lock_init(&pblk->lock);
@@ -944,6 +983,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
944 atomic_long_set(&pblk->recov_writes, 0); 983 atomic_long_set(&pblk->recov_writes, 0);
945 atomic_long_set(&pblk->recov_writes, 0); 984 atomic_long_set(&pblk->recov_writes, 0);
946 atomic_long_set(&pblk->recov_gc_writes, 0); 985 atomic_long_set(&pblk->recov_gc_writes, 0);
986 atomic_long_set(&pblk->recov_gc_reads, 0);
947#endif 987#endif
948 988
949 atomic_long_set(&pblk->read_failed, 0); 989 atomic_long_set(&pblk->read_failed, 0);
@@ -1012,6 +1052,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1012 pblk->rwb.nr_entries); 1052 pblk->rwb.nr_entries);
1013 1053
1014 wake_up_process(pblk->writer_ts); 1054 wake_up_process(pblk->writer_ts);
1055
1056 /* Check if we need to start GC */
1057 pblk_gc_should_kick(pblk);
1058
1015 return pblk; 1059 return pblk;
1016 1060
1017fail_stop_writer: 1061fail_stop_writer:
@@ -1044,6 +1088,7 @@ static struct nvm_tgt_type tt_pblk = {
1044 1088
1045 .sysfs_init = pblk_sysfs_init, 1089 .sysfs_init = pblk_sysfs_init,
1046 .sysfs_exit = pblk_sysfs_exit, 1090 .sysfs_exit = pblk_sysfs_exit,
1091 .owner = THIS_MODULE,
1047}; 1092};
1048 1093
1049static int __init pblk_module_init(void) 1094static int __init pblk_module_init(void)
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index fddb924f6dde..6f3ecde2140f 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -25,16 +25,28 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
25 unsigned int valid_secs) 25 unsigned int valid_secs)
26{ 26{
27 struct pblk_line *line = pblk_line_get_data(pblk); 27 struct pblk_line *line = pblk_line_get_data(pblk);
28 struct pblk_emeta *emeta = line->emeta; 28 struct pblk_emeta *emeta;
29 struct pblk_w_ctx *w_ctx; 29 struct pblk_w_ctx *w_ctx;
30 __le64 *lba_list = emeta_to_lbas(pblk, emeta->buf); 30 __le64 *lba_list;
31 u64 paddr; 31 u64 paddr;
32 int nr_secs = pblk->min_write_pgs; 32 int nr_secs = pblk->min_write_pgs;
33 int i; 33 int i;
34 34
35 if (pblk_line_is_full(line)) {
36 struct pblk_line *prev_line = line;
37
38 line = pblk_line_replace_data(pblk);
39 pblk_line_close_meta(pblk, prev_line);
40 }
41
42 emeta = line->emeta;
43 lba_list = emeta_to_lbas(pblk, emeta->buf);
44
35 paddr = pblk_alloc_page(pblk, line, nr_secs); 45 paddr = pblk_alloc_page(pblk, line, nr_secs);
36 46
37 for (i = 0; i < nr_secs; i++, paddr++) { 47 for (i = 0; i < nr_secs; i++, paddr++) {
48 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
49
38 /* ppa to be sent to the device */ 50 /* ppa to be sent to the device */
39 ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); 51 ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
40 52
@@ -51,22 +63,14 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
51 w_ctx->ppa = ppa_list[i]; 63 w_ctx->ppa = ppa_list[i];
52 meta_list[i].lba = cpu_to_le64(w_ctx->lba); 64 meta_list[i].lba = cpu_to_le64(w_ctx->lba);
53 lba_list[paddr] = cpu_to_le64(w_ctx->lba); 65 lba_list[paddr] = cpu_to_le64(w_ctx->lba);
54 line->nr_valid_lbas++; 66 if (lba_list[paddr] != addr_empty)
67 line->nr_valid_lbas++;
55 } else { 68 } else {
56 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
57
58 lba_list[paddr] = meta_list[i].lba = addr_empty; 69 lba_list[paddr] = meta_list[i].lba = addr_empty;
59 __pblk_map_invalidate(pblk, line, paddr); 70 __pblk_map_invalidate(pblk, line, paddr);
60 } 71 }
61 } 72 }
62 73
63 if (pblk_line_is_full(line)) {
64 struct pblk_line *prev_line = line;
65
66 pblk_line_replace_data(pblk);
67 pblk_line_close_meta(pblk, prev_line);
68 }
69
70 pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); 74 pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
71} 75}
72 76
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 9bc32578a766..b8f78e401482 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -201,8 +201,7 @@ unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
201 return subm; 201 return subm;
202} 202}
203 203
204static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, 204static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
205 unsigned int to_update)
206{ 205{
207 struct pblk *pblk = container_of(rb, struct pblk, rwb); 206 struct pblk *pblk = container_of(rb, struct pblk, rwb);
208 struct pblk_line *line; 207 struct pblk_line *line;
@@ -213,7 +212,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
213 int flags; 212 int flags;
214 213
215 for (i = 0; i < to_update; i++) { 214 for (i = 0; i < to_update; i++) {
216 entry = &rb->entries[*l2p_upd]; 215 entry = &rb->entries[rb->l2p_update];
217 w_ctx = &entry->w_ctx; 216 w_ctx = &entry->w_ctx;
218 217
219 flags = READ_ONCE(entry->w_ctx.flags); 218 flags = READ_ONCE(entry->w_ctx.flags);
@@ -230,7 +229,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
230 line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; 229 line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
231 kref_put(&line->ref, pblk_line_put); 230 kref_put(&line->ref, pblk_line_put);
232 clean_wctx(w_ctx); 231 clean_wctx(w_ctx);
233 *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1); 232 rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
234 } 233 }
235 234
236 pblk_rl_out(&pblk->rl, user_io, gc_io); 235 pblk_rl_out(&pblk->rl, user_io, gc_io);
@@ -258,7 +257,7 @@ static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
258 257
259 count = nr_entries - space; 258 count = nr_entries - space;
260 /* l2p_update used exclusively under rb->w_lock */ 259 /* l2p_update used exclusively under rb->w_lock */
261 ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count); 260 ret = __pblk_rb_update_l2p(rb, count);
262 261
263out: 262out:
264 return ret; 263 return ret;
@@ -280,7 +279,7 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb)
280 sync = smp_load_acquire(&rb->sync); 279 sync = smp_load_acquire(&rb->sync);
281 280
282 to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); 281 to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
283 __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update); 282 __pblk_rb_update_l2p(rb, to_update);
284 283
285 spin_unlock(&rb->w_lock); 284 spin_unlock(&rb->w_lock);
286} 285}
@@ -325,8 +324,8 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
325} 324}
326 325
327void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, 326void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
328 struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, 327 struct pblk_w_ctx w_ctx, struct pblk_line *line,
329 unsigned int ring_pos) 328 u64 paddr, unsigned int ring_pos)
330{ 329{
331 struct pblk *pblk = container_of(rb, struct pblk, rwb); 330 struct pblk *pblk = container_of(rb, struct pblk, rwb);
332 struct pblk_rb_entry *entry; 331 struct pblk_rb_entry *entry;
@@ -341,7 +340,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
341 340
342 __pblk_rb_write_entry(rb, data, w_ctx, entry); 341 __pblk_rb_write_entry(rb, data, w_ctx, entry);
343 342
344 if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line)) 343 if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr))
345 entry->w_ctx.lba = ADDR_EMPTY; 344 entry->w_ctx.lba = ADDR_EMPTY;
346 345
347 flags = w_ctx.flags | PBLK_WRITTEN_DATA; 346 flags = w_ctx.flags | PBLK_WRITTEN_DATA;
@@ -355,7 +354,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
355{ 354{
356 struct pblk_rb_entry *entry; 355 struct pblk_rb_entry *entry;
357 unsigned int subm, sync_point; 356 unsigned int subm, sync_point;
358 int flags;
359 357
360 subm = READ_ONCE(rb->subm); 358 subm = READ_ONCE(rb->subm);
361 359
@@ -369,12 +367,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
369 sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); 367 sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
370 entry = &rb->entries[sync_point]; 368 entry = &rb->entries[sync_point];
371 369
372 flags = READ_ONCE(entry->w_ctx.flags);
373 flags |= PBLK_FLUSH_ENTRY;
374
375 /* Release flags on context. Protect from writes */
376 smp_store_release(&entry->w_ctx.flags, flags);
377
378 /* Protect syncs */ 370 /* Protect syncs */
379 smp_store_release(&rb->sync_point, sync_point); 371 smp_store_release(&rb->sync_point, sync_point);
380 372
@@ -454,6 +446,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
454 446
455 /* Protect from read count */ 447 /* Protect from read count */
456 smp_store_release(&rb->mem, mem); 448 smp_store_release(&rb->mem, mem);
449
457 return 1; 450 return 1;
458} 451}
459 452
@@ -558,12 +551,13 @@ out:
558 * persist data on the write buffer to the media. 551 * persist data on the write buffer to the media.
559 */ 552 */
560unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, 553unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
561 struct bio *bio, unsigned int pos, 554 unsigned int pos, unsigned int nr_entries,
562 unsigned int nr_entries, unsigned int count) 555 unsigned int count)
563{ 556{
564 struct pblk *pblk = container_of(rb, struct pblk, rwb); 557 struct pblk *pblk = container_of(rb, struct pblk, rwb);
565 struct request_queue *q = pblk->dev->q; 558 struct request_queue *q = pblk->dev->q;
566 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); 559 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
560 struct bio *bio = rqd->bio;
567 struct pblk_rb_entry *entry; 561 struct pblk_rb_entry *entry;
568 struct page *page; 562 struct page *page;
569 unsigned int pad = 0, to_read = nr_entries; 563 unsigned int pad = 0, to_read = nr_entries;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d682e89e6493..ca79d8fb3e60 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -39,21 +39,15 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
39} 39}
40 40
41static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, 41static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
42 unsigned long *read_bitmap) 42 sector_t blba, unsigned long *read_bitmap)
43{ 43{
44 struct pblk_sec_meta *meta_list = rqd->meta_list;
44 struct bio *bio = rqd->bio; 45 struct bio *bio = rqd->bio;
45 struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; 46 struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
46 sector_t blba = pblk_get_lba(bio);
47 int nr_secs = rqd->nr_ppas; 47 int nr_secs = rqd->nr_ppas;
48 bool advanced_bio = false; 48 bool advanced_bio = false;
49 int i, j = 0; 49 int i, j = 0;
50 50
51 /* logic error: lba out-of-bounds. Ignore read request */
52 if (blba + nr_secs >= pblk->rl.nr_secs) {
53 WARN(1, "pblk: read lbas out of bounds\n");
54 return;
55 }
56
57 pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs); 51 pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
58 52
59 for (i = 0; i < nr_secs; i++) { 53 for (i = 0; i < nr_secs; i++) {
@@ -63,6 +57,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
63retry: 57retry:
64 if (pblk_ppa_empty(p)) { 58 if (pblk_ppa_empty(p)) {
65 WARN_ON(test_and_set_bit(i, read_bitmap)); 59 WARN_ON(test_and_set_bit(i, read_bitmap));
60 meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
66 61
67 if (unlikely(!advanced_bio)) { 62 if (unlikely(!advanced_bio)) {
68 bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); 63 bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -82,6 +77,7 @@ retry:
82 goto retry; 77 goto retry;
83 } 78 }
84 WARN_ON(test_and_set_bit(i, read_bitmap)); 79 WARN_ON(test_and_set_bit(i, read_bitmap));
80 meta_list[i].lba = cpu_to_le64(lba);
85 advanced_bio = true; 81 advanced_bio = true;
86#ifdef CONFIG_NVM_DEBUG 82#ifdef CONFIG_NVM_DEBUG
87 atomic_long_inc(&pblk->cache_reads); 83 atomic_long_inc(&pblk->cache_reads);
@@ -117,10 +113,51 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
117 return NVM_IO_OK; 113 return NVM_IO_OK;
118} 114}
119 115
120static void pblk_end_io_read(struct nvm_rq *rqd) 116static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
117 sector_t blba)
118{
119 struct pblk_sec_meta *meta_list = rqd->meta_list;
120 int nr_lbas = rqd->nr_ppas;
121 int i;
122
123 for (i = 0; i < nr_lbas; i++) {
124 u64 lba = le64_to_cpu(meta_list[i].lba);
125
126 if (lba == ADDR_EMPTY)
127 continue;
128
129 WARN(lba != blba + i, "pblk: corrupted read LBA\n");
130 }
131}
132
133static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
134{
135 struct ppa_addr *ppa_list;
136 int i;
137
138 ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
139
140 for (i = 0; i < rqd->nr_ppas; i++) {
141 struct ppa_addr ppa = ppa_list[i];
142 struct pblk_line *line;
143
144 line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
145 kref_put(&line->ref, pblk_line_put_wq);
146 }
147}
148
149static void pblk_end_user_read(struct bio *bio)
150{
151#ifdef CONFIG_NVM_DEBUG
152 WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
153#endif
154 bio_endio(bio);
155 bio_put(bio);
156}
157
158static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
159 bool put_line)
121{ 160{
122 struct pblk *pblk = rqd->private;
123 struct nvm_tgt_dev *dev = pblk->dev;
124 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); 161 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
125 struct bio *bio = rqd->bio; 162 struct bio *bio = rqd->bio;
126 163
@@ -131,47 +168,51 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
131 WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); 168 WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
132#endif 169#endif
133 170
134 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); 171 pblk_read_check(pblk, rqd, r_ctx->lba);
135 172
136 bio_put(bio); 173 bio_put(bio);
137 if (r_ctx->private) { 174 if (r_ctx->private)
138 struct bio *orig_bio = r_ctx->private; 175 pblk_end_user_read((struct bio *)r_ctx->private);
139 176
140#ifdef CONFIG_NVM_DEBUG 177 if (put_line)
141 WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n"); 178 pblk_read_put_rqd_kref(pblk, rqd);
142#endif
143 bio_endio(orig_bio);
144 bio_put(orig_bio);
145 }
146 179
147#ifdef CONFIG_NVM_DEBUG 180#ifdef CONFIG_NVM_DEBUG
148 atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); 181 atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
149 atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); 182 atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
150#endif 183#endif
151 184
152 pblk_free_rqd(pblk, rqd, READ); 185 pblk_free_rqd(pblk, rqd, PBLK_READ);
153 atomic_dec(&pblk->inflight_io); 186 atomic_dec(&pblk->inflight_io);
154} 187}
155 188
189static void pblk_end_io_read(struct nvm_rq *rqd)
190{
191 struct pblk *pblk = rqd->private;
192
193 __pblk_end_io_read(pblk, rqd, true);
194}
195
156static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, 196static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
157 unsigned int bio_init_idx, 197 unsigned int bio_init_idx,
158 unsigned long *read_bitmap) 198 unsigned long *read_bitmap)
159{ 199{
160 struct bio *new_bio, *bio = rqd->bio; 200 struct bio *new_bio, *bio = rqd->bio;
201 struct pblk_sec_meta *meta_list = rqd->meta_list;
161 struct bio_vec src_bv, dst_bv; 202 struct bio_vec src_bv, dst_bv;
162 void *ppa_ptr = NULL; 203 void *ppa_ptr = NULL;
163 void *src_p, *dst_p; 204 void *src_p, *dst_p;
164 dma_addr_t dma_ppa_list = 0; 205 dma_addr_t dma_ppa_list = 0;
206 __le64 *lba_list_mem, *lba_list_media;
165 int nr_secs = rqd->nr_ppas; 207 int nr_secs = rqd->nr_ppas;
166 int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); 208 int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
167 int i, ret, hole; 209 int i, ret, hole;
168 DECLARE_COMPLETION_ONSTACK(wait); 210
211 /* Re-use allocated memory for intermediate lbas */
212 lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
213 lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
169 214
170 new_bio = bio_alloc(GFP_KERNEL, nr_holes); 215 new_bio = bio_alloc(GFP_KERNEL, nr_holes);
171 if (!new_bio) {
172 pr_err("pblk: could not alloc read bio\n");
173 return NVM_IO_ERR;
174 }
175 216
176 if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) 217 if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
177 goto err; 218 goto err;
@@ -181,34 +222,29 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
181 goto err; 222 goto err;
182 } 223 }
183 224
225 for (i = 0; i < nr_secs; i++)
226 lba_list_mem[i] = meta_list[i].lba;
227
184 new_bio->bi_iter.bi_sector = 0; /* internal bio */ 228 new_bio->bi_iter.bi_sector = 0; /* internal bio */
185 bio_set_op_attrs(new_bio, REQ_OP_READ, 0); 229 bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
186 new_bio->bi_private = &wait;
187 new_bio->bi_end_io = pblk_end_bio_sync;
188 230
189 rqd->bio = new_bio; 231 rqd->bio = new_bio;
190 rqd->nr_ppas = nr_holes; 232 rqd->nr_ppas = nr_holes;
191 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); 233 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
192 rqd->end_io = NULL;
193 234
194 if (unlikely(nr_secs > 1 && nr_holes == 1)) { 235 if (unlikely(nr_holes == 1)) {
195 ppa_ptr = rqd->ppa_list; 236 ppa_ptr = rqd->ppa_list;
196 dma_ppa_list = rqd->dma_ppa_list; 237 dma_ppa_list = rqd->dma_ppa_list;
197 rqd->ppa_addr = rqd->ppa_list[0]; 238 rqd->ppa_addr = rqd->ppa_list[0];
198 } 239 }
199 240
200 ret = pblk_submit_read_io(pblk, rqd); 241 ret = pblk_submit_io_sync(pblk, rqd);
201 if (ret) { 242 if (ret) {
202 bio_put(rqd->bio); 243 bio_put(rqd->bio);
203 pr_err("pblk: read IO submission failed\n"); 244 pr_err("pblk: sync read IO submission failed\n");
204 goto err; 245 goto err;
205 } 246 }
206 247
207 if (!wait_for_completion_io_timeout(&wait,
208 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
209 pr_err("pblk: partial read I/O timed out\n");
210 }
211
212 if (rqd->error) { 248 if (rqd->error) {
213 atomic_long_inc(&pblk->read_failed); 249 atomic_long_inc(&pblk->read_failed);
214#ifdef CONFIG_NVM_DEBUG 250#ifdef CONFIG_NVM_DEBUG
@@ -216,15 +252,31 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
216#endif 252#endif
217 } 253 }
218 254
219 if (unlikely(nr_secs > 1 && nr_holes == 1)) { 255 if (unlikely(nr_holes == 1)) {
256 struct ppa_addr ppa;
257
258 ppa = rqd->ppa_addr;
220 rqd->ppa_list = ppa_ptr; 259 rqd->ppa_list = ppa_ptr;
221 rqd->dma_ppa_list = dma_ppa_list; 260 rqd->dma_ppa_list = dma_ppa_list;
261 rqd->ppa_list[0] = ppa;
262 }
263
264 for (i = 0; i < nr_secs; i++) {
265 lba_list_media[i] = meta_list[i].lba;
266 meta_list[i].lba = lba_list_mem[i];
222 } 267 }
223 268
224 /* Fill the holes in the original bio */ 269 /* Fill the holes in the original bio */
225 i = 0; 270 i = 0;
226 hole = find_first_zero_bit(read_bitmap, nr_secs); 271 hole = find_first_zero_bit(read_bitmap, nr_secs);
227 do { 272 do {
273 int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]);
274 struct pblk_line *line = &pblk->lines[line_id];
275
276 kref_put(&line->ref, pblk_line_put);
277
278 meta_list[hole].lba = lba_list_media[i];
279
228 src_bv = new_bio->bi_io_vec[i++]; 280 src_bv = new_bio->bi_io_vec[i++];
229 dst_bv = bio->bi_io_vec[bio_init_idx + hole]; 281 dst_bv = bio->bi_io_vec[bio_init_idx + hole];
230 282
@@ -238,7 +290,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
238 kunmap_atomic(src_p); 290 kunmap_atomic(src_p);
239 kunmap_atomic(dst_p); 291 kunmap_atomic(dst_p);
240 292
241 mempool_free(src_bv.bv_page, pblk->page_pool); 293 mempool_free(src_bv.bv_page, pblk->page_bio_pool);
242 294
243 hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); 295 hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
244 } while (hole < nr_secs); 296 } while (hole < nr_secs);
@@ -246,34 +298,26 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
246 bio_put(new_bio); 298 bio_put(new_bio);
247 299
248 /* Complete the original bio and associated request */ 300 /* Complete the original bio and associated request */
301 bio_endio(bio);
249 rqd->bio = bio; 302 rqd->bio = bio;
250 rqd->nr_ppas = nr_secs; 303 rqd->nr_ppas = nr_secs;
251 rqd->private = pblk;
252 304
253 bio_endio(bio); 305 __pblk_end_io_read(pblk, rqd, false);
254 pblk_end_io_read(rqd);
255 return NVM_IO_OK; 306 return NVM_IO_OK;
256 307
257err: 308err:
258 /* Free allocated pages in new bio */ 309 /* Free allocated pages in new bio */
259 pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); 310 pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
260 rqd->private = pblk; 311 __pblk_end_io_read(pblk, rqd, false);
261 pblk_end_io_read(rqd);
262 return NVM_IO_ERR; 312 return NVM_IO_ERR;
263} 313}
264 314
265static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, 315static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
266 unsigned long *read_bitmap) 316 sector_t lba, unsigned long *read_bitmap)
267{ 317{
318 struct pblk_sec_meta *meta_list = rqd->meta_list;
268 struct bio *bio = rqd->bio; 319 struct bio *bio = rqd->bio;
269 struct ppa_addr ppa; 320 struct ppa_addr ppa;
270 sector_t lba = pblk_get_lba(bio);
271
272 /* logic error: lba out-of-bounds. Ignore read request */
273 if (lba >= pblk->rl.nr_secs) {
274 WARN(1, "pblk: read lba out of bounds\n");
275 return;
276 }
277 321
278 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); 322 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
279 323
@@ -284,6 +328,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
284retry: 328retry:
285 if (pblk_ppa_empty(ppa)) { 329 if (pblk_ppa_empty(ppa)) {
286 WARN_ON(test_and_set_bit(0, read_bitmap)); 330 WARN_ON(test_and_set_bit(0, read_bitmap));
331 meta_list[0].lba = cpu_to_le64(ADDR_EMPTY);
287 return; 332 return;
288 } 333 }
289 334
@@ -295,9 +340,12 @@ retry:
295 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); 340 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
296 goto retry; 341 goto retry;
297 } 342 }
343
298 WARN_ON(test_and_set_bit(0, read_bitmap)); 344 WARN_ON(test_and_set_bit(0, read_bitmap));
345 meta_list[0].lba = cpu_to_le64(lba);
346
299#ifdef CONFIG_NVM_DEBUG 347#ifdef CONFIG_NVM_DEBUG
300 atomic_long_inc(&pblk->cache_reads); 348 atomic_long_inc(&pblk->cache_reads);
301#endif 349#endif
302 } else { 350 } else {
303 rqd->ppa_addr = ppa; 351 rqd->ppa_addr = ppa;
@@ -309,22 +357,24 @@ retry:
309int pblk_submit_read(struct pblk *pblk, struct bio *bio) 357int pblk_submit_read(struct pblk *pblk, struct bio *bio)
310{ 358{
311 struct nvm_tgt_dev *dev = pblk->dev; 359 struct nvm_tgt_dev *dev = pblk->dev;
360 sector_t blba = pblk_get_lba(bio);
312 unsigned int nr_secs = pblk_get_secs(bio); 361 unsigned int nr_secs = pblk_get_secs(bio);
362 struct pblk_g_ctx *r_ctx;
313 struct nvm_rq *rqd; 363 struct nvm_rq *rqd;
314 unsigned long read_bitmap; /* Max 64 ppas per request */
315 unsigned int bio_init_idx; 364 unsigned int bio_init_idx;
365 unsigned long read_bitmap; /* Max 64 ppas per request */
316 int ret = NVM_IO_ERR; 366 int ret = NVM_IO_ERR;
317 367
318 if (nr_secs > PBLK_MAX_REQ_ADDRS) 368 /* logic error: lba out-of-bounds. Ignore read request */
369 if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) {
370 WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n",
371 (unsigned long long)blba, nr_secs);
319 return NVM_IO_ERR; 372 return NVM_IO_ERR;
373 }
320 374
321 bitmap_zero(&read_bitmap, nr_secs); 375 bitmap_zero(&read_bitmap, nr_secs);
322 376
323 rqd = pblk_alloc_rqd(pblk, READ); 377 rqd = pblk_alloc_rqd(pblk, PBLK_READ);
324 if (IS_ERR(rqd)) {
325 pr_err_ratelimited("pblk: not able to alloc rqd");
326 return NVM_IO_ERR;
327 }
328 378
329 rqd->opcode = NVM_OP_PREAD; 379 rqd->opcode = NVM_OP_PREAD;
330 rqd->bio = bio; 380 rqd->bio = bio;
@@ -332,6 +382,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
332 rqd->private = pblk; 382 rqd->private = pblk;
333 rqd->end_io = pblk_end_io_read; 383 rqd->end_io = pblk_end_io_read;
334 384
385 r_ctx = nvm_rq_to_pdu(rqd);
386 r_ctx->lba = blba;
387
335 /* Save the index for this bio's start. This is needed in case 388 /* Save the index for this bio's start. This is needed in case
336 * we need to fill a partial read. 389 * we need to fill a partial read.
337 */ 390 */
@@ -348,23 +401,22 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
348 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; 401 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
349 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; 402 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
350 403
351 pblk_read_ppalist_rq(pblk, rqd, &read_bitmap); 404 pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap);
352 } else { 405 } else {
353 pblk_read_rq(pblk, rqd, &read_bitmap); 406 pblk_read_rq(pblk, rqd, blba, &read_bitmap);
354 } 407 }
355 408
356 bio_get(bio); 409 bio_get(bio);
357 if (bitmap_full(&read_bitmap, nr_secs)) { 410 if (bitmap_full(&read_bitmap, nr_secs)) {
358 bio_endio(bio); 411 bio_endio(bio);
359 atomic_inc(&pblk->inflight_io); 412 atomic_inc(&pblk->inflight_io);
360 pblk_end_io_read(rqd); 413 __pblk_end_io_read(pblk, rqd, false);
361 return NVM_IO_OK; 414 return NVM_IO_OK;
362 } 415 }
363 416
364 /* All sectors are to be read from the device */ 417 /* All sectors are to be read from the device */
365 if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { 418 if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
366 struct bio *int_bio = NULL; 419 struct bio *int_bio = NULL;
367 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
368 420
369 /* Clone read bio to deal with read errors internally */ 421 /* Clone read bio to deal with read errors internally */
370 int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); 422 int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
@@ -399,40 +451,46 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
399 return NVM_IO_OK; 451 return NVM_IO_OK;
400 452
401fail_rqd_free: 453fail_rqd_free:
402 pblk_free_rqd(pblk, rqd, READ); 454 pblk_free_rqd(pblk, rqd, PBLK_READ);
403 return ret; 455 return ret;
404} 456}
405 457
406static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, 458static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
407 struct pblk_line *line, u64 *lba_list, 459 struct pblk_line *line, u64 *lba_list,
408 unsigned int nr_secs) 460 u64 *paddr_list_gc, unsigned int nr_secs)
409{ 461{
410 struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; 462 struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS];
463 struct ppa_addr ppa_gc;
411 int valid_secs = 0; 464 int valid_secs = 0;
412 int i; 465 int i;
413 466
414 pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs); 467 pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs);
415 468
416 for (i = 0; i < nr_secs; i++) { 469 for (i = 0; i < nr_secs; i++) {
417 if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id || 470 if (lba_list[i] == ADDR_EMPTY)
418 pblk_ppa_empty(ppas[i])) { 471 continue;
419 lba_list[i] = ADDR_EMPTY; 472
473 ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id);
474 if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) {
475 paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY;
420 continue; 476 continue;
421 } 477 }
422 478
423 rqd->ppa_list[valid_secs++] = ppas[i]; 479 rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
424 } 480 }
425 481
426#ifdef CONFIG_NVM_DEBUG 482#ifdef CONFIG_NVM_DEBUG
427 atomic_long_add(valid_secs, &pblk->inflight_reads); 483 atomic_long_add(valid_secs, &pblk->inflight_reads);
428#endif 484#endif
485
429 return valid_secs; 486 return valid_secs;
430} 487}
431 488
432static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, 489static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
433 struct pblk_line *line, sector_t lba) 490 struct pblk_line *line, sector_t lba,
491 u64 paddr_gc)
434{ 492{
435 struct ppa_addr ppa; 493 struct ppa_addr ppa_l2p, ppa_gc;
436 int valid_secs = 0; 494 int valid_secs = 0;
437 495
438 if (lba == ADDR_EMPTY) 496 if (lba == ADDR_EMPTY)
@@ -445,15 +503,14 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
445 } 503 }
446 504
447 spin_lock(&pblk->trans_lock); 505 spin_lock(&pblk->trans_lock);
448 ppa = pblk_trans_map_get(pblk, lba); 506 ppa_l2p = pblk_trans_map_get(pblk, lba);
449 spin_unlock(&pblk->trans_lock); 507 spin_unlock(&pblk->trans_lock);
450 508
451 /* Ignore updated values until the moment */ 509 ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id);
452 if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id || 510 if (!pblk_ppa_comp(ppa_l2p, ppa_gc))
453 pblk_ppa_empty(ppa))
454 goto out; 511 goto out;
455 512
456 rqd->ppa_addr = ppa; 513 rqd->ppa_addr = ppa_l2p;
457 valid_secs = 1; 514 valid_secs = 1;
458 515
459#ifdef CONFIG_NVM_DEBUG 516#ifdef CONFIG_NVM_DEBUG
@@ -464,42 +521,44 @@ out:
464 return valid_secs; 521 return valid_secs;
465} 522}
466 523
467int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, 524int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
468 unsigned int nr_secs, unsigned int *secs_to_gc,
469 struct pblk_line *line)
470{ 525{
471 struct nvm_tgt_dev *dev = pblk->dev; 526 struct nvm_tgt_dev *dev = pblk->dev;
472 struct nvm_geo *geo = &dev->geo; 527 struct nvm_geo *geo = &dev->geo;
473 struct bio *bio; 528 struct bio *bio;
474 struct nvm_rq rqd; 529 struct nvm_rq rqd;
475 int ret, data_len; 530 int data_len;
476 DECLARE_COMPLETION_ONSTACK(wait); 531 int ret = NVM_IO_OK;
477 532
478 memset(&rqd, 0, sizeof(struct nvm_rq)); 533 memset(&rqd, 0, sizeof(struct nvm_rq));
479 534
480 rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, 535 rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
481 &rqd.dma_meta_list); 536 &rqd.dma_meta_list);
482 if (!rqd.meta_list) 537 if (!rqd.meta_list)
483 return NVM_IO_ERR; 538 return -ENOMEM;
484 539
485 if (nr_secs > 1) { 540 if (gc_rq->nr_secs > 1) {
486 rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; 541 rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
487 rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; 542 rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
488 543
489 *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list, 544 gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line,
490 nr_secs); 545 gc_rq->lba_list,
491 if (*secs_to_gc == 1) 546 gc_rq->paddr_list,
547 gc_rq->nr_secs);
548 if (gc_rq->secs_to_gc == 1)
492 rqd.ppa_addr = rqd.ppa_list[0]; 549 rqd.ppa_addr = rqd.ppa_list[0];
493 } else { 550 } else {
494 *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]); 551 gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line,
552 gc_rq->lba_list[0],
553 gc_rq->paddr_list[0]);
495 } 554 }
496 555
497 if (!(*secs_to_gc)) 556 if (!(gc_rq->secs_to_gc))
498 goto out; 557 goto out;
499 558
500 data_len = (*secs_to_gc) * geo->sec_size; 559 data_len = (gc_rq->secs_to_gc) * geo->sec_size;
501 bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, 560 bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
502 PBLK_KMALLOC_META, GFP_KERNEL); 561 PBLK_VMALLOC_META, GFP_KERNEL);
503 if (IS_ERR(bio)) { 562 if (IS_ERR(bio)) {
504 pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); 563 pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
505 goto err_free_dma; 564 goto err_free_dma;
@@ -509,23 +568,16 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
509 bio_set_op_attrs(bio, REQ_OP_READ, 0); 568 bio_set_op_attrs(bio, REQ_OP_READ, 0);
510 569
511 rqd.opcode = NVM_OP_PREAD; 570 rqd.opcode = NVM_OP_PREAD;
512 rqd.end_io = pblk_end_io_sync; 571 rqd.nr_ppas = gc_rq->secs_to_gc;
513 rqd.private = &wait;
514 rqd.nr_ppas = *secs_to_gc;
515 rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); 572 rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
516 rqd.bio = bio; 573 rqd.bio = bio;
517 574
518 ret = pblk_submit_read_io(pblk, &rqd); 575 if (pblk_submit_io_sync(pblk, &rqd)) {
519 if (ret) { 576 ret = -EIO;
520 bio_endio(bio);
521 pr_err("pblk: GC read request failed\n"); 577 pr_err("pblk: GC read request failed\n");
522 goto err_free_dma; 578 goto err_free_bio;
523 } 579 }
524 580
525 if (!wait_for_completion_io_timeout(&wait,
526 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
527 pr_err("pblk: GC read I/O timed out\n");
528 }
529 atomic_dec(&pblk->inflight_io); 581 atomic_dec(&pblk->inflight_io);
530 582
531 if (rqd.error) { 583 if (rqd.error) {
@@ -536,16 +588,18 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
536 } 588 }
537 589
538#ifdef CONFIG_NVM_DEBUG 590#ifdef CONFIG_NVM_DEBUG
539 atomic_long_add(*secs_to_gc, &pblk->sync_reads); 591 atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
540 atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads); 592 atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
541 atomic_long_sub(*secs_to_gc, &pblk->inflight_reads); 593 atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
542#endif 594#endif
543 595
544out: 596out:
545 nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); 597 nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
546 return NVM_IO_OK; 598 return ret;
547 599
600err_free_bio:
601 bio_put(bio);
548err_free_dma: 602err_free_dma:
549 nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); 603 nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
550 return NVM_IO_ERR; 604 return ret;
551} 605}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index cb556e06673e..eadb3eb5d4dc 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -34,10 +34,6 @@ void pblk_submit_rec(struct work_struct *work)
34 max_secs); 34 max_secs);
35 35
36 bio = bio_alloc(GFP_KERNEL, nr_rec_secs); 36 bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
37 if (!bio) {
38 pr_err("pblk: not able to create recovery bio\n");
39 return;
40 }
41 37
42 bio->bi_iter.bi_sector = 0; 38 bio->bi_iter.bi_sector = 0;
43 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 39 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -71,7 +67,7 @@ void pblk_submit_rec(struct work_struct *work)
71 67
72err: 68err:
73 bio_put(bio); 69 bio_put(bio);
74 pblk_free_rqd(pblk, rqd, WRITE); 70 pblk_free_rqd(pblk, rqd, PBLK_WRITE);
75} 71}
76 72
77int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, 73int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
@@ -84,12 +80,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
84 struct pblk_c_ctx *rec_ctx; 80 struct pblk_c_ctx *rec_ctx;
85 int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; 81 int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
86 82
87 rec_rqd = pblk_alloc_rqd(pblk, WRITE); 83 rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
88 if (IS_ERR(rec_rqd)) {
89 pr_err("pblk: could not create recovery req.\n");
90 return -ENOMEM;
91 }
92
93 rec_ctx = nvm_rq_to_pdu(rec_rqd); 84 rec_ctx = nvm_rq_to_pdu(rec_rqd);
94 85
95 /* Copy completion bitmap, but exclude the first X completed entries */ 86 /* Copy completion bitmap, but exclude the first X completed entries */
@@ -142,19 +133,19 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
142 struct pblk_emeta *emeta = line->emeta; 133 struct pblk_emeta *emeta = line->emeta;
143 struct line_emeta *emeta_buf = emeta->buf; 134 struct line_emeta *emeta_buf = emeta->buf;
144 __le64 *lba_list; 135 __le64 *lba_list;
145 int data_start; 136 u64 data_start, data_end;
146 int nr_data_lbas, nr_valid_lbas, nr_lbas = 0; 137 u64 nr_valid_lbas, nr_lbas = 0;
147 int i; 138 u64 i;
148 139
149 lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); 140 lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
150 if (!lba_list) 141 if (!lba_list)
151 return 1; 142 return 1;
152 143
153 data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; 144 data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
154 nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0]; 145 data_end = line->emeta_ssec;
155 nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); 146 nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas);
156 147
157 for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) { 148 for (i = data_start; i < data_end; i++) {
158 struct ppa_addr ppa; 149 struct ppa_addr ppa;
159 int pos; 150 int pos;
160 151
@@ -181,8 +172,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
181 } 172 }
182 173
183 if (nr_valid_lbas != nr_lbas) 174 if (nr_valid_lbas != nr_lbas)
184 pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n", 175 pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n",
185 line->id, emeta_buf->nr_valid_lbas, nr_lbas); 176 line->id, nr_valid_lbas, nr_lbas);
186 177
187 line->left_msecs = 0; 178 line->left_msecs = 0;
188 179
@@ -225,7 +216,6 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
225 int rq_ppas, rq_len; 216 int rq_ppas, rq_len;
226 int i, j; 217 int i, j;
227 int ret = 0; 218 int ret = 0;
228 DECLARE_COMPLETION_ONSTACK(wait);
229 219
230 ppa_list = p.ppa_list; 220 ppa_list = p.ppa_list;
231 meta_list = p.meta_list; 221 meta_list = p.meta_list;
@@ -262,8 +252,6 @@ next_read_rq:
262 rqd->ppa_list = ppa_list; 252 rqd->ppa_list = ppa_list;
263 rqd->dma_ppa_list = dma_ppa_list; 253 rqd->dma_ppa_list = dma_ppa_list;
264 rqd->dma_meta_list = dma_meta_list; 254 rqd->dma_meta_list = dma_meta_list;
265 rqd->end_io = pblk_end_io_sync;
266 rqd->private = &wait;
267 255
268 if (pblk_io_aligned(pblk, rq_ppas)) 256 if (pblk_io_aligned(pblk, rq_ppas))
269 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); 257 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -289,19 +277,13 @@ next_read_rq:
289 } 277 }
290 278
291 /* If read fails, more padding is needed */ 279 /* If read fails, more padding is needed */
292 ret = pblk_submit_io(pblk, rqd); 280 ret = pblk_submit_io_sync(pblk, rqd);
293 if (ret) { 281 if (ret) {
294 pr_err("pblk: I/O submission failed: %d\n", ret); 282 pr_err("pblk: I/O submission failed: %d\n", ret);
295 return ret; 283 return ret;
296 } 284 }
297 285
298 if (!wait_for_completion_io_timeout(&wait,
299 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
300 pr_err("pblk: L2P recovery read timed out\n");
301 return -EINTR;
302 }
303 atomic_dec(&pblk->inflight_io); 286 atomic_dec(&pblk->inflight_io);
304 reinit_completion(&wait);
305 287
306 /* At this point, the read should not fail. If it does, it is a problem 288 /* At this point, the read should not fail. If it does, it is a problem
307 * we cannot recover from here. Need FTL log. 289 * we cannot recover from here. Need FTL log.
@@ -338,13 +320,10 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
338{ 320{
339 struct pblk_pad_rq *pad_rq = rqd->private; 321 struct pblk_pad_rq *pad_rq = rqd->private;
340 struct pblk *pblk = pad_rq->pblk; 322 struct pblk *pblk = pad_rq->pblk;
341 struct nvm_tgt_dev *dev = pblk->dev;
342 323
343 pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 324 pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
344 325
345 bio_put(rqd->bio); 326 pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
346 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
347 pblk_free_rqd(pblk, rqd, WRITE);
348 327
349 atomic_dec(&pblk->inflight_io); 328 atomic_dec(&pblk->inflight_io);
350 kref_put(&pad_rq->ref, pblk_recov_complete); 329 kref_put(&pad_rq->ref, pblk_recov_complete);
@@ -404,25 +383,21 @@ next_pad_rq:
404 ppa_list = (void *)(meta_list) + pblk_dma_meta_size; 383 ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
405 dma_ppa_list = dma_meta_list + pblk_dma_meta_size; 384 dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
406 385
407 rqd = pblk_alloc_rqd(pblk, WRITE);
408 if (IS_ERR(rqd)) {
409 ret = PTR_ERR(rqd);
410 goto fail_free_meta;
411 }
412
413 bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, 386 bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
414 PBLK_VMALLOC_META, GFP_KERNEL); 387 PBLK_VMALLOC_META, GFP_KERNEL);
415 if (IS_ERR(bio)) { 388 if (IS_ERR(bio)) {
416 ret = PTR_ERR(bio); 389 ret = PTR_ERR(bio);
417 goto fail_free_rqd; 390 goto fail_free_meta;
418 } 391 }
419 392
420 bio->bi_iter.bi_sector = 0; /* internal bio */ 393 bio->bi_iter.bi_sector = 0; /* internal bio */
421 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 394 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
422 395
396 rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
397
423 rqd->bio = bio; 398 rqd->bio = bio;
424 rqd->opcode = NVM_OP_PWRITE; 399 rqd->opcode = NVM_OP_PWRITE;
425 rqd->flags = pblk_set_progr_mode(pblk, WRITE); 400 rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
426 rqd->meta_list = meta_list; 401 rqd->meta_list = meta_list;
427 rqd->nr_ppas = rq_ppas; 402 rqd->nr_ppas = rq_ppas;
428 rqd->ppa_list = ppa_list; 403 rqd->ppa_list = ppa_list;
@@ -490,8 +465,6 @@ free_rq:
490 465
491fail_free_bio: 466fail_free_bio:
492 bio_put(bio); 467 bio_put(bio);
493fail_free_rqd:
494 pblk_free_rqd(pblk, rqd, WRITE);
495fail_free_meta: 468fail_free_meta:
496 nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); 469 nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
497fail_free_pad: 470fail_free_pad:
@@ -522,7 +495,6 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
522 int ret = 0; 495 int ret = 0;
523 int rec_round; 496 int rec_round;
524 int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec; 497 int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
525 DECLARE_COMPLETION_ONSTACK(wait);
526 498
527 ppa_list = p.ppa_list; 499 ppa_list = p.ppa_list;
528 meta_list = p.meta_list; 500 meta_list = p.meta_list;
@@ -557,8 +529,6 @@ next_rq:
557 rqd->ppa_list = ppa_list; 529 rqd->ppa_list = ppa_list;
558 rqd->dma_ppa_list = dma_ppa_list; 530 rqd->dma_ppa_list = dma_ppa_list;
559 rqd->dma_meta_list = dma_meta_list; 531 rqd->dma_meta_list = dma_meta_list;
560 rqd->end_io = pblk_end_io_sync;
561 rqd->private = &wait;
562 532
563 if (pblk_io_aligned(pblk, rq_ppas)) 533 if (pblk_io_aligned(pblk, rq_ppas))
564 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); 534 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -584,18 +554,13 @@ next_rq:
584 addr_to_gen_ppa(pblk, w_ptr, line->id); 554 addr_to_gen_ppa(pblk, w_ptr, line->id);
585 } 555 }
586 556
587 ret = pblk_submit_io(pblk, rqd); 557 ret = pblk_submit_io_sync(pblk, rqd);
588 if (ret) { 558 if (ret) {
589 pr_err("pblk: I/O submission failed: %d\n", ret); 559 pr_err("pblk: I/O submission failed: %d\n", ret);
590 return ret; 560 return ret;
591 } 561 }
592 562
593 if (!wait_for_completion_io_timeout(&wait,
594 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
595 pr_err("pblk: L2P recovery read timed out\n");
596 }
597 atomic_dec(&pblk->inflight_io); 563 atomic_dec(&pblk->inflight_io);
598 reinit_completion(&wait);
599 564
600 /* This should not happen since the read failed during normal recovery, 565 /* This should not happen since the read failed during normal recovery,
601 * but the media works funny sometimes... 566 * but the media works funny sometimes...
@@ -663,7 +628,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
663 int i, j; 628 int i, j;
664 int ret = 0; 629 int ret = 0;
665 int left_ppas = pblk_calc_sec_in_line(pblk, line); 630 int left_ppas = pblk_calc_sec_in_line(pblk, line);
666 DECLARE_COMPLETION_ONSTACK(wait);
667 631
668 ppa_list = p.ppa_list; 632 ppa_list = p.ppa_list;
669 meta_list = p.meta_list; 633 meta_list = p.meta_list;
@@ -696,8 +660,6 @@ next_rq:
696 rqd->ppa_list = ppa_list; 660 rqd->ppa_list = ppa_list;
697 rqd->dma_ppa_list = dma_ppa_list; 661 rqd->dma_ppa_list = dma_ppa_list;
698 rqd->dma_meta_list = dma_meta_list; 662 rqd->dma_meta_list = dma_meta_list;
699 rqd->end_io = pblk_end_io_sync;
700 rqd->private = &wait;
701 663
702 if (pblk_io_aligned(pblk, rq_ppas)) 664 if (pblk_io_aligned(pblk, rq_ppas))
703 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); 665 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -723,19 +685,14 @@ next_rq:
723 addr_to_gen_ppa(pblk, paddr, line->id); 685 addr_to_gen_ppa(pblk, paddr, line->id);
724 } 686 }
725 687
726 ret = pblk_submit_io(pblk, rqd); 688 ret = pblk_submit_io_sync(pblk, rqd);
727 if (ret) { 689 if (ret) {
728 pr_err("pblk: I/O submission failed: %d\n", ret); 690 pr_err("pblk: I/O submission failed: %d\n", ret);
729 bio_put(bio); 691 bio_put(bio);
730 return ret; 692 return ret;
731 } 693 }
732 694
733 if (!wait_for_completion_io_timeout(&wait,
734 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
735 pr_err("pblk: L2P recovery read timed out\n");
736 }
737 atomic_dec(&pblk->inflight_io); 695 atomic_dec(&pblk->inflight_io);
738 reinit_completion(&wait);
739 696
740 /* Reached the end of the written line */ 697 /* Reached the end of the written line */
741 if (rqd->error) { 698 if (rqd->error) {
@@ -785,15 +742,9 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
785 dma_addr_t dma_ppa_list, dma_meta_list; 742 dma_addr_t dma_ppa_list, dma_meta_list;
786 int done, ret = 0; 743 int done, ret = 0;
787 744
788 rqd = pblk_alloc_rqd(pblk, READ);
789 if (IS_ERR(rqd))
790 return PTR_ERR(rqd);
791
792 meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); 745 meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
793 if (!meta_list) { 746 if (!meta_list)
794 ret = -ENOMEM; 747 return -ENOMEM;
795 goto free_rqd;
796 }
797 748
798 ppa_list = (void *)(meta_list) + pblk_dma_meta_size; 749 ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
799 dma_ppa_list = dma_meta_list + pblk_dma_meta_size; 750 dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
@@ -804,6 +755,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
804 goto free_meta_list; 755 goto free_meta_list;
805 } 756 }
806 757
758 rqd = pblk_alloc_rqd(pblk, PBLK_READ);
759
807 p.ppa_list = ppa_list; 760 p.ppa_list = ppa_list;
808 p.meta_list = meta_list; 761 p.meta_list = meta_list;
809 p.rqd = rqd; 762 p.rqd = rqd;
@@ -832,8 +785,6 @@ out:
832 kfree(data); 785 kfree(data);
833free_meta_list: 786free_meta_list:
834 nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); 787 nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
835free_rqd:
836 pblk_free_rqd(pblk, rqd, READ);
837 788
838 return ret; 789 return ret;
839} 790}
@@ -851,11 +802,33 @@ static void pblk_recov_line_add_ordered(struct list_head *head,
851 __list_add(&line->list, t->list.prev, &t->list); 802 __list_add(&line->list, t->list.prev, &t->list);
852} 803}
853 804
854struct pblk_line *pblk_recov_l2p(struct pblk *pblk) 805static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
855{ 806{
856 struct nvm_tgt_dev *dev = pblk->dev; 807 struct nvm_tgt_dev *dev = pblk->dev;
857 struct nvm_geo *geo = &dev->geo; 808 struct nvm_geo *geo = &dev->geo;
858 struct pblk_line_meta *lm = &pblk->lm; 809 struct pblk_line_meta *lm = &pblk->lm;
810 unsigned int emeta_secs;
811 u64 emeta_start;
812 struct ppa_addr ppa;
813 int pos;
814
815 emeta_secs = lm->emeta_sec[0];
816 emeta_start = lm->sec_per_line;
817
818 while (emeta_secs) {
819 emeta_start--;
820 ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id);
821 pos = pblk_ppa_to_pos(geo, ppa);
822 if (!test_bit(pos, line->blk_bitmap))
823 emeta_secs--;
824 }
825
826 return emeta_start;
827}
828
829struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
830{
831 struct pblk_line_meta *lm = &pblk->lm;
859 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 832 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
860 struct pblk_line *line, *tline, *data_line = NULL; 833 struct pblk_line *line, *tline, *data_line = NULL;
861 struct pblk_smeta *smeta; 834 struct pblk_smeta *smeta;
@@ -900,9 +873,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
900 if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) 873 if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
901 continue; 874 continue;
902 875
903 if (le16_to_cpu(smeta_buf->header.version) != 1) { 876 if (smeta_buf->header.version != SMETA_VERSION) {
904 pr_err("pblk: found incompatible line version %u\n", 877 pr_err("pblk: found incompatible line version %u\n",
905 smeta_buf->header.version); 878 le16_to_cpu(smeta_buf->header.version));
906 return ERR_PTR(-EINVAL); 879 return ERR_PTR(-EINVAL);
907 } 880 }
908 881
@@ -954,15 +927,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
954 927
955 /* Verify closed blocks and recover this portion of L2P table*/ 928 /* Verify closed blocks and recover this portion of L2P table*/
956 list_for_each_entry_safe(line, tline, &recov_list, list) { 929 list_for_each_entry_safe(line, tline, &recov_list, list) {
957 int off, nr_bb;
958
959 recovered_lines++; 930 recovered_lines++;
960 /* Calculate where emeta starts based on the line bb */
961 off = lm->sec_per_line - lm->emeta_sec[0];
962 nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
963 off -= nr_bb * geo->sec_per_pl;
964 931
965 line->emeta_ssec = off; 932 line->emeta_ssec = pblk_line_emeta_start(pblk, line);
966 line->emeta = emeta; 933 line->emeta = emeta;
967 memset(line->emeta->buf, 0, lm->emeta_len[0]); 934 memset(line->emeta->buf, 0, lm->emeta_len[0]);
968 935
@@ -987,7 +954,7 @@ next:
987 list_move_tail(&line->list, move_list); 954 list_move_tail(&line->list, move_list);
988 spin_unlock(&l_mg->gc_lock); 955 spin_unlock(&l_mg->gc_lock);
989 956
990 mempool_free(line->map_bitmap, pblk->line_meta_pool); 957 kfree(line->map_bitmap);
991 line->map_bitmap = NULL; 958 line->map_bitmap = NULL;
992 line->smeta = NULL; 959 line->smeta = NULL;
993 line->emeta = NULL; 960 line->emeta = NULL;
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 2e6a5361baf0..abae31fd434e 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -96,9 +96,11 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
96 * 96 *
97 * Only the total number of free blocks is used to configure the rate limiter. 97 * Only the total number of free blocks is used to configure the rate limiter.
98 */ 98 */
99static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) 99void pblk_rl_update_rates(struct pblk_rl *rl)
100{ 100{
101 struct pblk *pblk = container_of(rl, struct pblk, rl);
101 unsigned long free_blocks = pblk_rl_nr_free_blks(rl); 102 unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
103 int max = rl->rb_budget;
102 104
103 if (free_blocks >= rl->high) { 105 if (free_blocks >= rl->high) {
104 rl->rb_user_max = max; 106 rl->rb_user_max = max;
@@ -124,23 +126,18 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
124 rl->rb_state = PBLK_RL_LOW; 126 rl->rb_state = PBLK_RL_LOW;
125 } 127 }
126 128
127 return rl->rb_state; 129 if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW))
130 pblk_gc_should_start(pblk);
131 else
132 pblk_gc_should_stop(pblk);
128} 133}
129 134
130void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) 135void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
131{ 136{
132 struct pblk *pblk = container_of(rl, struct pblk, rl);
133 int blk_in_line = atomic_read(&line->blk_in_line); 137 int blk_in_line = atomic_read(&line->blk_in_line);
134 int ret;
135 138
136 atomic_add(blk_in_line, &rl->free_blocks); 139 atomic_add(blk_in_line, &rl->free_blocks);
137 /* Rates will not change that often - no need to lock update */ 140 pblk_rl_update_rates(rl);
138 ret = pblk_rl_update_rates(rl, rl->rb_budget);
139
140 if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
141 pblk_gc_should_start(pblk);
142 else
143 pblk_gc_should_stop(pblk);
144} 141}
145 142
146void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) 143void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
@@ -148,19 +145,7 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
148 int blk_in_line = atomic_read(&line->blk_in_line); 145 int blk_in_line = atomic_read(&line->blk_in_line);
149 146
150 atomic_sub(blk_in_line, &rl->free_blocks); 147 atomic_sub(blk_in_line, &rl->free_blocks);
151} 148 pblk_rl_update_rates(rl);
152
153void pblk_gc_should_kick(struct pblk *pblk)
154{
155 struct pblk_rl *rl = &pblk->rl;
156 int ret;
157
158 /* Rates will not change that often - no need to lock update */
159 ret = pblk_rl_update_rates(rl, rl->rb_budget);
160 if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
161 pblk_gc_should_start(pblk);
162 else
163 pblk_gc_should_stop(pblk);
164} 149}
165 150
166int pblk_rl_high_thrs(struct pblk_rl *rl) 151int pblk_rl_high_thrs(struct pblk_rl *rl)
@@ -168,14 +153,9 @@ int pblk_rl_high_thrs(struct pblk_rl *rl)
168 return rl->high; 153 return rl->high;
169} 154}
170 155
171int pblk_rl_low_thrs(struct pblk_rl *rl) 156int pblk_rl_max_io(struct pblk_rl *rl)
172{
173 return rl->low;
174}
175
176int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
177{ 157{
178 return rl->rb_user_max; 158 return rl->rb_max_io;
179} 159}
180 160
181static void pblk_rl_u_timer(unsigned long data) 161static void pblk_rl_u_timer(unsigned long data)
@@ -214,6 +194,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
214 /* To start with, all buffer is available to user I/O writers */ 194 /* To start with, all buffer is available to user I/O writers */
215 rl->rb_budget = budget; 195 rl->rb_budget = budget;
216 rl->rb_user_max = budget; 196 rl->rb_user_max = budget;
197 rl->rb_max_io = budget >> 1;
217 rl->rb_gc_max = 0; 198 rl->rb_gc_max = 0;
218 rl->rb_state = PBLK_RL_HIGH; 199 rl->rb_state = PBLK_RL_HIGH;
219 200
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 95fb434e2f01..cd49e8875d4e 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -253,7 +253,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
253 sz += snprintf(page + sz, PAGE_SIZE - sz, 253 sz += snprintf(page + sz, PAGE_SIZE - sz,
254 "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n", 254 "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
255 gc_full, gc_high, gc_mid, gc_low, gc_empty, 255 gc_full, gc_high, gc_mid, gc_low, gc_empty,
256 atomic_read(&pblk->gc.inflight_gc)); 256 atomic_read(&pblk->gc.read_inflight_gc));
257 257
258 sz += snprintf(page + sz, PAGE_SIZE - sz, 258 sz += snprintf(page + sz, PAGE_SIZE - sz,
259 "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", 259 "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 3ad9e56d2473..6c1cafafef53 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -20,7 +20,6 @@
20static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, 20static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
21 struct pblk_c_ctx *c_ctx) 21 struct pblk_c_ctx *c_ctx)
22{ 22{
23 struct nvm_tgt_dev *dev = pblk->dev;
24 struct bio *original_bio; 23 struct bio *original_bio;
25 unsigned long ret; 24 unsigned long ret;
26 int i; 25 int i;
@@ -33,16 +32,18 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
33 bio_endio(original_bio); 32 bio_endio(original_bio);
34 } 33 }
35 34
35 if (c_ctx->nr_padded)
36 pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
37 c_ctx->nr_padded);
38
36#ifdef CONFIG_NVM_DEBUG 39#ifdef CONFIG_NVM_DEBUG
37 atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); 40 atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
38#endif 41#endif
39 42
40 ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); 43 ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
41 44
42 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
43
44 bio_put(rqd->bio); 45 bio_put(rqd->bio);
45 pblk_free_rqd(pblk, rqd, WRITE); 46 pblk_free_rqd(pblk, rqd, PBLK_WRITE);
46 47
47 return ret; 48 return ret;
48} 49}
@@ -107,10 +108,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
107 ppa_list = &rqd->ppa_addr; 108 ppa_list = &rqd->ppa_addr;
108 109
109 recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); 110 recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
110 if (!recovery) { 111
111 pr_err("pblk: could not allocate recovery context\n");
112 return;
113 }
114 INIT_LIST_HEAD(&recovery->failed); 112 INIT_LIST_HEAD(&recovery->failed);
115 113
116 bit = -1; 114 bit = -1;
@@ -175,7 +173,6 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
175static void pblk_end_io_write_meta(struct nvm_rq *rqd) 173static void pblk_end_io_write_meta(struct nvm_rq *rqd)
176{ 174{
177 struct pblk *pblk = rqd->private; 175 struct pblk *pblk = rqd->private;
178 struct nvm_tgt_dev *dev = pblk->dev;
179 struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); 176 struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd);
180 struct pblk_line *line = m_ctx->private; 177 struct pblk_line *line = m_ctx->private;
181 struct pblk_emeta *emeta = line->emeta; 178 struct pblk_emeta *emeta = line->emeta;
@@ -187,19 +184,13 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
187 pblk_log_write_err(pblk, rqd); 184 pblk_log_write_err(pblk, rqd);
188 pr_err("pblk: metadata I/O failed. Line %d\n", line->id); 185 pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
189 } 186 }
190#ifdef CONFIG_NVM_DEBUG
191 else
192 WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
193#endif
194 187
195 sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); 188 sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
196 if (sync == emeta->nr_entries) 189 if (sync == emeta->nr_entries)
197 pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws, 190 pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
198 pblk->close_wq); 191 GFP_ATOMIC, pblk->close_wq);
199 192
200 bio_put(rqd->bio); 193 pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
201 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
202 pblk_free_rqd(pblk, rqd, READ);
203 194
204 atomic_dec(&pblk->inflight_io); 195 atomic_dec(&pblk->inflight_io);
205} 196}
@@ -213,7 +204,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
213 /* Setup write request */ 204 /* Setup write request */
214 rqd->opcode = NVM_OP_PWRITE; 205 rqd->opcode = NVM_OP_PWRITE;
215 rqd->nr_ppas = nr_secs; 206 rqd->nr_ppas = nr_secs;
216 rqd->flags = pblk_set_progr_mode(pblk, WRITE); 207 rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
217 rqd->private = pblk; 208 rqd->private = pblk;
218 rqd->end_io = end_io; 209 rqd->end_io = end_io;
219 210
@@ -229,15 +220,16 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
229} 220}
230 221
231static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, 222static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
232 struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa) 223 struct ppa_addr *erase_ppa)
233{ 224{
234 struct pblk_line_meta *lm = &pblk->lm; 225 struct pblk_line_meta *lm = &pblk->lm;
235 struct pblk_line *e_line = pblk_line_get_erase(pblk); 226 struct pblk_line *e_line = pblk_line_get_erase(pblk);
227 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
236 unsigned int valid = c_ctx->nr_valid; 228 unsigned int valid = c_ctx->nr_valid;
237 unsigned int padded = c_ctx->nr_padded; 229 unsigned int padded = c_ctx->nr_padded;
238 unsigned int nr_secs = valid + padded; 230 unsigned int nr_secs = valid + padded;
239 unsigned long *lun_bitmap; 231 unsigned long *lun_bitmap;
240 int ret = 0; 232 int ret;
241 233
242 lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); 234 lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
243 if (!lun_bitmap) 235 if (!lun_bitmap)
@@ -279,7 +271,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
279 pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0); 271 pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
280 272
281 rqd->ppa_status = (u64)0; 273 rqd->ppa_status = (u64)0;
282 rqd->flags = pblk_set_progr_mode(pblk, WRITE); 274 rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
283 275
284 return ret; 276 return ret;
285} 277}
@@ -303,55 +295,6 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
303 return secs_to_sync; 295 return secs_to_sync;
304} 296}
305 297
306static inline int pblk_valid_meta_ppa(struct pblk *pblk,
307 struct pblk_line *meta_line,
308 struct ppa_addr *ppa_list, int nr_ppas)
309{
310 struct nvm_tgt_dev *dev = pblk->dev;
311 struct nvm_geo *geo = &dev->geo;
312 struct pblk_line *data_line;
313 struct ppa_addr ppa, ppa_opt;
314 u64 paddr;
315 int i;
316
317 data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])];
318 paddr = pblk_lookup_page(pblk, meta_line);
319 ppa = addr_to_gen_ppa(pblk, paddr, 0);
320
321 if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap))
322 return 1;
323
324 /* Schedule a metadata I/O that is half the distance from the data I/O
325 * with regards to the number of LUNs forming the pblk instance. This
326 * balances LUN conflicts across every I/O.
327 *
328 * When the LUN configuration changes (e.g., due to GC), this distance
329 * can align, which would result on a LUN deadlock. In this case, modify
330 * the distance to not be optimal, but allow metadata I/Os to succeed.
331 */
332 ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
333 if (unlikely(ppa_opt.ppa == ppa.ppa)) {
334 data_line->meta_distance--;
335 return 0;
336 }
337
338 for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
339 if (ppa_list[i].g.ch == ppa_opt.g.ch &&
340 ppa_list[i].g.lun == ppa_opt.g.lun)
341 return 1;
342
343 if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) {
344 for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
345 if (ppa_list[i].g.ch == ppa.g.ch &&
346 ppa_list[i].g.lun == ppa.g.lun)
347 return 0;
348
349 return 1;
350 }
351
352 return 0;
353}
354
355int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) 298int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
356{ 299{
357 struct nvm_tgt_dev *dev = pblk->dev; 300 struct nvm_tgt_dev *dev = pblk->dev;
@@ -370,11 +313,8 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
370 int i, j; 313 int i, j;
371 int ret; 314 int ret;
372 315
373 rqd = pblk_alloc_rqd(pblk, READ); 316 rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
374 if (IS_ERR(rqd)) { 317
375 pr_err("pblk: cannot allocate write req.\n");
376 return PTR_ERR(rqd);
377 }
378 m_ctx = nvm_rq_to_pdu(rqd); 318 m_ctx = nvm_rq_to_pdu(rqd);
379 m_ctx->private = meta_line; 319 m_ctx->private = meta_line;
380 320
@@ -407,8 +347,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
407 if (emeta->mem >= lm->emeta_len[0]) { 347 if (emeta->mem >= lm->emeta_len[0]) {
408 spin_lock(&l_mg->close_lock); 348 spin_lock(&l_mg->close_lock);
409 list_del(&meta_line->list); 349 list_del(&meta_line->list);
410 WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line),
411 "pblk: corrupt meta line %d\n", meta_line->id);
412 spin_unlock(&l_mg->close_lock); 350 spin_unlock(&l_mg->close_lock);
413 } 351 }
414 352
@@ -428,18 +366,51 @@ fail_rollback:
428 pblk_dealloc_page(pblk, meta_line, rq_ppas); 366 pblk_dealloc_page(pblk, meta_line, rq_ppas);
429 list_add(&meta_line->list, &meta_line->list); 367 list_add(&meta_line->list, &meta_line->list);
430 spin_unlock(&l_mg->close_lock); 368 spin_unlock(&l_mg->close_lock);
431
432 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
433fail_free_bio: 369fail_free_bio:
434 if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META)) 370 bio_put(bio);
435 bio_put(bio);
436fail_free_rqd: 371fail_free_rqd:
437 pblk_free_rqd(pblk, rqd, READ); 372 pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
438 return ret; 373 return ret;
439} 374}
440 375
441static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, 376static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
442 int prev_n) 377 struct pblk_line *meta_line,
378 struct nvm_rq *data_rqd)
379{
380 struct nvm_tgt_dev *dev = pblk->dev;
381 struct nvm_geo *geo = &dev->geo;
382 struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
383 struct pblk_line *data_line = pblk_line_get_data(pblk);
384 struct ppa_addr ppa, ppa_opt;
385 u64 paddr;
386 int pos_opt;
387
388 /* Schedule a metadata I/O that is half the distance from the data I/O
389 * with regards to the number of LUNs forming the pblk instance. This
390 * balances LUN conflicts across every I/O.
391 *
392 * When the LUN configuration changes (e.g., due to GC), this distance
393 * can align, which would result on metadata and data I/Os colliding. In
394 * this case, modify the distance to not be optimal, but move the
395 * optimal in the right direction.
396 */
397 paddr = pblk_lookup_page(pblk, meta_line);
398 ppa = addr_to_gen_ppa(pblk, paddr, 0);
399 ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
400 pos_opt = pblk_ppa_to_pos(geo, ppa_opt);
401
402 if (test_bit(pos_opt, data_c_ctx->lun_bitmap) ||
403 test_bit(pos_opt, data_line->blk_bitmap))
404 return true;
405
406 if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
407 data_line->meta_distance--;
408
409 return false;
410}
411
412static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
413 struct nvm_rq *data_rqd)
443{ 414{
444 struct pblk_line_meta *lm = &pblk->lm; 415 struct pblk_line_meta *lm = &pblk->lm;
445 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 416 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -449,57 +420,45 @@ static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list,
449retry: 420retry:
450 if (list_empty(&l_mg->emeta_list)) { 421 if (list_empty(&l_mg->emeta_list)) {
451 spin_unlock(&l_mg->close_lock); 422 spin_unlock(&l_mg->close_lock);
452 return 0; 423 return NULL;
453 } 424 }
454 meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); 425 meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list);
455 if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line)) 426 if (meta_line->emeta->mem >= lm->emeta_len[0])
456 goto retry; 427 goto retry;
457 spin_unlock(&l_mg->close_lock); 428 spin_unlock(&l_mg->close_lock);
458 429
459 if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n)) 430 if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
460 return 0; 431 return NULL;
461 432
462 return pblk_submit_meta_io(pblk, meta_line); 433 return meta_line;
463} 434}
464 435
465static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) 436static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
466{ 437{
467 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
468 struct ppa_addr erase_ppa; 438 struct ppa_addr erase_ppa;
439 struct pblk_line *meta_line;
469 int err; 440 int err;
470 441
471 ppa_set_empty(&erase_ppa); 442 ppa_set_empty(&erase_ppa);
472 443
473 /* Assign lbas to ppas and populate request structure */ 444 /* Assign lbas to ppas and populate request structure */
474 err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa); 445 err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
475 if (err) { 446 if (err) {
476 pr_err("pblk: could not setup write request: %d\n", err); 447 pr_err("pblk: could not setup write request: %d\n", err);
477 return NVM_IO_ERR; 448 return NVM_IO_ERR;
478 } 449 }
479 450
480 if (likely(ppa_empty(erase_ppa))) { 451 meta_line = pblk_should_submit_meta_io(pblk, rqd);
481 /* Submit metadata write for previous data line */
482 err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas);
483 if (err) {
484 pr_err("pblk: metadata I/O submission failed: %d", err);
485 return NVM_IO_ERR;
486 }
487 452
488 /* Submit data write for current data line */ 453 /* Submit data write for current data line */
489 err = pblk_submit_io(pblk, rqd); 454 err = pblk_submit_io(pblk, rqd);
490 if (err) { 455 if (err) {
491 pr_err("pblk: data I/O submission failed: %d\n", err); 456 pr_err("pblk: data I/O submission failed: %d\n", err);
492 return NVM_IO_ERR; 457 return NVM_IO_ERR;
493 } 458 }
494 } else {
495 /* Submit data write for current data line */
496 err = pblk_submit_io(pblk, rqd);
497 if (err) {
498 pr_err("pblk: data I/O submission failed: %d\n", err);
499 return NVM_IO_ERR;
500 }
501 459
502 /* Submit available erase for next data line */ 460 if (!ppa_empty(erase_ppa)) {
461 /* Submit erase for next data line */
503 if (pblk_blk_erase_async(pblk, erase_ppa)) { 462 if (pblk_blk_erase_async(pblk, erase_ppa)) {
504 struct pblk_line *e_line = pblk_line_get_erase(pblk); 463 struct pblk_line *e_line = pblk_line_get_erase(pblk);
505 struct nvm_tgt_dev *dev = pblk->dev; 464 struct nvm_tgt_dev *dev = pblk->dev;
@@ -512,6 +471,15 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
512 } 471 }
513 } 472 }
514 473
474 if (meta_line) {
475 /* Submit metadata write for previous data line */
476 err = pblk_submit_meta_io(pblk, meta_line);
477 if (err) {
478 pr_err("pblk: metadata I/O submission failed: %d", err);
479 return NVM_IO_ERR;
480 }
481 }
482
515 return NVM_IO_OK; 483 return NVM_IO_OK;
516} 484}
517 485
@@ -521,7 +489,8 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
521 struct bio *bio = rqd->bio; 489 struct bio *bio = rqd->bio;
522 490
523 if (c_ctx->nr_padded) 491 if (c_ctx->nr_padded)
524 pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded); 492 pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid,
493 c_ctx->nr_padded);
525} 494}
526 495
527static int pblk_submit_write(struct pblk *pblk) 496static int pblk_submit_write(struct pblk *pblk)
@@ -543,31 +512,24 @@ static int pblk_submit_write(struct pblk *pblk)
543 if (!secs_to_flush && secs_avail < pblk->min_write_pgs) 512 if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
544 return 1; 513 return 1;
545 514
546 rqd = pblk_alloc_rqd(pblk, WRITE);
547 if (IS_ERR(rqd)) {
548 pr_err("pblk: cannot allocate write req.\n");
549 return 1;
550 }
551
552 bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
553 if (!bio) {
554 pr_err("pblk: cannot allocate write bio\n");
555 goto fail_free_rqd;
556 }
557 bio->bi_iter.bi_sector = 0; /* internal bio */
558 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
559 rqd->bio = bio;
560
561 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); 515 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
562 if (secs_to_sync > pblk->max_write_pgs) { 516 if (secs_to_sync > pblk->max_write_pgs) {
563 pr_err("pblk: bad buffer sync calculation\n"); 517 pr_err("pblk: bad buffer sync calculation\n");
564 goto fail_put_bio; 518 return 1;
565 } 519 }
566 520
567 secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; 521 secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
568 pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); 522 pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
569 523
570 if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync, 524 bio = bio_alloc(GFP_KERNEL, secs_to_sync);
525
526 bio->bi_iter.bi_sector = 0; /* internal bio */
527 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
528
529 rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
530 rqd->bio = bio;
531
532 if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
571 secs_avail)) { 533 secs_avail)) {
572 pr_err("pblk: corrupted write bio\n"); 534 pr_err("pblk: corrupted write bio\n");
573 goto fail_put_bio; 535 goto fail_put_bio;
@@ -586,8 +548,7 @@ fail_free_bio:
586 pblk_free_write_rqd(pblk, rqd); 548 pblk_free_write_rqd(pblk, rqd);
587fail_put_bio: 549fail_put_bio:
588 bio_put(bio); 550 bio_put(bio);
589fail_free_rqd: 551 pblk_free_rqd(pblk, rqd, PBLK_WRITE);
590 pblk_free_rqd(pblk, rqd, WRITE);
591 552
592 return 1; 553 return 1;
593} 554}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 67e623bd5c2d..90961033a79f 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -40,10 +40,6 @@
40#define PBLK_MAX_REQ_ADDRS (64) 40#define PBLK_MAX_REQ_ADDRS (64)
41#define PBLK_MAX_REQ_ADDRS_PW (6) 41#define PBLK_MAX_REQ_ADDRS_PW (6)
42 42
43#define PBLK_WS_POOL_SIZE (128)
44#define PBLK_META_POOL_SIZE (128)
45#define PBLK_READ_REQ_POOL_SIZE (1024)
46
47#define PBLK_NR_CLOSE_JOBS (4) 43#define PBLK_NR_CLOSE_JOBS (4)
48 44
49#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) 45#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
@@ -59,7 +55,15 @@
59 for ((i) = 0, rlun = &(pblk)->luns[0]; \ 55 for ((i) = 0, rlun = &(pblk)->luns[0]; \
60 (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)]) 56 (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
61 57
62#define ERASE 2 /* READ = 0, WRITE = 1 */ 58/* Static pool sizes */
59#define PBLK_GEN_WS_POOL_SIZE (2)
60
61enum {
62 PBLK_READ = READ,
63 PBLK_WRITE = WRITE,/* Write from write buffer */
64 PBLK_WRITE_INT, /* Internal write - no write buffer */
65 PBLK_ERASE,
66};
63 67
64enum { 68enum {
65 /* IO Types */ 69 /* IO Types */
@@ -95,6 +99,7 @@ enum {
95}; 99};
96 100
97#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) 101#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
102#define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS)
98 103
99/* write buffer completion context */ 104/* write buffer completion context */
100struct pblk_c_ctx { 105struct pblk_c_ctx {
@@ -106,9 +111,10 @@ struct pblk_c_ctx {
106 unsigned int nr_padded; 111 unsigned int nr_padded;
107}; 112};
108 113
109/* generic context */ 114/* read context */
110struct pblk_g_ctx { 115struct pblk_g_ctx {
111 void *private; 116 void *private;
117 u64 lba;
112}; 118};
113 119
114/* Pad context */ 120/* Pad context */
@@ -207,6 +213,7 @@ struct pblk_lun {
207struct pblk_gc_rq { 213struct pblk_gc_rq {
208 struct pblk_line *line; 214 struct pblk_line *line;
209 void *data; 215 void *data;
216 u64 paddr_list[PBLK_MAX_REQ_ADDRS];
210 u64 lba_list[PBLK_MAX_REQ_ADDRS]; 217 u64 lba_list[PBLK_MAX_REQ_ADDRS];
211 int nr_secs; 218 int nr_secs;
212 int secs_to_gc; 219 int secs_to_gc;
@@ -231,7 +238,10 @@ struct pblk_gc {
231 struct timer_list gc_timer; 238 struct timer_list gc_timer;
232 239
233 struct semaphore gc_sem; 240 struct semaphore gc_sem;
234 atomic_t inflight_gc; 241 atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */
242 atomic_t pipeline_gc; /* Number of lines in the GC pipeline -
243 * started reads to finished writes
244 */
235 int w_entries; 245 int w_entries;
236 246
237 struct list_head w_list; 247 struct list_head w_list;
@@ -267,6 +277,7 @@ struct pblk_rl {
267 int rb_gc_max; /* Max buffer entries available for GC I/O */ 277 int rb_gc_max; /* Max buffer entries available for GC I/O */
268 int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ 278 int rb_gc_rsv; /* Reserved buffer entries for GC I/O */
269 int rb_state; /* Rate-limiter current state */ 279 int rb_state; /* Rate-limiter current state */
280 int rb_max_io; /* Maximum size for an I/O giving the config */
270 281
271 atomic_t rb_user_cnt; /* User I/O buffer counter */ 282 atomic_t rb_user_cnt; /* User I/O buffer counter */
272 atomic_t rb_gc_cnt; /* GC I/O buffer counter */ 283 atomic_t rb_gc_cnt; /* GC I/O buffer counter */
@@ -310,6 +321,7 @@ enum {
310}; 321};
311 322
312#define PBLK_MAGIC 0x70626c6b /*pblk*/ 323#define PBLK_MAGIC 0x70626c6b /*pblk*/
324#define SMETA_VERSION cpu_to_le16(1)
313 325
314struct line_header { 326struct line_header {
315 __le32 crc; 327 __le32 crc;
@@ -618,15 +630,16 @@ struct pblk {
618 630
619 struct list_head compl_list; 631 struct list_head compl_list;
620 632
621 mempool_t *page_pool; 633 mempool_t *page_bio_pool;
622 mempool_t *line_ws_pool; 634 mempool_t *gen_ws_pool;
623 mempool_t *rec_pool; 635 mempool_t *rec_pool;
624 mempool_t *g_rq_pool; 636 mempool_t *r_rq_pool;
625 mempool_t *w_rq_pool; 637 mempool_t *w_rq_pool;
626 mempool_t *line_meta_pool; 638 mempool_t *e_rq_pool;
627 639
628 struct workqueue_struct *close_wq; 640 struct workqueue_struct *close_wq;
629 struct workqueue_struct *bb_wq; 641 struct workqueue_struct *bb_wq;
642 struct workqueue_struct *r_end_wq;
630 643
631 struct timer_list wtimer; 644 struct timer_list wtimer;
632 645
@@ -657,15 +670,15 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
657void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, 670void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
658 struct pblk_w_ctx w_ctx, unsigned int pos); 671 struct pblk_w_ctx w_ctx, unsigned int pos);
659void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, 672void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
660 struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, 673 struct pblk_w_ctx w_ctx, struct pblk_line *line,
661 unsigned int pos); 674 u64 paddr, unsigned int pos);
662struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); 675struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
663void pblk_rb_flush(struct pblk_rb *rb); 676void pblk_rb_flush(struct pblk_rb *rb);
664 677
665void pblk_rb_sync_l2p(struct pblk_rb *rb); 678void pblk_rb_sync_l2p(struct pblk_rb *rb);
666unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, 679unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
667 struct bio *bio, unsigned int pos, 680 unsigned int pos, unsigned int nr_entries,
668 unsigned int nr_entries, unsigned int count); 681 unsigned int count);
669unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, 682unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
670 struct list_head *list, 683 struct list_head *list,
671 unsigned int max); 684 unsigned int max);
@@ -692,24 +705,23 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
692/* 705/*
693 * pblk core 706 * pblk core
694 */ 707 */
695struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw); 708struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type);
709void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type);
696void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); 710void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write);
697int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, 711int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
698 struct pblk_c_ctx *c_ctx); 712 struct pblk_c_ctx *c_ctx);
699void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
700void pblk_wait_for_meta(struct pblk *pblk);
701struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
702void pblk_discard(struct pblk *pblk, struct bio *bio); 713void pblk_discard(struct pblk *pblk, struct bio *bio);
703void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); 714void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
704void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); 715void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
705int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); 716int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
717int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
706int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); 718int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
707struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, 719struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
708 unsigned int nr_secs, unsigned int len, 720 unsigned int nr_secs, unsigned int len,
709 int alloc_type, gfp_t gfp_mask); 721 int alloc_type, gfp_t gfp_mask);
710struct pblk_line *pblk_line_get(struct pblk *pblk); 722struct pblk_line *pblk_line_get(struct pblk *pblk);
711struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); 723struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
712void pblk_line_replace_data(struct pblk *pblk); 724struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
713int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); 725int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
714void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); 726void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
715struct pblk_line *pblk_line_get_data(struct pblk *pblk); 727struct pblk_line *pblk_line_get_data(struct pblk *pblk);
@@ -719,19 +731,18 @@ int pblk_line_is_full(struct pblk_line *line);
719void pblk_line_free(struct pblk *pblk, struct pblk_line *line); 731void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
720void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); 732void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
721void pblk_line_close(struct pblk *pblk, struct pblk_line *line); 733void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
722void pblk_line_close_meta_sync(struct pblk *pblk);
723void pblk_line_close_ws(struct work_struct *work); 734void pblk_line_close_ws(struct work_struct *work);
724void pblk_pipeline_stop(struct pblk *pblk); 735void pblk_pipeline_stop(struct pblk *pblk);
725void pblk_line_mark_bb(struct work_struct *work); 736void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
726void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, 737 void (*work)(struct work_struct *), gfp_t gfp_mask,
727 void (*work)(struct work_struct *), 738 struct workqueue_struct *wq);
728 struct workqueue_struct *wq);
729u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); 739u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
730int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); 740int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
731int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, 741int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
732 void *emeta_buf); 742 void *emeta_buf);
733int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); 743int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
734void pblk_line_put(struct kref *ref); 744void pblk_line_put(struct kref *ref);
745void pblk_line_put_wq(struct kref *ref);
735struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); 746struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
736u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); 747u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line);
737void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); 748void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
@@ -745,7 +756,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
745void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); 756void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
746void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, 757void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
747 unsigned long *lun_bitmap); 758 unsigned long *lun_bitmap);
748void pblk_end_bio_sync(struct bio *bio);
749void pblk_end_io_sync(struct nvm_rq *rqd); 759void pblk_end_io_sync(struct nvm_rq *rqd);
750int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, 760int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
751 int nr_pages); 761 int nr_pages);
@@ -760,7 +770,7 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
760void pblk_update_map_dev(struct pblk *pblk, sector_t lba, 770void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
761 struct ppa_addr ppa, struct ppa_addr entry_line); 771 struct ppa_addr ppa, struct ppa_addr entry_line);
762int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, 772int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
763 struct pblk_line *gc_line); 773 struct pblk_line *gc_line, u64 paddr);
764void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, 774void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
765 u64 *lba_list, int nr_secs); 775 u64 *lba_list, int nr_secs);
766void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, 776void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
@@ -771,9 +781,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
771 */ 781 */
772int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, 782int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
773 unsigned long flags); 783 unsigned long flags);
774int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, 784int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
775 unsigned int nr_entries, unsigned int nr_rec_entries,
776 struct pblk_line *gc_line, unsigned long flags);
777 785
778/* 786/*
779 * pblk map 787 * pblk map
@@ -797,9 +805,7 @@ void pblk_write_should_kick(struct pblk *pblk);
797 */ 805 */
798extern struct bio_set *pblk_bio_set; 806extern struct bio_set *pblk_bio_set;
799int pblk_submit_read(struct pblk *pblk, struct bio *bio); 807int pblk_submit_read(struct pblk *pblk, struct bio *bio);
800int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, 808int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
801 unsigned int nr_secs, unsigned int *secs_to_gc,
802 struct pblk_line *line);
803/* 809/*
804 * pblk recovery 810 * pblk recovery
805 */ 811 */
@@ -815,7 +821,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
815 * pblk gc 821 * pblk gc
816 */ 822 */
817#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ 823#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */
818#define PBLK_GC_W_QD 128 /* Queue depth for inflight GC write I/Os */ 824#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */
819#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ 825#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */
820#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ 826#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
821 827
@@ -824,7 +830,7 @@ void pblk_gc_exit(struct pblk *pblk);
824void pblk_gc_should_start(struct pblk *pblk); 830void pblk_gc_should_start(struct pblk *pblk);
825void pblk_gc_should_stop(struct pblk *pblk); 831void pblk_gc_should_stop(struct pblk *pblk);
826void pblk_gc_should_kick(struct pblk *pblk); 832void pblk_gc_should_kick(struct pblk *pblk);
827void pblk_gc_kick(struct pblk *pblk); 833void pblk_gc_free_full_lines(struct pblk *pblk);
828void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, 834void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
829 int *gc_active); 835 int *gc_active);
830int pblk_gc_sysfs_force(struct pblk *pblk, int force); 836int pblk_gc_sysfs_force(struct pblk *pblk, int force);
@@ -834,8 +840,8 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force);
834 */ 840 */
835void pblk_rl_init(struct pblk_rl *rl, int budget); 841void pblk_rl_init(struct pblk_rl *rl, int budget);
836void pblk_rl_free(struct pblk_rl *rl); 842void pblk_rl_free(struct pblk_rl *rl);
843void pblk_rl_update_rates(struct pblk_rl *rl);
837int pblk_rl_high_thrs(struct pblk_rl *rl); 844int pblk_rl_high_thrs(struct pblk_rl *rl);
838int pblk_rl_low_thrs(struct pblk_rl *rl);
839unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); 845unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
840int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); 846int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
841void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); 847void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
@@ -843,10 +849,9 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
843int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); 849int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
844void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); 850void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
845void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); 851void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
846int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); 852int pblk_rl_max_io(struct pblk_rl *rl);
847void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); 853void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
848void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); 854void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
849void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left);
850int pblk_rl_is_limit(struct pblk_rl *rl); 855int pblk_rl_is_limit(struct pblk_rl *rl);
851 856
852/* 857/*
@@ -892,13 +897,7 @@ static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta)
892 897
893static inline int pblk_line_vsc(struct pblk_line *line) 898static inline int pblk_line_vsc(struct pblk_line *line)
894{ 899{
895 int vsc; 900 return le32_to_cpu(*line->vsc);
896
897 spin_lock(&line->lock);
898 vsc = le32_to_cpu(*line->vsc);
899 spin_unlock(&line->lock);
900
901 return vsc;
902} 901}
903 902
904#define NVM_MEM_PAGE_WRITE (8) 903#define NVM_MEM_PAGE_WRITE (8)
@@ -1140,7 +1139,7 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
1140 1139
1141 flags = geo->plane_mode >> 1; 1140 flags = geo->plane_mode >> 1;
1142 1141
1143 if (type == WRITE) 1142 if (type == PBLK_WRITE)
1144 flags |= NVM_IO_SCRAMBLE_ENABLE; 1143 flags |= NVM_IO_SCRAMBLE_ENABLE;
1145 1144
1146 return flags; 1145 return flags;
@@ -1200,7 +1199,6 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
1200 1199
1201 pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); 1200 pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
1202} 1201}
1203#endif
1204 1202
1205static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, 1203static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
1206 struct ppa_addr *ppas, int nr_ppas) 1204 struct ppa_addr *ppas, int nr_ppas)
@@ -1221,14 +1219,50 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
1221 ppa->g.sec < geo->sec_per_pg) 1219 ppa->g.sec < geo->sec_per_pg)
1222 continue; 1220 continue;
1223 1221
1224#ifdef CONFIG_NVM_DEBUG
1225 print_ppa(ppa, "boundary", i); 1222 print_ppa(ppa, "boundary", i);
1226#endif 1223
1227 return 1; 1224 return 1;
1228 } 1225 }
1229 return 0; 1226 return 0;
1230} 1227}
1231 1228
1229static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
1230{
1231 struct nvm_tgt_dev *dev = pblk->dev;
1232 struct ppa_addr *ppa_list;
1233
1234 ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
1235
1236 if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
1237 WARN_ON(1);
1238 return -EINVAL;
1239 }
1240
1241 if (rqd->opcode == NVM_OP_PWRITE) {
1242 struct pblk_line *line;
1243 struct ppa_addr ppa;
1244 int i;
1245
1246 for (i = 0; i < rqd->nr_ppas; i++) {
1247 ppa = ppa_list[i];
1248 line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
1249
1250 spin_lock(&line->lock);
1251 if (line->state != PBLK_LINESTATE_OPEN) {
1252 pr_err("pblk: bad ppa: line:%d,state:%d\n",
1253 line->id, line->state);
1254 WARN_ON(1);
1255 spin_unlock(&line->lock);
1256 return -EINVAL;
1257 }
1258 spin_unlock(&line->lock);
1259 }
1260 }
1261
1262 return 0;
1263}
1264#endif
1265
1232static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) 1266static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
1233{ 1267{
1234 struct pblk_line_meta *lm = &pblk->lm; 1268 struct pblk_line_meta *lm = &pblk->lm;
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 08035634795c..a27d85232ce1 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -407,7 +407,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
407 407
408 finish_wait(&ca->set->bucket_wait, &w); 408 finish_wait(&ca->set->bucket_wait, &w);
409out: 409out:
410 wake_up_process(ca->alloc_thread); 410 if (ca->alloc_thread)
411 wake_up_process(ca->alloc_thread);
411 412
412 trace_bcache_alloc(ca, reserve); 413 trace_bcache_alloc(ca, reserve);
413 414
@@ -442,6 +443,11 @@ out:
442 b->prio = INITIAL_PRIO; 443 b->prio = INITIAL_PRIO;
443 } 444 }
444 445
446 if (ca->set->avail_nbuckets > 0) {
447 ca->set->avail_nbuckets--;
448 bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
449 }
450
445 return r; 451 return r;
446} 452}
447 453
@@ -449,6 +455,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
449{ 455{
450 SET_GC_MARK(b, 0); 456 SET_GC_MARK(b, 0);
451 SET_GC_SECTORS_USED(b, 0); 457 SET_GC_SECTORS_USED(b, 0);
458
459 if (ca->set->avail_nbuckets < ca->set->nbuckets) {
460 ca->set->avail_nbuckets++;
461 bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
462 }
452} 463}
453 464
454void bch_bucket_free(struct cache_set *c, struct bkey *k) 465void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -601,7 +612,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
601 612
602 /* 613 /*
603 * If we had to allocate, we might race and not need to allocate the 614 * If we had to allocate, we might race and not need to allocate the
604 * second time we call find_data_bucket(). If we allocated a bucket but 615 * second time we call pick_data_bucket(). If we allocated a bucket but
605 * didn't use it, drop the refcount bch_bucket_alloc_set() took: 616 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
606 */ 617 */
607 if (KEY_PTRS(&alloc.key)) 618 if (KEY_PTRS(&alloc.key))
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index abd31e847f96..843877e017e1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -185,6 +185,7 @@
185#include <linux/mutex.h> 185#include <linux/mutex.h>
186#include <linux/rbtree.h> 186#include <linux/rbtree.h>
187#include <linux/rwsem.h> 187#include <linux/rwsem.h>
188#include <linux/refcount.h>
188#include <linux/types.h> 189#include <linux/types.h>
189#include <linux/workqueue.h> 190#include <linux/workqueue.h>
190 191
@@ -266,9 +267,6 @@ struct bcache_device {
266 atomic_t *stripe_sectors_dirty; 267 atomic_t *stripe_sectors_dirty;
267 unsigned long *full_dirty_stripes; 268 unsigned long *full_dirty_stripes;
268 269
269 unsigned long sectors_dirty_last;
270 long sectors_dirty_derivative;
271
272 struct bio_set *bio_split; 270 struct bio_set *bio_split;
273 271
274 unsigned data_csum:1; 272 unsigned data_csum:1;
@@ -300,7 +298,7 @@ struct cached_dev {
300 struct semaphore sb_write_mutex; 298 struct semaphore sb_write_mutex;
301 299
302 /* Refcount on the cache set. Always nonzero when we're caching. */ 300 /* Refcount on the cache set. Always nonzero when we're caching. */
303 atomic_t count; 301 refcount_t count;
304 struct work_struct detach; 302 struct work_struct detach;
305 303
306 /* 304 /*
@@ -363,12 +361,14 @@ struct cached_dev {
363 361
364 uint64_t writeback_rate_target; 362 uint64_t writeback_rate_target;
365 int64_t writeback_rate_proportional; 363 int64_t writeback_rate_proportional;
366 int64_t writeback_rate_derivative; 364 int64_t writeback_rate_integral;
367 int64_t writeback_rate_change; 365 int64_t writeback_rate_integral_scaled;
366 int32_t writeback_rate_change;
368 367
369 unsigned writeback_rate_update_seconds; 368 unsigned writeback_rate_update_seconds;
370 unsigned writeback_rate_d_term; 369 unsigned writeback_rate_i_term_inverse;
371 unsigned writeback_rate_p_term_inverse; 370 unsigned writeback_rate_p_term_inverse;
371 unsigned writeback_rate_minimum;
372}; 372};
373 373
374enum alloc_reserve { 374enum alloc_reserve {
@@ -582,6 +582,7 @@ struct cache_set {
582 uint8_t need_gc; 582 uint8_t need_gc;
583 struct gc_stat gc_stats; 583 struct gc_stat gc_stats;
584 size_t nbuckets; 584 size_t nbuckets;
585 size_t avail_nbuckets;
585 586
586 struct task_struct *gc_thread; 587 struct task_struct *gc_thread;
587 /* Where in the btree gc currently is */ 588 /* Where in the btree gc currently is */
@@ -807,13 +808,13 @@ do { \
807 808
808static inline void cached_dev_put(struct cached_dev *dc) 809static inline void cached_dev_put(struct cached_dev *dc)
809{ 810{
810 if (atomic_dec_and_test(&dc->count)) 811 if (refcount_dec_and_test(&dc->count))
811 schedule_work(&dc->detach); 812 schedule_work(&dc->detach);
812} 813}
813 814
814static inline bool cached_dev_get(struct cached_dev *dc) 815static inline bool cached_dev_get(struct cached_dev *dc)
815{ 816{
816 if (!atomic_inc_not_zero(&dc->count)) 817 if (!refcount_inc_not_zero(&dc->count))
817 return false; 818 return false;
818 819
819 /* Paired with the mb in cached_dev_attach */ 820 /* Paired with the mb in cached_dev_attach */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 658c54b3b07a..11c5503d31dc 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1241,6 +1241,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
1241 __bch_btree_mark_key(c, level, k); 1241 __bch_btree_mark_key(c, level, k);
1242} 1242}
1243 1243
1244void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
1245{
1246 stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
1247}
1248
1244static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) 1249static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1245{ 1250{
1246 uint8_t stale = 0; 1251 uint8_t stale = 0;
@@ -1652,9 +1657,8 @@ static void btree_gc_start(struct cache_set *c)
1652 mutex_unlock(&c->bucket_lock); 1657 mutex_unlock(&c->bucket_lock);
1653} 1658}
1654 1659
1655static size_t bch_btree_gc_finish(struct cache_set *c) 1660static void bch_btree_gc_finish(struct cache_set *c)
1656{ 1661{
1657 size_t available = 0;
1658 struct bucket *b; 1662 struct bucket *b;
1659 struct cache *ca; 1663 struct cache *ca;
1660 unsigned i; 1664 unsigned i;
@@ -1691,6 +1695,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
1691 } 1695 }
1692 rcu_read_unlock(); 1696 rcu_read_unlock();
1693 1697
1698 c->avail_nbuckets = 0;
1694 for_each_cache(ca, c, i) { 1699 for_each_cache(ca, c, i) {
1695 uint64_t *i; 1700 uint64_t *i;
1696 1701
@@ -1712,18 +1717,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
1712 BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); 1717 BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1713 1718
1714 if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) 1719 if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1715 available++; 1720 c->avail_nbuckets++;
1716 } 1721 }
1717 } 1722 }
1718 1723
1719 mutex_unlock(&c->bucket_lock); 1724 mutex_unlock(&c->bucket_lock);
1720 return available;
1721} 1725}
1722 1726
1723static void bch_btree_gc(struct cache_set *c) 1727static void bch_btree_gc(struct cache_set *c)
1724{ 1728{
1725 int ret; 1729 int ret;
1726 unsigned long available;
1727 struct gc_stat stats; 1730 struct gc_stat stats;
1728 struct closure writes; 1731 struct closure writes;
1729 struct btree_op op; 1732 struct btree_op op;
@@ -1746,14 +1749,14 @@ static void bch_btree_gc(struct cache_set *c)
1746 pr_warn("gc failed!"); 1749 pr_warn("gc failed!");
1747 } while (ret); 1750 } while (ret);
1748 1751
1749 available = bch_btree_gc_finish(c); 1752 bch_btree_gc_finish(c);
1750 wake_up_allocators(c); 1753 wake_up_allocators(c);
1751 1754
1752 bch_time_stats_update(&c->btree_gc_time, start_time); 1755 bch_time_stats_update(&c->btree_gc_time, start_time);
1753 1756
1754 stats.key_bytes *= sizeof(uint64_t); 1757 stats.key_bytes *= sizeof(uint64_t);
1755 stats.data <<= 9; 1758 stats.data <<= 9;
1756 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1759 bch_update_bucket_in_use(c, &stats);
1757 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1760 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1758 1761
1759 trace_bcache_gc_end(c); 1762 trace_bcache_gc_end(c);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 42204d61bc95..d211e2c25b6b 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -306,5 +306,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
306struct keybuf_key *bch_keybuf_next(struct keybuf *); 306struct keybuf_key *bch_keybuf_next(struct keybuf *);
307struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, 307struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
308 struct bkey *, keybuf_pred_fn *); 308 struct bkey *, keybuf_pred_fn *);
309 309void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
310#endif 310#endif
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 965907ce1e20..ccfbea6f9f6b 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -252,6 +252,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
252static inline void closure_queue(struct closure *cl) 252static inline void closure_queue(struct closure *cl)
253{ 253{
254 struct workqueue_struct *wq = cl->wq; 254 struct workqueue_struct *wq = cl->wq;
255 /**
256 * Changes made to closure, work_struct, or a couple of other structs
257 * may cause work.func not pointing to the right location.
258 */
259 BUILD_BUG_ON(offsetof(struct closure, fn)
260 != offsetof(struct work_struct, func));
255 if (wq) { 261 if (wq) {
256 INIT_WORK(&cl->work, cl->work.func); 262 INIT_WORK(&cl->work, cl->work.func);
257 BUG_ON(!queue_work(wq, &cl->work)); 263 BUG_ON(!queue_work(wq, &cl->work));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3475d6628e21..3a7aed7282b2 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -27,12 +27,12 @@ struct kmem_cache *bch_search_cache;
27 27
28static void bch_data_insert_start(struct closure *); 28static void bch_data_insert_start(struct closure *);
29 29
30static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) 30static unsigned cache_mode(struct cached_dev *dc)
31{ 31{
32 return BDEV_CACHE_MODE(&dc->sb); 32 return BDEV_CACHE_MODE(&dc->sb);
33} 33}
34 34
35static bool verify(struct cached_dev *dc, struct bio *bio) 35static bool verify(struct cached_dev *dc)
36{ 36{
37 return dc->verify; 37 return dc->verify;
38} 38}
@@ -370,7 +370,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
370static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) 370static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
371{ 371{
372 struct cache_set *c = dc->disk.c; 372 struct cache_set *c = dc->disk.c;
373 unsigned mode = cache_mode(dc, bio); 373 unsigned mode = cache_mode(dc);
374 unsigned sectors, congested = bch_get_congested(c); 374 unsigned sectors, congested = bch_get_congested(c);
375 struct task_struct *task = current; 375 struct task_struct *task = current;
376 struct io *i; 376 struct io *i;
@@ -385,6 +385,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
385 op_is_write(bio_op(bio)))) 385 op_is_write(bio_op(bio))))
386 goto skip; 386 goto skip;
387 387
388 /*
389 * Flag for bypass if the IO is for read-ahead or background,
390 * unless the read-ahead request is for metadata (eg, for gfs2).
391 */
392 if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
393 !(bio->bi_opf & REQ_META))
394 goto skip;
395
388 if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || 396 if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
389 bio_sectors(bio) & (c->sb.block_size - 1)) { 397 bio_sectors(bio) & (c->sb.block_size - 1)) {
390 pr_debug("skipping unaligned io"); 398 pr_debug("skipping unaligned io");
@@ -463,6 +471,7 @@ struct search {
463 unsigned recoverable:1; 471 unsigned recoverable:1;
464 unsigned write:1; 472 unsigned write:1;
465 unsigned read_dirty_data:1; 473 unsigned read_dirty_data:1;
474 unsigned cache_missed:1;
466 475
467 unsigned long start_time; 476 unsigned long start_time;
468 477
@@ -649,6 +658,7 @@ static inline struct search *search_alloc(struct bio *bio,
649 658
650 s->orig_bio = bio; 659 s->orig_bio = bio;
651 s->cache_miss = NULL; 660 s->cache_miss = NULL;
661 s->cache_missed = 0;
652 s->d = d; 662 s->d = d;
653 s->recoverable = 1; 663 s->recoverable = 1;
654 s->write = op_is_write(bio_op(bio)); 664 s->write = op_is_write(bio_op(bio));
@@ -698,8 +708,16 @@ static void cached_dev_read_error(struct closure *cl)
698{ 708{
699 struct search *s = container_of(cl, struct search, cl); 709 struct search *s = container_of(cl, struct search, cl);
700 struct bio *bio = &s->bio.bio; 710 struct bio *bio = &s->bio.bio;
711 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
701 712
702 if (s->recoverable) { 713 /*
714 * If cache device is dirty (dc->has_dirty is non-zero), then
715 * recovery a failed read request from cached device may get a
716 * stale data back. So read failure recovery is only permitted
717 * when cache device is clean.
718 */
719 if (s->recoverable &&
720 (dc && !atomic_read(&dc->has_dirty))) {
703 /* Retry from the backing device: */ 721 /* Retry from the backing device: */
704 trace_bcache_read_retry(s->orig_bio); 722 trace_bcache_read_retry(s->orig_bio);
705 723
@@ -740,7 +758,7 @@ static void cached_dev_read_done(struct closure *cl)
740 s->cache_miss = NULL; 758 s->cache_miss = NULL;
741 } 759 }
742 760
743 if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) 761 if (verify(dc) && s->recoverable && !s->read_dirty_data)
744 bch_data_verify(dc, s->orig_bio); 762 bch_data_verify(dc, s->orig_bio);
745 763
746 bio_complete(s); 764 bio_complete(s);
@@ -760,12 +778,12 @@ static void cached_dev_read_done_bh(struct closure *cl)
760 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 778 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
761 779
762 bch_mark_cache_accounting(s->iop.c, s->d, 780 bch_mark_cache_accounting(s->iop.c, s->d,
763 !s->cache_miss, s->iop.bypass); 781 !s->cache_missed, s->iop.bypass);
764 trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); 782 trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
765 783
766 if (s->iop.status) 784 if (s->iop.status)
767 continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); 785 continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
768 else if (s->iop.bio || verify(dc, &s->bio.bio)) 786 else if (s->iop.bio || verify(dc))
769 continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); 787 continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
770 else 788 else
771 continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); 789 continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -779,6 +797,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
779 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 797 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
780 struct bio *miss, *cache_bio; 798 struct bio *miss, *cache_bio;
781 799
800 s->cache_missed = 1;
801
782 if (s->cache_miss || s->iop.bypass) { 802 if (s->cache_miss || s->iop.bypass) {
783 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); 803 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
784 ret = miss == bio ? MAP_DONE : MAP_CONTINUE; 804 ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
@@ -892,7 +912,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
892 s->iop.bypass = true; 912 s->iop.bypass = true;
893 913
894 if (should_writeback(dc, s->orig_bio, 914 if (should_writeback(dc, s->orig_bio,
895 cache_mode(dc, bio), 915 cache_mode(dc),
896 s->iop.bypass)) { 916 s->iop.bypass)) {
897 s->iop.bypass = false; 917 s->iop.bypass = false;
898 s->iop.writeback = true; 918 s->iop.writeback = true;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fc0a31b13ac4..b4d28928dec5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets);
53static LIST_HEAD(uncached_devices); 53static LIST_HEAD(uncached_devices);
54 54
55static int bcache_major; 55static int bcache_major;
56static DEFINE_IDA(bcache_minor); 56static DEFINE_IDA(bcache_device_idx);
57static wait_queue_head_t unregister_wait; 57static wait_queue_head_t unregister_wait;
58struct workqueue_struct *bcache_wq; 58struct workqueue_struct *bcache_wq;
59 59
60#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) 60#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
61#define BCACHE_MINORS 16 /* partition support */ 61/* limitation of partitions number on single bcache device */
62#define BCACHE_MINORS 128
63/* limitation of bcache devices number on single system */
64#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
62 65
63/* Superblock */ 66/* Superblock */
64 67
@@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
721 closure_get(&c->caching); 724 closure_get(&c->caching);
722} 725}
723 726
727static inline int first_minor_to_idx(int first_minor)
728{
729 return (first_minor/BCACHE_MINORS);
730}
731
732static inline int idx_to_first_minor(int idx)
733{
734 return (idx * BCACHE_MINORS);
735}
736
724static void bcache_device_free(struct bcache_device *d) 737static void bcache_device_free(struct bcache_device *d)
725{ 738{
726 lockdep_assert_held(&bch_register_lock); 739 lockdep_assert_held(&bch_register_lock);
@@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d)
734 if (d->disk && d->disk->queue) 747 if (d->disk && d->disk->queue)
735 blk_cleanup_queue(d->disk->queue); 748 blk_cleanup_queue(d->disk->queue);
736 if (d->disk) { 749 if (d->disk) {
737 ida_simple_remove(&bcache_minor, d->disk->first_minor); 750 ida_simple_remove(&bcache_device_idx,
751 first_minor_to_idx(d->disk->first_minor));
738 put_disk(d->disk); 752 put_disk(d->disk);
739 } 753 }
740 754
@@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
751{ 765{
752 struct request_queue *q; 766 struct request_queue *q;
753 size_t n; 767 size_t n;
754 int minor; 768 int idx;
755 769
756 if (!d->stripe_size) 770 if (!d->stripe_size)
757 d->stripe_size = 1 << 31; 771 d->stripe_size = 1 << 31;
@@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
776 if (!d->full_dirty_stripes) 790 if (!d->full_dirty_stripes)
777 return -ENOMEM; 791 return -ENOMEM;
778 792
779 minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); 793 idx = ida_simple_get(&bcache_device_idx, 0,
780 if (minor < 0) 794 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
781 return minor; 795 if (idx < 0)
782 796 return idx;
783 minor *= BCACHE_MINORS;
784 797
785 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), 798 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
786 BIOSET_NEED_BVECS | 799 BIOSET_NEED_BVECS |
787 BIOSET_NEED_RESCUER)) || 800 BIOSET_NEED_RESCUER)) ||
788 !(d->disk = alloc_disk(BCACHE_MINORS))) { 801 !(d->disk = alloc_disk(BCACHE_MINORS))) {
789 ida_simple_remove(&bcache_minor, minor); 802 ida_simple_remove(&bcache_device_idx, idx);
790 return -ENOMEM; 803 return -ENOMEM;
791 } 804 }
792 805
793 set_capacity(d->disk, sectors); 806 set_capacity(d->disk, sectors);
794 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); 807 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
795 808
796 d->disk->major = bcache_major; 809 d->disk->major = bcache_major;
797 d->disk->first_minor = minor; 810 d->disk->first_minor = idx_to_first_minor(idx);
798 d->disk->fops = &bcache_ops; 811 d->disk->fops = &bcache_ops;
799 d->disk->private_data = d; 812 d->disk->private_data = d;
800 813
@@ -889,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
889 closure_init_stack(&cl); 902 closure_init_stack(&cl);
890 903
891 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); 904 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
892 BUG_ON(atomic_read(&dc->count)); 905 BUG_ON(refcount_read(&dc->count));
893 906
894 mutex_lock(&bch_register_lock); 907 mutex_lock(&bch_register_lock);
895 908
@@ -1016,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
1016 * dc->c must be set before dc->count != 0 - paired with the mb in 1029 * dc->c must be set before dc->count != 0 - paired with the mb in
1017 * cached_dev_get() 1030 * cached_dev_get()
1018 */ 1031 */
1019 atomic_set(&dc->count, 1); 1032 refcount_set(&dc->count, 1);
1020 1033
1021 /* Block writeback thread, but spawn it */ 1034 /* Block writeback thread, but spawn it */
1022 down_write(&dc->writeback_lock); 1035 down_write(&dc->writeback_lock);
@@ -1028,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
1028 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { 1041 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1029 bch_sectors_dirty_init(&dc->disk); 1042 bch_sectors_dirty_init(&dc->disk);
1030 atomic_set(&dc->has_dirty, 1); 1043 atomic_set(&dc->has_dirty, 1);
1031 atomic_inc(&dc->count); 1044 refcount_inc(&dc->count);
1032 bch_writeback_queue(dc); 1045 bch_writeback_queue(dc);
1033 } 1046 }
1034 1047
@@ -1129,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1129 if (ret) 1142 if (ret)
1130 return ret; 1143 return ret;
1131 1144
1132 set_capacity(dc->disk.disk,
1133 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1134
1135 dc->disk.disk->queue->backing_dev_info->ra_pages = 1145 dc->disk.disk->queue->backing_dev_info->ra_pages =
1136 max(dc->disk.disk->queue->backing_dev_info->ra_pages, 1146 max(dc->disk.disk->queue->backing_dev_info->ra_pages,
1137 q->backing_dev_info->ra_pages); 1147 q->backing_dev_info->ra_pages);
@@ -2085,6 +2095,7 @@ static void bcache_exit(void)
2085 if (bcache_major) 2095 if (bcache_major)
2086 unregister_blkdev(bcache_major, "bcache"); 2096 unregister_blkdev(bcache_major, "bcache");
2087 unregister_reboot_notifier(&reboot); 2097 unregister_reboot_notifier(&reboot);
2098 mutex_destroy(&bch_register_lock);
2088} 2099}
2089 2100
2090static int __init bcache_init(void) 2101static int __init bcache_init(void)
@@ -2103,14 +2114,15 @@ static int __init bcache_init(void)
2103 bcache_major = register_blkdev(0, "bcache"); 2114 bcache_major = register_blkdev(0, "bcache");
2104 if (bcache_major < 0) { 2115 if (bcache_major < 0) {
2105 unregister_reboot_notifier(&reboot); 2116 unregister_reboot_notifier(&reboot);
2117 mutex_destroy(&bch_register_lock);
2106 return bcache_major; 2118 return bcache_major;
2107 } 2119 }
2108 2120
2109 if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || 2121 if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
2110 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || 2122 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2111 sysfs_create_files(bcache_kobj, files) ||
2112 bch_request_init() || 2123 bch_request_init() ||
2113 bch_debug_init(bcache_kobj)) 2124 bch_debug_init(bcache_kobj) ||
2125 sysfs_create_files(bcache_kobj, files))
2114 goto err; 2126 goto err;
2115 2127
2116 return 0; 2128 return 0;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 234b2f5b286d..b4184092c727 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -82,8 +82,9 @@ rw_attribute(writeback_delay);
82rw_attribute(writeback_rate); 82rw_attribute(writeback_rate);
83 83
84rw_attribute(writeback_rate_update_seconds); 84rw_attribute(writeback_rate_update_seconds);
85rw_attribute(writeback_rate_d_term); 85rw_attribute(writeback_rate_i_term_inverse);
86rw_attribute(writeback_rate_p_term_inverse); 86rw_attribute(writeback_rate_p_term_inverse);
87rw_attribute(writeback_rate_minimum);
87read_attribute(writeback_rate_debug); 88read_attribute(writeback_rate_debug);
88 89
89read_attribute(stripe_size); 90read_attribute(stripe_size);
@@ -131,15 +132,16 @@ SHOW(__bch_cached_dev)
131 sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); 132 sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
132 133
133 var_print(writeback_rate_update_seconds); 134 var_print(writeback_rate_update_seconds);
134 var_print(writeback_rate_d_term); 135 var_print(writeback_rate_i_term_inverse);
135 var_print(writeback_rate_p_term_inverse); 136 var_print(writeback_rate_p_term_inverse);
137 var_print(writeback_rate_minimum);
136 138
137 if (attr == &sysfs_writeback_rate_debug) { 139 if (attr == &sysfs_writeback_rate_debug) {
138 char rate[20]; 140 char rate[20];
139 char dirty[20]; 141 char dirty[20];
140 char target[20]; 142 char target[20];
141 char proportional[20]; 143 char proportional[20];
142 char derivative[20]; 144 char integral[20];
143 char change[20]; 145 char change[20];
144 s64 next_io; 146 s64 next_io;
145 147
@@ -147,7 +149,7 @@ SHOW(__bch_cached_dev)
147 bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); 149 bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
148 bch_hprint(target, dc->writeback_rate_target << 9); 150 bch_hprint(target, dc->writeback_rate_target << 9);
149 bch_hprint(proportional,dc->writeback_rate_proportional << 9); 151 bch_hprint(proportional,dc->writeback_rate_proportional << 9);
150 bch_hprint(derivative, dc->writeback_rate_derivative << 9); 152 bch_hprint(integral, dc->writeback_rate_integral_scaled << 9);
151 bch_hprint(change, dc->writeback_rate_change << 9); 153 bch_hprint(change, dc->writeback_rate_change << 9);
152 154
153 next_io = div64_s64(dc->writeback_rate.next - local_clock(), 155 next_io = div64_s64(dc->writeback_rate.next - local_clock(),
@@ -158,11 +160,11 @@ SHOW(__bch_cached_dev)
158 "dirty:\t\t%s\n" 160 "dirty:\t\t%s\n"
159 "target:\t\t%s\n" 161 "target:\t\t%s\n"
160 "proportional:\t%s\n" 162 "proportional:\t%s\n"
161 "derivative:\t%s\n" 163 "integral:\t%s\n"
162 "change:\t\t%s/sec\n" 164 "change:\t\t%s/sec\n"
163 "next io:\t%llims\n", 165 "next io:\t%llims\n",
164 rate, dirty, target, proportional, 166 rate, dirty, target, proportional,
165 derivative, change, next_io); 167 integral, change, next_io);
166 } 168 }
167 169
168 sysfs_hprint(dirty_data, 170 sysfs_hprint(dirty_data,
@@ -214,7 +216,7 @@ STORE(__cached_dev)
214 dc->writeback_rate.rate, 1, INT_MAX); 216 dc->writeback_rate.rate, 1, INT_MAX);
215 217
216 d_strtoul_nonzero(writeback_rate_update_seconds); 218 d_strtoul_nonzero(writeback_rate_update_seconds);
217 d_strtoul(writeback_rate_d_term); 219 d_strtoul(writeback_rate_i_term_inverse);
218 d_strtoul_nonzero(writeback_rate_p_term_inverse); 220 d_strtoul_nonzero(writeback_rate_p_term_inverse);
219 221
220 d_strtoi_h(sequential_cutoff); 222 d_strtoi_h(sequential_cutoff);
@@ -320,7 +322,7 @@ static struct attribute *bch_cached_dev_files[] = {
320 &sysfs_writeback_percent, 322 &sysfs_writeback_percent,
321 &sysfs_writeback_rate, 323 &sysfs_writeback_rate,
322 &sysfs_writeback_rate_update_seconds, 324 &sysfs_writeback_rate_update_seconds,
323 &sysfs_writeback_rate_d_term, 325 &sysfs_writeback_rate_i_term_inverse,
324 &sysfs_writeback_rate_p_term_inverse, 326 &sysfs_writeback_rate_p_term_inverse,
325 &sysfs_writeback_rate_debug, 327 &sysfs_writeback_rate_debug,
326 &sysfs_dirty_data, 328 &sysfs_dirty_data,
@@ -746,6 +748,11 @@ static struct attribute *bch_cache_set_internal_files[] = {
746}; 748};
747KTYPE(bch_cache_set_internal); 749KTYPE(bch_cache_set_internal);
748 750
751static int __bch_cache_cmp(const void *l, const void *r)
752{
753 return *((uint16_t *)r) - *((uint16_t *)l);
754}
755
749SHOW(__bch_cache) 756SHOW(__bch_cache)
750{ 757{
751 struct cache *ca = container_of(kobj, struct cache, kobj); 758 struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -770,9 +777,6 @@ SHOW(__bch_cache)
770 CACHE_REPLACEMENT(&ca->sb)); 777 CACHE_REPLACEMENT(&ca->sb));
771 778
772 if (attr == &sysfs_priority_stats) { 779 if (attr == &sysfs_priority_stats) {
773 int cmp(const void *l, const void *r)
774 { return *((uint16_t *) r) - *((uint16_t *) l); }
775
776 struct bucket *b; 780 struct bucket *b;
777 size_t n = ca->sb.nbuckets, i; 781 size_t n = ca->sb.nbuckets, i;
778 size_t unused = 0, available = 0, dirty = 0, meta = 0; 782 size_t unused = 0, available = 0, dirty = 0, meta = 0;
@@ -801,7 +805,7 @@ SHOW(__bch_cache)
801 p[i] = ca->buckets[i].prio; 805 p[i] = ca->buckets[i].prio;
802 mutex_unlock(&ca->set->bucket_lock); 806 mutex_unlock(&ca->set->bucket_lock);
803 807
804 sort(p, n, sizeof(uint16_t), cmp, NULL); 808 sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL);
805 809
806 while (n && 810 while (n &&
807 !cached[n - 1]) 811 !cached[n - 1])
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 176d3c2ef5f5..e548b8b51322 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
232 232
233 d->next += div_u64(done * NSEC_PER_SEC, d->rate); 233 d->next += div_u64(done * NSEC_PER_SEC, d->rate);
234 234
235 if (time_before64(now + NSEC_PER_SEC, d->next)) 235 /* Bound the time. Don't let us fall further than 2 seconds behind
236 d->next = now + NSEC_PER_SEC; 236 * (this prevents unnecessary backlog that would make it impossible
237 * to catch up). If we're ahead of the desired writeback rate,
238 * don't let us sleep more than 2.5 seconds (so we can notice/respond
239 * if the control system tells us to speed up!).
240 */
241 if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next))
242 d->next = now + NSEC_PER_SEC * 5LLU / 2LLU;
237 243
238 if (time_after64(now - NSEC_PER_SEC * 2, d->next)) 244 if (time_after64(now - NSEC_PER_SEC * 2, d->next))
239 d->next = now - NSEC_PER_SEC * 2; 245 d->next = now - NSEC_PER_SEC * 2;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f54b58282f77..ed5e8a412eb8 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,10 +442,10 @@ struct bch_ratelimit {
442 uint64_t next; 442 uint64_t next;
443 443
444 /* 444 /*
445 * Rate at which we want to do work, in units per nanosecond 445 * Rate at which we want to do work, in units per second
446 * The units here correspond to the units passed to bch_next_delay() 446 * The units here correspond to the units passed to bch_next_delay()
447 */ 447 */
448 unsigned rate; 448 uint32_t rate;
449}; 449};
450 450
451static inline void bch_ratelimit_reset(struct bch_ratelimit *d) 451static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 70454f2ad2fa..56a37884ca8b 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -26,48 +26,63 @@ static void __update_writeback_rate(struct cached_dev *dc)
26 bcache_flash_devs_sectors_dirty(c); 26 bcache_flash_devs_sectors_dirty(c);
27 uint64_t cache_dirty_target = 27 uint64_t cache_dirty_target =
28 div_u64(cache_sectors * dc->writeback_percent, 100); 28 div_u64(cache_sectors * dc->writeback_percent, 100);
29
30 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), 29 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
31 c->cached_dev_sectors); 30 c->cached_dev_sectors);
32 31
33 /* PD controller */ 32 /*
34 33 * PI controller:
34 * Figures out the amount that should be written per second.
35 *
36 * First, the error (number of sectors that are dirty beyond our
37 * target) is calculated. The error is accumulated (numerically
38 * integrated).
39 *
40 * Then, the proportional value and integral value are scaled
41 * based on configured values. These are stored as inverses to
42 * avoid fixed point math and to make configuration easy-- e.g.
43 * the default value of 40 for writeback_rate_p_term_inverse
44 * attempts to write at a rate that would retire all the dirty
45 * blocks in 40 seconds.
46 *
47 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
48 * of the error is accumulated in the integral term per second.
49 * This acts as a slow, long-term average that is not subject to
50 * variations in usage like the p term.
51 */
35 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 52 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
36 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 53 int64_t error = dirty - target;
37 int64_t proportional = dirty - target; 54 int64_t proportional_scaled =
38 int64_t change; 55 div_s64(error, dc->writeback_rate_p_term_inverse);
39 56 int64_t integral_scaled;
40 dc->disk.sectors_dirty_last = dirty; 57 uint32_t new_rate;
41 58
42 /* Scale to sectors per second */ 59 if ((error < 0 && dc->writeback_rate_integral > 0) ||
43 60 (error > 0 && time_before64(local_clock(),
44 proportional *= dc->writeback_rate_update_seconds; 61 dc->writeback_rate.next + NSEC_PER_MSEC))) {
45 proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); 62 /*
46 63 * Only decrease the integral term if it's more than
47 derivative = div_s64(derivative, dc->writeback_rate_update_seconds); 64 * zero. Only increase the integral term if the device
48 65 * is keeping up. (Don't wind up the integral
49 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, 66 * ineffectively in either case).
50 (dc->writeback_rate_d_term / 67 *
51 dc->writeback_rate_update_seconds) ?: 1, 0); 68 * It's necessary to scale this by
52 69 * writeback_rate_update_seconds to keep the integral
53 derivative *= dc->writeback_rate_d_term; 70 * term dimensioned properly.
54 derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); 71 */
55 72 dc->writeback_rate_integral += error *
56 change = proportional + derivative; 73 dc->writeback_rate_update_seconds;
74 }
57 75
58 /* Don't increase writeback rate if the device isn't keeping up */ 76 integral_scaled = div_s64(dc->writeback_rate_integral,
59 if (change > 0 && 77 dc->writeback_rate_i_term_inverse);
60 time_after64(local_clock(),
61 dc->writeback_rate.next + NSEC_PER_MSEC))
62 change = 0;
63 78
64 dc->writeback_rate.rate = 79 new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
65 clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 80 dc->writeback_rate_minimum, NSEC_PER_SEC);
66 1, NSEC_PER_MSEC);
67 81
68 dc->writeback_rate_proportional = proportional; 82 dc->writeback_rate_proportional = proportional_scaled;
69 dc->writeback_rate_derivative = derivative; 83 dc->writeback_rate_integral_scaled = integral_scaled;
70 dc->writeback_rate_change = change; 84 dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
85 dc->writeback_rate.rate = new_rate;
71 dc->writeback_rate_target = target; 86 dc->writeback_rate_target = target;
72} 87}
73 88
@@ -180,13 +195,21 @@ static void write_dirty(struct closure *cl)
180 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 195 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
181 struct keybuf_key *w = io->bio.bi_private; 196 struct keybuf_key *w = io->bio.bi_private;
182 197
183 dirty_init(w); 198 /*
184 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); 199 * IO errors are signalled using the dirty bit on the key.
185 io->bio.bi_iter.bi_sector = KEY_START(&w->key); 200 * If we failed to read, we should not attempt to write to the
186 bio_set_dev(&io->bio, io->dc->bdev); 201 * backing device. Instead, immediately go to write_dirty_finish
187 io->bio.bi_end_io = dirty_endio; 202 * to clean up.
203 */
204 if (KEY_DIRTY(&w->key)) {
205 dirty_init(w);
206 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
207 io->bio.bi_iter.bi_sector = KEY_START(&w->key);
208 bio_set_dev(&io->bio, io->dc->bdev);
209 io->bio.bi_end_io = dirty_endio;
188 210
189 closure_bio_submit(&io->bio, cl); 211 closure_bio_submit(&io->bio, cl);
212 }
190 213
191 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); 214 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
192} 215}
@@ -418,6 +441,8 @@ static int bch_writeback_thread(void *arg)
418 struct cached_dev *dc = arg; 441 struct cached_dev *dc = arg;
419 bool searched_full_index; 442 bool searched_full_index;
420 443
444 bch_ratelimit_reset(&dc->writeback_rate);
445
421 while (!kthread_should_stop()) { 446 while (!kthread_should_stop()) {
422 down_write(&dc->writeback_lock); 447 down_write(&dc->writeback_lock);
423 if (!atomic_read(&dc->has_dirty) || 448 if (!atomic_read(&dc->has_dirty) ||
@@ -445,7 +470,6 @@ static int bch_writeback_thread(void *arg)
445 470
446 up_write(&dc->writeback_lock); 471 up_write(&dc->writeback_lock);
447 472
448 bch_ratelimit_reset(&dc->writeback_rate);
449 read_dirty(dc); 473 read_dirty(dc);
450 474
451 if (searched_full_index) { 475 if (searched_full_index) {
@@ -455,6 +479,8 @@ static int bch_writeback_thread(void *arg)
455 !kthread_should_stop() && 479 !kthread_should_stop() &&
456 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 480 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
457 delay = schedule_timeout_interruptible(delay); 481 delay = schedule_timeout_interruptible(delay);
482
483 bch_ratelimit_reset(&dc->writeback_rate);
458 } 484 }
459 } 485 }
460 486
@@ -492,8 +518,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
492 518
493 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), 519 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
494 sectors_dirty_init_fn, 0); 520 sectors_dirty_init_fn, 0);
495
496 d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
497} 521}
498 522
499void bch_cached_dev_writeback_init(struct cached_dev *dc) 523void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -507,10 +531,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
507 dc->writeback_percent = 10; 531 dc->writeback_percent = 10;
508 dc->writeback_delay = 30; 532 dc->writeback_delay = 30;
509 dc->writeback_rate.rate = 1024; 533 dc->writeback_rate.rate = 1024;
534 dc->writeback_rate_minimum = 8;
510 535
511 dc->writeback_rate_update_seconds = 5; 536 dc->writeback_rate_update_seconds = 5;
512 dc->writeback_rate_d_term = 30; 537 dc->writeback_rate_p_term_inverse = 40;
513 dc->writeback_rate_p_term_inverse = 6000; 538 dc->writeback_rate_i_term_inverse = 10000;
514 539
515 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 540 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
516} 541}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 151544740148..a9e3ffb4b03c 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -77,7 +77,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
77 if (would_skip) 77 if (would_skip)
78 return false; 78 return false;
79 79
80 return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK; 80 return (op_is_sync(bio->bi_opf) ||
81 bio->bi_opf & (REQ_META|REQ_PRIO) ||
82 in_use <= CUTOFF_WRITEBACK);
81} 83}
82 84
83static inline void bch_writeback_queue(struct cached_dev *dc) 85static inline void bch_writeback_queue(struct cached_dev *dc)
@@ -90,7 +92,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
90{ 92{
91 if (!atomic_read(&dc->has_dirty) && 93 if (!atomic_read(&dc->has_dirty) &&
92 !atomic_xchg(&dc->has_dirty, 1)) { 94 !atomic_xchg(&dc->has_dirty, 1)) {
93 atomic_inc(&dc->count); 95 refcount_inc(&dc->count);
94 96
95 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { 97 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
96 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); 98 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d2121637b4ab..4d8ed74efadf 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index,
368 pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, 368 pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
369 (unsigned long long)index << PAGE_SHIFT); 369 (unsigned long long)index << PAGE_SHIFT);
370 370
371 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); 371 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
372 if (!bh) { 372 if (!bh) {
373 ret = -ENOMEM; 373 ret = -ENOMEM;
374 goto out; 374 goto out;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index eadfcfd106ff..9d32f25489c2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
56 56
57int dm_request_based(struct mapped_device *md) 57int dm_request_based(struct mapped_device *md)
58{ 58{
59 return blk_queue_stackable(md->queue); 59 return queue_is_rq_based(md->queue);
60} 60}
61 61
62static void dm_old_start_queue(struct request_queue *q) 62static void dm_old_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ef7b8f201f73..75281828f2cb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1000,7 +1000,7 @@ verify_rq_based:
1000 list_for_each_entry(dd, devices, list) { 1000 list_for_each_entry(dd, devices, list) {
1001 struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); 1001 struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
1002 1002
1003 if (!blk_queue_stackable(q)) { 1003 if (!queue_is_rq_based(q)) {
1004 DMERR("table load rejected: including" 1004 DMERR("table load rejected: including"
1005 " non-request-stackable devices"); 1005 " non-request-stackable devices");
1006 return -EINVAL; 1006 return -EINVAL;
@@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1847 */ 1847 */
1848 if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) 1848 if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
1849 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); 1849 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
1850
1851 /*
1852 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
1853 * visible to other CPUs because, once the flag is set, incoming bios
1854 * are processed by request-based dm, which refers to the queue
1855 * settings.
1856 * Until the flag set, bios are passed to bio-based dm and queued to
1857 * md->deferred where queue settings are not needed yet.
1858 * Those bios are passed to request-based dm at the resume time.
1859 */
1860 smp_mb();
1861 if (dm_table_request_based(t))
1862 queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
1863} 1850}
1864 1851
1865unsigned int dm_table_get_num_targets(struct dm_table *t) 1852unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8aaffa19b29a..a3f8cbb98dd5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1619,17 +1619,6 @@ static void dm_wq_work(struct work_struct *work);
1619void dm_init_md_queue(struct mapped_device *md) 1619void dm_init_md_queue(struct mapped_device *md)
1620{ 1620{
1621 /* 1621 /*
1622 * Request-based dm devices cannot be stacked on top of bio-based dm
1623 * devices. The type of this dm device may not have been decided yet.
1624 * The type is decided at the first table loading time.
1625 * To prevent problematic device stacking, clear the queue flag
1626 * for request stacking support until then.
1627 *
1628 * This queue is new, so no concurrency on the queue_flags.
1629 */
1630 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1631
1632 /*
1633 * Initialize data that will only be used by a non-blk-mq DM queue 1622 * Initialize data that will only be used by a non-blk-mq DM queue
1634 * - must do so here (in alloc_dev callchain) before queue is used 1623 * - must do so here (in alloc_dev callchain) before queue is used
1635 */ 1624 */
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index b7c78a5b1f7a..04008e0bbe81 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,2 +1,6 @@
1menu "NVME Support"
2
1source "drivers/nvme/host/Kconfig" 3source "drivers/nvme/host/Kconfig"
2source "drivers/nvme/target/Kconfig" 4source "drivers/nvme/target/Kconfig"
5
6endmenu
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 46d6cb1e03bd..b979cf3bce65 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -13,6 +13,15 @@ config BLK_DEV_NVME
13 To compile this driver as a module, choose M here: the 13 To compile this driver as a module, choose M here: the
14 module will be called nvme. 14 module will be called nvme.
15 15
16config NVME_MULTIPATH
17 bool "NVMe multipath support"
18 depends on NVME_CORE
19 ---help---
20 This option enables support for multipath access to NVMe
21 subsystems. If this option is enabled only a single
22 /dev/nvmeXnY device will show up for each NVMe namespaces,
23 even if it is accessible through multiple controllers.
24
16config NVME_FABRICS 25config NVME_FABRICS
17 tristate 26 tristate
18 27
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 7b96e4588a12..a25fd43650ad 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
6obj-$(CONFIG_NVME_FC) += nvme-fc.o 6obj-$(CONFIG_NVME_FC) += nvme-fc.o
7 7
8nvme-core-y := core.o 8nvme-core-y := core.o
9nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
9nvme-core-$(CONFIG_NVM) += lightnvm.o 10nvme-core-$(CONFIG_NVM) += lightnvm.o
10 11
11nvme-y += pci.o 12nvme-y += pci.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 37f9039bb9ca..25da74d310d1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -34,13 +34,13 @@
34 34
35#define NVME_MINORS (1U << MINORBITS) 35#define NVME_MINORS (1U << MINORBITS)
36 36
37unsigned char admin_timeout = 60; 37unsigned int admin_timeout = 60;
38module_param(admin_timeout, byte, 0644); 38module_param(admin_timeout, uint, 0644);
39MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 39MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
40EXPORT_SYMBOL_GPL(admin_timeout); 40EXPORT_SYMBOL_GPL(admin_timeout);
41 41
42unsigned char nvme_io_timeout = 30; 42unsigned int nvme_io_timeout = 30;
43module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 43module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
44MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 44MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
45EXPORT_SYMBOL_GPL(nvme_io_timeout); 45EXPORT_SYMBOL_GPL(nvme_io_timeout);
46 46
@@ -52,9 +52,6 @@ static u8 nvme_max_retries = 5;
52module_param_named(max_retries, nvme_max_retries, byte, 0644); 52module_param_named(max_retries, nvme_max_retries, byte, 0644);
53MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); 53MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
54 54
55static int nvme_char_major;
56module_param(nvme_char_major, int, 0);
57
58static unsigned long default_ps_max_latency_us = 100000; 55static unsigned long default_ps_max_latency_us = 100000;
59module_param(default_ps_max_latency_us, ulong, 0644); 56module_param(default_ps_max_latency_us, ulong, 0644);
60MODULE_PARM_DESC(default_ps_max_latency_us, 57MODULE_PARM_DESC(default_ps_max_latency_us,
@@ -71,10 +68,17 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
71struct workqueue_struct *nvme_wq; 68struct workqueue_struct *nvme_wq;
72EXPORT_SYMBOL_GPL(nvme_wq); 69EXPORT_SYMBOL_GPL(nvme_wq);
73 70
74static LIST_HEAD(nvme_ctrl_list); 71static DEFINE_IDA(nvme_subsystems_ida);
75static DEFINE_SPINLOCK(dev_list_lock); 72static LIST_HEAD(nvme_subsystems);
73static DEFINE_MUTEX(nvme_subsystems_lock);
76 74
75static DEFINE_IDA(nvme_instance_ida);
76static dev_t nvme_chr_devt;
77static struct class *nvme_class; 77static struct class *nvme_class;
78static struct class *nvme_subsys_class;
79
80static void nvme_ns_remove(struct nvme_ns *ns);
81static int nvme_revalidate_disk(struct gendisk *disk);
78 82
79static __le32 nvme_get_log_dw10(u8 lid, size_t size) 83static __le32 nvme_get_log_dw10(u8 lid, size_t size)
80{ 84{
@@ -101,6 +105,51 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
101 return ret; 105 return ret;
102} 106}
103 107
108static void nvme_delete_ctrl_work(struct work_struct *work)
109{
110 struct nvme_ctrl *ctrl =
111 container_of(work, struct nvme_ctrl, delete_work);
112
113 flush_work(&ctrl->reset_work);
114 nvme_stop_ctrl(ctrl);
115 nvme_remove_namespaces(ctrl);
116 ctrl->ops->delete_ctrl(ctrl);
117 nvme_uninit_ctrl(ctrl);
118 nvme_put_ctrl(ctrl);
119}
120
121int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
122{
123 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
124 return -EBUSY;
125 if (!queue_work(nvme_wq, &ctrl->delete_work))
126 return -EBUSY;
127 return 0;
128}
129EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
130
131int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
132{
133 int ret = 0;
134
135 /*
136 * Keep a reference until the work is flushed since ->delete_ctrl
137 * can free the controller.
138 */
139 nvme_get_ctrl(ctrl);
140 ret = nvme_delete_ctrl(ctrl);
141 if (!ret)
142 flush_work(&ctrl->delete_work);
143 nvme_put_ctrl(ctrl);
144 return ret;
145}
146EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
147
148static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
149{
150 return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
151}
152
104static blk_status_t nvme_error_status(struct request *req) 153static blk_status_t nvme_error_status(struct request *req)
105{ 154{
106 switch (nvme_req(req)->status & 0x7ff) { 155 switch (nvme_req(req)->status & 0x7ff) {
@@ -142,9 +191,16 @@ static inline bool nvme_req_needs_retry(struct request *req)
142void nvme_complete_rq(struct request *req) 191void nvme_complete_rq(struct request *req)
143{ 192{
144 if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { 193 if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
145 nvme_req(req)->retries++; 194 if (nvme_req_needs_failover(req)) {
146 blk_mq_requeue_request(req, true); 195 nvme_failover_req(req);
147 return; 196 return;
197 }
198
199 if (!blk_queue_dying(req->q)) {
200 nvme_req(req)->retries++;
201 blk_mq_requeue_request(req, true);
202 return;
203 }
148 } 204 }
149 205
150 blk_mq_end_request(req, nvme_error_status(req)); 206 blk_mq_end_request(req, nvme_error_status(req));
@@ -153,18 +209,13 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
153 209
154void nvme_cancel_request(struct request *req, void *data, bool reserved) 210void nvme_cancel_request(struct request *req, void *data, bool reserved)
155{ 211{
156 int status;
157
158 if (!blk_mq_request_started(req)) 212 if (!blk_mq_request_started(req))
159 return; 213 return;
160 214
161 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, 215 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
162 "Cancelling I/O %d", req->tag); 216 "Cancelling I/O %d", req->tag);
163 217
164 status = NVME_SC_ABORT_REQ; 218 nvme_req(req)->status = NVME_SC_ABORT_REQ;
165 if (blk_queue_dying(req->q))
166 status |= NVME_SC_DNR;
167 nvme_req(req)->status = status;
168 blk_mq_complete_request(req); 219 blk_mq_complete_request(req);
169 220
170} 221}
@@ -205,6 +256,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
205 case NVME_CTRL_RECONNECTING: 256 case NVME_CTRL_RECONNECTING:
206 switch (old_state) { 257 switch (old_state) {
207 case NVME_CTRL_LIVE: 258 case NVME_CTRL_LIVE:
259 case NVME_CTRL_RESETTING:
208 changed = true; 260 changed = true;
209 /* FALLTHRU */ 261 /* FALLTHRU */
210 default: 262 default:
@@ -239,11 +291,29 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
239 ctrl->state = new_state; 291 ctrl->state = new_state;
240 292
241 spin_unlock_irqrestore(&ctrl->lock, flags); 293 spin_unlock_irqrestore(&ctrl->lock, flags);
242 294 if (changed && ctrl->state == NVME_CTRL_LIVE)
295 nvme_kick_requeue_lists(ctrl);
243 return changed; 296 return changed;
244} 297}
245EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 298EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
246 299
300static void nvme_free_ns_head(struct kref *ref)
301{
302 struct nvme_ns_head *head =
303 container_of(ref, struct nvme_ns_head, ref);
304
305 nvme_mpath_remove_disk(head);
306 ida_simple_remove(&head->subsys->ns_ida, head->instance);
307 list_del_init(&head->entry);
308 cleanup_srcu_struct(&head->srcu);
309 kfree(head);
310}
311
312static void nvme_put_ns_head(struct nvme_ns_head *head)
313{
314 kref_put(&head->ref, nvme_free_ns_head);
315}
316
247static void nvme_free_ns(struct kref *kref) 317static void nvme_free_ns(struct kref *kref)
248{ 318{
249 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 319 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
@@ -251,14 +321,8 @@ static void nvme_free_ns(struct kref *kref)
251 if (ns->ndev) 321 if (ns->ndev)
252 nvme_nvm_unregister(ns); 322 nvme_nvm_unregister(ns);
253 323
254 if (ns->disk) {
255 spin_lock(&dev_list_lock);
256 ns->disk->private_data = NULL;
257 spin_unlock(&dev_list_lock);
258 }
259
260 put_disk(ns->disk); 324 put_disk(ns->disk);
261 ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); 325 nvme_put_ns_head(ns->head);
262 nvme_put_ctrl(ns->ctrl); 326 nvme_put_ctrl(ns->ctrl);
263 kfree(ns); 327 kfree(ns);
264} 328}
@@ -268,31 +332,8 @@ static void nvme_put_ns(struct nvme_ns *ns)
268 kref_put(&ns->kref, nvme_free_ns); 332 kref_put(&ns->kref, nvme_free_ns);
269} 333}
270 334
271static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
272{
273 struct nvme_ns *ns;
274
275 spin_lock(&dev_list_lock);
276 ns = disk->private_data;
277 if (ns) {
278 if (!kref_get_unless_zero(&ns->kref))
279 goto fail;
280 if (!try_module_get(ns->ctrl->ops->module))
281 goto fail_put_ns;
282 }
283 spin_unlock(&dev_list_lock);
284
285 return ns;
286
287fail_put_ns:
288 kref_put(&ns->kref, nvme_free_ns);
289fail:
290 spin_unlock(&dev_list_lock);
291 return NULL;
292}
293
294struct request *nvme_alloc_request(struct request_queue *q, 335struct request *nvme_alloc_request(struct request_queue *q,
295 struct nvme_command *cmd, unsigned int flags, int qid) 336 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
296{ 337{
297 unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; 338 unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
298 struct request *req; 339 struct request *req;
@@ -417,7 +458,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
417{ 458{
418 memset(cmnd, 0, sizeof(*cmnd)); 459 memset(cmnd, 0, sizeof(*cmnd));
419 cmnd->common.opcode = nvme_cmd_flush; 460 cmnd->common.opcode = nvme_cmd_flush;
420 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 461 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
421} 462}
422 463
423static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, 464static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
@@ -448,7 +489,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
448 489
449 memset(cmnd, 0, sizeof(*cmnd)); 490 memset(cmnd, 0, sizeof(*cmnd));
450 cmnd->dsm.opcode = nvme_cmd_dsm; 491 cmnd->dsm.opcode = nvme_cmd_dsm;
451 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 492 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
452 cmnd->dsm.nr = cpu_to_le32(segments - 1); 493 cmnd->dsm.nr = cpu_to_le32(segments - 1);
453 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 494 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
454 495
@@ -467,16 +508,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
467 u16 control = 0; 508 u16 control = 0;
468 u32 dsmgmt = 0; 509 u32 dsmgmt = 0;
469 510
470 /*
471 * If formated with metadata, require the block layer provide a buffer
472 * unless this namespace is formated such that the metadata can be
473 * stripped/generated by the controller with PRACT=1.
474 */
475 if (ns && ns->ms &&
476 (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
477 !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
478 return BLK_STS_NOTSUPP;
479
480 if (req->cmd_flags & REQ_FUA) 511 if (req->cmd_flags & REQ_FUA)
481 control |= NVME_RW_FUA; 512 control |= NVME_RW_FUA;
482 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 513 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
@@ -487,7 +518,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
487 518
488 memset(cmnd, 0, sizeof(*cmnd)); 519 memset(cmnd, 0, sizeof(*cmnd));
489 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 520 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
490 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 521 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
491 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 522 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
492 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 523 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
493 524
@@ -495,6 +526,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
495 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); 526 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
496 527
497 if (ns->ms) { 528 if (ns->ms) {
529 /*
530 * If formated with metadata, the block layer always provides a
531 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
532 * we enable the PRACT bit for protection information or set the
533 * namespace capacity to zero to prevent any I/O.
534 */
535 if (!blk_integrity_rq(req)) {
536 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
537 return BLK_STS_NOTSUPP;
538 control |= NVME_RW_PRINFO_PRACT;
539 }
540
498 switch (ns->pi_type) { 541 switch (ns->pi_type) {
499 case NVME_NS_DPS_PI_TYPE3: 542 case NVME_NS_DPS_PI_TYPE3:
500 control |= NVME_RW_PRINFO_PRCHK_GUARD; 543 control |= NVME_RW_PRINFO_PRCHK_GUARD;
@@ -507,8 +550,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
507 nvme_block_nr(ns, blk_rq_pos(req))); 550 nvme_block_nr(ns, blk_rq_pos(req)));
508 break; 551 break;
509 } 552 }
510 if (!blk_integrity_rq(req))
511 control |= NVME_RW_PRINFO_PRACT;
512 } 553 }
513 554
514 cmnd->rw.control = cpu_to_le16(control); 555 cmnd->rw.control = cpu_to_le16(control);
@@ -560,7 +601,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
560 */ 601 */
561int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 602int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
562 union nvme_result *result, void *buffer, unsigned bufflen, 603 union nvme_result *result, void *buffer, unsigned bufflen,
563 unsigned timeout, int qid, int at_head, int flags) 604 unsigned timeout, int qid, int at_head,
605 blk_mq_req_flags_t flags)
564{ 606{
565 struct request *req; 607 struct request *req;
566 int ret; 608 int ret;
@@ -778,7 +820,7 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
778} 820}
779 821
780static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, 822static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
781 u8 *eui64, u8 *nguid, uuid_t *uuid) 823 struct nvme_ns_ids *ids)
782{ 824{
783 struct nvme_command c = { }; 825 struct nvme_command c = { };
784 int status; 826 int status;
@@ -814,7 +856,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
814 goto free_data; 856 goto free_data;
815 } 857 }
816 len = NVME_NIDT_EUI64_LEN; 858 len = NVME_NIDT_EUI64_LEN;
817 memcpy(eui64, data + pos + sizeof(*cur), len); 859 memcpy(ids->eui64, data + pos + sizeof(*cur), len);
818 break; 860 break;
819 case NVME_NIDT_NGUID: 861 case NVME_NIDT_NGUID:
820 if (cur->nidl != NVME_NIDT_NGUID_LEN) { 862 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
@@ -824,7 +866,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
824 goto free_data; 866 goto free_data;
825 } 867 }
826 len = NVME_NIDT_NGUID_LEN; 868 len = NVME_NIDT_NGUID_LEN;
827 memcpy(nguid, data + pos + sizeof(*cur), len); 869 memcpy(ids->nguid, data + pos + sizeof(*cur), len);
828 break; 870 break;
829 case NVME_NIDT_UUID: 871 case NVME_NIDT_UUID:
830 if (cur->nidl != NVME_NIDT_UUID_LEN) { 872 if (cur->nidl != NVME_NIDT_UUID_LEN) {
@@ -834,7 +876,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
834 goto free_data; 876 goto free_data;
835 } 877 }
836 len = NVME_NIDT_UUID_LEN; 878 len = NVME_NIDT_UUID_LEN;
837 uuid_copy(uuid, data + pos + sizeof(*cur)); 879 uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
838 break; 880 break;
839 default: 881 default:
840 /* Skip unnkown types */ 882 /* Skip unnkown types */
@@ -968,7 +1010,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
968 memset(&c, 0, sizeof(c)); 1010 memset(&c, 0, sizeof(c));
969 c.rw.opcode = io.opcode; 1011 c.rw.opcode = io.opcode;
970 c.rw.flags = io.flags; 1012 c.rw.flags = io.flags;
971 c.rw.nsid = cpu_to_le32(ns->ns_id); 1013 c.rw.nsid = cpu_to_le32(ns->head->ns_id);
972 c.rw.slba = cpu_to_le64(io.slba); 1014 c.rw.slba = cpu_to_le64(io.slba);
973 c.rw.length = cpu_to_le16(io.nblocks); 1015 c.rw.length = cpu_to_le16(io.nblocks);
974 c.rw.control = cpu_to_le16(io.control); 1016 c.rw.control = cpu_to_le16(io.control);
@@ -982,12 +1024,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
982 metadata, meta_len, io.slba, NULL, 0); 1024 metadata, meta_len, io.slba, NULL, 0);
983} 1025}
984 1026
1027static u32 nvme_known_admin_effects(u8 opcode)
1028{
1029 switch (opcode) {
1030 case nvme_admin_format_nvm:
1031 return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
1032 NVME_CMD_EFFECTS_CSE_MASK;
1033 case nvme_admin_sanitize_nvm:
1034 return NVME_CMD_EFFECTS_CSE_MASK;
1035 default:
1036 break;
1037 }
1038 return 0;
1039}
1040
1041static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1042 u8 opcode)
1043{
1044 u32 effects = 0;
1045
1046 if (ns) {
1047 if (ctrl->effects)
1048 effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1049 if (effects & ~NVME_CMD_EFFECTS_CSUPP)
1050 dev_warn(ctrl->device,
1051 "IO command:%02x has unhandled effects:%08x\n",
1052 opcode, effects);
1053 return 0;
1054 }
1055
1056 if (ctrl->effects)
1057 effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1058 else
1059 effects = nvme_known_admin_effects(opcode);
1060
1061 /*
1062 * For simplicity, IO to all namespaces is quiesced even if the command
1063 * effects say only one namespace is affected.
1064 */
1065 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1066 nvme_start_freeze(ctrl);
1067 nvme_wait_freeze(ctrl);
1068 }
1069 return effects;
1070}
1071
1072static void nvme_update_formats(struct nvme_ctrl *ctrl)
1073{
1074 struct nvme_ns *ns;
1075
1076 mutex_lock(&ctrl->namespaces_mutex);
1077 list_for_each_entry(ns, &ctrl->namespaces, list) {
1078 if (ns->disk && nvme_revalidate_disk(ns->disk))
1079 nvme_ns_remove(ns);
1080 }
1081 mutex_unlock(&ctrl->namespaces_mutex);
1082}
1083
1084static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1085{
1086 /*
1087 * Revalidate LBA changes prior to unfreezing. This is necessary to
1088 * prevent memory corruption if a logical block size was changed by
1089 * this command.
1090 */
1091 if (effects & NVME_CMD_EFFECTS_LBCC)
1092 nvme_update_formats(ctrl);
1093 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
1094 nvme_unfreeze(ctrl);
1095 if (effects & NVME_CMD_EFFECTS_CCC)
1096 nvme_init_identify(ctrl);
1097 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
1098 nvme_queue_scan(ctrl);
1099}
1100
985static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1101static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
986 struct nvme_passthru_cmd __user *ucmd) 1102 struct nvme_passthru_cmd __user *ucmd)
987{ 1103{
988 struct nvme_passthru_cmd cmd; 1104 struct nvme_passthru_cmd cmd;
989 struct nvme_command c; 1105 struct nvme_command c;
990 unsigned timeout = 0; 1106 unsigned timeout = 0;
1107 u32 effects;
991 int status; 1108 int status;
992 1109
993 if (!capable(CAP_SYS_ADMIN)) 1110 if (!capable(CAP_SYS_ADMIN))
@@ -1013,10 +1130,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1013 if (cmd.timeout_ms) 1130 if (cmd.timeout_ms)
1014 timeout = msecs_to_jiffies(cmd.timeout_ms); 1131 timeout = msecs_to_jiffies(cmd.timeout_ms);
1015 1132
1133 effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
1016 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1134 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1017 (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 1135 (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1018 (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, 1136 (void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
1019 0, &cmd.result, timeout); 1137 0, &cmd.result, timeout);
1138 nvme_passthru_end(ctrl, effects);
1139
1020 if (status >= 0) { 1140 if (status >= 0) {
1021 if (put_user(cmd.result, &ucmd->result)) 1141 if (put_user(cmd.result, &ucmd->result))
1022 return -EFAULT; 1142 return -EFAULT;
@@ -1025,15 +1145,37 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1025 return status; 1145 return status;
1026} 1146}
1027 1147
1028static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 1148/*
1029 unsigned int cmd, unsigned long arg) 1149 * Issue ioctl requests on the first available path. Note that unlike normal
1150 * block layer requests we will not retry failed request on another controller.
1151 */
1152static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1153 struct nvme_ns_head **head, int *srcu_idx)
1030{ 1154{
1031 struct nvme_ns *ns = bdev->bd_disk->private_data; 1155#ifdef CONFIG_NVME_MULTIPATH
1156 if (disk->fops == &nvme_ns_head_ops) {
1157 *head = disk->private_data;
1158 *srcu_idx = srcu_read_lock(&(*head)->srcu);
1159 return nvme_find_path(*head);
1160 }
1161#endif
1162 *head = NULL;
1163 *srcu_idx = -1;
1164 return disk->private_data;
1165}
1032 1166
1167static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1168{
1169 if (head)
1170 srcu_read_unlock(&head->srcu, idx);
1171}
1172
1173static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
1174{
1033 switch (cmd) { 1175 switch (cmd) {
1034 case NVME_IOCTL_ID: 1176 case NVME_IOCTL_ID:
1035 force_successful_syscall_return(); 1177 force_successful_syscall_return();
1036 return ns->ns_id; 1178 return ns->head->ns_id;
1037 case NVME_IOCTL_ADMIN_CMD: 1179 case NVME_IOCTL_ADMIN_CMD:
1038 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); 1180 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
1039 case NVME_IOCTL_IO_CMD: 1181 case NVME_IOCTL_IO_CMD:
@@ -1052,27 +1194,39 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1052 } 1194 }
1053} 1195}
1054 1196
1055#ifdef CONFIG_COMPAT 1197static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1056static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1198 unsigned int cmd, unsigned long arg)
1057 unsigned int cmd, unsigned long arg)
1058{ 1199{
1059 return nvme_ioctl(bdev, mode, cmd, arg); 1200 struct nvme_ns_head *head = NULL;
1201 struct nvme_ns *ns;
1202 int srcu_idx, ret;
1203
1204 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1205 if (unlikely(!ns))
1206 ret = -EWOULDBLOCK;
1207 else
1208 ret = nvme_ns_ioctl(ns, cmd, arg);
1209 nvme_put_ns_from_disk(head, srcu_idx);
1210 return ret;
1060} 1211}
1061#else
1062#define nvme_compat_ioctl NULL
1063#endif
1064 1212
1065static int nvme_open(struct block_device *bdev, fmode_t mode) 1213static int nvme_open(struct block_device *bdev, fmode_t mode)
1066{ 1214{
1067 return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; 1215 struct nvme_ns *ns = bdev->bd_disk->private_data;
1216
1217#ifdef CONFIG_NVME_MULTIPATH
1218 /* should never be called due to GENHD_FL_HIDDEN */
1219 if (WARN_ON_ONCE(ns->head->disk))
1220 return -ENXIO;
1221#endif
1222 if (!kref_get_unless_zero(&ns->kref))
1223 return -ENXIO;
1224 return 0;
1068} 1225}
1069 1226
1070static void nvme_release(struct gendisk *disk, fmode_t mode) 1227static void nvme_release(struct gendisk *disk, fmode_t mode)
1071{ 1228{
1072 struct nvme_ns *ns = disk->private_data; 1229 nvme_put_ns(disk->private_data);
1073
1074 module_put(ns->ctrl->ops->module);
1075 nvme_put_ns(ns);
1076} 1230}
1077 1231
1078static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1232static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -1085,35 +1239,12 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1085} 1239}
1086 1240
1087#ifdef CONFIG_BLK_DEV_INTEGRITY 1241#ifdef CONFIG_BLK_DEV_INTEGRITY
1088static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, 1242static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1089 u16 bs)
1090{
1091 struct nvme_ns *ns = disk->private_data;
1092 u16 old_ms = ns->ms;
1093 u8 pi_type = 0;
1094
1095 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1096 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1097
1098 /* PI implementation requires metadata equal t10 pi tuple size */
1099 if (ns->ms == sizeof(struct t10_pi_tuple))
1100 pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1101
1102 if (blk_get_integrity(disk) &&
1103 (ns->pi_type != pi_type || ns->ms != old_ms ||
1104 bs != queue_logical_block_size(disk->queue) ||
1105 (ns->ms && ns->ext)))
1106 blk_integrity_unregister(disk);
1107
1108 ns->pi_type = pi_type;
1109}
1110
1111static void nvme_init_integrity(struct nvme_ns *ns)
1112{ 1243{
1113 struct blk_integrity integrity; 1244 struct blk_integrity integrity;
1114 1245
1115 memset(&integrity, 0, sizeof(integrity)); 1246 memset(&integrity, 0, sizeof(integrity));
1116 switch (ns->pi_type) { 1247 switch (pi_type) {
1117 case NVME_NS_DPS_PI_TYPE3: 1248 case NVME_NS_DPS_PI_TYPE3:
1118 integrity.profile = &t10_pi_type3_crc; 1249 integrity.profile = &t10_pi_type3_crc;
1119 integrity.tag_size = sizeof(u16) + sizeof(u32); 1250 integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1129,16 +1260,12 @@ static void nvme_init_integrity(struct nvme_ns *ns)
1129 integrity.profile = NULL; 1260 integrity.profile = NULL;
1130 break; 1261 break;
1131 } 1262 }
1132 integrity.tuple_size = ns->ms; 1263 integrity.tuple_size = ms;
1133 blk_integrity_register(ns->disk, &integrity); 1264 blk_integrity_register(disk, &integrity);
1134 blk_queue_max_integrity_segments(ns->queue, 1); 1265 blk_queue_max_integrity_segments(disk->queue, 1);
1135} 1266}
1136#else 1267#else
1137static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, 1268static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1138 u16 bs)
1139{
1140}
1141static void nvme_init_integrity(struct nvme_ns *ns)
1142{ 1269{
1143} 1270}
1144#endif /* CONFIG_BLK_DEV_INTEGRITY */ 1271#endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -1149,53 +1276,89 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
1149 blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); 1276 blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1150} 1277}
1151 1278
1152static void nvme_config_discard(struct nvme_ns *ns) 1279static void nvme_config_discard(struct nvme_ctrl *ctrl,
1280 unsigned stream_alignment, struct request_queue *queue)
1153{ 1281{
1154 struct nvme_ctrl *ctrl = ns->ctrl; 1282 u32 size = queue_logical_block_size(queue);
1155 u32 logical_block_size = queue_logical_block_size(ns->queue); 1283
1284 if (stream_alignment)
1285 size *= stream_alignment;
1156 1286
1157 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < 1287 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1158 NVME_DSM_MAX_RANGES); 1288 NVME_DSM_MAX_RANGES);
1159 1289
1160 if (ctrl->nr_streams && ns->sws && ns->sgs) { 1290 queue->limits.discard_alignment = size;
1161 unsigned int sz = logical_block_size * ns->sws * ns->sgs; 1291 queue->limits.discard_granularity = size;
1162 1292
1163 ns->queue->limits.discard_alignment = sz; 1293 blk_queue_max_discard_sectors(queue, UINT_MAX);
1164 ns->queue->limits.discard_granularity = sz; 1294 blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
1165 } else { 1295 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
1166 ns->queue->limits.discard_alignment = logical_block_size;
1167 ns->queue->limits.discard_granularity = logical_block_size;
1168 }
1169 blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
1170 blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
1171 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
1172 1296
1173 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 1297 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1174 blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX); 1298 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1175} 1299}
1176 1300
1177static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, 1301static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1178 struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid) 1302 struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1179{ 1303{
1304 memset(ids, 0, sizeof(*ids));
1305
1180 if (ctrl->vs >= NVME_VS(1, 1, 0)) 1306 if (ctrl->vs >= NVME_VS(1, 1, 0))
1181 memcpy(eui64, id->eui64, sizeof(id->eui64)); 1307 memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1182 if (ctrl->vs >= NVME_VS(1, 2, 0)) 1308 if (ctrl->vs >= NVME_VS(1, 2, 0))
1183 memcpy(nguid, id->nguid, sizeof(id->nguid)); 1309 memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1184 if (ctrl->vs >= NVME_VS(1, 3, 0)) { 1310 if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1185 /* Don't treat error as fatal we potentially 1311 /* Don't treat error as fatal we potentially
1186 * already have a NGUID or EUI-64 1312 * already have a NGUID or EUI-64
1187 */ 1313 */
1188 if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid)) 1314 if (nvme_identify_ns_descs(ctrl, nsid, ids))
1189 dev_warn(ctrl->device, 1315 dev_warn(ctrl->device,
1190 "%s: Identify Descriptors failed\n", __func__); 1316 "%s: Identify Descriptors failed\n", __func__);
1191 } 1317 }
1192} 1318}
1193 1319
1320static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
1321{
1322 return !uuid_is_null(&ids->uuid) ||
1323 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
1324 memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
1325}
1326
1327static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1328{
1329 return uuid_equal(&a->uuid, &b->uuid) &&
1330 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1331 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
1332}
1333
1334static void nvme_update_disk_info(struct gendisk *disk,
1335 struct nvme_ns *ns, struct nvme_id_ns *id)
1336{
1337 sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
1338 unsigned stream_alignment = 0;
1339
1340 if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
1341 stream_alignment = ns->sws * ns->sgs;
1342
1343 blk_mq_freeze_queue(disk->queue);
1344 blk_integrity_unregister(disk);
1345
1346 blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
1347 if (ns->ms && !ns->ext &&
1348 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1349 nvme_init_integrity(disk, ns->ms, ns->pi_type);
1350 if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
1351 capacity = 0;
1352 set_capacity(disk, capacity);
1353
1354 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
1355 nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
1356 blk_mq_unfreeze_queue(disk->queue);
1357}
1358
1194static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) 1359static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1195{ 1360{
1196 struct nvme_ns *ns = disk->private_data; 1361 struct nvme_ns *ns = disk->private_data;
1197 struct nvme_ctrl *ctrl = ns->ctrl;
1198 u16 bs;
1199 1362
1200 /* 1363 /*
1201 * If identify namespace failed, use default 512 byte block size so 1364 * If identify namespace failed, use default 512 byte block size so
@@ -1204,26 +1367,22 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1204 ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; 1367 ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1205 if (ns->lba_shift == 0) 1368 if (ns->lba_shift == 0)
1206 ns->lba_shift = 9; 1369 ns->lba_shift = 9;
1207 bs = 1 << ns->lba_shift;
1208 ns->noiob = le16_to_cpu(id->noiob); 1370 ns->noiob = le16_to_cpu(id->noiob);
1371 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1372 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1373 /* the PI implementation requires metadata equal t10 pi tuple size */
1374 if (ns->ms == sizeof(struct t10_pi_tuple))
1375 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1376 else
1377 ns->pi_type = 0;
1209 1378
1210 blk_mq_freeze_queue(disk->queue);
1211
1212 if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
1213 nvme_prep_integrity(disk, id, bs);
1214 blk_queue_logical_block_size(ns->queue, bs);
1215 if (ns->noiob) 1379 if (ns->noiob)
1216 nvme_set_chunk_size(ns); 1380 nvme_set_chunk_size(ns);
1217 if (ns->ms && !blk_get_integrity(disk) && !ns->ext) 1381 nvme_update_disk_info(disk, ns, id);
1218 nvme_init_integrity(ns); 1382#ifdef CONFIG_NVME_MULTIPATH
1219 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) 1383 if (ns->head->disk)
1220 set_capacity(disk, 0); 1384 nvme_update_disk_info(ns->head->disk, ns, id);
1221 else 1385#endif
1222 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1223
1224 if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1225 nvme_config_discard(ns);
1226 blk_mq_unfreeze_queue(disk->queue);
1227} 1386}
1228 1387
1229static int nvme_revalidate_disk(struct gendisk *disk) 1388static int nvme_revalidate_disk(struct gendisk *disk)
@@ -1231,8 +1390,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
1231 struct nvme_ns *ns = disk->private_data; 1390 struct nvme_ns *ns = disk->private_data;
1232 struct nvme_ctrl *ctrl = ns->ctrl; 1391 struct nvme_ctrl *ctrl = ns->ctrl;
1233 struct nvme_id_ns *id; 1392 struct nvme_id_ns *id;
1234 u8 eui64[8] = { 0 }, nguid[16] = { 0 }; 1393 struct nvme_ns_ids ids;
1235 uuid_t uuid = uuid_null;
1236 int ret = 0; 1394 int ret = 0;
1237 1395
1238 if (test_bit(NVME_NS_DEAD, &ns->flags)) { 1396 if (test_bit(NVME_NS_DEAD, &ns->flags)) {
@@ -1240,7 +1398,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
1240 return -ENODEV; 1398 return -ENODEV;
1241 } 1399 }
1242 1400
1243 id = nvme_identify_ns(ctrl, ns->ns_id); 1401 id = nvme_identify_ns(ctrl, ns->head->ns_id);
1244 if (!id) 1402 if (!id)
1245 return -ENODEV; 1403 return -ENODEV;
1246 1404
@@ -1250,12 +1408,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
1250 } 1408 }
1251 1409
1252 __nvme_revalidate_disk(disk, id); 1410 __nvme_revalidate_disk(disk, id);
1253 nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid); 1411 nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
1254 if (!uuid_equal(&ns->uuid, &uuid) || 1412 if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
1255 memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
1256 memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
1257 dev_err(ctrl->device, 1413 dev_err(ctrl->device,
1258 "identifiers changed for nsid %d\n", ns->ns_id); 1414 "identifiers changed for nsid %d\n", ns->head->ns_id);
1259 ret = -ENODEV; 1415 ret = -ENODEV;
1260 } 1416 }
1261 1417
@@ -1287,8 +1443,10 @@ static char nvme_pr_type(enum pr_type type)
1287static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 1443static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1288 u64 key, u64 sa_key, u8 op) 1444 u64 key, u64 sa_key, u8 op)
1289{ 1445{
1290 struct nvme_ns *ns = bdev->bd_disk->private_data; 1446 struct nvme_ns_head *head = NULL;
1447 struct nvme_ns *ns;
1291 struct nvme_command c; 1448 struct nvme_command c;
1449 int srcu_idx, ret;
1292 u8 data[16] = { 0, }; 1450 u8 data[16] = { 0, };
1293 1451
1294 put_unaligned_le64(key, &data[0]); 1452 put_unaligned_le64(key, &data[0]);
@@ -1296,10 +1454,16 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1296 1454
1297 memset(&c, 0, sizeof(c)); 1455 memset(&c, 0, sizeof(c));
1298 c.common.opcode = op; 1456 c.common.opcode = op;
1299 c.common.nsid = cpu_to_le32(ns->ns_id); 1457 c.common.nsid = cpu_to_le32(head->ns_id);
1300 c.common.cdw10[0] = cpu_to_le32(cdw10); 1458 c.common.cdw10[0] = cpu_to_le32(cdw10);
1301 1459
1302 return nvme_submit_sync_cmd(ns->queue, &c, data, 16); 1460 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1461 if (unlikely(!ns))
1462 ret = -EWOULDBLOCK;
1463 else
1464 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1465 nvme_put_ns_from_disk(head, srcu_idx);
1466 return ret;
1303} 1467}
1304 1468
1305static int nvme_pr_register(struct block_device *bdev, u64 old, 1469static int nvme_pr_register(struct block_device *bdev, u64 old,
@@ -1381,7 +1545,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit);
1381static const struct block_device_operations nvme_fops = { 1545static const struct block_device_operations nvme_fops = {
1382 .owner = THIS_MODULE, 1546 .owner = THIS_MODULE,
1383 .ioctl = nvme_ioctl, 1547 .ioctl = nvme_ioctl,
1384 .compat_ioctl = nvme_compat_ioctl, 1548 .compat_ioctl = nvme_ioctl,
1385 .open = nvme_open, 1549 .open = nvme_open,
1386 .release = nvme_release, 1550 .release = nvme_release,
1387 .getgeo = nvme_getgeo, 1551 .getgeo = nvme_getgeo,
@@ -1389,6 +1553,32 @@ static const struct block_device_operations nvme_fops = {
1389 .pr_ops = &nvme_pr_ops, 1553 .pr_ops = &nvme_pr_ops,
1390}; 1554};
1391 1555
1556#ifdef CONFIG_NVME_MULTIPATH
1557static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
1558{
1559 struct nvme_ns_head *head = bdev->bd_disk->private_data;
1560
1561 if (!kref_get_unless_zero(&head->ref))
1562 return -ENXIO;
1563 return 0;
1564}
1565
1566static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
1567{
1568 nvme_put_ns_head(disk->private_data);
1569}
1570
1571const struct block_device_operations nvme_ns_head_ops = {
1572 .owner = THIS_MODULE,
1573 .open = nvme_ns_head_open,
1574 .release = nvme_ns_head_release,
1575 .ioctl = nvme_ioctl,
1576 .compat_ioctl = nvme_ioctl,
1577 .getgeo = nvme_getgeo,
1578 .pr_ops = &nvme_pr_ops,
1579};
1580#endif /* CONFIG_NVME_MULTIPATH */
1581
1392static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 1582static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1393{ 1583{
1394 unsigned long timeout = 1584 unsigned long timeout =
@@ -1737,14 +1927,15 @@ static bool quirk_matches(const struct nvme_id_ctrl *id,
1737 string_matches(id->fr, q->fr, sizeof(id->fr)); 1927 string_matches(id->fr, q->fr, sizeof(id->fr));
1738} 1928}
1739 1929
1740static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 1930static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
1931 struct nvme_id_ctrl *id)
1741{ 1932{
1742 size_t nqnlen; 1933 size_t nqnlen;
1743 int off; 1934 int off;
1744 1935
1745 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); 1936 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
1746 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { 1937 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
1747 strcpy(ctrl->subnqn, id->subnqn); 1938 strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
1748 return; 1939 return;
1749 } 1940 }
1750 1941
@@ -1752,14 +1943,222 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1752 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); 1943 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
1753 1944
1754 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ 1945 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
1755 off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE, 1946 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
1756 "nqn.2014.08.org.nvmexpress:%4x%4x", 1947 "nqn.2014.08.org.nvmexpress:%4x%4x",
1757 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); 1948 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
1758 memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn)); 1949 memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
1759 off += sizeof(id->sn); 1950 off += sizeof(id->sn);
1760 memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn)); 1951 memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
1761 off += sizeof(id->mn); 1952 off += sizeof(id->mn);
1762 memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); 1953 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
1954}
1955
1956static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
1957{
1958 ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
1959 kfree(subsys);
1960}
1961
1962static void nvme_release_subsystem(struct device *dev)
1963{
1964 __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
1965}
1966
1967static void nvme_destroy_subsystem(struct kref *ref)
1968{
1969 struct nvme_subsystem *subsys =
1970 container_of(ref, struct nvme_subsystem, ref);
1971
1972 mutex_lock(&nvme_subsystems_lock);
1973 list_del(&subsys->entry);
1974 mutex_unlock(&nvme_subsystems_lock);
1975
1976 ida_destroy(&subsys->ns_ida);
1977 device_del(&subsys->dev);
1978 put_device(&subsys->dev);
1979}
1980
1981static void nvme_put_subsystem(struct nvme_subsystem *subsys)
1982{
1983 kref_put(&subsys->ref, nvme_destroy_subsystem);
1984}
1985
1986static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
1987{
1988 struct nvme_subsystem *subsys;
1989
1990 lockdep_assert_held(&nvme_subsystems_lock);
1991
1992 list_for_each_entry(subsys, &nvme_subsystems, entry) {
1993 if (strcmp(subsys->subnqn, subsysnqn))
1994 continue;
1995 if (!kref_get_unless_zero(&subsys->ref))
1996 continue;
1997 return subsys;
1998 }
1999
2000 return NULL;
2001}
2002
2003#define SUBSYS_ATTR_RO(_name, _mode, _show) \
2004 struct device_attribute subsys_attr_##_name = \
2005 __ATTR(_name, _mode, _show, NULL)
2006
2007static ssize_t nvme_subsys_show_nqn(struct device *dev,
2008 struct device_attribute *attr,
2009 char *buf)
2010{
2011 struct nvme_subsystem *subsys =
2012 container_of(dev, struct nvme_subsystem, dev);
2013
2014 return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
2015}
2016static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2017
2018#define nvme_subsys_show_str_function(field) \
2019static ssize_t subsys_##field##_show(struct device *dev, \
2020 struct device_attribute *attr, char *buf) \
2021{ \
2022 struct nvme_subsystem *subsys = \
2023 container_of(dev, struct nvme_subsystem, dev); \
2024 return sprintf(buf, "%.*s\n", \
2025 (int)sizeof(subsys->field), subsys->field); \
2026} \
2027static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2028
2029nvme_subsys_show_str_function(model);
2030nvme_subsys_show_str_function(serial);
2031nvme_subsys_show_str_function(firmware_rev);
2032
2033static struct attribute *nvme_subsys_attrs[] = {
2034 &subsys_attr_model.attr,
2035 &subsys_attr_serial.attr,
2036 &subsys_attr_firmware_rev.attr,
2037 &subsys_attr_subsysnqn.attr,
2038 NULL,
2039};
2040
2041static struct attribute_group nvme_subsys_attrs_group = {
2042 .attrs = nvme_subsys_attrs,
2043};
2044
2045static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2046 &nvme_subsys_attrs_group,
2047 NULL,
2048};
2049
2050static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2051{
2052 struct nvme_subsystem *subsys, *found;
2053 int ret;
2054
2055 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2056 if (!subsys)
2057 return -ENOMEM;
2058 ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
2059 if (ret < 0) {
2060 kfree(subsys);
2061 return ret;
2062 }
2063 subsys->instance = ret;
2064 mutex_init(&subsys->lock);
2065 kref_init(&subsys->ref);
2066 INIT_LIST_HEAD(&subsys->ctrls);
2067 INIT_LIST_HEAD(&subsys->nsheads);
2068 nvme_init_subnqn(subsys, ctrl, id);
2069 memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2070 memcpy(subsys->model, id->mn, sizeof(subsys->model));
2071 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2072 subsys->vendor_id = le16_to_cpu(id->vid);
2073 subsys->cmic = id->cmic;
2074
2075 subsys->dev.class = nvme_subsys_class;
2076 subsys->dev.release = nvme_release_subsystem;
2077 subsys->dev.groups = nvme_subsys_attrs_groups;
2078 dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
2079 device_initialize(&subsys->dev);
2080
2081 mutex_lock(&nvme_subsystems_lock);
2082 found = __nvme_find_get_subsystem(subsys->subnqn);
2083 if (found) {
2084 /*
2085 * Verify that the subsystem actually supports multiple
2086 * controllers, else bail out.
2087 */
2088 if (!(id->cmic & (1 << 1))) {
2089 dev_err(ctrl->device,
2090 "ignoring ctrl due to duplicate subnqn (%s).\n",
2091 found->subnqn);
2092 nvme_put_subsystem(found);
2093 ret = -EINVAL;
2094 goto out_unlock;
2095 }
2096
2097 __nvme_release_subsystem(subsys);
2098 subsys = found;
2099 } else {
2100 ret = device_add(&subsys->dev);
2101 if (ret) {
2102 dev_err(ctrl->device,
2103 "failed to register subsystem device.\n");
2104 goto out_unlock;
2105 }
2106 ida_init(&subsys->ns_ida);
2107 list_add_tail(&subsys->entry, &nvme_subsystems);
2108 }
2109
2110 ctrl->subsys = subsys;
2111 mutex_unlock(&nvme_subsystems_lock);
2112
2113 if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2114 dev_name(ctrl->device))) {
2115 dev_err(ctrl->device,
2116 "failed to create sysfs link from subsystem.\n");
2117 /* the transport driver will eventually put the subsystem */
2118 return -EINVAL;
2119 }
2120
2121 mutex_lock(&subsys->lock);
2122 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2123 mutex_unlock(&subsys->lock);
2124
2125 return 0;
2126
2127out_unlock:
2128 mutex_unlock(&nvme_subsystems_lock);
2129 put_device(&subsys->dev);
2130 return ret;
2131}
2132
2133static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
2134 size_t size)
2135{
2136 struct nvme_command c = { };
2137
2138 c.common.opcode = nvme_admin_get_log_page;
2139 c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
2140 c.common.cdw10[0] = nvme_get_log_dw10(log_page, size);
2141
2142 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2143}
2144
2145static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
2146{
2147 int ret;
2148
2149 if (!ctrl->effects)
2150 ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
2151
2152 if (!ctrl->effects)
2153 return 0;
2154
2155 ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
2156 sizeof(*ctrl->effects));
2157 if (ret) {
2158 kfree(ctrl->effects);
2159 ctrl->effects = NULL;
2160 }
2161 return ret;
1763} 2162}
1764 2163
1765/* 2164/*
@@ -1797,9 +2196,19 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
1797 return -EIO; 2196 return -EIO;
1798 } 2197 }
1799 2198
1800 nvme_init_subnqn(ctrl, id); 2199 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2200 ret = nvme_get_effects_log(ctrl);
2201 if (ret < 0)
2202 return ret;
2203 }
1801 2204
1802 if (!ctrl->identified) { 2205 if (!ctrl->identified) {
2206 int i;
2207
2208 ret = nvme_init_subsystem(ctrl, id);
2209 if (ret)
2210 goto out_free;
2211
1803 /* 2212 /*
1804 * Check for quirks. Quirk can depend on firmware version, 2213 * Check for quirks. Quirk can depend on firmware version,
1805 * so, in principle, the set of quirks present can change 2214 * so, in principle, the set of quirks present can change
@@ -1808,9 +2217,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
1808 * the device, but we'd have to make sure that the driver 2217 * the device, but we'd have to make sure that the driver
1809 * behaves intelligently if the quirks change. 2218 * behaves intelligently if the quirks change.
1810 */ 2219 */
1811
1812 int i;
1813
1814 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { 2220 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
1815 if (quirk_matches(id, &core_quirks[i])) 2221 if (quirk_matches(id, &core_quirks[i]))
1816 ctrl->quirks |= core_quirks[i].quirks; 2222 ctrl->quirks |= core_quirks[i].quirks;
@@ -1823,14 +2229,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
1823 } 2229 }
1824 2230
1825 ctrl->oacs = le16_to_cpu(id->oacs); 2231 ctrl->oacs = le16_to_cpu(id->oacs);
1826 ctrl->vid = le16_to_cpu(id->vid);
1827 ctrl->oncs = le16_to_cpup(&id->oncs); 2232 ctrl->oncs = le16_to_cpup(&id->oncs);
1828 atomic_set(&ctrl->abort_limit, id->acl + 1); 2233 atomic_set(&ctrl->abort_limit, id->acl + 1);
1829 ctrl->vwc = id->vwc; 2234 ctrl->vwc = id->vwc;
1830 ctrl->cntlid = le16_to_cpup(&id->cntlid); 2235 ctrl->cntlid = le16_to_cpup(&id->cntlid);
1831 memcpy(ctrl->serial, id->sn, sizeof(id->sn));
1832 memcpy(ctrl->model, id->mn, sizeof(id->mn));
1833 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
1834 if (id->mdts) 2236 if (id->mdts)
1835 max_hw_sectors = 1 << (id->mdts + page_shift - 9); 2237 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
1836 else 2238 else
@@ -1931,33 +2333,12 @@ EXPORT_SYMBOL_GPL(nvme_init_identify);
1931 2333
1932static int nvme_dev_open(struct inode *inode, struct file *file) 2334static int nvme_dev_open(struct inode *inode, struct file *file)
1933{ 2335{
1934 struct nvme_ctrl *ctrl; 2336 struct nvme_ctrl *ctrl =
1935 int instance = iminor(inode); 2337 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
1936 int ret = -ENODEV;
1937
1938 spin_lock(&dev_list_lock);
1939 list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
1940 if (ctrl->instance != instance)
1941 continue;
1942
1943 if (!ctrl->admin_q) {
1944 ret = -EWOULDBLOCK;
1945 break;
1946 }
1947 if (!kref_get_unless_zero(&ctrl->kref))
1948 break;
1949 file->private_data = ctrl;
1950 ret = 0;
1951 break;
1952 }
1953 spin_unlock(&dev_list_lock);
1954
1955 return ret;
1956}
1957 2338
1958static int nvme_dev_release(struct inode *inode, struct file *file) 2339 if (ctrl->state != NVME_CTRL_LIVE)
1959{ 2340 return -EWOULDBLOCK;
1960 nvme_put_ctrl(file->private_data); 2341 file->private_data = ctrl;
1961 return 0; 2342 return 0;
1962} 2343}
1963 2344
@@ -2021,7 +2402,6 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
2021static const struct file_operations nvme_dev_fops = { 2402static const struct file_operations nvme_dev_fops = {
2022 .owner = THIS_MODULE, 2403 .owner = THIS_MODULE,
2023 .open = nvme_dev_open, 2404 .open = nvme_dev_open,
2024 .release = nvme_dev_release,
2025 .unlocked_ioctl = nvme_dev_ioctl, 2405 .unlocked_ioctl = nvme_dev_ioctl,
2026 .compat_ioctl = nvme_dev_ioctl, 2406 .compat_ioctl = nvme_dev_ioctl,
2027}; 2407};
@@ -2051,77 +2431,86 @@ static ssize_t nvme_sysfs_rescan(struct device *dev,
2051} 2431}
2052static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); 2432static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
2053 2433
2434static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
2435{
2436 struct gendisk *disk = dev_to_disk(dev);
2437
2438 if (disk->fops == &nvme_fops)
2439 return nvme_get_ns_from_dev(dev)->head;
2440 else
2441 return disk->private_data;
2442}
2443
2054static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 2444static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
2055 char *buf) 2445 char *buf)
2056{ 2446{
2057 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 2447 struct nvme_ns_head *head = dev_to_ns_head(dev);
2058 struct nvme_ctrl *ctrl = ns->ctrl; 2448 struct nvme_ns_ids *ids = &head->ids;
2059 int serial_len = sizeof(ctrl->serial); 2449 struct nvme_subsystem *subsys = head->subsys;
2060 int model_len = sizeof(ctrl->model); 2450 int serial_len = sizeof(subsys->serial);
2451 int model_len = sizeof(subsys->model);
2061 2452
2062 if (!uuid_is_null(&ns->uuid)) 2453 if (!uuid_is_null(&ids->uuid))
2063 return sprintf(buf, "uuid.%pU\n", &ns->uuid); 2454 return sprintf(buf, "uuid.%pU\n", &ids->uuid);
2064 2455
2065 if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) 2456 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2066 return sprintf(buf, "eui.%16phN\n", ns->nguid); 2457 return sprintf(buf, "eui.%16phN\n", ids->nguid);
2067 2458
2068 if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) 2459 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2069 return sprintf(buf, "eui.%8phN\n", ns->eui); 2460 return sprintf(buf, "eui.%8phN\n", ids->eui64);
2070 2461
2071 while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' || 2462 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
2072 ctrl->serial[serial_len - 1] == '\0')) 2463 subsys->serial[serial_len - 1] == '\0'))
2073 serial_len--; 2464 serial_len--;
2074 while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' || 2465 while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
2075 ctrl->model[model_len - 1] == '\0')) 2466 subsys->model[model_len - 1] == '\0'))
2076 model_len--; 2467 model_len--;
2077 2468
2078 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, 2469 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
2079 serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); 2470 serial_len, subsys->serial, model_len, subsys->model,
2471 head->ns_id);
2080} 2472}
2081static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); 2473static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
2082 2474
2083static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, 2475static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
2084 char *buf) 2476 char *buf)
2085{ 2477{
2086 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 2478 return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
2087 return sprintf(buf, "%pU\n", ns->nguid);
2088} 2479}
2089static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); 2480static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
2090 2481
2091static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 2482static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
2092 char *buf) 2483 char *buf)
2093{ 2484{
2094 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 2485 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2095 2486
2096 /* For backward compatibility expose the NGUID to userspace if 2487 /* For backward compatibility expose the NGUID to userspace if
2097 * we have no UUID set 2488 * we have no UUID set
2098 */ 2489 */
2099 if (uuid_is_null(&ns->uuid)) { 2490 if (uuid_is_null(&ids->uuid)) {
2100 printk_ratelimited(KERN_WARNING 2491 printk_ratelimited(KERN_WARNING
2101 "No UUID available providing old NGUID\n"); 2492 "No UUID available providing old NGUID\n");
2102 return sprintf(buf, "%pU\n", ns->nguid); 2493 return sprintf(buf, "%pU\n", ids->nguid);
2103 } 2494 }
2104 return sprintf(buf, "%pU\n", &ns->uuid); 2495 return sprintf(buf, "%pU\n", &ids->uuid);
2105} 2496}
2106static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); 2497static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
2107 2498
2108static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 2499static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
2109 char *buf) 2500 char *buf)
2110{ 2501{
2111 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 2502 return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
2112 return sprintf(buf, "%8phd\n", ns->eui);
2113} 2503}
2114static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); 2504static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
2115 2505
2116static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 2506static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
2117 char *buf) 2507 char *buf)
2118{ 2508{
2119 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 2509 return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
2120 return sprintf(buf, "%d\n", ns->ns_id);
2121} 2510}
2122static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); 2511static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
2123 2512
2124static struct attribute *nvme_ns_attrs[] = { 2513static struct attribute *nvme_ns_id_attrs[] = {
2125 &dev_attr_wwid.attr, 2514 &dev_attr_wwid.attr,
2126 &dev_attr_uuid.attr, 2515 &dev_attr_uuid.attr,
2127 &dev_attr_nguid.attr, 2516 &dev_attr_nguid.attr,
@@ -2130,31 +2519,31 @@ static struct attribute *nvme_ns_attrs[] = {
2130 NULL, 2519 NULL,
2131}; 2520};
2132 2521
2133static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, 2522static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
2134 struct attribute *a, int n) 2523 struct attribute *a, int n)
2135{ 2524{
2136 struct device *dev = container_of(kobj, struct device, kobj); 2525 struct device *dev = container_of(kobj, struct device, kobj);
2137 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 2526 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2138 2527
2139 if (a == &dev_attr_uuid.attr) { 2528 if (a == &dev_attr_uuid.attr) {
2140 if (uuid_is_null(&ns->uuid) && 2529 if (uuid_is_null(&ids->uuid) &&
2141 !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) 2530 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2142 return 0; 2531 return 0;
2143 } 2532 }
2144 if (a == &dev_attr_nguid.attr) { 2533 if (a == &dev_attr_nguid.attr) {
2145 if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) 2534 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2146 return 0; 2535 return 0;
2147 } 2536 }
2148 if (a == &dev_attr_eui.attr) { 2537 if (a == &dev_attr_eui.attr) {
2149 if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) 2538 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2150 return 0; 2539 return 0;
2151 } 2540 }
2152 return a->mode; 2541 return a->mode;
2153} 2542}
2154 2543
2155static const struct attribute_group nvme_ns_attr_group = { 2544const struct attribute_group nvme_ns_id_attr_group = {
2156 .attrs = nvme_ns_attrs, 2545 .attrs = nvme_ns_id_attrs,
2157 .is_visible = nvme_ns_attrs_are_visible, 2546 .is_visible = nvme_ns_id_attrs_are_visible,
2158}; 2547};
2159 2548
2160#define nvme_show_str_function(field) \ 2549#define nvme_show_str_function(field) \
@@ -2162,10 +2551,15 @@ static ssize_t field##_show(struct device *dev, \
2162 struct device_attribute *attr, char *buf) \ 2551 struct device_attribute *attr, char *buf) \
2163{ \ 2552{ \
2164 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 2553 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
2165 return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ 2554 return sprintf(buf, "%.*s\n", \
2555 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
2166} \ 2556} \
2167static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 2557static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2168 2558
2559nvme_show_str_function(model);
2560nvme_show_str_function(serial);
2561nvme_show_str_function(firmware_rev);
2562
2169#define nvme_show_int_function(field) \ 2563#define nvme_show_int_function(field) \
2170static ssize_t field##_show(struct device *dev, \ 2564static ssize_t field##_show(struct device *dev, \
2171 struct device_attribute *attr, char *buf) \ 2565 struct device_attribute *attr, char *buf) \
@@ -2175,9 +2569,6 @@ static ssize_t field##_show(struct device *dev, \
2175} \ 2569} \
2176static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 2570static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2177 2571
2178nvme_show_str_function(model);
2179nvme_show_str_function(serial);
2180nvme_show_str_function(firmware_rev);
2181nvme_show_int_function(cntlid); 2572nvme_show_int_function(cntlid);
2182 2573
2183static ssize_t nvme_sysfs_delete(struct device *dev, 2574static ssize_t nvme_sysfs_delete(struct device *dev,
@@ -2187,7 +2578,7 @@ static ssize_t nvme_sysfs_delete(struct device *dev,
2187 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2578 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2188 2579
2189 if (device_remove_file_self(dev, attr)) 2580 if (device_remove_file_self(dev, attr))
2190 ctrl->ops->delete_ctrl(ctrl); 2581 nvme_delete_ctrl_sync(ctrl);
2191 return count; 2582 return count;
2192} 2583}
2193static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); 2584static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
@@ -2231,7 +2622,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
2231{ 2622{
2232 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2623 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2233 2624
2234 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn); 2625 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
2235} 2626}
2236static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); 2627static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
2237 2628
@@ -2284,12 +2675,128 @@ static const struct attribute_group *nvme_dev_attr_groups[] = {
2284 NULL, 2675 NULL,
2285}; 2676};
2286 2677
2678static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
2679 unsigned nsid)
2680{
2681 struct nvme_ns_head *h;
2682
2683 lockdep_assert_held(&subsys->lock);
2684
2685 list_for_each_entry(h, &subsys->nsheads, entry) {
2686 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
2687 return h;
2688 }
2689
2690 return NULL;
2691}
2692
2693static int __nvme_check_ids(struct nvme_subsystem *subsys,
2694 struct nvme_ns_head *new)
2695{
2696 struct nvme_ns_head *h;
2697
2698 lockdep_assert_held(&subsys->lock);
2699
2700 list_for_each_entry(h, &subsys->nsheads, entry) {
2701 if (nvme_ns_ids_valid(&new->ids) &&
2702 nvme_ns_ids_equal(&new->ids, &h->ids))
2703 return -EINVAL;
2704 }
2705
2706 return 0;
2707}
2708
2709static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
2710 unsigned nsid, struct nvme_id_ns *id)
2711{
2712 struct nvme_ns_head *head;
2713 int ret = -ENOMEM;
2714
2715 head = kzalloc(sizeof(*head), GFP_KERNEL);
2716 if (!head)
2717 goto out;
2718 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
2719 if (ret < 0)
2720 goto out_free_head;
2721 head->instance = ret;
2722 INIT_LIST_HEAD(&head->list);
2723 init_srcu_struct(&head->srcu);
2724 head->subsys = ctrl->subsys;
2725 head->ns_id = nsid;
2726 kref_init(&head->ref);
2727
2728 nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
2729
2730 ret = __nvme_check_ids(ctrl->subsys, head);
2731 if (ret) {
2732 dev_err(ctrl->device,
2733 "duplicate IDs for nsid %d\n", nsid);
2734 goto out_cleanup_srcu;
2735 }
2736
2737 ret = nvme_mpath_alloc_disk(ctrl, head);
2738 if (ret)
2739 goto out_cleanup_srcu;
2740
2741 list_add_tail(&head->entry, &ctrl->subsys->nsheads);
2742 return head;
2743out_cleanup_srcu:
2744 cleanup_srcu_struct(&head->srcu);
2745 ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
2746out_free_head:
2747 kfree(head);
2748out:
2749 return ERR_PTR(ret);
2750}
2751
2752static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
2753 struct nvme_id_ns *id, bool *new)
2754{
2755 struct nvme_ctrl *ctrl = ns->ctrl;
2756 bool is_shared = id->nmic & (1 << 0);
2757 struct nvme_ns_head *head = NULL;
2758 int ret = 0;
2759
2760 mutex_lock(&ctrl->subsys->lock);
2761 if (is_shared)
2762 head = __nvme_find_ns_head(ctrl->subsys, nsid);
2763 if (!head) {
2764 head = nvme_alloc_ns_head(ctrl, nsid, id);
2765 if (IS_ERR(head)) {
2766 ret = PTR_ERR(head);
2767 goto out_unlock;
2768 }
2769
2770 *new = true;
2771 } else {
2772 struct nvme_ns_ids ids;
2773
2774 nvme_report_ns_ids(ctrl, nsid, id, &ids);
2775 if (!nvme_ns_ids_equal(&head->ids, &ids)) {
2776 dev_err(ctrl->device,
2777 "IDs don't match for shared namespace %d\n",
2778 nsid);
2779 ret = -EINVAL;
2780 goto out_unlock;
2781 }
2782
2783 *new = false;
2784 }
2785
2786 list_add_tail(&ns->siblings, &head->list);
2787 ns->head = head;
2788
2789out_unlock:
2790 mutex_unlock(&ctrl->subsys->lock);
2791 return ret;
2792}
2793
2287static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 2794static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
2288{ 2795{
2289 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 2796 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
2290 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 2797 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
2291 2798
2292 return nsa->ns_id - nsb->ns_id; 2799 return nsa->head->ns_id - nsb->head->ns_id;
2293} 2800}
2294 2801
2295static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) 2802static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
@@ -2298,12 +2805,13 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2298 2805
2299 mutex_lock(&ctrl->namespaces_mutex); 2806 mutex_lock(&ctrl->namespaces_mutex);
2300 list_for_each_entry(ns, &ctrl->namespaces, list) { 2807 list_for_each_entry(ns, &ctrl->namespaces, list) {
2301 if (ns->ns_id == nsid) { 2808 if (ns->head->ns_id == nsid) {
2302 kref_get(&ns->kref); 2809 if (!kref_get_unless_zero(&ns->kref))
2810 continue;
2303 ret = ns; 2811 ret = ns;
2304 break; 2812 break;
2305 } 2813 }
2306 if (ns->ns_id > nsid) 2814 if (ns->head->ns_id > nsid)
2307 break; 2815 break;
2308 } 2816 }
2309 mutex_unlock(&ctrl->namespaces_mutex); 2817 mutex_unlock(&ctrl->namespaces_mutex);
@@ -2318,7 +2826,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
2318 if (!ctrl->nr_streams) 2826 if (!ctrl->nr_streams)
2319 return 0; 2827 return 0;
2320 2828
2321 ret = nvme_get_stream_params(ctrl, &s, ns->ns_id); 2829 ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
2322 if (ret) 2830 if (ret)
2323 return ret; 2831 return ret;
2324 2832
@@ -2342,33 +2850,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2342 struct gendisk *disk; 2850 struct gendisk *disk;
2343 struct nvme_id_ns *id; 2851 struct nvme_id_ns *id;
2344 char disk_name[DISK_NAME_LEN]; 2852 char disk_name[DISK_NAME_LEN];
2345 int node = dev_to_node(ctrl->dev); 2853 int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
2854 bool new = true;
2346 2855
2347 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 2856 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2348 if (!ns) 2857 if (!ns)
2349 return; 2858 return;
2350 2859
2351 ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
2352 if (ns->instance < 0)
2353 goto out_free_ns;
2354
2355 ns->queue = blk_mq_init_queue(ctrl->tagset); 2860 ns->queue = blk_mq_init_queue(ctrl->tagset);
2356 if (IS_ERR(ns->queue)) 2861 if (IS_ERR(ns->queue))
2357 goto out_release_instance; 2862 goto out_free_ns;
2358 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 2863 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2359 ns->queue->queuedata = ns; 2864 ns->queue->queuedata = ns;
2360 ns->ctrl = ctrl; 2865 ns->ctrl = ctrl;
2361 2866
2362 kref_init(&ns->kref); 2867 kref_init(&ns->kref);
2363 ns->ns_id = nsid;
2364 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 2868 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2365 2869
2366 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 2870 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2367 nvme_set_queue_limits(ctrl, ns->queue); 2871 nvme_set_queue_limits(ctrl, ns->queue);
2368 nvme_setup_streams_ns(ctrl, ns); 2872 nvme_setup_streams_ns(ctrl, ns);
2369 2873
2370 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
2371
2372 id = nvme_identify_ns(ctrl, nsid); 2874 id = nvme_identify_ns(ctrl, nsid);
2373 if (!id) 2875 if (!id)
2374 goto out_free_queue; 2876 goto out_free_queue;
@@ -2376,23 +2878,49 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2376 if (id->ncap == 0) 2878 if (id->ncap == 0)
2377 goto out_free_id; 2879 goto out_free_id;
2378 2880
2379 nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); 2881 if (nvme_init_ns_head(ns, nsid, id, &new))
2882 goto out_free_id;
2883
2884#ifdef CONFIG_NVME_MULTIPATH
2885 /*
2886 * If multipathing is enabled we need to always use the subsystem
2887 * instance number for numbering our devices to avoid conflicts
2888 * between subsystems that have multiple controllers and thus use
2889 * the multipath-aware subsystem node and those that have a single
2890 * controller and use the controller node directly.
2891 */
2892 if (ns->head->disk) {
2893 sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
2894 ctrl->cntlid, ns->head->instance);
2895 flags = GENHD_FL_HIDDEN;
2896 } else {
2897 sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
2898 ns->head->instance);
2899 }
2900#else
2901 /*
2902 * But without the multipath code enabled, multiple controller per
2903 * subsystems are visible as devices and thus we cannot use the
2904 * subsystem instance.
2905 */
2906 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
2907#endif
2380 2908
2381 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { 2909 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
2382 if (nvme_nvm_register(ns, disk_name, node)) { 2910 if (nvme_nvm_register(ns, disk_name, node)) {
2383 dev_warn(ctrl->device, "LightNVM init failure\n"); 2911 dev_warn(ctrl->device, "LightNVM init failure\n");
2384 goto out_free_id; 2912 goto out_unlink_ns;
2385 } 2913 }
2386 } 2914 }
2387 2915
2388 disk = alloc_disk_node(0, node); 2916 disk = alloc_disk_node(0, node);
2389 if (!disk) 2917 if (!disk)
2390 goto out_free_id; 2918 goto out_unlink_ns;
2391 2919
2392 disk->fops = &nvme_fops; 2920 disk->fops = &nvme_fops;
2393 disk->private_data = ns; 2921 disk->private_data = ns;
2394 disk->queue = ns->queue; 2922 disk->queue = ns->queue;
2395 disk->flags = GENHD_FL_EXT_DEVT; 2923 disk->flags = flags;
2396 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); 2924 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
2397 ns->disk = disk; 2925 ns->disk = disk;
2398 2926
@@ -2402,49 +2930,65 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2402 list_add_tail(&ns->list, &ctrl->namespaces); 2930 list_add_tail(&ns->list, &ctrl->namespaces);
2403 mutex_unlock(&ctrl->namespaces_mutex); 2931 mutex_unlock(&ctrl->namespaces_mutex);
2404 2932
2405 kref_get(&ctrl->kref); 2933 nvme_get_ctrl(ctrl);
2406 2934
2407 kfree(id); 2935 kfree(id);
2408 2936
2409 device_add_disk(ctrl->device, ns->disk); 2937 device_add_disk(ctrl->device, ns->disk);
2410 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, 2938 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
2411 &nvme_ns_attr_group)) 2939 &nvme_ns_id_attr_group))
2412 pr_warn("%s: failed to create sysfs group for identification\n", 2940 pr_warn("%s: failed to create sysfs group for identification\n",
2413 ns->disk->disk_name); 2941 ns->disk->disk_name);
2414 if (ns->ndev && nvme_nvm_register_sysfs(ns)) 2942 if (ns->ndev && nvme_nvm_register_sysfs(ns))
2415 pr_warn("%s: failed to register lightnvm sysfs group for identification\n", 2943 pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
2416 ns->disk->disk_name); 2944 ns->disk->disk_name);
2945
2946 if (new)
2947 nvme_mpath_add_disk(ns->head);
2948 nvme_mpath_add_disk_links(ns);
2417 return; 2949 return;
2950 out_unlink_ns:
2951 mutex_lock(&ctrl->subsys->lock);
2952 list_del_rcu(&ns->siblings);
2953 mutex_unlock(&ctrl->subsys->lock);
2418 out_free_id: 2954 out_free_id:
2419 kfree(id); 2955 kfree(id);
2420 out_free_queue: 2956 out_free_queue:
2421 blk_cleanup_queue(ns->queue); 2957 blk_cleanup_queue(ns->queue);
2422 out_release_instance:
2423 ida_simple_remove(&ctrl->ns_ida, ns->instance);
2424 out_free_ns: 2958 out_free_ns:
2425 kfree(ns); 2959 kfree(ns);
2426} 2960}
2427 2961
2428static void nvme_ns_remove(struct nvme_ns *ns) 2962static void nvme_ns_remove(struct nvme_ns *ns)
2429{ 2963{
2964 struct nvme_ns_head *head = ns->head;
2965
2430 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 2966 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
2431 return; 2967 return;
2432 2968
2433 if (ns->disk && ns->disk->flags & GENHD_FL_UP) { 2969 if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
2434 if (blk_get_integrity(ns->disk)) 2970 if (blk_get_integrity(ns->disk))
2435 blk_integrity_unregister(ns->disk); 2971 blk_integrity_unregister(ns->disk);
2972 nvme_mpath_remove_disk_links(ns);
2436 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 2973 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
2437 &nvme_ns_attr_group); 2974 &nvme_ns_id_attr_group);
2438 if (ns->ndev) 2975 if (ns->ndev)
2439 nvme_nvm_unregister_sysfs(ns); 2976 nvme_nvm_unregister_sysfs(ns);
2440 del_gendisk(ns->disk); 2977 del_gendisk(ns->disk);
2441 blk_cleanup_queue(ns->queue); 2978 blk_cleanup_queue(ns->queue);
2442 } 2979 }
2443 2980
2981 mutex_lock(&ns->ctrl->subsys->lock);
2982 nvme_mpath_clear_current_path(ns);
2983 if (head)
2984 list_del_rcu(&ns->siblings);
2985 mutex_unlock(&ns->ctrl->subsys->lock);
2986
2444 mutex_lock(&ns->ctrl->namespaces_mutex); 2987 mutex_lock(&ns->ctrl->namespaces_mutex);
2445 list_del_init(&ns->list); 2988 list_del_init(&ns->list);
2446 mutex_unlock(&ns->ctrl->namespaces_mutex); 2989 mutex_unlock(&ns->ctrl->namespaces_mutex);
2447 2990
2991 synchronize_srcu(&head->srcu);
2448 nvme_put_ns(ns); 2992 nvme_put_ns(ns);
2449} 2993}
2450 2994
@@ -2467,7 +3011,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
2467 struct nvme_ns *ns, *next; 3011 struct nvme_ns *ns, *next;
2468 3012
2469 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 3013 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
2470 if (ns->ns_id > nsid) 3014 if (ns->head->ns_id > nsid)
2471 nvme_ns_remove(ns); 3015 nvme_ns_remove(ns);
2472 } 3016 }
2473} 3017}
@@ -2583,20 +3127,29 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
2583} 3127}
2584EXPORT_SYMBOL_GPL(nvme_remove_namespaces); 3128EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
2585 3129
3130static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
3131{
3132 char *envp[2] = { NULL, NULL };
3133 u32 aen_result = ctrl->aen_result;
3134
3135 ctrl->aen_result = 0;
3136 if (!aen_result)
3137 return;
3138
3139 envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
3140 if (!envp[0])
3141 return;
3142 kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
3143 kfree(envp[0]);
3144}
3145
2586static void nvme_async_event_work(struct work_struct *work) 3146static void nvme_async_event_work(struct work_struct *work)
2587{ 3147{
2588 struct nvme_ctrl *ctrl = 3148 struct nvme_ctrl *ctrl =
2589 container_of(work, struct nvme_ctrl, async_event_work); 3149 container_of(work, struct nvme_ctrl, async_event_work);
2590 3150
2591 spin_lock_irq(&ctrl->lock); 3151 nvme_aen_uevent(ctrl);
2592 while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { 3152 ctrl->ops->submit_async_event(ctrl);
2593 int aer_idx = --ctrl->event_limit;
2594
2595 spin_unlock_irq(&ctrl->lock);
2596 ctrl->ops->submit_async_event(ctrl, aer_idx);
2597 spin_lock_irq(&ctrl->lock);
2598 }
2599 spin_unlock_irq(&ctrl->lock);
2600} 3153}
2601 3154
2602static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) 3155static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
@@ -2615,18 +3168,13 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
2615 3168
2616static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) 3169static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
2617{ 3170{
2618 struct nvme_command c = { };
2619 struct nvme_fw_slot_info_log *log; 3171 struct nvme_fw_slot_info_log *log;
2620 3172
2621 log = kmalloc(sizeof(*log), GFP_KERNEL); 3173 log = kmalloc(sizeof(*log), GFP_KERNEL);
2622 if (!log) 3174 if (!log)
2623 return; 3175 return;
2624 3176
2625 c.common.opcode = nvme_admin_get_log_page; 3177 if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
2626 c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
2627 c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
2628
2629 if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
2630 dev_warn(ctrl->device, 3178 dev_warn(ctrl->device,
2631 "Get FW SLOT INFO log error\n"); 3179 "Get FW SLOT INFO log error\n");
2632 kfree(log); 3180 kfree(log);
@@ -2660,7 +3208,7 @@ static void nvme_fw_act_work(struct work_struct *work)
2660 return; 3208 return;
2661 3209
2662 nvme_start_queues(ctrl); 3210 nvme_start_queues(ctrl);
2663 /* read FW slot informationi to clear the AER*/ 3211 /* read FW slot information to clear the AER */
2664 nvme_get_fw_slot_info(ctrl); 3212 nvme_get_fw_slot_info(ctrl);
2665} 3213}
2666 3214
@@ -2668,24 +3216,21 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
2668 union nvme_result *res) 3216 union nvme_result *res)
2669{ 3217{
2670 u32 result = le32_to_cpu(res->u32); 3218 u32 result = le32_to_cpu(res->u32);
2671 bool done = true;
2672 3219
2673 switch (le16_to_cpu(status) >> 1) { 3220 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
2674 case NVME_SC_SUCCESS: 3221 return;
2675 done = false; 3222
2676 /*FALLTHRU*/ 3223 switch (result & 0x7) {
2677 case NVME_SC_ABORT_REQ: 3224 case NVME_AER_ERROR:
2678 ++ctrl->event_limit; 3225 case NVME_AER_SMART:
2679 if (ctrl->state == NVME_CTRL_LIVE) 3226 case NVME_AER_CSS:
2680 queue_work(nvme_wq, &ctrl->async_event_work); 3227 case NVME_AER_VS:
3228 ctrl->aen_result = result;
2681 break; 3229 break;
2682 default: 3230 default:
2683 break; 3231 break;
2684 } 3232 }
2685 3233
2686 if (done)
2687 return;
2688
2689 switch (result & 0xff07) { 3234 switch (result & 0xff07) {
2690 case NVME_AER_NOTICE_NS_CHANGED: 3235 case NVME_AER_NOTICE_NS_CHANGED:
2691 dev_info(ctrl->device, "rescanning\n"); 3236 dev_info(ctrl->device, "rescanning\n");
@@ -2697,44 +3242,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
2697 default: 3242 default:
2698 dev_warn(ctrl->device, "async event result %08x\n", result); 3243 dev_warn(ctrl->device, "async event result %08x\n", result);
2699 } 3244 }
2700}
2701EXPORT_SYMBOL_GPL(nvme_complete_async_event);
2702
2703void nvme_queue_async_events(struct nvme_ctrl *ctrl)
2704{
2705 ctrl->event_limit = NVME_NR_AERS;
2706 queue_work(nvme_wq, &ctrl->async_event_work); 3245 queue_work(nvme_wq, &ctrl->async_event_work);
2707} 3246}
2708EXPORT_SYMBOL_GPL(nvme_queue_async_events); 3247EXPORT_SYMBOL_GPL(nvme_complete_async_event);
2709
2710static DEFINE_IDA(nvme_instance_ida);
2711
2712static int nvme_set_instance(struct nvme_ctrl *ctrl)
2713{
2714 int instance, error;
2715
2716 do {
2717 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
2718 return -ENODEV;
2719
2720 spin_lock(&dev_list_lock);
2721 error = ida_get_new(&nvme_instance_ida, &instance);
2722 spin_unlock(&dev_list_lock);
2723 } while (error == -EAGAIN);
2724
2725 if (error)
2726 return -ENODEV;
2727
2728 ctrl->instance = instance;
2729 return 0;
2730}
2731
2732static void nvme_release_instance(struct nvme_ctrl *ctrl)
2733{
2734 spin_lock(&dev_list_lock);
2735 ida_remove(&nvme_instance_ida, ctrl->instance);
2736 spin_unlock(&dev_list_lock);
2737}
2738 3248
2739void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 3249void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
2740{ 3250{
@@ -2752,7 +3262,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
2752 3262
2753 if (ctrl->queue_count > 1) { 3263 if (ctrl->queue_count > 1) {
2754 nvme_queue_scan(ctrl); 3264 nvme_queue_scan(ctrl);
2755 nvme_queue_async_events(ctrl); 3265 queue_work(nvme_wq, &ctrl->async_event_work);
2756 nvme_start_queues(ctrl); 3266 nvme_start_queues(ctrl);
2757 } 3267 }
2758} 3268}
@@ -2760,30 +3270,31 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
2760 3270
2761void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 3271void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
2762{ 3272{
2763 device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); 3273 cdev_device_del(&ctrl->cdev, ctrl->device);
2764
2765 spin_lock(&dev_list_lock);
2766 list_del(&ctrl->node);
2767 spin_unlock(&dev_list_lock);
2768} 3274}
2769EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); 3275EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
2770 3276
2771static void nvme_free_ctrl(struct kref *kref) 3277static void nvme_free_ctrl(struct device *dev)
2772{ 3278{
2773 struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); 3279 struct nvme_ctrl *ctrl =
3280 container_of(dev, struct nvme_ctrl, ctrl_device);
3281 struct nvme_subsystem *subsys = ctrl->subsys;
2774 3282
2775 put_device(ctrl->device); 3283 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
2776 nvme_release_instance(ctrl); 3284 kfree(ctrl->effects);
2777 ida_destroy(&ctrl->ns_ida); 3285
3286 if (subsys) {
3287 mutex_lock(&subsys->lock);
3288 list_del(&ctrl->subsys_entry);
3289 mutex_unlock(&subsys->lock);
3290 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
3291 }
2778 3292
2779 ctrl->ops->free_ctrl(ctrl); 3293 ctrl->ops->free_ctrl(ctrl);
2780}
2781 3294
2782void nvme_put_ctrl(struct nvme_ctrl *ctrl) 3295 if (subsys)
2783{ 3296 nvme_put_subsystem(subsys);
2784 kref_put(&ctrl->kref, nvme_free_ctrl);
2785} 3297}
2786EXPORT_SYMBOL_GPL(nvme_put_ctrl);
2787 3298
2788/* 3299/*
2789 * Initialize a NVMe controller structures. This needs to be called during 3300 * Initialize a NVMe controller structures. This needs to be called during
@@ -2799,32 +3310,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
2799 spin_lock_init(&ctrl->lock); 3310 spin_lock_init(&ctrl->lock);
2800 INIT_LIST_HEAD(&ctrl->namespaces); 3311 INIT_LIST_HEAD(&ctrl->namespaces);
2801 mutex_init(&ctrl->namespaces_mutex); 3312 mutex_init(&ctrl->namespaces_mutex);
2802 kref_init(&ctrl->kref);
2803 ctrl->dev = dev; 3313 ctrl->dev = dev;
2804 ctrl->ops = ops; 3314 ctrl->ops = ops;
2805 ctrl->quirks = quirks; 3315 ctrl->quirks = quirks;
2806 INIT_WORK(&ctrl->scan_work, nvme_scan_work); 3316 INIT_WORK(&ctrl->scan_work, nvme_scan_work);
2807 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 3317 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
2808 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); 3318 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
3319 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
2809 3320
2810 ret = nvme_set_instance(ctrl); 3321 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
2811 if (ret) 3322 if (ret < 0)
2812 goto out; 3323 goto out;
2813 3324 ctrl->instance = ret;
2814 ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, 3325
2815 MKDEV(nvme_char_major, ctrl->instance), 3326 device_initialize(&ctrl->ctrl_device);
2816 ctrl, nvme_dev_attr_groups, 3327 ctrl->device = &ctrl->ctrl_device;
2817 "nvme%d", ctrl->instance); 3328 ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
2818 if (IS_ERR(ctrl->device)) { 3329 ctrl->device->class = nvme_class;
2819 ret = PTR_ERR(ctrl->device); 3330 ctrl->device->parent = ctrl->dev;
3331 ctrl->device->groups = nvme_dev_attr_groups;
3332 ctrl->device->release = nvme_free_ctrl;
3333 dev_set_drvdata(ctrl->device, ctrl);
3334 ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
3335 if (ret)
2820 goto out_release_instance; 3336 goto out_release_instance;
2821 }
2822 get_device(ctrl->device);
2823 ida_init(&ctrl->ns_ida);
2824 3337
2825 spin_lock(&dev_list_lock); 3338 cdev_init(&ctrl->cdev, &nvme_dev_fops);
2826 list_add_tail(&ctrl->node, &nvme_ctrl_list); 3339 ctrl->cdev.owner = ops->module;
2827 spin_unlock(&dev_list_lock); 3340 ret = cdev_device_add(&ctrl->cdev, ctrl->device);
3341 if (ret)
3342 goto out_free_name;
2828 3343
2829 /* 3344 /*
2830 * Initialize latency tolerance controls. The sysfs files won't 3345 * Initialize latency tolerance controls. The sysfs files won't
@@ -2835,8 +3350,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
2835 min(default_ps_max_latency_us, (unsigned long)S32_MAX)); 3350 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
2836 3351
2837 return 0; 3352 return 0;
3353out_free_name:
3354 kfree_const(dev->kobj.name);
2838out_release_instance: 3355out_release_instance:
2839 nvme_release_instance(ctrl); 3356 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
2840out: 3357out:
2841 return ret; 3358 return ret;
2842} 3359}
@@ -2945,6 +3462,16 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
2945} 3462}
2946EXPORT_SYMBOL_GPL(nvme_start_queues); 3463EXPORT_SYMBOL_GPL(nvme_start_queues);
2947 3464
3465int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
3466{
3467 if (!ctrl->ops->reinit_request)
3468 return 0;
3469
3470 return blk_mq_tagset_iter(set, set->driver_data,
3471 ctrl->ops->reinit_request);
3472}
3473EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
3474
2948int __init nvme_core_init(void) 3475int __init nvme_core_init(void)
2949{ 3476{
2950 int result; 3477 int result;
@@ -2954,12 +3481,9 @@ int __init nvme_core_init(void)
2954 if (!nvme_wq) 3481 if (!nvme_wq)
2955 return -ENOMEM; 3482 return -ENOMEM;
2956 3483
2957 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 3484 result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
2958 &nvme_dev_fops);
2959 if (result < 0) 3485 if (result < 0)
2960 goto destroy_wq; 3486 goto destroy_wq;
2961 else if (result > 0)
2962 nvme_char_major = result;
2963 3487
2964 nvme_class = class_create(THIS_MODULE, "nvme"); 3488 nvme_class = class_create(THIS_MODULE, "nvme");
2965 if (IS_ERR(nvme_class)) { 3489 if (IS_ERR(nvme_class)) {
@@ -2967,10 +3491,17 @@ int __init nvme_core_init(void)
2967 goto unregister_chrdev; 3491 goto unregister_chrdev;
2968 } 3492 }
2969 3493
3494 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
3495 if (IS_ERR(nvme_subsys_class)) {
3496 result = PTR_ERR(nvme_subsys_class);
3497 goto destroy_class;
3498 }
2970 return 0; 3499 return 0;
2971 3500
3501destroy_class:
3502 class_destroy(nvme_class);
2972unregister_chrdev: 3503unregister_chrdev:
2973 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3504 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
2974destroy_wq: 3505destroy_wq:
2975 destroy_workqueue(nvme_wq); 3506 destroy_workqueue(nvme_wq);
2976 return result; 3507 return result;
@@ -2978,8 +3509,10 @@ destroy_wq:
2978 3509
2979void nvme_core_exit(void) 3510void nvme_core_exit(void)
2980{ 3511{
3512 ida_destroy(&nvme_subsystems_ida);
3513 class_destroy(nvme_subsys_class);
2981 class_destroy(nvme_class); 3514 class_destroy(nvme_class);
2982 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3515 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
2983 destroy_workqueue(nvme_wq); 3516 destroy_workqueue(nvme_wq);
2984} 3517}
2985 3518
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 555c976cc2ee..76b4fe6816a0 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -548,6 +548,7 @@ static const match_table_t opt_tokens = {
548 { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, 548 { NVMF_OPT_HOSTNQN, "hostnqn=%s" },
549 { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, 549 { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
550 { NVMF_OPT_HOST_ID, "hostid=%s" }, 550 { NVMF_OPT_HOST_ID, "hostid=%s" },
551 { NVMF_OPT_DUP_CONNECT, "duplicate_connect" },
551 { NVMF_OPT_ERR, NULL } 552 { NVMF_OPT_ERR, NULL }
552}; 553};
553 554
@@ -566,6 +567,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
566 opts->nr_io_queues = num_online_cpus(); 567 opts->nr_io_queues = num_online_cpus();
567 opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; 568 opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
568 opts->kato = NVME_DEFAULT_KATO; 569 opts->kato = NVME_DEFAULT_KATO;
570 opts->duplicate_connect = false;
569 571
570 options = o = kstrdup(buf, GFP_KERNEL); 572 options = o = kstrdup(buf, GFP_KERNEL);
571 if (!options) 573 if (!options)
@@ -742,6 +744,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
742 goto out; 744 goto out;
743 } 745 }
744 break; 746 break;
747 case NVMF_OPT_DUP_CONNECT:
748 opts->duplicate_connect = true;
749 break;
745 default: 750 default:
746 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", 751 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
747 p); 752 p);
@@ -823,7 +828,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
823#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) 828#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
824#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ 829#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
825 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ 830 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
826 NVMF_OPT_HOST_ID) 831 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
827 832
828static struct nvme_ctrl * 833static struct nvme_ctrl *
829nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) 834nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
@@ -841,6 +846,9 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
841 if (ret) 846 if (ret)
842 goto out_free_opts; 847 goto out_free_opts;
843 848
849
850 request_module("nvme-%s", opts->transport);
851
844 /* 852 /*
845 * Check the generic options first as we need a valid transport for 853 * Check the generic options first as we need a valid transport for
846 * the lookup below. Then clear the generic flags so that transport 854 * the lookup below. Then clear the generic flags so that transport
@@ -874,12 +882,12 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
874 goto out_unlock; 882 goto out_unlock;
875 } 883 }
876 884
877 if (strcmp(ctrl->subnqn, opts->subsysnqn)) { 885 if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
878 dev_warn(ctrl->device, 886 dev_warn(ctrl->device,
879 "controller returned incorrect NQN: \"%s\".\n", 887 "controller returned incorrect NQN: \"%s\".\n",
880 ctrl->subnqn); 888 ctrl->subsys->subnqn);
881 up_read(&nvmf_transports_rwsem); 889 up_read(&nvmf_transports_rwsem);
882 ctrl->ops->delete_ctrl(ctrl); 890 nvme_delete_ctrl_sync(ctrl);
883 return ERR_PTR(-EINVAL); 891 return ERR_PTR(-EINVAL);
884 } 892 }
885 893
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index bf33663218cd..42232e731f19 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -57,6 +57,7 @@ enum {
57 NVMF_OPT_HOST_TRADDR = 1 << 10, 57 NVMF_OPT_HOST_TRADDR = 1 << 10,
58 NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, 58 NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
59 NVMF_OPT_HOST_ID = 1 << 12, 59 NVMF_OPT_HOST_ID = 1 << 12,
60 NVMF_OPT_DUP_CONNECT = 1 << 13,
60}; 61};
61 62
62/** 63/**
@@ -96,6 +97,7 @@ struct nvmf_ctrl_options {
96 unsigned int nr_io_queues; 97 unsigned int nr_io_queues;
97 unsigned int reconnect_delay; 98 unsigned int reconnect_delay;
98 bool discovery_nqn; 99 bool discovery_nqn;
100 bool duplicate_connect;
99 unsigned int kato; 101 unsigned int kato;
100 struct nvmf_host *host; 102 struct nvmf_host *host;
101 int max_reconnects; 103 int max_reconnects;
@@ -131,6 +133,18 @@ struct nvmf_transport_ops {
131 struct nvmf_ctrl_options *opts); 133 struct nvmf_ctrl_options *opts);
132}; 134};
133 135
136static inline bool
137nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
138 struct nvmf_ctrl_options *opts)
139{
140 if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
141 strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
142 memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t)))
143 return false;
144
145 return true;
146}
147
134int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); 148int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
135int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); 149int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
136int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); 150int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index be49d0f79381..7ab0be55c7d0 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -30,27 +30,19 @@
30/* *************************** Data Structures/Defines ****************** */ 30/* *************************** Data Structures/Defines ****************** */
31 31
32 32
33/*
34 * We handle AEN commands ourselves and don't even let the
35 * block layer know about them.
36 */
37#define NVME_FC_NR_AEN_COMMANDS 1
38#define NVME_FC_AQ_BLKMQ_DEPTH \
39 (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
40#define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1)
41
42enum nvme_fc_queue_flags { 33enum nvme_fc_queue_flags {
43 NVME_FC_Q_CONNECTED = (1 << 0), 34 NVME_FC_Q_CONNECTED = (1 << 0),
44}; 35};
45 36
46#define NVMEFC_QUEUE_DELAY 3 /* ms units */ 37#define NVMEFC_QUEUE_DELAY 3 /* ms units */
47 38
39#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */
40
48struct nvme_fc_queue { 41struct nvme_fc_queue {
49 struct nvme_fc_ctrl *ctrl; 42 struct nvme_fc_ctrl *ctrl;
50 struct device *dev; 43 struct device *dev;
51 struct blk_mq_hw_ctx *hctx; 44 struct blk_mq_hw_ctx *hctx;
52 void *lldd_handle; 45 void *lldd_handle;
53 int queue_size;
54 size_t cmnd_capsule_len; 46 size_t cmnd_capsule_len;
55 u32 qnum; 47 u32 qnum;
56 u32 rqcnt; 48 u32 rqcnt;
@@ -124,6 +116,7 @@ struct nvme_fc_lport {
124 struct device *dev; /* physical device for dma */ 116 struct device *dev; /* physical device for dma */
125 struct nvme_fc_port_template *ops; 117 struct nvme_fc_port_template *ops;
126 struct kref ref; 118 struct kref ref;
119 atomic_t act_rport_cnt;
127} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ 120} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
128 121
129struct nvme_fc_rport { 122struct nvme_fc_rport {
@@ -136,6 +129,8 @@ struct nvme_fc_rport {
136 struct nvme_fc_lport *lport; 129 struct nvme_fc_lport *lport;
137 spinlock_t lock; 130 spinlock_t lock;
138 struct kref ref; 131 struct kref ref;
132 atomic_t act_ctrl_cnt;
133 unsigned long dev_loss_end;
139} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ 134} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
140 135
141enum nvme_fcctrl_flags { 136enum nvme_fcctrl_flags {
@@ -150,6 +145,7 @@ struct nvme_fc_ctrl {
150 struct nvme_fc_rport *rport; 145 struct nvme_fc_rport *rport;
151 u32 cnum; 146 u32 cnum;
152 147
148 bool assoc_active;
153 u64 association_id; 149 u64 association_id;
154 150
155 struct list_head ctrl_list; /* rport->ctrl_list */ 151 struct list_head ctrl_list; /* rport->ctrl_list */
@@ -157,7 +153,6 @@ struct nvme_fc_ctrl {
157 struct blk_mq_tag_set admin_tag_set; 153 struct blk_mq_tag_set admin_tag_set;
158 struct blk_mq_tag_set tag_set; 154 struct blk_mq_tag_set tag_set;
159 155
160 struct work_struct delete_work;
161 struct delayed_work connect_work; 156 struct delayed_work connect_work;
162 157
163 struct kref ref; 158 struct kref ref;
@@ -165,7 +160,7 @@ struct nvme_fc_ctrl {
165 u32 iocnt; 160 u32 iocnt;
166 wait_queue_head_t ioabort_wait; 161 wait_queue_head_t ioabort_wait;
167 162
168 struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS]; 163 struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS];
169 164
170 struct nvme_ctrl ctrl; 165 struct nvme_ctrl ctrl;
171}; 166};
@@ -213,10 +208,16 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
213 208
214 209
215 210
211/*
212 * These items are short-term. They will eventually be moved into
213 * a generic FC class. See comments in module init.
214 */
215static struct class *fc_class;
216static struct device *fc_udev_device;
217
216 218
217/* *********************** FC-NVME Port Management ************************ */ 219/* *********************** FC-NVME Port Management ************************ */
218 220
219static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *);
220static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, 221static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
221 struct nvme_fc_queue *, unsigned int); 222 struct nvme_fc_queue *, unsigned int);
222 223
@@ -235,9 +236,6 @@ nvme_fc_free_lport(struct kref *ref)
235 list_del(&lport->port_list); 236 list_del(&lport->port_list);
236 spin_unlock_irqrestore(&nvme_fc_lock, flags); 237 spin_unlock_irqrestore(&nvme_fc_lock, flags);
237 238
238 /* let the LLDD know we've finished tearing it down */
239 lport->ops->localport_delete(&lport->localport);
240
241 ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); 239 ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
242 ida_destroy(&lport->endp_cnt); 240 ida_destroy(&lport->endp_cnt);
243 241
@@ -260,7 +258,9 @@ nvme_fc_lport_get(struct nvme_fc_lport *lport)
260 258
261 259
262static struct nvme_fc_lport * 260static struct nvme_fc_lport *
263nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) 261nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo,
262 struct nvme_fc_port_template *ops,
263 struct device *dev)
264{ 264{
265 struct nvme_fc_lport *lport; 265 struct nvme_fc_lport *lport;
266 unsigned long flags; 266 unsigned long flags;
@@ -272,6 +272,11 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
272 lport->localport.port_name != pinfo->port_name) 272 lport->localport.port_name != pinfo->port_name)
273 continue; 273 continue;
274 274
275 if (lport->dev != dev) {
276 lport = ERR_PTR(-EXDEV);
277 goto out_done;
278 }
279
275 if (lport->localport.port_state != FC_OBJSTATE_DELETED) { 280 if (lport->localport.port_state != FC_OBJSTATE_DELETED) {
276 lport = ERR_PTR(-EEXIST); 281 lport = ERR_PTR(-EEXIST);
277 goto out_done; 282 goto out_done;
@@ -288,6 +293,7 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
288 293
289 /* resume the lport */ 294 /* resume the lport */
290 295
296 lport->ops = ops;
291 lport->localport.port_role = pinfo->port_role; 297 lport->localport.port_role = pinfo->port_role;
292 lport->localport.port_id = pinfo->port_id; 298 lport->localport.port_id = pinfo->port_id;
293 lport->localport.port_state = FC_OBJSTATE_ONLINE; 299 lport->localport.port_state = FC_OBJSTATE_ONLINE;
@@ -348,7 +354,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
348 * expired, we can simply re-enable the localport. Remoteports 354 * expired, we can simply re-enable the localport. Remoteports
349 * and controller reconnections should resume naturally. 355 * and controller reconnections should resume naturally.
350 */ 356 */
351 newrec = nvme_fc_attach_to_unreg_lport(pinfo); 357 newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev);
352 358
353 /* found an lport, but something about its state is bad */ 359 /* found an lport, but something about its state is bad */
354 if (IS_ERR(newrec)) { 360 if (IS_ERR(newrec)) {
@@ -384,6 +390,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
384 INIT_LIST_HEAD(&newrec->port_list); 390 INIT_LIST_HEAD(&newrec->port_list);
385 INIT_LIST_HEAD(&newrec->endp_list); 391 INIT_LIST_HEAD(&newrec->endp_list);
386 kref_init(&newrec->ref); 392 kref_init(&newrec->ref);
393 atomic_set(&newrec->act_rport_cnt, 0);
387 newrec->ops = template; 394 newrec->ops = template;
388 newrec->dev = dev; 395 newrec->dev = dev;
389 ida_init(&newrec->endp_cnt); 396 ida_init(&newrec->endp_cnt);
@@ -446,12 +453,177 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
446 453
447 spin_unlock_irqrestore(&nvme_fc_lock, flags); 454 spin_unlock_irqrestore(&nvme_fc_lock, flags);
448 455
456 if (atomic_read(&lport->act_rport_cnt) == 0)
457 lport->ops->localport_delete(&lport->localport);
458
449 nvme_fc_lport_put(lport); 459 nvme_fc_lport_put(lport);
450 460
451 return 0; 461 return 0;
452} 462}
453EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport); 463EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
454 464
465/*
466 * TRADDR strings, per FC-NVME are fixed format:
467 * "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters
468 * udev event will only differ by prefix of what field is
469 * being specified:
470 * "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters
471 * 19 + 43 + null_fudge = 64 characters
472 */
473#define FCNVME_TRADDR_LENGTH 64
474
475static void
476nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport,
477 struct nvme_fc_rport *rport)
478{
479 char hostaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_HOST_TRADDR=...*/
480 char tgtaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_TRADDR=...*/
481 char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL };
482
483 if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY))
484 return;
485
486 snprintf(hostaddr, sizeof(hostaddr),
487 "NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx",
488 lport->localport.node_name, lport->localport.port_name);
489 snprintf(tgtaddr, sizeof(tgtaddr),
490 "NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx",
491 rport->remoteport.node_name, rport->remoteport.port_name);
492 kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp);
493}
494
495static void
496nvme_fc_free_rport(struct kref *ref)
497{
498 struct nvme_fc_rport *rport =
499 container_of(ref, struct nvme_fc_rport, ref);
500 struct nvme_fc_lport *lport =
501 localport_to_lport(rport->remoteport.localport);
502 unsigned long flags;
503
504 WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
505 WARN_ON(!list_empty(&rport->ctrl_list));
506
507 /* remove from lport list */
508 spin_lock_irqsave(&nvme_fc_lock, flags);
509 list_del(&rport->endp_list);
510 spin_unlock_irqrestore(&nvme_fc_lock, flags);
511
512 ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
513
514 kfree(rport);
515
516 nvme_fc_lport_put(lport);
517}
518
519static void
520nvme_fc_rport_put(struct nvme_fc_rport *rport)
521{
522 kref_put(&rport->ref, nvme_fc_free_rport);
523}
524
525static int
526nvme_fc_rport_get(struct nvme_fc_rport *rport)
527{
528 return kref_get_unless_zero(&rport->ref);
529}
530
531static void
532nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
533{
534 switch (ctrl->ctrl.state) {
535 case NVME_CTRL_NEW:
536 case NVME_CTRL_RECONNECTING:
537 /*
538 * As all reconnects were suppressed, schedule a
539 * connect.
540 */
541 dev_info(ctrl->ctrl.device,
542 "NVME-FC{%d}: connectivity re-established. "
543 "Attempting reconnect\n", ctrl->cnum);
544
545 queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
546 break;
547
548 case NVME_CTRL_RESETTING:
549 /*
550 * Controller is already in the process of terminating the
551 * association. No need to do anything further. The reconnect
552 * step will naturally occur after the reset completes.
553 */
554 break;
555
556 default:
557 /* no action to take - let it delete */
558 break;
559 }
560}
561
562static struct nvme_fc_rport *
563nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport,
564 struct nvme_fc_port_info *pinfo)
565{
566 struct nvme_fc_rport *rport;
567 struct nvme_fc_ctrl *ctrl;
568 unsigned long flags;
569
570 spin_lock_irqsave(&nvme_fc_lock, flags);
571
572 list_for_each_entry(rport, &lport->endp_list, endp_list) {
573 if (rport->remoteport.node_name != pinfo->node_name ||
574 rport->remoteport.port_name != pinfo->port_name)
575 continue;
576
577 if (!nvme_fc_rport_get(rport)) {
578 rport = ERR_PTR(-ENOLCK);
579 goto out_done;
580 }
581
582 spin_unlock_irqrestore(&nvme_fc_lock, flags);
583
584 spin_lock_irqsave(&rport->lock, flags);
585
586 /* has it been unregistered */
587 if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) {
588 /* means lldd called us twice */
589 spin_unlock_irqrestore(&rport->lock, flags);
590 nvme_fc_rport_put(rport);
591 return ERR_PTR(-ESTALE);
592 }
593
594 rport->remoteport.port_state = FC_OBJSTATE_ONLINE;
595 rport->dev_loss_end = 0;
596
597 /*
598 * kick off a reconnect attempt on all associations to the
599 * remote port. A successful reconnects will resume i/o.
600 */
601 list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
602 nvme_fc_resume_controller(ctrl);
603
604 spin_unlock_irqrestore(&rport->lock, flags);
605
606 return rport;
607 }
608
609 rport = NULL;
610
611out_done:
612 spin_unlock_irqrestore(&nvme_fc_lock, flags);
613
614 return rport;
615}
616
617static inline void
618__nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport,
619 struct nvme_fc_port_info *pinfo)
620{
621 if (pinfo->dev_loss_tmo)
622 rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
623 else
624 rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
625}
626
455/** 627/**
456 * nvme_fc_register_remoteport - transport entry point called by an 628 * nvme_fc_register_remoteport - transport entry point called by an
457 * LLDD to register the existence of a NVME 629 * LLDD to register the existence of a NVME
@@ -478,28 +650,52 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
478 unsigned long flags; 650 unsigned long flags;
479 int ret, idx; 651 int ret, idx;
480 652
653 if (!nvme_fc_lport_get(lport)) {
654 ret = -ESHUTDOWN;
655 goto out_reghost_failed;
656 }
657
658 /*
659 * look to see if there is already a remoteport that is waiting
660 * for a reconnect (within dev_loss_tmo) with the same WWN's.
661 * If so, transition to it and reconnect.
662 */
663 newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo);
664
665 /* found an rport, but something about its state is bad */
666 if (IS_ERR(newrec)) {
667 ret = PTR_ERR(newrec);
668 goto out_lport_put;
669
670 /* found existing rport, which was resumed */
671 } else if (newrec) {
672 nvme_fc_lport_put(lport);
673 __nvme_fc_set_dev_loss_tmo(newrec, pinfo);
674 nvme_fc_signal_discovery_scan(lport, newrec);
675 *portptr = &newrec->remoteport;
676 return 0;
677 }
678
679 /* nothing found - allocate a new remoteport struct */
680
481 newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz), 681 newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
482 GFP_KERNEL); 682 GFP_KERNEL);
483 if (!newrec) { 683 if (!newrec) {
484 ret = -ENOMEM; 684 ret = -ENOMEM;
485 goto out_reghost_failed; 685 goto out_lport_put;
486 }
487
488 if (!nvme_fc_lport_get(lport)) {
489 ret = -ESHUTDOWN;
490 goto out_kfree_rport;
491 } 686 }
492 687
493 idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); 688 idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL);
494 if (idx < 0) { 689 if (idx < 0) {
495 ret = -ENOSPC; 690 ret = -ENOSPC;
496 goto out_lport_put; 691 goto out_kfree_rport;
497 } 692 }
498 693
499 INIT_LIST_HEAD(&newrec->endp_list); 694 INIT_LIST_HEAD(&newrec->endp_list);
500 INIT_LIST_HEAD(&newrec->ctrl_list); 695 INIT_LIST_HEAD(&newrec->ctrl_list);
501 INIT_LIST_HEAD(&newrec->ls_req_list); 696 INIT_LIST_HEAD(&newrec->ls_req_list);
502 kref_init(&newrec->ref); 697 kref_init(&newrec->ref);
698 atomic_set(&newrec->act_ctrl_cnt, 0);
503 spin_lock_init(&newrec->lock); 699 spin_lock_init(&newrec->lock);
504 newrec->remoteport.localport = &lport->localport; 700 newrec->remoteport.localport = &lport->localport;
505 newrec->dev = lport->dev; 701 newrec->dev = lport->dev;
@@ -511,63 +707,27 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
511 newrec->remoteport.port_id = pinfo->port_id; 707 newrec->remoteport.port_id = pinfo->port_id;
512 newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; 708 newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
513 newrec->remoteport.port_num = idx; 709 newrec->remoteport.port_num = idx;
710 __nvme_fc_set_dev_loss_tmo(newrec, pinfo);
514 711
515 spin_lock_irqsave(&nvme_fc_lock, flags); 712 spin_lock_irqsave(&nvme_fc_lock, flags);
516 list_add_tail(&newrec->endp_list, &lport->endp_list); 713 list_add_tail(&newrec->endp_list, &lport->endp_list);
517 spin_unlock_irqrestore(&nvme_fc_lock, flags); 714 spin_unlock_irqrestore(&nvme_fc_lock, flags);
518 715
716 nvme_fc_signal_discovery_scan(lport, newrec);
717
519 *portptr = &newrec->remoteport; 718 *portptr = &newrec->remoteport;
520 return 0; 719 return 0;
521 720
522out_lport_put:
523 nvme_fc_lport_put(lport);
524out_kfree_rport: 721out_kfree_rport:
525 kfree(newrec); 722 kfree(newrec);
723out_lport_put:
724 nvme_fc_lport_put(lport);
526out_reghost_failed: 725out_reghost_failed:
527 *portptr = NULL; 726 *portptr = NULL;
528 return ret; 727 return ret;
529} 728}
530EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport); 729EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
531 730
532static void
533nvme_fc_free_rport(struct kref *ref)
534{
535 struct nvme_fc_rport *rport =
536 container_of(ref, struct nvme_fc_rport, ref);
537 struct nvme_fc_lport *lport =
538 localport_to_lport(rport->remoteport.localport);
539 unsigned long flags;
540
541 WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
542 WARN_ON(!list_empty(&rport->ctrl_list));
543
544 /* remove from lport list */
545 spin_lock_irqsave(&nvme_fc_lock, flags);
546 list_del(&rport->endp_list);
547 spin_unlock_irqrestore(&nvme_fc_lock, flags);
548
549 /* let the LLDD know we've finished tearing it down */
550 lport->ops->remoteport_delete(&rport->remoteport);
551
552 ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
553
554 kfree(rport);
555
556 nvme_fc_lport_put(lport);
557}
558
559static void
560nvme_fc_rport_put(struct nvme_fc_rport *rport)
561{
562 kref_put(&rport->ref, nvme_fc_free_rport);
563}
564
565static int
566nvme_fc_rport_get(struct nvme_fc_rport *rport)
567{
568 return kref_get_unless_zero(&rport->ref);
569}
570
571static int 731static int
572nvme_fc_abort_lsops(struct nvme_fc_rport *rport) 732nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
573{ 733{
@@ -592,6 +752,58 @@ restart:
592 return 0; 752 return 0;
593} 753}
594 754
755static void
756nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
757{
758 dev_info(ctrl->ctrl.device,
759 "NVME-FC{%d}: controller connectivity lost. Awaiting "
760 "Reconnect", ctrl->cnum);
761
762 switch (ctrl->ctrl.state) {
763 case NVME_CTRL_NEW:
764 case NVME_CTRL_LIVE:
765 /*
766 * Schedule a controller reset. The reset will terminate the
767 * association and schedule the reconnect timer. Reconnects
768 * will be attempted until either the ctlr_loss_tmo
769 * (max_retries * connect_delay) expires or the remoteport's
770 * dev_loss_tmo expires.
771 */
772 if (nvme_reset_ctrl(&ctrl->ctrl)) {
773 dev_warn(ctrl->ctrl.device,
774 "NVME-FC{%d}: Couldn't schedule reset. "
775 "Deleting controller.\n",
776 ctrl->cnum);
777 nvme_delete_ctrl(&ctrl->ctrl);
778 }
779 break;
780
781 case NVME_CTRL_RECONNECTING:
782 /*
783 * The association has already been terminated and the
784 * controller is attempting reconnects. No need to do anything
785 * futher. Reconnects will be attempted until either the
786 * ctlr_loss_tmo (max_retries * connect_delay) expires or the
787 * remoteport's dev_loss_tmo expires.
788 */
789 break;
790
791 case NVME_CTRL_RESETTING:
792 /*
793 * Controller is already in the process of terminating the
794 * association. No need to do anything further. The reconnect
795 * step will kick in naturally after the association is
796 * terminated.
797 */
798 break;
799
800 case NVME_CTRL_DELETING:
801 default:
802 /* no action to take - let it delete */
803 break;
804 }
805}
806
595/** 807/**
596 * nvme_fc_unregister_remoteport - transport entry point called by an 808 * nvme_fc_unregister_remoteport - transport entry point called by an
597 * LLDD to deregister/remove a previously 809 * LLDD to deregister/remove a previously
@@ -621,19 +833,78 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
621 } 833 }
622 portptr->port_state = FC_OBJSTATE_DELETED; 834 portptr->port_state = FC_OBJSTATE_DELETED;
623 835
624 /* tear down all associations to the remote port */ 836 rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ);
625 list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) 837
626 __nvme_fc_del_ctrl(ctrl); 838 list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
839 /* if dev_loss_tmo==0, dev loss is immediate */
840 if (!portptr->dev_loss_tmo) {
841 dev_warn(ctrl->ctrl.device,
842 "NVME-FC{%d}: controller connectivity lost. "
843 "Deleting controller.\n",
844 ctrl->cnum);
845 nvme_delete_ctrl(&ctrl->ctrl);
846 } else
847 nvme_fc_ctrl_connectivity_loss(ctrl);
848 }
627 849
628 spin_unlock_irqrestore(&rport->lock, flags); 850 spin_unlock_irqrestore(&rport->lock, flags);
629 851
630 nvme_fc_abort_lsops(rport); 852 nvme_fc_abort_lsops(rport);
631 853
854 if (atomic_read(&rport->act_ctrl_cnt) == 0)
855 rport->lport->ops->remoteport_delete(portptr);
856
857 /*
858 * release the reference, which will allow, if all controllers
859 * go away, which should only occur after dev_loss_tmo occurs,
860 * for the rport to be torn down.
861 */
632 nvme_fc_rport_put(rport); 862 nvme_fc_rport_put(rport);
863
633 return 0; 864 return 0;
634} 865}
635EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); 866EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
636 867
868/**
869 * nvme_fc_rescan_remoteport - transport entry point called by an
870 * LLDD to request a nvme device rescan.
871 * @remoteport: pointer to the (registered) remote port that is to be
872 * rescanned.
873 *
874 * Returns: N/A
875 */
876void
877nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
878{
879 struct nvme_fc_rport *rport = remoteport_to_rport(remoteport);
880
881 nvme_fc_signal_discovery_scan(rport->lport, rport);
882}
883EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
884
885int
886nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr,
887 u32 dev_loss_tmo)
888{
889 struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
890 unsigned long flags;
891
892 spin_lock_irqsave(&rport->lock, flags);
893
894 if (portptr->port_state != FC_OBJSTATE_ONLINE) {
895 spin_unlock_irqrestore(&rport->lock, flags);
896 return -EINVAL;
897 }
898
899 /* a dev_loss_tmo of 0 (immediate) is allowed to be set */
900 rport->remoteport.dev_loss_tmo = dev_loss_tmo;
901
902 spin_unlock_irqrestore(&rport->lock, flags);
903
904 return 0;
905}
906EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
907
637 908
638/* *********************** FC-NVME DMA Handling **************************** */ 909/* *********************** FC-NVME DMA Handling **************************** */
639 910
@@ -723,7 +994,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
723 dma_unmap_sg(dev, sg, nents, dir); 994 dma_unmap_sg(dev, sg, nents, dir);
724} 995}
725 996
726
727/* *********************** FC-NVME LS Handling **************************** */ 997/* *********************** FC-NVME LS Handling **************************** */
728 998
729static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *); 999static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
@@ -1266,7 +1536,7 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
1266 unsigned long flags; 1536 unsigned long flags;
1267 int i, ret; 1537 int i, ret;
1268 1538
1269 for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { 1539 for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
1270 if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE) 1540 if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
1271 continue; 1541 continue;
1272 1542
@@ -1331,7 +1601,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
1331 struct nvme_command *sqe = &op->cmd_iu.sqe; 1601 struct nvme_command *sqe = &op->cmd_iu.sqe;
1332 __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); 1602 __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
1333 union nvme_result result; 1603 union nvme_result result;
1334 bool complete_rq, terminate_assoc = true; 1604 bool terminate_assoc = true;
1335 1605
1336 /* 1606 /*
1337 * WARNING: 1607 * WARNING:
@@ -1373,8 +1643,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
1373 fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, 1643 fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
1374 sizeof(op->rsp_iu), DMA_FROM_DEVICE); 1644 sizeof(op->rsp_iu), DMA_FROM_DEVICE);
1375 1645
1376 if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) 1646 if (atomic_read(&op->state) == FCPOP_STATE_ABORTED ||
1377 status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); 1647 op->flags & FCOP_FLAGS_TERMIO)
1648 status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
1378 else if (freq->status) 1649 else if (freq->status)
1379 status = cpu_to_le16(NVME_SC_INTERNAL << 1); 1650 status = cpu_to_le16(NVME_SC_INTERNAL << 1);
1380 1651
@@ -1438,23 +1709,27 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
1438done: 1709done:
1439 if (op->flags & FCOP_FLAGS_AEN) { 1710 if (op->flags & FCOP_FLAGS_AEN) {
1440 nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); 1711 nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
1441 complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); 1712 __nvme_fc_fcpop_chk_teardowns(ctrl, op);
1442 atomic_set(&op->state, FCPOP_STATE_IDLE); 1713 atomic_set(&op->state, FCPOP_STATE_IDLE);
1443 op->flags = FCOP_FLAGS_AEN; /* clear other flags */ 1714 op->flags = FCOP_FLAGS_AEN; /* clear other flags */
1444 nvme_fc_ctrl_put(ctrl); 1715 nvme_fc_ctrl_put(ctrl);
1445 goto check_error; 1716 goto check_error;
1446 } 1717 }
1447 1718
1448 complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); 1719 /*
1449 if (!complete_rq) { 1720 * Force failures of commands if we're killing the controller
1450 if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { 1721 * or have an error on a command used to create an new association
1451 status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); 1722 */
1452 if (blk_queue_dying(rq->q)) 1723 if (status &&
1453 status |= cpu_to_le16(NVME_SC_DNR << 1); 1724 (blk_queue_dying(rq->q) ||
1454 } 1725 ctrl->ctrl.state == NVME_CTRL_NEW ||
1455 nvme_end_request(rq, status, result); 1726 ctrl->ctrl.state == NVME_CTRL_RECONNECTING))
1456 } else 1727 status |= cpu_to_le16(NVME_SC_DNR << 1);
1728
1729 if (__nvme_fc_fcpop_chk_teardowns(ctrl, op))
1457 __nvme_fc_final_op_cleanup(rq); 1730 __nvme_fc_final_op_cleanup(rq);
1731 else
1732 nvme_end_request(rq, status, result);
1458 1733
1459check_error: 1734check_error:
1460 if (terminate_assoc) 1735 if (terminate_assoc)
@@ -1531,7 +1806,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
1531 int i, ret; 1806 int i, ret;
1532 1807
1533 aen_op = ctrl->aen_ops; 1808 aen_op = ctrl->aen_ops;
1534 for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { 1809 for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
1535 private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, 1810 private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
1536 GFP_KERNEL); 1811 GFP_KERNEL);
1537 if (!private) 1812 if (!private)
@@ -1541,7 +1816,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
1541 sqe = &cmdiu->sqe; 1816 sqe = &cmdiu->sqe;
1542 ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0], 1817 ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
1543 aen_op, (struct request *)NULL, 1818 aen_op, (struct request *)NULL,
1544 (AEN_CMDID_BASE + i)); 1819 (NVME_AQ_BLK_MQ_DEPTH + i));
1545 if (ret) { 1820 if (ret) {
1546 kfree(private); 1821 kfree(private);
1547 return ret; 1822 return ret;
@@ -1554,7 +1829,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
1554 memset(sqe, 0, sizeof(*sqe)); 1829 memset(sqe, 0, sizeof(*sqe));
1555 sqe->common.opcode = nvme_admin_async_event; 1830 sqe->common.opcode = nvme_admin_async_event;
1556 /* Note: core layer may overwrite the sqe.command_id value */ 1831 /* Note: core layer may overwrite the sqe.command_id value */
1557 sqe->common.command_id = AEN_CMDID_BASE + i; 1832 sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i;
1558 } 1833 }
1559 return 0; 1834 return 0;
1560} 1835}
@@ -1566,7 +1841,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
1566 int i; 1841 int i;
1567 1842
1568 aen_op = ctrl->aen_ops; 1843 aen_op = ctrl->aen_ops;
1569 for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { 1844 for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
1570 if (!aen_op->fcp_req.private) 1845 if (!aen_op->fcp_req.private)
1571 continue; 1846 continue;
1572 1847
@@ -1610,7 +1885,7 @@ nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
1610} 1885}
1611 1886
1612static void 1887static void
1613nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size) 1888nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx)
1614{ 1889{
1615 struct nvme_fc_queue *queue; 1890 struct nvme_fc_queue *queue;
1616 1891
@@ -1626,8 +1901,6 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
1626 else 1901 else
1627 queue->cmnd_capsule_len = sizeof(struct nvme_command); 1902 queue->cmnd_capsule_len = sizeof(struct nvme_command);
1628 1903
1629 queue->queue_size = queue_size;
1630
1631 /* 1904 /*
1632 * Considered whether we should allocate buffers for all SQEs 1905 * Considered whether we should allocate buffers for all SQEs
1633 * and CQEs and dma map them - mapping their respective entries 1906 * and CQEs and dma map them - mapping their respective entries
@@ -1751,7 +2024,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
1751 int i; 2024 int i;
1752 2025
1753 for (i = 1; i < ctrl->ctrl.queue_count; i++) 2026 for (i = 1; i < ctrl->ctrl.queue_count; i++)
1754 nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize); 2027 nvme_fc_init_queue(ctrl, i);
1755} 2028}
1756 2029
1757static void 2030static void
@@ -1825,13 +2098,6 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
1825 dev_warn(ctrl->ctrl.device, 2098 dev_warn(ctrl->ctrl.device,
1826 "NVME-FC{%d}: resetting controller\n", ctrl->cnum); 2099 "NVME-FC{%d}: resetting controller\n", ctrl->cnum);
1827 2100
1828 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
1829 dev_err(ctrl->ctrl.device,
1830 "NVME-FC{%d}: error_recovery: Couldn't change state "
1831 "to RECONNECTING\n", ctrl->cnum);
1832 return;
1833 }
1834
1835 nvme_reset_ctrl(&ctrl->ctrl); 2101 nvme_reset_ctrl(&ctrl->ctrl);
1836} 2102}
1837 2103
@@ -1842,13 +2108,14 @@ nvme_fc_timeout(struct request *rq, bool reserved)
1842 struct nvme_fc_ctrl *ctrl = op->ctrl; 2108 struct nvme_fc_ctrl *ctrl = op->ctrl;
1843 int ret; 2109 int ret;
1844 2110
1845 if (reserved) 2111 if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
2112 atomic_read(&op->state) == FCPOP_STATE_ABORTED)
1846 return BLK_EH_RESET_TIMER; 2113 return BLK_EH_RESET_TIMER;
1847 2114
1848 ret = __nvme_fc_abort_op(ctrl, op); 2115 ret = __nvme_fc_abort_op(ctrl, op);
1849 if (ret) 2116 if (ret)
1850 /* io wasn't active to abort consider it done */ 2117 /* io wasn't active to abort */
1851 return BLK_EH_HANDLED; 2118 return BLK_EH_NOT_HANDLED;
1852 2119
1853 /* 2120 /*
1854 * we can't individually ABTS an io without affecting the queue, 2121 * we can't individually ABTS an io without affecting the queue,
@@ -1859,7 +2126,12 @@ nvme_fc_timeout(struct request *rq, bool reserved)
1859 */ 2126 */
1860 nvme_fc_error_recovery(ctrl, "io timeout error"); 2127 nvme_fc_error_recovery(ctrl, "io timeout error");
1861 2128
1862 return BLK_EH_HANDLED; 2129 /*
2130 * the io abort has been initiated. Have the reset timer
2131 * restarted and the abort completion will complete the io
2132 * shortly. Avoids a synchronous wait while the abort finishes.
2133 */
2134 return BLK_EH_RESET_TIMER;
1863} 2135}
1864 2136
1865static int 2137static int
@@ -2110,7 +2382,7 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
2110} 2382}
2111 2383
2112static void 2384static void
2113nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) 2385nvme_fc_submit_async_event(struct nvme_ctrl *arg)
2114{ 2386{
2115 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg); 2387 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
2116 struct nvme_fc_fcp_op *aen_op; 2388 struct nvme_fc_fcp_op *aen_op;
@@ -2118,9 +2390,6 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
2118 bool terminating = false; 2390 bool terminating = false;
2119 blk_status_t ret; 2391 blk_status_t ret;
2120 2392
2121 if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
2122 return;
2123
2124 spin_lock_irqsave(&ctrl->lock, flags); 2393 spin_lock_irqsave(&ctrl->lock, flags);
2125 if (ctrl->flags & FCCTRL_TERMIO) 2394 if (ctrl->flags & FCCTRL_TERMIO)
2126 terminating = true; 2395 terminating = true;
@@ -2129,13 +2398,13 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
2129 if (terminating) 2398 if (terminating)
2130 return; 2399 return;
2131 2400
2132 aen_op = &ctrl->aen_ops[aer_idx]; 2401 aen_op = &ctrl->aen_ops[0];
2133 2402
2134 ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0, 2403 ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
2135 NVMEFC_FCP_NODATA); 2404 NVMEFC_FCP_NODATA);
2136 if (ret) 2405 if (ret)
2137 dev_err(ctrl->ctrl.device, 2406 dev_err(ctrl->ctrl.device,
2138 "failed async event work [%d]\n", aer_idx); 2407 "failed async event work\n");
2139} 2408}
2140 2409
2141static void 2410static void
@@ -2337,7 +2606,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl)
2337 2606
2338 nvme_fc_init_io_queues(ctrl); 2607 nvme_fc_init_io_queues(ctrl);
2339 2608
2340 ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request); 2609 ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
2341 if (ret) 2610 if (ret)
2342 goto out_free_io_queues; 2611 goto out_free_io_queues;
2343 2612
@@ -2360,6 +2629,61 @@ out_free_io_queues:
2360 return ret; 2629 return ret;
2361} 2630}
2362 2631
2632static void
2633nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport)
2634{
2635 struct nvme_fc_lport *lport = rport->lport;
2636
2637 atomic_inc(&lport->act_rport_cnt);
2638}
2639
2640static void
2641nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport)
2642{
2643 struct nvme_fc_lport *lport = rport->lport;
2644 u32 cnt;
2645
2646 cnt = atomic_dec_return(&lport->act_rport_cnt);
2647 if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED)
2648 lport->ops->localport_delete(&lport->localport);
2649}
2650
2651static int
2652nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl)
2653{
2654 struct nvme_fc_rport *rport = ctrl->rport;
2655 u32 cnt;
2656
2657 if (ctrl->assoc_active)
2658 return 1;
2659
2660 ctrl->assoc_active = true;
2661 cnt = atomic_inc_return(&rport->act_ctrl_cnt);
2662 if (cnt == 1)
2663 nvme_fc_rport_active_on_lport(rport);
2664
2665 return 0;
2666}
2667
2668static int
2669nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl)
2670{
2671 struct nvme_fc_rport *rport = ctrl->rport;
2672 struct nvme_fc_lport *lport = rport->lport;
2673 u32 cnt;
2674
2675 /* ctrl->assoc_active=false will be set independently */
2676
2677 cnt = atomic_dec_return(&rport->act_ctrl_cnt);
2678 if (cnt == 0) {
2679 if (rport->remoteport.port_state == FC_OBJSTATE_DELETED)
2680 lport->ops->remoteport_delete(&rport->remoteport);
2681 nvme_fc_rport_inactive_on_lport(rport);
2682 }
2683
2684 return 0;
2685}
2686
2363/* 2687/*
2364 * This routine restarts the controller on the host side, and 2688 * This routine restarts the controller on the host side, and
2365 * on the link side, recreates the controller association. 2689 * on the link side, recreates the controller association.
@@ -2368,26 +2692,31 @@ static int
2368nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) 2692nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
2369{ 2693{
2370 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 2694 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2371 u32 segs;
2372 int ret; 2695 int ret;
2373 bool changed; 2696 bool changed;
2374 2697
2375 ++ctrl->ctrl.nr_reconnects; 2698 ++ctrl->ctrl.nr_reconnects;
2376 2699
2700 if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
2701 return -ENODEV;
2702
2703 if (nvme_fc_ctlr_active_on_rport(ctrl))
2704 return -ENOTUNIQ;
2705
2377 /* 2706 /*
2378 * Create the admin queue 2707 * Create the admin queue
2379 */ 2708 */
2380 2709
2381 nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH); 2710 nvme_fc_init_queue(ctrl, 0);
2382 2711
2383 ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, 2712 ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
2384 NVME_FC_AQ_BLKMQ_DEPTH); 2713 NVME_AQ_BLK_MQ_DEPTH);
2385 if (ret) 2714 if (ret)
2386 goto out_free_queue; 2715 goto out_free_queue;
2387 2716
2388 ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0], 2717 ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
2389 NVME_FC_AQ_BLKMQ_DEPTH, 2718 NVME_AQ_BLK_MQ_DEPTH,
2390 (NVME_FC_AQ_BLKMQ_DEPTH / 4)); 2719 (NVME_AQ_BLK_MQ_DEPTH / 4));
2391 if (ret) 2720 if (ret)
2392 goto out_delete_hw_queue; 2721 goto out_delete_hw_queue;
2393 2722
@@ -2419,9 +2748,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
2419 if (ret) 2748 if (ret)
2420 goto out_disconnect_admin_queue; 2749 goto out_disconnect_admin_queue;
2421 2750
2422 segs = min_t(u32, NVME_FC_MAX_SEGMENTS, 2751 ctrl->ctrl.max_hw_sectors =
2423 ctrl->lport->ops->max_sgl_segments); 2752 (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
2424 ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
2425 2753
2426 ret = nvme_init_identify(&ctrl->ctrl); 2754 ret = nvme_init_identify(&ctrl->ctrl);
2427 if (ret) 2755 if (ret)
@@ -2465,11 +2793,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
2465 } 2793 }
2466 2794
2467 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 2795 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
2468 WARN_ON_ONCE(!changed);
2469 2796
2470 ctrl->ctrl.nr_reconnects = 0; 2797 ctrl->ctrl.nr_reconnects = 0;
2471 2798
2472 nvme_start_ctrl(&ctrl->ctrl); 2799 if (changed)
2800 nvme_start_ctrl(&ctrl->ctrl);
2473 2801
2474 return 0; /* Success */ 2802 return 0; /* Success */
2475 2803
@@ -2482,6 +2810,8 @@ out_delete_hw_queue:
2482 __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); 2810 __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
2483out_free_queue: 2811out_free_queue:
2484 nvme_fc_free_queue(&ctrl->queues[0]); 2812 nvme_fc_free_queue(&ctrl->queues[0]);
2813 ctrl->assoc_active = false;
2814 nvme_fc_ctlr_inactive_on_rport(ctrl);
2485 2815
2486 return ret; 2816 return ret;
2487} 2817}
@@ -2497,6 +2827,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
2497{ 2827{
2498 unsigned long flags; 2828 unsigned long flags;
2499 2829
2830 if (!ctrl->assoc_active)
2831 return;
2832 ctrl->assoc_active = false;
2833
2500 spin_lock_irqsave(&ctrl->lock, flags); 2834 spin_lock_irqsave(&ctrl->lock, flags);
2501 ctrl->flags |= FCCTRL_TERMIO; 2835 ctrl->flags |= FCCTRL_TERMIO;
2502 ctrl->iocnt = 0; 2836 ctrl->iocnt = 0;
@@ -2537,7 +2871,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
2537 * use blk_mq_tagset_busy_itr() and the transport routine to 2871 * use blk_mq_tagset_busy_itr() and the transport routine to
2538 * terminate the exchanges. 2872 * terminate the exchanges.
2539 */ 2873 */
2540 blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 2874 if (ctrl->ctrl.state != NVME_CTRL_NEW)
2875 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
2541 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 2876 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
2542 nvme_fc_terminate_exchange, &ctrl->ctrl); 2877 nvme_fc_terminate_exchange, &ctrl->ctrl);
2543 2878
@@ -2568,102 +2903,64 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
2568 2903
2569 __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); 2904 __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
2570 nvme_fc_free_queue(&ctrl->queues[0]); 2905 nvme_fc_free_queue(&ctrl->queues[0]);
2906
2907 nvme_fc_ctlr_inactive_on_rport(ctrl);
2571} 2908}
2572 2909
2573static void 2910static void
2574nvme_fc_delete_ctrl_work(struct work_struct *work) 2911nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
2575{ 2912{
2576 struct nvme_fc_ctrl *ctrl = 2913 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
2577 container_of(work, struct nvme_fc_ctrl, delete_work);
2578 2914
2579 cancel_work_sync(&ctrl->ctrl.reset_work);
2580 cancel_delayed_work_sync(&ctrl->connect_work); 2915 cancel_delayed_work_sync(&ctrl->connect_work);
2581 nvme_stop_ctrl(&ctrl->ctrl);
2582 nvme_remove_namespaces(&ctrl->ctrl);
2583 /* 2916 /*
2584 * kill the association on the link side. this will block 2917 * kill the association on the link side. this will block
2585 * waiting for io to terminate 2918 * waiting for io to terminate
2586 */ 2919 */
2587 nvme_fc_delete_association(ctrl); 2920 nvme_fc_delete_association(ctrl);
2588
2589 /*
2590 * tear down the controller
2591 * After the last reference on the nvme ctrl is removed,
2592 * the transport nvme_fc_nvme_ctrl_freed() callback will be
2593 * invoked. From there, the transport will tear down it's
2594 * logical queues and association.
2595 */
2596 nvme_uninit_ctrl(&ctrl->ctrl);
2597
2598 nvme_put_ctrl(&ctrl->ctrl);
2599}
2600
2601static bool
2602__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl)
2603{
2604 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
2605 return true;
2606
2607 if (!queue_work(nvme_wq, &ctrl->delete_work))
2608 return true;
2609
2610 return false;
2611}
2612
2613static int
2614__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
2615{
2616 return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0;
2617}
2618
2619/*
2620 * Request from nvme core layer to delete the controller
2621 */
2622static int
2623nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
2624{
2625 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
2626 int ret;
2627
2628 if (!kref_get_unless_zero(&ctrl->ctrl.kref))
2629 return -EBUSY;
2630
2631 ret = __nvme_fc_del_ctrl(ctrl);
2632
2633 if (!ret)
2634 flush_workqueue(nvme_wq);
2635
2636 nvme_put_ctrl(&ctrl->ctrl);
2637
2638 return ret;
2639} 2921}
2640 2922
2641static void 2923static void
2642nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) 2924nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
2643{ 2925{
2644 /* If we are resetting/deleting then do nothing */ 2926 struct nvme_fc_rport *rport = ctrl->rport;
2645 if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { 2927 struct nvme_fc_remote_port *portptr = &rport->remoteport;
2646 WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || 2928 unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
2647 ctrl->ctrl.state == NVME_CTRL_LIVE); 2929 bool recon = true;
2648 return;
2649 }
2650 2930
2651 dev_info(ctrl->ctrl.device, 2931 if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING)
2652 "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", 2932 return;
2653 ctrl->cnum, status);
2654 2933
2655 if (nvmf_should_reconnect(&ctrl->ctrl)) { 2934 if (portptr->port_state == FC_OBJSTATE_ONLINE)
2656 dev_info(ctrl->ctrl.device, 2935 dev_info(ctrl->ctrl.device,
2657 "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", 2936 "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
2658 ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); 2937 ctrl->cnum, status);
2659 queue_delayed_work(nvme_wq, &ctrl->connect_work, 2938 else if (time_after_eq(jiffies, rport->dev_loss_end))
2660 ctrl->ctrl.opts->reconnect_delay * HZ); 2939 recon = false;
2940
2941 if (recon && nvmf_should_reconnect(&ctrl->ctrl)) {
2942 if (portptr->port_state == FC_OBJSTATE_ONLINE)
2943 dev_info(ctrl->ctrl.device,
2944 "NVME-FC{%d}: Reconnect attempt in %ld "
2945 "seconds\n",
2946 ctrl->cnum, recon_delay / HZ);
2947 else if (time_after(jiffies + recon_delay, rport->dev_loss_end))
2948 recon_delay = rport->dev_loss_end - jiffies;
2949
2950 queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay);
2661 } else { 2951 } else {
2662 dev_warn(ctrl->ctrl.device, 2952 if (portptr->port_state == FC_OBJSTATE_ONLINE)
2953 dev_warn(ctrl->ctrl.device,
2663 "NVME-FC{%d}: Max reconnect attempts (%d) " 2954 "NVME-FC{%d}: Max reconnect attempts (%d) "
2664 "reached. Removing controller\n", 2955 "reached. Removing controller\n",
2665 ctrl->cnum, ctrl->ctrl.nr_reconnects); 2956 ctrl->cnum, ctrl->ctrl.nr_reconnects);
2666 WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); 2957 else
2958 dev_warn(ctrl->ctrl.device,
2959 "NVME-FC{%d}: dev_loss_tmo (%d) expired "
2960 "while waiting for remoteport connectivity. "
2961 "Removing controller\n", ctrl->cnum,
2962 portptr->dev_loss_tmo);
2963 WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
2667 } 2964 }
2668} 2965}
2669 2966
@@ -2675,15 +2972,28 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
2675 int ret; 2972 int ret;
2676 2973
2677 nvme_stop_ctrl(&ctrl->ctrl); 2974 nvme_stop_ctrl(&ctrl->ctrl);
2975
2678 /* will block will waiting for io to terminate */ 2976 /* will block will waiting for io to terminate */
2679 nvme_fc_delete_association(ctrl); 2977 nvme_fc_delete_association(ctrl);
2680 2978
2681 ret = nvme_fc_create_association(ctrl); 2979 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
2980 dev_err(ctrl->ctrl.device,
2981 "NVME-FC{%d}: error_recovery: Couldn't change state "
2982 "to RECONNECTING\n", ctrl->cnum);
2983 return;
2984 }
2985
2986 if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
2987 ret = nvme_fc_create_association(ctrl);
2988 else
2989 ret = -ENOTCONN;
2990
2682 if (ret) 2991 if (ret)
2683 nvme_fc_reconnect_or_delete(ctrl, ret); 2992 nvme_fc_reconnect_or_delete(ctrl, ret);
2684 else 2993 else
2685 dev_info(ctrl->ctrl.device, 2994 dev_info(ctrl->ctrl.device,
2686 "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); 2995 "NVME-FC{%d}: controller reset complete\n",
2996 ctrl->cnum);
2687} 2997}
2688 2998
2689static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { 2999static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
@@ -2695,8 +3005,9 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
2695 .reg_write32 = nvmf_reg_write32, 3005 .reg_write32 = nvmf_reg_write32,
2696 .free_ctrl = nvme_fc_nvme_ctrl_freed, 3006 .free_ctrl = nvme_fc_nvme_ctrl_freed,
2697 .submit_async_event = nvme_fc_submit_async_event, 3007 .submit_async_event = nvme_fc_submit_async_event,
2698 .delete_ctrl = nvme_fc_del_nvme_ctrl, 3008 .delete_ctrl = nvme_fc_delete_ctrl,
2699 .get_address = nvmf_get_address, 3009 .get_address = nvmf_get_address,
3010 .reinit_request = nvme_fc_reinit_request,
2700}; 3011};
2701 3012
2702static void 3013static void
@@ -2728,6 +3039,33 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
2728}; 3039};
2729 3040
2730 3041
3042/*
3043 * Fails a controller request if it matches an existing controller
3044 * (association) with the same tuple:
3045 * <Host NQN, Host ID, local FC port, remote FC port, SUBSYS NQN>
3046 *
3047 * The ports don't need to be compared as they are intrinsically
3048 * already matched by the port pointers supplied.
3049 */
3050static bool
3051nvme_fc_existing_controller(struct nvme_fc_rport *rport,
3052 struct nvmf_ctrl_options *opts)
3053{
3054 struct nvme_fc_ctrl *ctrl;
3055 unsigned long flags;
3056 bool found = false;
3057
3058 spin_lock_irqsave(&rport->lock, flags);
3059 list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
3060 found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts);
3061 if (found)
3062 break;
3063 }
3064 spin_unlock_irqrestore(&rport->lock, flags);
3065
3066 return found;
3067}
3068
2731static struct nvme_ctrl * 3069static struct nvme_ctrl *
2732nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, 3070nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
2733 struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) 3071 struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
@@ -2742,6 +3080,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
2742 goto out_fail; 3080 goto out_fail;
2743 } 3081 }
2744 3082
3083 if (!opts->duplicate_connect &&
3084 nvme_fc_existing_controller(rport, opts)) {
3085 ret = -EALREADY;
3086 goto out_fail;
3087 }
3088
2745 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); 3089 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2746 if (!ctrl) { 3090 if (!ctrl) {
2747 ret = -ENOMEM; 3091 ret = -ENOMEM;
@@ -2760,12 +3104,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
2760 ctrl->rport = rport; 3104 ctrl->rport = rport;
2761 ctrl->dev = lport->dev; 3105 ctrl->dev = lport->dev;
2762 ctrl->cnum = idx; 3106 ctrl->cnum = idx;
3107 ctrl->assoc_active = false;
2763 init_waitqueue_head(&ctrl->ioabort_wait); 3108 init_waitqueue_head(&ctrl->ioabort_wait);
2764 3109
2765 get_device(ctrl->dev); 3110 get_device(ctrl->dev);
2766 kref_init(&ctrl->ref); 3111 kref_init(&ctrl->ref);
2767 3112
2768 INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work);
2769 INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); 3113 INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
2770 INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); 3114 INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
2771 spin_lock_init(&ctrl->lock); 3115 spin_lock_init(&ctrl->lock);
@@ -2787,7 +3131,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
2787 3131
2788 memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); 3132 memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
2789 ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; 3133 ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
2790 ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH; 3134 ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
2791 ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ 3135 ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
2792 ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; 3136 ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
2793 ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) + 3137 ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
@@ -2797,6 +3141,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
2797 ctrl->admin_tag_set.driver_data = ctrl; 3141 ctrl->admin_tag_set.driver_data = ctrl;
2798 ctrl->admin_tag_set.nr_hw_queues = 1; 3142 ctrl->admin_tag_set.nr_hw_queues = 1;
2799 ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; 3143 ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
3144 ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
2800 3145
2801 ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); 3146 ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
2802 if (ret) 3147 if (ret)
@@ -2878,7 +3223,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
2878 return ERR_PTR(ret); 3223 return ERR_PTR(ret);
2879 } 3224 }
2880 3225
2881 kref_get(&ctrl->ctrl.kref); 3226 nvme_get_ctrl(&ctrl->ctrl);
2882 3227
2883 dev_info(ctrl->ctrl.device, 3228 dev_info(ctrl->ctrl.device,
2884 "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", 3229 "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
@@ -3026,7 +3371,50 @@ static struct nvmf_transport_ops nvme_fc_transport = {
3026 3371
3027static int __init nvme_fc_init_module(void) 3372static int __init nvme_fc_init_module(void)
3028{ 3373{
3029 return nvmf_register_transport(&nvme_fc_transport); 3374 int ret;
3375
3376 /*
3377 * NOTE:
3378 * It is expected that in the future the kernel will combine
3379 * the FC-isms that are currently under scsi and now being
3380 * added to by NVME into a new standalone FC class. The SCSI
3381 * and NVME protocols and their devices would be under this
3382 * new FC class.
3383 *
3384 * As we need something to post FC-specific udev events to,
3385 * specifically for nvme probe events, start by creating the
3386 * new device class. When the new standalone FC class is
3387 * put in place, this code will move to a more generic
3388 * location for the class.
3389 */
3390 fc_class = class_create(THIS_MODULE, "fc");
3391 if (IS_ERR(fc_class)) {
3392 pr_err("couldn't register class fc\n");
3393 return PTR_ERR(fc_class);
3394 }
3395
3396 /*
3397 * Create a device for the FC-centric udev events
3398 */
3399 fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL,
3400 "fc_udev_device");
3401 if (IS_ERR(fc_udev_device)) {
3402 pr_err("couldn't create fc_udev device!\n");
3403 ret = PTR_ERR(fc_udev_device);
3404 goto out_destroy_class;
3405 }
3406
3407 ret = nvmf_register_transport(&nvme_fc_transport);
3408 if (ret)
3409 goto out_destroy_device;
3410
3411 return 0;
3412
3413out_destroy_device:
3414 device_destroy(fc_class, MKDEV(0, 0));
3415out_destroy_class:
3416 class_destroy(fc_class);
3417 return ret;
3030} 3418}
3031 3419
3032static void __exit nvme_fc_exit_module(void) 3420static void __exit nvme_fc_exit_module(void)
@@ -3039,6 +3427,9 @@ static void __exit nvme_fc_exit_module(void)
3039 3427
3040 ida_destroy(&nvme_fc_local_port_cnt); 3428 ida_destroy(&nvme_fc_local_port_cnt);
3041 ida_destroy(&nvme_fc_ctrl_cnt); 3429 ida_destroy(&nvme_fc_ctrl_cnt);
3430
3431 device_destroy(fc_class, MKDEV(0, 0));
3432 class_destroy(fc_class);
3042} 3433}
3043 3434
3044module_init(nvme_fc_init_module); 3435module_init(nvme_fc_init_module);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 1f79e3f141e6..ba3d7f3349e5 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -305,7 +305,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
305 int ret; 305 int ret;
306 306
307 c.identity.opcode = nvme_nvm_admin_identity; 307 c.identity.opcode = nvme_nvm_admin_identity;
308 c.identity.nsid = cpu_to_le32(ns->ns_id); 308 c.identity.nsid = cpu_to_le32(ns->head->ns_id);
309 c.identity.chnl_off = 0; 309 c.identity.chnl_off = 0;
310 310
311 nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL); 311 nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL);
@@ -344,7 +344,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
344 int ret = 0; 344 int ret = 0;
345 345
346 c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; 346 c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
347 c.l2p.nsid = cpu_to_le32(ns->ns_id); 347 c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
348 entries = kmalloc(len, GFP_KERNEL); 348 entries = kmalloc(len, GFP_KERNEL);
349 if (!entries) 349 if (!entries)
350 return -ENOMEM; 350 return -ENOMEM;
@@ -402,7 +402,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
402 int ret = 0; 402 int ret = 0;
403 403
404 c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; 404 c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
405 c.get_bb.nsid = cpu_to_le32(ns->ns_id); 405 c.get_bb.nsid = cpu_to_le32(ns->head->ns_id);
406 c.get_bb.spba = cpu_to_le64(ppa.ppa); 406 c.get_bb.spba = cpu_to_le64(ppa.ppa);
407 407
408 bb_tbl = kzalloc(tblsz, GFP_KERNEL); 408 bb_tbl = kzalloc(tblsz, GFP_KERNEL);
@@ -452,7 +452,7 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
452 int ret = 0; 452 int ret = 0;
453 453
454 c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; 454 c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
455 c.set_bb.nsid = cpu_to_le32(ns->ns_id); 455 c.set_bb.nsid = cpu_to_le32(ns->head->ns_id);
456 c.set_bb.spba = cpu_to_le64(ppas->ppa); 456 c.set_bb.spba = cpu_to_le64(ppas->ppa);
457 c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); 457 c.set_bb.nlb = cpu_to_le16(nr_ppas - 1);
458 c.set_bb.value = type; 458 c.set_bb.value = type;
@@ -469,7 +469,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
469 struct nvme_nvm_command *c) 469 struct nvme_nvm_command *c)
470{ 470{
471 c->ph_rw.opcode = rqd->opcode; 471 c->ph_rw.opcode = rqd->opcode;
472 c->ph_rw.nsid = cpu_to_le32(ns->ns_id); 472 c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
473 c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); 473 c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
474 c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); 474 c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
475 c->ph_rw.control = cpu_to_le16(rqd->flags); 475 c->ph_rw.control = cpu_to_le16(rqd->flags);
@@ -492,34 +492,47 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
492 blk_mq_free_request(rq); 492 blk_mq_free_request(rq);
493} 493}
494 494
495static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) 495static struct request *nvme_nvm_alloc_request(struct request_queue *q,
496 struct nvm_rq *rqd,
497 struct nvme_nvm_command *cmd)
496{ 498{
497 struct request_queue *q = dev->q;
498 struct nvme_ns *ns = q->queuedata; 499 struct nvme_ns *ns = q->queuedata;
499 struct request *rq; 500 struct request *rq;
500 struct bio *bio = rqd->bio;
501 struct nvme_nvm_command *cmd;
502
503 cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
504 if (!cmd)
505 return -ENOMEM;
506 501
507 nvme_nvm_rqtocmd(rqd, ns, cmd); 502 nvme_nvm_rqtocmd(rqd, ns, cmd);
508 503
509 rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); 504 rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
510 if (IS_ERR(rq)) { 505 if (IS_ERR(rq))
511 kfree(cmd); 506 return rq;
512 return PTR_ERR(rq); 507
513 }
514 rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; 508 rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
515 509
516 if (bio) { 510 if (rqd->bio) {
517 blk_init_request_from_bio(rq, bio); 511 blk_init_request_from_bio(rq, rqd->bio);
518 } else { 512 } else {
519 rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); 513 rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
520 rq->__data_len = 0; 514 rq->__data_len = 0;
521 } 515 }
522 516
517 return rq;
518}
519
520static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
521{
522 struct request_queue *q = dev->q;
523 struct nvme_nvm_command *cmd;
524 struct request *rq;
525
526 cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
527 if (!cmd)
528 return -ENOMEM;
529
530 rq = nvme_nvm_alloc_request(q, rqd, cmd);
531 if (IS_ERR(rq)) {
532 kfree(cmd);
533 return PTR_ERR(rq);
534 }
535
523 rq->end_io_data = rqd; 536 rq->end_io_data = rqd;
524 537
525 blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); 538 blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
@@ -527,6 +540,34 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
527 return 0; 540 return 0;
528} 541}
529 542
543static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
544{
545 struct request_queue *q = dev->q;
546 struct request *rq;
547 struct nvme_nvm_command cmd;
548 int ret = 0;
549
550 memset(&cmd, 0, sizeof(struct nvme_nvm_command));
551
552 rq = nvme_nvm_alloc_request(q, rqd, &cmd);
553 if (IS_ERR(rq))
554 return PTR_ERR(rq);
555
556 /* I/Os can fail and the error is signaled through rqd. Callers must
557 * handle the error accordingly.
558 */
559 blk_execute_rq(q, NULL, rq, 0);
560 if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
561 ret = -EINTR;
562
563 rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
564 rqd->error = nvme_req(rq)->status;
565
566 blk_mq_free_request(rq);
567
568 return ret;
569}
570
530static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) 571static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
531{ 572{
532 struct nvme_ns *ns = nvmdev->q->queuedata; 573 struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -562,6 +603,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
562 .set_bb_tbl = nvme_nvm_set_bb_tbl, 603 .set_bb_tbl = nvme_nvm_set_bb_tbl,
563 604
564 .submit_io = nvme_nvm_submit_io, 605 .submit_io = nvme_nvm_submit_io,
606 .submit_io_sync = nvme_nvm_submit_io_sync,
565 607
566 .create_dma_pool = nvme_nvm_create_dma_pool, 608 .create_dma_pool = nvme_nvm_create_dma_pool,
567 .destroy_dma_pool = nvme_nvm_destroy_dma_pool, 609 .destroy_dma_pool = nvme_nvm_destroy_dma_pool,
@@ -600,8 +642,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
600 642
601 rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; 643 rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;
602 644
603 rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
604
605 if (ppa_buf && ppa_len) { 645 if (ppa_buf && ppa_len) {
606 ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); 646 ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
607 if (!ppa_list) { 647 if (!ppa_list) {
@@ -691,7 +731,7 @@ static int nvme_nvm_submit_vio(struct nvme_ns *ns,
691 731
692 memset(&c, 0, sizeof(c)); 732 memset(&c, 0, sizeof(c));
693 c.ph_rw.opcode = vio.opcode; 733 c.ph_rw.opcode = vio.opcode;
694 c.ph_rw.nsid = cpu_to_le32(ns->ns_id); 734 c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
695 c.ph_rw.control = cpu_to_le16(vio.control); 735 c.ph_rw.control = cpu_to_le16(vio.control);
696 c.ph_rw.length = cpu_to_le16(vio.nppas); 736 c.ph_rw.length = cpu_to_le16(vio.nppas);
697 737
@@ -728,7 +768,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
728 768
729 memset(&c, 0, sizeof(c)); 769 memset(&c, 0, sizeof(c));
730 c.common.opcode = vcmd.opcode; 770 c.common.opcode = vcmd.opcode;
731 c.common.nsid = cpu_to_le32(ns->ns_id); 771 c.common.nsid = cpu_to_le32(ns->head->ns_id);
732 c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); 772 c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
733 c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); 773 c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
734 /* cdw11-12 */ 774 /* cdw11-12 */
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
new file mode 100644
index 000000000000..78d92151a904
--- /dev/null
+++ b/drivers/nvme/host/multipath.c
@@ -0,0 +1,291 @@
1/*
2 * Copyright (c) 2017 Christoph Hellwig.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <linux/moduleparam.h>
15#include "nvme.h"
16
17static bool multipath = true;
18module_param(multipath, bool, 0644);
19MODULE_PARM_DESC(multipath,
20 "turn on native support for multiple controllers per subsystem");
21
22void nvme_failover_req(struct request *req)
23{
24 struct nvme_ns *ns = req->q->queuedata;
25 unsigned long flags;
26
27 spin_lock_irqsave(&ns->head->requeue_lock, flags);
28 blk_steal_bios(&ns->head->requeue_list, req);
29 spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
30 blk_mq_end_request(req, 0);
31
32 nvme_reset_ctrl(ns->ctrl);
33 kblockd_schedule_work(&ns->head->requeue_work);
34}
35
36bool nvme_req_needs_failover(struct request *req)
37{
38 if (!(req->cmd_flags & REQ_NVME_MPATH))
39 return false;
40
41 switch (nvme_req(req)->status & 0x7ff) {
42 /*
43 * Generic command status:
44 */
45 case NVME_SC_INVALID_OPCODE:
46 case NVME_SC_INVALID_FIELD:
47 case NVME_SC_INVALID_NS:
48 case NVME_SC_LBA_RANGE:
49 case NVME_SC_CAP_EXCEEDED:
50 case NVME_SC_RESERVATION_CONFLICT:
51 return false;
52
53 /*
54 * I/O command set specific error. Unfortunately these values are
55 * reused for fabrics commands, but those should never get here.
56 */
57 case NVME_SC_BAD_ATTRIBUTES:
58 case NVME_SC_INVALID_PI:
59 case NVME_SC_READ_ONLY:
60 case NVME_SC_ONCS_NOT_SUPPORTED:
61 WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
62 nvme_fabrics_command);
63 return false;
64
65 /*
66 * Media and Data Integrity Errors:
67 */
68 case NVME_SC_WRITE_FAULT:
69 case NVME_SC_READ_ERROR:
70 case NVME_SC_GUARD_CHECK:
71 case NVME_SC_APPTAG_CHECK:
72 case NVME_SC_REFTAG_CHECK:
73 case NVME_SC_COMPARE_FAILED:
74 case NVME_SC_ACCESS_DENIED:
75 case NVME_SC_UNWRITTEN_BLOCK:
76 return false;
77 }
78
79 /* Everything else could be a path failure, so should be retried */
80 return true;
81}
82
83void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
84{
85 struct nvme_ns *ns;
86
87 mutex_lock(&ctrl->namespaces_mutex);
88 list_for_each_entry(ns, &ctrl->namespaces, list) {
89 if (ns->head->disk)
90 kblockd_schedule_work(&ns->head->requeue_work);
91 }
92 mutex_unlock(&ctrl->namespaces_mutex);
93}
94
95static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
96{
97 struct nvme_ns *ns;
98
99 list_for_each_entry_rcu(ns, &head->list, siblings) {
100 if (ns->ctrl->state == NVME_CTRL_LIVE) {
101 rcu_assign_pointer(head->current_path, ns);
102 return ns;
103 }
104 }
105
106 return NULL;
107}
108
109inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
110{
111 struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
112
113 if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
114 ns = __nvme_find_path(head);
115 return ns;
116}
117
118static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
119 struct bio *bio)
120{
121 struct nvme_ns_head *head = q->queuedata;
122 struct device *dev = disk_to_dev(head->disk);
123 struct nvme_ns *ns;
124 blk_qc_t ret = BLK_QC_T_NONE;
125 int srcu_idx;
126
127 srcu_idx = srcu_read_lock(&head->srcu);
128 ns = nvme_find_path(head);
129 if (likely(ns)) {
130 bio->bi_disk = ns->disk;
131 bio->bi_opf |= REQ_NVME_MPATH;
132 ret = direct_make_request(bio);
133 } else if (!list_empty_careful(&head->list)) {
134 dev_warn_ratelimited(dev, "no path available - requeing I/O\n");
135
136 spin_lock_irq(&head->requeue_lock);
137 bio_list_add(&head->requeue_list, bio);
138 spin_unlock_irq(&head->requeue_lock);
139 } else {
140 dev_warn_ratelimited(dev, "no path - failing I/O\n");
141
142 bio->bi_status = BLK_STS_IOERR;
143 bio_endio(bio);
144 }
145
146 srcu_read_unlock(&head->srcu, srcu_idx);
147 return ret;
148}
149
150static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
151{
152 struct nvme_ns_head *head = q->queuedata;
153 struct nvme_ns *ns;
154 bool found = false;
155 int srcu_idx;
156
157 srcu_idx = srcu_read_lock(&head->srcu);
158 ns = srcu_dereference(head->current_path, &head->srcu);
159 if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
160 found = ns->queue->poll_fn(q, qc);
161 srcu_read_unlock(&head->srcu, srcu_idx);
162 return found;
163}
164
165static void nvme_requeue_work(struct work_struct *work)
166{
167 struct nvme_ns_head *head =
168 container_of(work, struct nvme_ns_head, requeue_work);
169 struct bio *bio, *next;
170
171 spin_lock_irq(&head->requeue_lock);
172 next = bio_list_get(&head->requeue_list);
173 spin_unlock_irq(&head->requeue_lock);
174
175 while ((bio = next) != NULL) {
176 next = bio->bi_next;
177 bio->bi_next = NULL;
178
179 /*
180 * Reset disk to the mpath node and resubmit to select a new
181 * path.
182 */
183 bio->bi_disk = head->disk;
184 generic_make_request(bio);
185 }
186}
187
188int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
189{
190 struct request_queue *q;
191 bool vwc = false;
192
193 bio_list_init(&head->requeue_list);
194 spin_lock_init(&head->requeue_lock);
195 INIT_WORK(&head->requeue_work, nvme_requeue_work);
196
197 /*
198 * Add a multipath node if the subsystems supports multiple controllers.
199 * We also do this for private namespaces as the namespace sharing data could
200 * change after a rescan.
201 */
202 if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
203 return 0;
204
205 q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
206 if (!q)
207 goto out;
208 q->queuedata = head;
209 blk_queue_make_request(q, nvme_ns_head_make_request);
210 q->poll_fn = nvme_ns_head_poll;
211 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
212 /* set to a default value for 512 until disk is validated */
213 blk_queue_logical_block_size(q, 512);
214
215 /* we need to propagate up the VMC settings */
216 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
217 vwc = true;
218 blk_queue_write_cache(q, vwc, vwc);
219
220 head->disk = alloc_disk(0);
221 if (!head->disk)
222 goto out_cleanup_queue;
223 head->disk->fops = &nvme_ns_head_ops;
224 head->disk->private_data = head;
225 head->disk->queue = q;
226 head->disk->flags = GENHD_FL_EXT_DEVT;
227 sprintf(head->disk->disk_name, "nvme%dn%d",
228 ctrl->subsys->instance, head->instance);
229 return 0;
230
231out_cleanup_queue:
232 blk_cleanup_queue(q);
233out:
234 return -ENOMEM;
235}
236
237void nvme_mpath_add_disk(struct nvme_ns_head *head)
238{
239 if (!head->disk)
240 return;
241 device_add_disk(&head->subsys->dev, head->disk);
242 if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
243 &nvme_ns_id_attr_group))
244 pr_warn("%s: failed to create sysfs group for identification\n",
245 head->disk->disk_name);
246}
247
248void nvme_mpath_add_disk_links(struct nvme_ns *ns)
249{
250 struct kobject *slave_disk_kobj, *holder_disk_kobj;
251
252 if (!ns->head->disk)
253 return;
254
255 slave_disk_kobj = &disk_to_dev(ns->disk)->kobj;
256 if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj,
257 kobject_name(slave_disk_kobj)))
258 return;
259
260 holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj;
261 if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj,
262 kobject_name(holder_disk_kobj)))
263 sysfs_remove_link(ns->head->disk->slave_dir,
264 kobject_name(slave_disk_kobj));
265}
266
267void nvme_mpath_remove_disk(struct nvme_ns_head *head)
268{
269 if (!head->disk)
270 return;
271 sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
272 &nvme_ns_id_attr_group);
273 del_gendisk(head->disk);
274 blk_set_queue_dying(head->disk->queue);
275 /* make sure all pending bios are cleaned up */
276 kblockd_schedule_work(&head->requeue_work);
277 flush_work(&head->requeue_work);
278 blk_cleanup_queue(head->disk->queue);
279 put_disk(head->disk);
280}
281
282void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
283{
284 if (!ns->head->disk)
285 return;
286
287 sysfs_remove_link(ns->disk->part0.holder_dir,
288 kobject_name(&disk_to_dev(ns->head->disk)->kobj));
289 sysfs_remove_link(ns->head->disk->slave_dir,
290 kobject_name(&disk_to_dev(ns->disk)->kobj));
291}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d3f3c4447515..c0873a68872f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -15,16 +15,17 @@
15#define _NVME_H 15#define _NVME_H
16 16
17#include <linux/nvme.h> 17#include <linux/nvme.h>
18#include <linux/cdev.h>
18#include <linux/pci.h> 19#include <linux/pci.h>
19#include <linux/kref.h> 20#include <linux/kref.h>
20#include <linux/blk-mq.h> 21#include <linux/blk-mq.h>
21#include <linux/lightnvm.h> 22#include <linux/lightnvm.h>
22#include <linux/sed-opal.h> 23#include <linux/sed-opal.h>
23 24
24extern unsigned char nvme_io_timeout; 25extern unsigned int nvme_io_timeout;
25#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) 26#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
26 27
27extern unsigned char admin_timeout; 28extern unsigned int admin_timeout;
28#define ADMIN_TIMEOUT (admin_timeout * HZ) 29#define ADMIN_TIMEOUT (admin_timeout * HZ)
29 30
30#define NVME_DEFAULT_KATO 5 31#define NVME_DEFAULT_KATO 5
@@ -94,6 +95,11 @@ struct nvme_request {
94 u16 status; 95 u16 status;
95}; 96};
96 97
98/*
99 * Mark a bio as coming in through the mpath node.
100 */
101#define REQ_NVME_MPATH REQ_DRV
102
97enum { 103enum {
98 NVME_REQ_CANCELLED = (1 << 0), 104 NVME_REQ_CANCELLED = (1 << 0),
99}; 105};
@@ -127,24 +133,23 @@ struct nvme_ctrl {
127 struct request_queue *admin_q; 133 struct request_queue *admin_q;
128 struct request_queue *connect_q; 134 struct request_queue *connect_q;
129 struct device *dev; 135 struct device *dev;
130 struct kref kref;
131 int instance; 136 int instance;
132 struct blk_mq_tag_set *tagset; 137 struct blk_mq_tag_set *tagset;
133 struct blk_mq_tag_set *admin_tagset; 138 struct blk_mq_tag_set *admin_tagset;
134 struct list_head namespaces; 139 struct list_head namespaces;
135 struct mutex namespaces_mutex; 140 struct mutex namespaces_mutex;
141 struct device ctrl_device;
136 struct device *device; /* char device */ 142 struct device *device; /* char device */
137 struct list_head node; 143 struct cdev cdev;
138 struct ida ns_ida;
139 struct work_struct reset_work; 144 struct work_struct reset_work;
145 struct work_struct delete_work;
146
147 struct nvme_subsystem *subsys;
148 struct list_head subsys_entry;
140 149
141 struct opal_dev *opal_dev; 150 struct opal_dev *opal_dev;
142 151
143 char name[12]; 152 char name[12];
144 char serial[20];
145 char model[40];
146 char firmware_rev[8];
147 char subnqn[NVMF_NQN_SIZE];
148 u16 cntlid; 153 u16 cntlid;
149 154
150 u32 ctrl_config; 155 u32 ctrl_config;
@@ -155,23 +160,23 @@ struct nvme_ctrl {
155 u32 page_size; 160 u32 page_size;
156 u32 max_hw_sectors; 161 u32 max_hw_sectors;
157 u16 oncs; 162 u16 oncs;
158 u16 vid;
159 u16 oacs; 163 u16 oacs;
160 u16 nssa; 164 u16 nssa;
161 u16 nr_streams; 165 u16 nr_streams;
162 atomic_t abort_limit; 166 atomic_t abort_limit;
163 u8 event_limit;
164 u8 vwc; 167 u8 vwc;
165 u32 vs; 168 u32 vs;
166 u32 sgls; 169 u32 sgls;
167 u16 kas; 170 u16 kas;
168 u8 npss; 171 u8 npss;
169 u8 apsta; 172 u8 apsta;
173 u32 aen_result;
170 unsigned int shutdown_timeout; 174 unsigned int shutdown_timeout;
171 unsigned int kato; 175 unsigned int kato;
172 bool subsystem; 176 bool subsystem;
173 unsigned long quirks; 177 unsigned long quirks;
174 struct nvme_id_power_state psd[32]; 178 struct nvme_id_power_state psd[32];
179 struct nvme_effects_log *effects;
175 struct work_struct scan_work; 180 struct work_struct scan_work;
176 struct work_struct async_event_work; 181 struct work_struct async_event_work;
177 struct delayed_work ka_work; 182 struct delayed_work ka_work;
@@ -197,21 +202,72 @@ struct nvme_ctrl {
197 struct nvmf_ctrl_options *opts; 202 struct nvmf_ctrl_options *opts;
198}; 203};
199 204
205struct nvme_subsystem {
206 int instance;
207 struct device dev;
208 /*
209 * Because we unregister the device on the last put we need
210 * a separate refcount.
211 */
212 struct kref ref;
213 struct list_head entry;
214 struct mutex lock;
215 struct list_head ctrls;
216 struct list_head nsheads;
217 char subnqn[NVMF_NQN_SIZE];
218 char serial[20];
219 char model[40];
220 char firmware_rev[8];
221 u8 cmic;
222 u16 vendor_id;
223 struct ida ns_ida;
224};
225
226/*
227 * Container structure for uniqueue namespace identifiers.
228 */
229struct nvme_ns_ids {
230 u8 eui64[8];
231 u8 nguid[16];
232 uuid_t uuid;
233};
234
235/*
236 * Anchor structure for namespaces. There is one for each namespace in a
237 * NVMe subsystem that any of our controllers can see, and the namespace
238 * structure for each controller is chained of it. For private namespaces
239 * there is a 1:1 relation to our namespace structures, that is ->list
240 * only ever has a single entry for private namespaces.
241 */
242struct nvme_ns_head {
243#ifdef CONFIG_NVME_MULTIPATH
244 struct gendisk *disk;
245 struct nvme_ns __rcu *current_path;
246 struct bio_list requeue_list;
247 spinlock_t requeue_lock;
248 struct work_struct requeue_work;
249#endif
250 struct list_head list;
251 struct srcu_struct srcu;
252 struct nvme_subsystem *subsys;
253 unsigned ns_id;
254 struct nvme_ns_ids ids;
255 struct list_head entry;
256 struct kref ref;
257 int instance;
258};
259
200struct nvme_ns { 260struct nvme_ns {
201 struct list_head list; 261 struct list_head list;
202 262
203 struct nvme_ctrl *ctrl; 263 struct nvme_ctrl *ctrl;
204 struct request_queue *queue; 264 struct request_queue *queue;
205 struct gendisk *disk; 265 struct gendisk *disk;
266 struct list_head siblings;
206 struct nvm_dev *ndev; 267 struct nvm_dev *ndev;
207 struct kref kref; 268 struct kref kref;
208 int instance; 269 struct nvme_ns_head *head;
209 270
210 u8 eui[8];
211 u8 nguid[16];
212 uuid_t uuid;
213
214 unsigned ns_id;
215 int lba_shift; 271 int lba_shift;
216 u16 ms; 272 u16 ms;
217 u16 sgs; 273 u16 sgs;
@@ -234,9 +290,10 @@ struct nvme_ctrl_ops {
234 int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); 290 int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
235 int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); 291 int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
236 void (*free_ctrl)(struct nvme_ctrl *ctrl); 292 void (*free_ctrl)(struct nvme_ctrl *ctrl);
237 void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); 293 void (*submit_async_event)(struct nvme_ctrl *ctrl);
238 int (*delete_ctrl)(struct nvme_ctrl *ctrl); 294 void (*delete_ctrl)(struct nvme_ctrl *ctrl);
239 int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); 295 int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
296 int (*reinit_request)(void *data, struct request *rq);
240}; 297};
241 298
242static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) 299static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
@@ -278,6 +335,16 @@ static inline void nvme_end_request(struct request *req, __le16 status,
278 blk_mq_complete_request(req); 335 blk_mq_complete_request(req);
279} 336}
280 337
338static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl)
339{
340 get_device(ctrl->device);
341}
342
343static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
344{
345 put_device(ctrl->device);
346}
347
281void nvme_complete_rq(struct request *req); 348void nvme_complete_rq(struct request *req);
282void nvme_cancel_request(struct request *req, void *data, bool reserved); 349void nvme_cancel_request(struct request *req, void *data, bool reserved);
283bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 350bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
@@ -299,10 +366,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
299int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, 366int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
300 bool send); 367 bool send);
301 368
302#define NVME_NR_AERS 1
303void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, 369void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
304 union nvme_result *res); 370 union nvme_result *res);
305void nvme_queue_async_events(struct nvme_ctrl *ctrl);
306 371
307void nvme_stop_queues(struct nvme_ctrl *ctrl); 372void nvme_stop_queues(struct nvme_ctrl *ctrl);
308void nvme_start_queues(struct nvme_ctrl *ctrl); 373void nvme_start_queues(struct nvme_ctrl *ctrl);
@@ -311,21 +376,79 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl);
311void nvme_wait_freeze(struct nvme_ctrl *ctrl); 376void nvme_wait_freeze(struct nvme_ctrl *ctrl);
312void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); 377void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
313void nvme_start_freeze(struct nvme_ctrl *ctrl); 378void nvme_start_freeze(struct nvme_ctrl *ctrl);
379int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set);
314 380
315#define NVME_QID_ANY -1 381#define NVME_QID_ANY -1
316struct request *nvme_alloc_request(struct request_queue *q, 382struct request *nvme_alloc_request(struct request_queue *q,
317 struct nvme_command *cmd, unsigned int flags, int qid); 383 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
318blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 384blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
319 struct nvme_command *cmd); 385 struct nvme_command *cmd);
320int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 386int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
321 void *buf, unsigned bufflen); 387 void *buf, unsigned bufflen);
322int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 388int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
323 union nvme_result *result, void *buffer, unsigned bufflen, 389 union nvme_result *result, void *buffer, unsigned bufflen,
324 unsigned timeout, int qid, int at_head, int flags); 390 unsigned timeout, int qid, int at_head,
391 blk_mq_req_flags_t flags);
325int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); 392int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
326void nvme_start_keep_alive(struct nvme_ctrl *ctrl); 393void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
327void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); 394void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
328int nvme_reset_ctrl(struct nvme_ctrl *ctrl); 395int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
396int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
397int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
398
399extern const struct attribute_group nvme_ns_id_attr_group;
400extern const struct block_device_operations nvme_ns_head_ops;
401
402#ifdef CONFIG_NVME_MULTIPATH
403void nvme_failover_req(struct request *req);
404bool nvme_req_needs_failover(struct request *req);
405void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
406int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
407void nvme_mpath_add_disk(struct nvme_ns_head *head);
408void nvme_mpath_add_disk_links(struct nvme_ns *ns);
409void nvme_mpath_remove_disk(struct nvme_ns_head *head);
410void nvme_mpath_remove_disk_links(struct nvme_ns *ns);
411
412static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
413{
414 struct nvme_ns_head *head = ns->head;
415
416 if (head && ns == srcu_dereference(head->current_path, &head->srcu))
417 rcu_assign_pointer(head->current_path, NULL);
418}
419struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
420#else
421static inline void nvme_failover_req(struct request *req)
422{
423}
424static inline bool nvme_req_needs_failover(struct request *req)
425{
426 return false;
427}
428static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
429{
430}
431static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
432 struct nvme_ns_head *head)
433{
434 return 0;
435}
436static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
437{
438}
439static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
440{
441}
442static inline void nvme_mpath_add_disk_links(struct nvme_ns *ns)
443{
444}
445static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
446{
447}
448static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
449{
450}
451#endif /* CONFIG_NVME_MULTIPATH */
329 452
330#ifdef CONFIG_NVM 453#ifdef CONFIG_NVM
331int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); 454int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3f5a04c586ce..a11cfd470089 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -13,7 +13,6 @@
13 */ 13 */
14 14
15#include <linux/aer.h> 15#include <linux/aer.h>
16#include <linux/bitops.h>
17#include <linux/blkdev.h> 16#include <linux/blkdev.h>
18#include <linux/blk-mq.h> 17#include <linux/blk-mq.h>
19#include <linux/blk-mq-pci.h> 18#include <linux/blk-mq-pci.h>
@@ -26,12 +25,9 @@
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/once.h> 26#include <linux/once.h>
28#include <linux/pci.h> 27#include <linux/pci.h>
29#include <linux/poison.h>
30#include <linux/t10-pi.h> 28#include <linux/t10-pi.h>
31#include <linux/timer.h>
32#include <linux/types.h> 29#include <linux/types.h>
33#include <linux/io-64-nonatomic-lo-hi.h> 30#include <linux/io-64-nonatomic-lo-hi.h>
34#include <asm/unaligned.h>
35#include <linux/sed-opal.h> 31#include <linux/sed-opal.h>
36 32
37#include "nvme.h" 33#include "nvme.h"
@@ -39,11 +35,7 @@
39#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 35#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
40#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 36#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
41 37
42/* 38#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
43 * We handle AEN commands ourselves and don't even let the
44 * block layer know about them.
45 */
46#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS)
47 39
48static int use_threaded_interrupts; 40static int use_threaded_interrupts;
49module_param(use_threaded_interrupts, int, 0); 41module_param(use_threaded_interrupts, int, 0);
@@ -57,6 +49,12 @@ module_param(max_host_mem_size_mb, uint, 0444);
57MODULE_PARM_DESC(max_host_mem_size_mb, 49MODULE_PARM_DESC(max_host_mem_size_mb,
58 "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); 50 "Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
59 51
52static unsigned int sgl_threshold = SZ_32K;
53module_param(sgl_threshold, uint, 0644);
54MODULE_PARM_DESC(sgl_threshold,
55 "Use SGLs when average request segment size is larger or equal to "
56 "this size. Use 0 to disable SGLs.");
57
60static int io_queue_depth_set(const char *val, const struct kernel_param *kp); 58static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
61static const struct kernel_param_ops io_queue_depth_ops = { 59static const struct kernel_param_ops io_queue_depth_ops = {
62 .set = io_queue_depth_set, 60 .set = io_queue_depth_set,
@@ -178,6 +176,7 @@ struct nvme_queue {
178struct nvme_iod { 176struct nvme_iod {
179 struct nvme_request req; 177 struct nvme_request req;
180 struct nvme_queue *nvmeq; 178 struct nvme_queue *nvmeq;
179 bool use_sgl;
181 int aborted; 180 int aborted;
182 int npages; /* In the PRP list. 0 means small pool in use */ 181 int npages; /* In the PRP list. 0 means small pool in use */
183 int nents; /* Used in scatterlist */ 182 int nents; /* Used in scatterlist */
@@ -331,17 +330,35 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
331 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 330 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
332} 331}
333 332
334static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, 333/*
335 unsigned int size, unsigned int nseg) 334 * Calculates the number of pages needed for the SGL segments. For example a 4k
335 * page can accommodate 256 SGL descriptors.
336 */
337static int nvme_pci_npages_sgl(unsigned int num_seg)
336{ 338{
337 return sizeof(__le64 *) * nvme_npages(size, dev) + 339 return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
338 sizeof(struct scatterlist) * nseg;
339} 340}
340 341
341static unsigned int nvme_cmd_size(struct nvme_dev *dev) 342static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
343 unsigned int size, unsigned int nseg, bool use_sgl)
342{ 344{
343 return sizeof(struct nvme_iod) + 345 size_t alloc_size;
344 nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); 346
347 if (use_sgl)
348 alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
349 else
350 alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
351
352 return alloc_size + sizeof(struct scatterlist) * nseg;
353}
354
355static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
356{
357 unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
358 NVME_INT_BYTES(dev), NVME_INT_PAGES,
359 use_sgl);
360
361 return sizeof(struct nvme_iod) + alloc_size;
345} 362}
346 363
347static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 364static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -425,10 +442,10 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
425 nvmeq->sq_tail = tail; 442 nvmeq->sq_tail = tail;
426} 443}
427 444
428static __le64 **iod_list(struct request *req) 445static void **nvme_pci_iod_list(struct request *req)
429{ 446{
430 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 447 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
431 return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); 448 return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
432} 449}
433 450
434static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) 451static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
@@ -438,7 +455,10 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
438 unsigned int size = blk_rq_payload_bytes(rq); 455 unsigned int size = blk_rq_payload_bytes(rq);
439 456
440 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { 457 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
441 iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); 458 size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
459 iod->use_sgl);
460
461 iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
442 if (!iod->sg) 462 if (!iod->sg)
443 return BLK_STS_RESOURCE; 463 return BLK_STS_RESOURCE;
444 } else { 464 } else {
@@ -456,18 +476,31 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
456static void nvme_free_iod(struct nvme_dev *dev, struct request *req) 476static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
457{ 477{
458 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 478 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
459 const int last_prp = dev->ctrl.page_size / 8 - 1; 479 const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
480 dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
481
460 int i; 482 int i;
461 __le64 **list = iod_list(req);
462 dma_addr_t prp_dma = iod->first_dma;
463 483
464 if (iod->npages == 0) 484 if (iod->npages == 0)
465 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 485 dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
486 dma_addr);
487
466 for (i = 0; i < iod->npages; i++) { 488 for (i = 0; i < iod->npages; i++) {
467 __le64 *prp_list = list[i]; 489 void *addr = nvme_pci_iod_list(req)[i];
468 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 490
469 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 491 if (iod->use_sgl) {
470 prp_dma = next_prp_dma; 492 struct nvme_sgl_desc *sg_list = addr;
493
494 next_dma_addr =
495 le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
496 } else {
497 __le64 *prp_list = addr;
498
499 next_dma_addr = le64_to_cpu(prp_list[last_prp]);
500 }
501
502 dma_pool_free(dev->prp_page_pool, addr, dma_addr);
503 dma_addr = next_dma_addr;
471 } 504 }
472 505
473 if (iod->sg != iod->inline_sg) 506 if (iod->sg != iod->inline_sg)
@@ -555,7 +588,8 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
555 } 588 }
556} 589}
557 590
558static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) 591static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
592 struct request *req, struct nvme_rw_command *cmnd)
559{ 593{
560 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 594 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
561 struct dma_pool *pool; 595 struct dma_pool *pool;
@@ -566,14 +600,16 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
566 u32 page_size = dev->ctrl.page_size; 600 u32 page_size = dev->ctrl.page_size;
567 int offset = dma_addr & (page_size - 1); 601 int offset = dma_addr & (page_size - 1);
568 __le64 *prp_list; 602 __le64 *prp_list;
569 __le64 **list = iod_list(req); 603 void **list = nvme_pci_iod_list(req);
570 dma_addr_t prp_dma; 604 dma_addr_t prp_dma;
571 int nprps, i; 605 int nprps, i;
572 606
607 iod->use_sgl = false;
608
573 length -= (page_size - offset); 609 length -= (page_size - offset);
574 if (length <= 0) { 610 if (length <= 0) {
575 iod->first_dma = 0; 611 iod->first_dma = 0;
576 return BLK_STS_OK; 612 goto done;
577 } 613 }
578 614
579 dma_len -= (page_size - offset); 615 dma_len -= (page_size - offset);
@@ -587,7 +623,7 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
587 623
588 if (length <= page_size) { 624 if (length <= page_size) {
589 iod->first_dma = dma_addr; 625 iod->first_dma = dma_addr;
590 return BLK_STS_OK; 626 goto done;
591 } 627 }
592 628
593 nprps = DIV_ROUND_UP(length, page_size); 629 nprps = DIV_ROUND_UP(length, page_size);
@@ -634,6 +670,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
634 dma_len = sg_dma_len(sg); 670 dma_len = sg_dma_len(sg);
635 } 671 }
636 672
673done:
674 cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
675 cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
676
637 return BLK_STS_OK; 677 return BLK_STS_OK;
638 678
639 bad_sgl: 679 bad_sgl:
@@ -643,6 +683,110 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
643 return BLK_STS_IOERR; 683 return BLK_STS_IOERR;
644} 684}
645 685
686static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
687 struct scatterlist *sg)
688{
689 sge->addr = cpu_to_le64(sg_dma_address(sg));
690 sge->length = cpu_to_le32(sg_dma_len(sg));
691 sge->type = NVME_SGL_FMT_DATA_DESC << 4;
692}
693
694static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
695 dma_addr_t dma_addr, int entries)
696{
697 sge->addr = cpu_to_le64(dma_addr);
698 if (entries < SGES_PER_PAGE) {
699 sge->length = cpu_to_le32(entries * sizeof(*sge));
700 sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
701 } else {
702 sge->length = cpu_to_le32(PAGE_SIZE);
703 sge->type = NVME_SGL_FMT_SEG_DESC << 4;
704 }
705}
706
707static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
708 struct request *req, struct nvme_rw_command *cmd)
709{
710 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
711 int length = blk_rq_payload_bytes(req);
712 struct dma_pool *pool;
713 struct nvme_sgl_desc *sg_list;
714 struct scatterlist *sg = iod->sg;
715 int entries = iod->nents, i = 0;
716 dma_addr_t sgl_dma;
717
718 iod->use_sgl = true;
719
720 /* setting the transfer type as SGL */
721 cmd->flags = NVME_CMD_SGL_METABUF;
722
723 if (length == sg_dma_len(sg)) {
724 nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
725 return BLK_STS_OK;
726 }
727
728 if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
729 pool = dev->prp_small_pool;
730 iod->npages = 0;
731 } else {
732 pool = dev->prp_page_pool;
733 iod->npages = 1;
734 }
735
736 sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
737 if (!sg_list) {
738 iod->npages = -1;
739 return BLK_STS_RESOURCE;
740 }
741
742 nvme_pci_iod_list(req)[0] = sg_list;
743 iod->first_dma = sgl_dma;
744
745 nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
746
747 do {
748 if (i == SGES_PER_PAGE) {
749 struct nvme_sgl_desc *old_sg_desc = sg_list;
750 struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
751
752 sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
753 if (!sg_list)
754 return BLK_STS_RESOURCE;
755
756 i = 0;
757 nvme_pci_iod_list(req)[iod->npages++] = sg_list;
758 sg_list[i++] = *link;
759 nvme_pci_sgl_set_seg(link, sgl_dma, entries);
760 }
761
762 nvme_pci_sgl_set_data(&sg_list[i++], sg);
763
764 length -= sg_dma_len(sg);
765 sg = sg_next(sg);
766 entries--;
767 } while (length > 0);
768
769 WARN_ON(entries > 0);
770 return BLK_STS_OK;
771}
772
773static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
774{
775 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
776 unsigned int avg_seg_size;
777
778 avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
779 blk_rq_nr_phys_segments(req));
780
781 if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
782 return false;
783 if (!iod->nvmeq->qid)
784 return false;
785 if (!sgl_threshold || avg_seg_size < sgl_threshold)
786 return false;
787 return true;
788}
789
646static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, 790static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
647 struct nvme_command *cmnd) 791 struct nvme_command *cmnd)
648{ 792{
@@ -662,7 +806,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
662 DMA_ATTR_NO_WARN)) 806 DMA_ATTR_NO_WARN))
663 goto out; 807 goto out;
664 808
665 ret = nvme_setup_prps(dev, req); 809 if (nvme_pci_use_sgls(dev, req))
810 ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
811 else
812 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
813
666 if (ret != BLK_STS_OK) 814 if (ret != BLK_STS_OK)
667 goto out_unmap; 815 goto out_unmap;
668 816
@@ -682,8 +830,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
682 goto out_unmap; 830 goto out_unmap;
683 } 831 }
684 832
685 cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
686 cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
687 if (blk_integrity_rq(req)) 833 if (blk_integrity_rq(req))
688 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); 834 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
689 return BLK_STS_OK; 835 return BLK_STS_OK;
@@ -804,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
804 * for them but rather special case them here. 950 * for them but rather special case them here.
805 */ 951 */
806 if (unlikely(nvmeq->qid == 0 && 952 if (unlikely(nvmeq->qid == 0 &&
807 cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) { 953 cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
808 nvme_complete_async_event(&nvmeq->dev->ctrl, 954 nvme_complete_async_event(&nvmeq->dev->ctrl,
809 cqe->status, &cqe->result); 955 cqe->status, &cqe->result);
810 return; 956 return;
@@ -897,7 +1043,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
897 return __nvme_poll(nvmeq, tag); 1043 return __nvme_poll(nvmeq, tag);
898} 1044}
899 1045
900static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) 1046static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
901{ 1047{
902 struct nvme_dev *dev = to_nvme_dev(ctrl); 1048 struct nvme_dev *dev = to_nvme_dev(ctrl);
903 struct nvme_queue *nvmeq = dev->queues[0]; 1049 struct nvme_queue *nvmeq = dev->queues[0];
@@ -905,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
905 1051
906 memset(&c, 0, sizeof(c)); 1052 memset(&c, 0, sizeof(c));
907 c.common.opcode = nvme_admin_async_event; 1053 c.common.opcode = nvme_admin_async_event;
908 c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; 1054 c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
909 1055
910 spin_lock_irq(&nvmeq->q_lock); 1056 spin_lock_irq(&nvmeq->q_lock);
911 __nvme_submit_cmd(nvmeq, &c); 1057 __nvme_submit_cmd(nvmeq, &c);
@@ -930,7 +1076,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
930 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1076 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
931 1077
932 /* 1078 /*
933 * Note: we (ab)use the fact the the prp fields survive if no data 1079 * Note: we (ab)use the fact that the prp fields survive if no data
934 * is attached to the request. 1080 * is attached to the request.
935 */ 1081 */
936 memset(&c, 0, sizeof(c)); 1082 memset(&c, 0, sizeof(c));
@@ -951,7 +1097,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
951 int flags = NVME_QUEUE_PHYS_CONTIG; 1097 int flags = NVME_QUEUE_PHYS_CONTIG;
952 1098
953 /* 1099 /*
954 * Note: we (ab)use the fact the the prp fields survive if no data 1100 * Note: we (ab)use the fact that the prp fields survive if no data
955 * is attached to the request. 1101 * is attached to the request.
956 */ 1102 */
957 memset(&c, 0, sizeof(c)); 1103 memset(&c, 0, sizeof(c));
@@ -1372,14 +1518,10 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1372 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1518 dev->admin_tagset.ops = &nvme_mq_admin_ops;
1373 dev->admin_tagset.nr_hw_queues = 1; 1519 dev->admin_tagset.nr_hw_queues = 1;
1374 1520
1375 /* 1521 dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1376 * Subtract one to leave an empty queue entry for 'Full Queue'
1377 * condition. See NVM-Express 1.2 specification, section 4.1.2.
1378 */
1379 dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
1380 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1522 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1381 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1523 dev->admin_tagset.numa_node = dev_to_node(dev->dev);
1382 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1524 dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
1383 dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; 1525 dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
1384 dev->admin_tagset.driver_data = dev; 1526 dev->admin_tagset.driver_data = dev;
1385 1527
@@ -1906,7 +2048,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
1906 dev->tagset.numa_node = dev_to_node(dev->dev); 2048 dev->tagset.numa_node = dev_to_node(dev->dev);
1907 dev->tagset.queue_depth = 2049 dev->tagset.queue_depth =
1908 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2050 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
1909 dev->tagset.cmd_size = nvme_cmd_size(dev); 2051 dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
2052 if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
2053 dev->tagset.cmd_size = max(dev->tagset.cmd_size,
2054 nvme_pci_cmd_size(dev, true));
2055 }
1910 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2056 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
1911 dev->tagset.driver_data = dev; 2057 dev->tagset.driver_data = dev;
1912 2058
@@ -2132,9 +2278,9 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
2132{ 2278{
2133 dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); 2279 dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
2134 2280
2135 kref_get(&dev->ctrl.kref); 2281 nvme_get_ctrl(&dev->ctrl);
2136 nvme_dev_disable(dev, false); 2282 nvme_dev_disable(dev, false);
2137 if (!schedule_work(&dev->remove_work)) 2283 if (!queue_work(nvme_wq, &dev->remove_work))
2138 nvme_put_ctrl(&dev->ctrl); 2284 nvme_put_ctrl(&dev->ctrl);
2139} 2285}
2140 2286
@@ -2557,6 +2703,7 @@ static int __init nvme_init(void)
2557static void __exit nvme_exit(void) 2703static void __exit nvme_exit(void)
2558{ 2704{
2559 pci_unregister_driver(&nvme_driver); 2705 pci_unregister_driver(&nvme_driver);
2706 flush_workqueue(nvme_wq);
2560 _nvme_check_size(); 2707 _nvme_check_size();
2561} 2708}
2562 2709
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0ebb539f3bd3..4f9bf2f815c3 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -41,17 +41,9 @@
41 41
42#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 42#define NVME_RDMA_MAX_INLINE_SEGMENTS 1
43 43
44/*
45 * We handle AEN commands ourselves and don't even let the
46 * block layer know about them.
47 */
48#define NVME_RDMA_NR_AEN_COMMANDS 1
49#define NVME_RDMA_AQ_BLKMQ_DEPTH \
50 (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
51
52struct nvme_rdma_device { 44struct nvme_rdma_device {
53 struct ib_device *dev; 45 struct ib_device *dev;
54 struct ib_pd *pd; 46 struct ib_pd *pd;
55 struct kref ref; 47 struct kref ref;
56 struct list_head entry; 48 struct list_head entry;
57}; 49};
@@ -79,8 +71,8 @@ struct nvme_rdma_request {
79}; 71};
80 72
81enum nvme_rdma_queue_flags { 73enum nvme_rdma_queue_flags {
82 NVME_RDMA_Q_LIVE = 0, 74 NVME_RDMA_Q_ALLOCATED = 0,
83 NVME_RDMA_Q_DELETING = 1, 75 NVME_RDMA_Q_LIVE = 1,
84}; 76};
85 77
86struct nvme_rdma_queue { 78struct nvme_rdma_queue {
@@ -105,7 +97,6 @@ struct nvme_rdma_ctrl {
105 97
106 /* other member variables */ 98 /* other member variables */
107 struct blk_mq_tag_set tag_set; 99 struct blk_mq_tag_set tag_set;
108 struct work_struct delete_work;
109 struct work_struct err_work; 100 struct work_struct err_work;
110 101
111 struct nvme_rdma_qe async_event_sqe; 102 struct nvme_rdma_qe async_event_sqe;
@@ -274,6 +265,9 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq)
274 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 265 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
275 int ret = 0; 266 int ret = 0;
276 267
268 if (WARN_ON_ONCE(!req->mr))
269 return 0;
270
277 ib_dereg_mr(req->mr); 271 ib_dereg_mr(req->mr);
278 272
279 req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, 273 req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
@@ -434,11 +428,9 @@ out_err:
434 428
435static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) 429static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
436{ 430{
437 struct nvme_rdma_device *dev; 431 struct nvme_rdma_device *dev = queue->device;
438 struct ib_device *ibdev; 432 struct ib_device *ibdev = dev->dev;
439 433
440 dev = queue->device;
441 ibdev = dev->dev;
442 rdma_destroy_qp(queue->cm_id); 434 rdma_destroy_qp(queue->cm_id);
443 ib_free_cq(queue->ib_cq); 435 ib_free_cq(queue->ib_cq);
444 436
@@ -493,7 +485,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
493 return 0; 485 return 0;
494 486
495out_destroy_qp: 487out_destroy_qp:
496 ib_destroy_qp(queue->qp); 488 rdma_destroy_qp(queue->cm_id);
497out_destroy_ib_cq: 489out_destroy_ib_cq:
498 ib_free_cq(queue->ib_cq); 490 ib_free_cq(queue->ib_cq);
499out_put_dev: 491out_put_dev:
@@ -544,11 +536,11 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
544 ret = nvme_rdma_wait_for_cm(queue); 536 ret = nvme_rdma_wait_for_cm(queue);
545 if (ret) { 537 if (ret) {
546 dev_info(ctrl->ctrl.device, 538 dev_info(ctrl->ctrl.device,
547 "rdma_resolve_addr wait failed (%d).\n", ret); 539 "rdma connection establishment failed (%d)\n", ret);
548 goto out_destroy_cm_id; 540 goto out_destroy_cm_id;
549 } 541 }
550 542
551 clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); 543 set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
552 544
553 return 0; 545 return 0;
554 546
@@ -568,7 +560,7 @@ static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
568 560
569static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) 561static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
570{ 562{
571 if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) 563 if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
572 return; 564 return;
573 565
574 if (nvme_rdma_queue_idx(queue) == 0) { 566 if (nvme_rdma_queue_idx(queue) == 0) {
@@ -676,11 +668,10 @@ out_free_queues:
676 return ret; 668 return ret;
677} 669}
678 670
679static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin) 671static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
672 struct blk_mq_tag_set *set)
680{ 673{
681 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 674 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
682 struct blk_mq_tag_set *set = admin ?
683 &ctrl->admin_tag_set : &ctrl->tag_set;
684 675
685 blk_mq_free_tag_set(set); 676 blk_mq_free_tag_set(set);
686 nvme_rdma_dev_put(ctrl->device); 677 nvme_rdma_dev_put(ctrl->device);
@@ -697,7 +688,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
697 set = &ctrl->admin_tag_set; 688 set = &ctrl->admin_tag_set;
698 memset(set, 0, sizeof(*set)); 689 memset(set, 0, sizeof(*set));
699 set->ops = &nvme_rdma_admin_mq_ops; 690 set->ops = &nvme_rdma_admin_mq_ops;
700 set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; 691 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
701 set->reserved_tags = 2; /* connect + keep-alive */ 692 set->reserved_tags = 2; /* connect + keep-alive */
702 set->numa_node = NUMA_NO_NODE; 693 set->numa_node = NUMA_NO_NODE;
703 set->cmd_size = sizeof(struct nvme_rdma_request) + 694 set->cmd_size = sizeof(struct nvme_rdma_request) +
@@ -705,6 +696,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
705 set->driver_data = ctrl; 696 set->driver_data = ctrl;
706 set->nr_hw_queues = 1; 697 set->nr_hw_queues = 1;
707 set->timeout = ADMIN_TIMEOUT; 698 set->timeout = ADMIN_TIMEOUT;
699 set->flags = BLK_MQ_F_NO_SCHED;
708 } else { 700 } else {
709 set = &ctrl->tag_set; 701 set = &ctrl->tag_set;
710 memset(set, 0, sizeof(*set)); 702 memset(set, 0, sizeof(*set));
@@ -748,7 +740,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
748 nvme_rdma_stop_queue(&ctrl->queues[0]); 740 nvme_rdma_stop_queue(&ctrl->queues[0]);
749 if (remove) { 741 if (remove) {
750 blk_cleanup_queue(ctrl->ctrl.admin_q); 742 blk_cleanup_queue(ctrl->ctrl.admin_q);
751 nvme_rdma_free_tagset(&ctrl->ctrl, true); 743 nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
752 } 744 }
753 nvme_rdma_free_queue(&ctrl->queues[0]); 745 nvme_rdma_free_queue(&ctrl->queues[0]);
754} 746}
@@ -780,8 +772,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
780 goto out_free_tagset; 772 goto out_free_tagset;
781 } 773 }
782 } else { 774 } else {
783 error = blk_mq_reinit_tagset(&ctrl->admin_tag_set, 775 error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
784 nvme_rdma_reinit_request);
785 if (error) 776 if (error)
786 goto out_free_queue; 777 goto out_free_queue;
787 } 778 }
@@ -825,7 +816,7 @@ out_cleanup_queue:
825 blk_cleanup_queue(ctrl->ctrl.admin_q); 816 blk_cleanup_queue(ctrl->ctrl.admin_q);
826out_free_tagset: 817out_free_tagset:
827 if (new) 818 if (new)
828 nvme_rdma_free_tagset(&ctrl->ctrl, true); 819 nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
829out_free_queue: 820out_free_queue:
830 nvme_rdma_free_queue(&ctrl->queues[0]); 821 nvme_rdma_free_queue(&ctrl->queues[0]);
831 return error; 822 return error;
@@ -837,7 +828,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
837 nvme_rdma_stop_io_queues(ctrl); 828 nvme_rdma_stop_io_queues(ctrl);
838 if (remove) { 829 if (remove) {
839 blk_cleanup_queue(ctrl->ctrl.connect_q); 830 blk_cleanup_queue(ctrl->ctrl.connect_q);
840 nvme_rdma_free_tagset(&ctrl->ctrl, false); 831 nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
841 } 832 }
842 nvme_rdma_free_io_queues(ctrl); 833 nvme_rdma_free_io_queues(ctrl);
843} 834}
@@ -863,8 +854,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
863 goto out_free_tag_set; 854 goto out_free_tag_set;
864 } 855 }
865 } else { 856 } else {
866 ret = blk_mq_reinit_tagset(&ctrl->tag_set, 857 ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
867 nvme_rdma_reinit_request);
868 if (ret) 858 if (ret)
869 goto out_free_io_queues; 859 goto out_free_io_queues;
870 860
@@ -883,7 +873,7 @@ out_cleanup_connect_q:
883 blk_cleanup_queue(ctrl->ctrl.connect_q); 873 blk_cleanup_queue(ctrl->ctrl.connect_q);
884out_free_tag_set: 874out_free_tag_set:
885 if (new) 875 if (new)
886 nvme_rdma_free_tagset(&ctrl->ctrl, false); 876 nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
887out_free_io_queues: 877out_free_io_queues:
888 nvme_rdma_free_io_queues(ctrl); 878 nvme_rdma_free_io_queues(ctrl);
889 return ret; 879 return ret;
@@ -922,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
922 ctrl->ctrl.opts->reconnect_delay * HZ); 912 ctrl->ctrl.opts->reconnect_delay * HZ);
923 } else { 913 } else {
924 dev_info(ctrl->ctrl.device, "Removing controller...\n"); 914 dev_info(ctrl->ctrl.device, "Removing controller...\n");
925 queue_work(nvme_wq, &ctrl->delete_work); 915 nvme_delete_ctrl(&ctrl->ctrl);
926 } 916 }
927} 917}
928 918
@@ -935,10 +925,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
935 925
936 ++ctrl->ctrl.nr_reconnects; 926 ++ctrl->ctrl.nr_reconnects;
937 927
938 if (ctrl->ctrl.queue_count > 1)
939 nvme_rdma_destroy_io_queues(ctrl, false);
940
941 nvme_rdma_destroy_admin_queue(ctrl, false);
942 ret = nvme_rdma_configure_admin_queue(ctrl, false); 928 ret = nvme_rdma_configure_admin_queue(ctrl, false);
943 if (ret) 929 if (ret)
944 goto requeue; 930 goto requeue;
@@ -946,7 +932,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
946 if (ctrl->ctrl.queue_count > 1) { 932 if (ctrl->ctrl.queue_count > 1) {
947 ret = nvme_rdma_configure_io_queues(ctrl, false); 933 ret = nvme_rdma_configure_io_queues(ctrl, false);
948 if (ret) 934 if (ret)
949 goto requeue; 935 goto destroy_admin;
950 } 936 }
951 937
952 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 938 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
@@ -956,14 +942,17 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
956 return; 942 return;
957 } 943 }
958 944
959 ctrl->ctrl.nr_reconnects = 0;
960
961 nvme_start_ctrl(&ctrl->ctrl); 945 nvme_start_ctrl(&ctrl->ctrl);
962 946
963 dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); 947 dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
948 ctrl->ctrl.nr_reconnects);
949
950 ctrl->ctrl.nr_reconnects = 0;
964 951
965 return; 952 return;
966 953
954destroy_admin:
955 nvme_rdma_destroy_admin_queue(ctrl, false);
967requeue: 956requeue:
968 dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", 957 dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
969 ctrl->ctrl.nr_reconnects); 958 ctrl->ctrl.nr_reconnects);
@@ -979,17 +968,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
979 968
980 if (ctrl->ctrl.queue_count > 1) { 969 if (ctrl->ctrl.queue_count > 1) {
981 nvme_stop_queues(&ctrl->ctrl); 970 nvme_stop_queues(&ctrl->ctrl);
982 nvme_rdma_stop_io_queues(ctrl);
983 }
984 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
985 nvme_rdma_stop_queue(&ctrl->queues[0]);
986
987 /* We must take care of fastfail/requeue all our inflight requests */
988 if (ctrl->ctrl.queue_count > 1)
989 blk_mq_tagset_busy_iter(&ctrl->tag_set, 971 blk_mq_tagset_busy_iter(&ctrl->tag_set,
990 nvme_cancel_request, &ctrl->ctrl); 972 nvme_cancel_request, &ctrl->ctrl);
973 nvme_rdma_destroy_io_queues(ctrl, false);
974 }
975
976 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
991 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 977 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
992 nvme_cancel_request, &ctrl->ctrl); 978 nvme_cancel_request, &ctrl->ctrl);
979 nvme_rdma_destroy_admin_queue(ctrl, false);
993 980
994 /* 981 /*
995 * queues are not a live anymore, so restart the queues to fail fast 982 * queues are not a live anymore, so restart the queues to fail fast
@@ -1065,7 +1052,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
1065 if (!blk_rq_bytes(rq)) 1052 if (!blk_rq_bytes(rq))
1066 return; 1053 return;
1067 1054
1068 if (req->mr->need_inval) { 1055 if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) {
1069 res = nvme_rdma_inv_rkey(queue, req); 1056 res = nvme_rdma_inv_rkey(queue, req);
1070 if (unlikely(res < 0)) { 1057 if (unlikely(res < 0)) {
1071 dev_err(ctrl->ctrl.device, 1058 dev_err(ctrl->ctrl.device,
@@ -1314,7 +1301,7 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1314 return queue->ctrl->tag_set.tags[queue_idx - 1]; 1301 return queue->ctrl->tag_set.tags[queue_idx - 1];
1315} 1302}
1316 1303
1317static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) 1304static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
1318{ 1305{
1319 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); 1306 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1320 struct nvme_rdma_queue *queue = &ctrl->queues[0]; 1307 struct nvme_rdma_queue *queue = &ctrl->queues[0];
@@ -1324,14 +1311,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
1324 struct ib_sge sge; 1311 struct ib_sge sge;
1325 int ret; 1312 int ret;
1326 1313
1327 if (WARN_ON_ONCE(aer_idx != 0))
1328 return;
1329
1330 ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); 1314 ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1331 1315
1332 memset(cmd, 0, sizeof(*cmd)); 1316 memset(cmd, 0, sizeof(*cmd));
1333 cmd->common.opcode = nvme_admin_async_event; 1317 cmd->common.opcode = nvme_admin_async_event;
1334 cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; 1318 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1335 cmd->common.flags |= NVME_CMD_SGL_METABUF; 1319 cmd->common.flags |= NVME_CMD_SGL_METABUF;
1336 nvme_rdma_set_sg_null(cmd); 1320 nvme_rdma_set_sg_null(cmd);
1337 1321
@@ -1393,7 +1377,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1393 * for them but rather special case them here. 1377 * for them but rather special case them here.
1394 */ 1378 */
1395 if (unlikely(nvme_rdma_queue_idx(queue) == 0 && 1379 if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1396 cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) 1380 cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
1397 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, 1381 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
1398 &cqe->result); 1382 &cqe->result);
1399 else 1383 else
@@ -1590,6 +1574,10 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
1590{ 1574{
1591 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1575 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1592 1576
1577 dev_warn(req->queue->ctrl->ctrl.device,
1578 "I/O %d QID %d timeout, reset controller\n",
1579 rq->tag, nvme_rdma_queue_idx(req->queue));
1580
1593 /* queue error recovery */ 1581 /* queue error recovery */
1594 nvme_rdma_error_recovery(req->queue->ctrl); 1582 nvme_rdma_error_recovery(req->queue->ctrl);
1595 1583
@@ -1767,50 +1755,9 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
1767 nvme_rdma_destroy_admin_queue(ctrl, shutdown); 1755 nvme_rdma_destroy_admin_queue(ctrl, shutdown);
1768} 1756}
1769 1757
1770static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) 1758static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
1771{ 1759{
1772 nvme_remove_namespaces(&ctrl->ctrl); 1760 nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
1773 nvme_rdma_shutdown_ctrl(ctrl, true);
1774 nvme_uninit_ctrl(&ctrl->ctrl);
1775 nvme_put_ctrl(&ctrl->ctrl);
1776}
1777
1778static void nvme_rdma_del_ctrl_work(struct work_struct *work)
1779{
1780 struct nvme_rdma_ctrl *ctrl = container_of(work,
1781 struct nvme_rdma_ctrl, delete_work);
1782
1783 nvme_stop_ctrl(&ctrl->ctrl);
1784 nvme_rdma_remove_ctrl(ctrl);
1785}
1786
1787static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
1788{
1789 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
1790 return -EBUSY;
1791
1792 if (!queue_work(nvme_wq, &ctrl->delete_work))
1793 return -EBUSY;
1794
1795 return 0;
1796}
1797
1798static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
1799{
1800 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1801 int ret = 0;
1802
1803 /*
1804 * Keep a reference until all work is flushed since
1805 * __nvme_rdma_del_ctrl can free the ctrl mem
1806 */
1807 if (!kref_get_unless_zero(&ctrl->ctrl.kref))
1808 return -EBUSY;
1809 ret = __nvme_rdma_del_ctrl(ctrl);
1810 if (!ret)
1811 flush_work(&ctrl->delete_work);
1812 nvme_put_ctrl(&ctrl->ctrl);
1813 return ret;
1814} 1761}
1815 1762
1816static void nvme_rdma_reset_ctrl_work(struct work_struct *work) 1763static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
@@ -1834,7 +1781,11 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1834 } 1781 }
1835 1782
1836 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 1783 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1837 WARN_ON_ONCE(!changed); 1784 if (!changed) {
1785 /* state change failure is ok if we're in DELETING state */
1786 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1787 return;
1788 }
1838 1789
1839 nvme_start_ctrl(&ctrl->ctrl); 1790 nvme_start_ctrl(&ctrl->ctrl);
1840 1791
@@ -1842,7 +1793,10 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1842 1793
1843out_fail: 1794out_fail:
1844 dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); 1795 dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
1845 nvme_rdma_remove_ctrl(ctrl); 1796 nvme_remove_namespaces(&ctrl->ctrl);
1797 nvme_rdma_shutdown_ctrl(ctrl, true);
1798 nvme_uninit_ctrl(&ctrl->ctrl);
1799 nvme_put_ctrl(&ctrl->ctrl);
1846} 1800}
1847 1801
1848static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { 1802static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
@@ -1854,10 +1808,88 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1854 .reg_write32 = nvmf_reg_write32, 1808 .reg_write32 = nvmf_reg_write32,
1855 .free_ctrl = nvme_rdma_free_ctrl, 1809 .free_ctrl = nvme_rdma_free_ctrl,
1856 .submit_async_event = nvme_rdma_submit_async_event, 1810 .submit_async_event = nvme_rdma_submit_async_event,
1857 .delete_ctrl = nvme_rdma_del_ctrl, 1811 .delete_ctrl = nvme_rdma_delete_ctrl,
1858 .get_address = nvmf_get_address, 1812 .get_address = nvmf_get_address,
1813 .reinit_request = nvme_rdma_reinit_request,
1859}; 1814};
1860 1815
1816static inline bool
1817__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
1818 struct nvmf_ctrl_options *opts)
1819{
1820 char *stdport = __stringify(NVME_RDMA_IP_PORT);
1821
1822
1823 if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
1824 strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
1825 return false;
1826
1827 if (opts->mask & NVMF_OPT_TRSVCID &&
1828 ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
1829 if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
1830 return false;
1831 } else if (opts->mask & NVMF_OPT_TRSVCID) {
1832 if (strcmp(opts->trsvcid, stdport))
1833 return false;
1834 } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
1835 if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
1836 return false;
1837 }
1838 /* else, it's a match as both have stdport. Fall to next checks */
1839
1840 /*
1841 * checking the local address is rough. In most cases, one
1842 * is not specified and the host port is selected by the stack.
1843 *
1844 * Assume no match if:
1845 * local address is specified and address is not the same
1846 * local address is not specified but remote is, or vice versa
1847 * (admin using specific host_traddr when it matters).
1848 */
1849 if (opts->mask & NVMF_OPT_HOST_TRADDR &&
1850 ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
1851 if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
1852 return false;
1853 } else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
1854 ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
1855 return false;
1856 /*
1857 * if neither controller had an host port specified, assume it's
1858 * a match as everything else matched.
1859 */
1860
1861 return true;
1862}
1863
1864/*
1865 * Fails a connection request if it matches an existing controller
1866 * (association) with the same tuple:
1867 * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
1868 *
1869 * if local address is not specified in the request, it will match an
1870 * existing controller with all the other parameters the same and no
1871 * local port address specified as well.
1872 *
1873 * The ports don't need to be compared as they are intrinsically
1874 * already matched by the port pointers supplied.
1875 */
1876static bool
1877nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
1878{
1879 struct nvme_rdma_ctrl *ctrl;
1880 bool found = false;
1881
1882 mutex_lock(&nvme_rdma_ctrl_mutex);
1883 list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
1884 found = __nvme_rdma_options_match(ctrl, opts);
1885 if (found)
1886 break;
1887 }
1888 mutex_unlock(&nvme_rdma_ctrl_mutex);
1889
1890 return found;
1891}
1892
1861static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, 1893static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1862 struct nvmf_ctrl_options *opts) 1894 struct nvmf_ctrl_options *opts)
1863{ 1895{
@@ -1894,6 +1926,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1894 } 1926 }
1895 } 1927 }
1896 1928
1929 if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
1930 ret = -EALREADY;
1931 goto out_free_ctrl;
1932 }
1933
1897 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, 1934 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
1898 0 /* no quirks, we're perfect! */); 1935 0 /* no quirks, we're perfect! */);
1899 if (ret) 1936 if (ret)
@@ -1902,7 +1939,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1902 INIT_DELAYED_WORK(&ctrl->reconnect_work, 1939 INIT_DELAYED_WORK(&ctrl->reconnect_work,
1903 nvme_rdma_reconnect_ctrl_work); 1940 nvme_rdma_reconnect_ctrl_work);
1904 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); 1941 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1905 INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
1906 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); 1942 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
1907 1943
1908 ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ 1944 ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
@@ -1961,7 +1997,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1961 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", 1997 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
1962 ctrl->ctrl.opts->subsysnqn, &ctrl->addr); 1998 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1963 1999
1964 kref_get(&ctrl->ctrl.kref); 2000 nvme_get_ctrl(&ctrl->ctrl);
1965 2001
1966 mutex_lock(&nvme_rdma_ctrl_mutex); 2002 mutex_lock(&nvme_rdma_ctrl_mutex);
1967 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); 2003 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
@@ -2006,7 +2042,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2006 dev_info(ctrl->ctrl.device, 2042 dev_info(ctrl->ctrl.device,
2007 "Removing ctrl: NQN \"%s\", addr %pISp\n", 2043 "Removing ctrl: NQN \"%s\", addr %pISp\n",
2008 ctrl->ctrl.opts->subsysnqn, &ctrl->addr); 2044 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2009 __nvme_rdma_del_ctrl(ctrl); 2045 nvme_delete_ctrl(&ctrl->ctrl);
2010 } 2046 }
2011 mutex_unlock(&nvme_rdma_ctrl_mutex); 2047 mutex_unlock(&nvme_rdma_ctrl_mutex);
2012 2048
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c4a0bf36e752..90dcdc40ac71 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -35,17 +35,14 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
35static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, 35static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
36 struct nvme_smart_log *slog) 36 struct nvme_smart_log *slog)
37{ 37{
38 u16 status;
39 struct nvmet_ns *ns; 38 struct nvmet_ns *ns;
40 u64 host_reads, host_writes, data_units_read, data_units_written; 39 u64 host_reads, host_writes, data_units_read, data_units_written;
41 40
42 status = NVME_SC_SUCCESS;
43 ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); 41 ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
44 if (!ns) { 42 if (!ns) {
45 status = NVME_SC_INVALID_NS;
46 pr_err("nvmet : Could not find namespace id : %d\n", 43 pr_err("nvmet : Could not find namespace id : %d\n",
47 le32_to_cpu(req->cmd->get_log_page.nsid)); 44 le32_to_cpu(req->cmd->get_log_page.nsid));
48 goto out; 45 return NVME_SC_INVALID_NS;
49 } 46 }
50 47
51 host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); 48 host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
@@ -58,20 +55,18 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
58 put_unaligned_le64(host_writes, &slog->host_writes[0]); 55 put_unaligned_le64(host_writes, &slog->host_writes[0]);
59 put_unaligned_le64(data_units_written, &slog->data_units_written[0]); 56 put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
60 nvmet_put_namespace(ns); 57 nvmet_put_namespace(ns);
61out: 58
62 return status; 59 return NVME_SC_SUCCESS;
63} 60}
64 61
65static u16 nvmet_get_smart_log_all(struct nvmet_req *req, 62static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
66 struct nvme_smart_log *slog) 63 struct nvme_smart_log *slog)
67{ 64{
68 u16 status;
69 u64 host_reads = 0, host_writes = 0; 65 u64 host_reads = 0, host_writes = 0;
70 u64 data_units_read = 0, data_units_written = 0; 66 u64 data_units_read = 0, data_units_written = 0;
71 struct nvmet_ns *ns; 67 struct nvmet_ns *ns;
72 struct nvmet_ctrl *ctrl; 68 struct nvmet_ctrl *ctrl;
73 69
74 status = NVME_SC_SUCCESS;
75 ctrl = req->sq->ctrl; 70 ctrl = req->sq->ctrl;
76 71
77 rcu_read_lock(); 72 rcu_read_lock();
@@ -91,7 +86,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
91 put_unaligned_le64(host_writes, &slog->host_writes[0]); 86 put_unaligned_le64(host_writes, &slog->host_writes[0]);
92 put_unaligned_le64(data_units_written, &slog->data_units_written[0]); 87 put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
93 88
94 return status; 89 return NVME_SC_SUCCESS;
95} 90}
96 91
97static u16 nvmet_get_smart_log(struct nvmet_req *req, 92static u16 nvmet_get_smart_log(struct nvmet_req *req,
@@ -144,10 +139,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
144 } 139 }
145 smart_log = buf; 140 smart_log = buf;
146 status = nvmet_get_smart_log(req, smart_log); 141 status = nvmet_get_smart_log(req, smart_log);
147 if (status) { 142 if (status)
148 memset(buf, '\0', data_len);
149 goto err; 143 goto err;
150 }
151 break; 144 break;
152 case NVME_LOG_FW_SLOT: 145 case NVME_LOG_FW_SLOT:
153 /* 146 /*
@@ -300,7 +293,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
300 } 293 }
301 294
302 /* 295 /*
303 * nuse = ncap = nsze isn't aways true, but we have no way to find 296 * nuse = ncap = nsze isn't always true, but we have no way to find
304 * that out from the underlying device. 297 * that out from the underlying device.
305 */ 298 */
306 id->ncap = id->nuse = id->nsze = 299 id->ncap = id->nuse = id->nsze =
@@ -424,7 +417,7 @@ out:
424} 417}
425 418
426/* 419/*
427 * A "mimimum viable" abort implementation: the command is mandatory in the 420 * A "minimum viable" abort implementation: the command is mandatory in the
428 * spec, but we are not required to do any useful work. We couldn't really 421 * spec, but we are not required to do any useful work. We couldn't really
429 * do a useful abort, so don't bother even with waiting for the command 422 * do a useful abort, so don't bother even with waiting for the command
430 * to be exectuted and return immediately telling the command to abort 423 * to be exectuted and return immediately telling the command to abort
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 645ba7eee35d..b54748ad5f48 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -57,6 +57,17 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
57 return 0; 57 return 0;
58} 58}
59 59
60static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
61{
62 struct nvmet_ns *ns;
63
64 if (list_empty(&subsys->namespaces))
65 return 0;
66
67 ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
68 return ns->nsid;
69}
70
60static u32 nvmet_async_event_result(struct nvmet_async_event *aen) 71static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
61{ 72{
62 return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); 73 return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
@@ -334,6 +345,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
334 345
335 ns->enabled = false; 346 ns->enabled = false;
336 list_del_rcu(&ns->dev_link); 347 list_del_rcu(&ns->dev_link);
348 if (ns->nsid == subsys->max_nsid)
349 subsys->max_nsid = nvmet_max_nsid(subsys);
337 mutex_unlock(&subsys->lock); 350 mutex_unlock(&subsys->lock);
338 351
339 /* 352 /*
@@ -497,6 +510,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
497 req->ops = ops; 510 req->ops = ops;
498 req->sg = NULL; 511 req->sg = NULL;
499 req->sg_cnt = 0; 512 req->sg_cnt = 0;
513 req->transfer_len = 0;
500 req->rsp->status = 0; 514 req->rsp->status = 0;
501 515
502 /* no support for fused commands yet */ 516 /* no support for fused commands yet */
@@ -546,6 +560,15 @@ void nvmet_req_uninit(struct nvmet_req *req)
546} 560}
547EXPORT_SYMBOL_GPL(nvmet_req_uninit); 561EXPORT_SYMBOL_GPL(nvmet_req_uninit);
548 562
563void nvmet_req_execute(struct nvmet_req *req)
564{
565 if (unlikely(req->data_len != req->transfer_len))
566 nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
567 else
568 req->execute(req);
569}
570EXPORT_SYMBOL_GPL(nvmet_req_execute);
571
549static inline bool nvmet_cc_en(u32 cc) 572static inline bool nvmet_cc_en(u32 cc)
550{ 573{
551 return (cc >> NVME_CC_EN_SHIFT) & 0x1; 574 return (cc >> NVME_CC_EN_SHIFT) & 0x1;
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 58e010bdda3e..739b8feadc7d 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -76,7 +76,6 @@ struct nvmet_fc_fcp_iod {
76 dma_addr_t rspdma; 76 dma_addr_t rspdma;
77 struct scatterlist *data_sg; 77 struct scatterlist *data_sg;
78 int data_sg_cnt; 78 int data_sg_cnt;
79 u32 total_length;
80 u32 offset; 79 u32 offset;
81 enum nvmet_fcp_datadir io_dir; 80 enum nvmet_fcp_datadir io_dir;
82 bool active; 81 bool active;
@@ -150,6 +149,7 @@ struct nvmet_fc_tgt_assoc {
150 struct list_head a_list; 149 struct list_head a_list;
151 struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; 150 struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1];
152 struct kref ref; 151 struct kref ref;
152 struct work_struct del_work;
153}; 153};
154 154
155 155
@@ -232,6 +232,7 @@ static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport);
232static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); 232static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport);
233static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, 233static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
234 struct nvmet_fc_fcp_iod *fod); 234 struct nvmet_fc_fcp_iod *fod);
235static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc);
235 236
236 237
237/* *********************** FC-NVME DMA Handling **************************** */ 238/* *********************** FC-NVME DMA Handling **************************** */
@@ -802,6 +803,16 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport,
802 return NULL; 803 return NULL;
803} 804}
804 805
806static void
807nvmet_fc_delete_assoc(struct work_struct *work)
808{
809 struct nvmet_fc_tgt_assoc *assoc =
810 container_of(work, struct nvmet_fc_tgt_assoc, del_work);
811
812 nvmet_fc_delete_target_assoc(assoc);
813 nvmet_fc_tgt_a_put(assoc);
814}
815
805static struct nvmet_fc_tgt_assoc * 816static struct nvmet_fc_tgt_assoc *
806nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) 817nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
807{ 818{
@@ -826,6 +837,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
826 assoc->a_id = idx; 837 assoc->a_id = idx;
827 INIT_LIST_HEAD(&assoc->a_list); 838 INIT_LIST_HEAD(&assoc->a_list);
828 kref_init(&assoc->ref); 839 kref_init(&assoc->ref);
840 INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
829 841
830 while (needrandom) { 842 while (needrandom) {
831 get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID); 843 get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID);
@@ -1118,8 +1130,7 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
1118 nvmet_fc_tgtport_put(tgtport); 1130 nvmet_fc_tgtport_put(tgtport);
1119 1131
1120 if (found_ctrl) { 1132 if (found_ctrl) {
1121 nvmet_fc_delete_target_assoc(assoc); 1133 schedule_work(&assoc->del_work);
1122 nvmet_fc_tgt_a_put(assoc);
1123 return; 1134 return;
1124 } 1135 }
1125 1136
@@ -1688,7 +1699,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
1688 u32 page_len, length; 1699 u32 page_len, length;
1689 int i = 0; 1700 int i = 0;
1690 1701
1691 length = fod->total_length; 1702 length = fod->req.transfer_len;
1692 nent = DIV_ROUND_UP(length, PAGE_SIZE); 1703 nent = DIV_ROUND_UP(length, PAGE_SIZE);
1693 sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); 1704 sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
1694 if (!sg) 1705 if (!sg)
@@ -1777,7 +1788,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
1777 u32 rsn, rspcnt, xfr_length; 1788 u32 rsn, rspcnt, xfr_length;
1778 1789
1779 if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP) 1790 if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP)
1780 xfr_length = fod->total_length; 1791 xfr_length = fod->req.transfer_len;
1781 else 1792 else
1782 xfr_length = fod->offset; 1793 xfr_length = fod->offset;
1783 1794
@@ -1803,7 +1814,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
1803 rspcnt = atomic_inc_return(&fod->queue->zrspcnt); 1814 rspcnt = atomic_inc_return(&fod->queue->zrspcnt);
1804 if (!(rspcnt % fod->queue->ersp_ratio) || 1815 if (!(rspcnt % fod->queue->ersp_ratio) ||
1805 sqe->opcode == nvme_fabrics_command || 1816 sqe->opcode == nvme_fabrics_command ||
1806 xfr_length != fod->total_length || 1817 xfr_length != fod->req.transfer_len ||
1807 (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || 1818 (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
1808 (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || 1819 (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
1809 queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head))) 1820 queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head)))
@@ -1880,7 +1891,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
1880 fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; 1891 fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
1881 1892
1882 tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, 1893 tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE,
1883 (fod->total_length - fod->offset)); 1894 (fod->req.transfer_len - fod->offset));
1884 fcpreq->transfer_length = tlen; 1895 fcpreq->transfer_length = tlen;
1885 fcpreq->transferred_length = 0; 1896 fcpreq->transferred_length = 0;
1886 fcpreq->fcp_error = 0; 1897 fcpreq->fcp_error = 0;
@@ -1894,7 +1905,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
1894 * combined xfr with response. 1905 * combined xfr with response.
1895 */ 1906 */
1896 if ((op == NVMET_FCOP_READDATA) && 1907 if ((op == NVMET_FCOP_READDATA) &&
1897 ((fod->offset + fcpreq->transfer_length) == fod->total_length) && 1908 ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) &&
1898 (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) { 1909 (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) {
1899 fcpreq->op = NVMET_FCOP_READDATA_RSP; 1910 fcpreq->op = NVMET_FCOP_READDATA_RSP;
1900 nvmet_fc_prep_fcp_rsp(tgtport, fod); 1911 nvmet_fc_prep_fcp_rsp(tgtport, fod);
@@ -1974,7 +1985,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
1974 } 1985 }
1975 1986
1976 fod->offset += fcpreq->transferred_length; 1987 fod->offset += fcpreq->transferred_length;
1977 if (fod->offset != fod->total_length) { 1988 if (fod->offset != fod->req.transfer_len) {
1978 spin_lock_irqsave(&fod->flock, flags); 1989 spin_lock_irqsave(&fod->flock, flags);
1979 fod->writedataactive = true; 1990 fod->writedataactive = true;
1980 spin_unlock_irqrestore(&fod->flock, flags); 1991 spin_unlock_irqrestore(&fod->flock, flags);
@@ -1986,9 +1997,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
1986 } 1997 }
1987 1998
1988 /* data transfer complete, resume with nvmet layer */ 1999 /* data transfer complete, resume with nvmet layer */
1989 2000 nvmet_req_execute(&fod->req);
1990 fod->req.execute(&fod->req);
1991
1992 break; 2001 break;
1993 2002
1994 case NVMET_FCOP_READDATA: 2003 case NVMET_FCOP_READDATA:
@@ -2011,7 +2020,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
2011 } 2020 }
2012 2021
2013 fod->offset += fcpreq->transferred_length; 2022 fod->offset += fcpreq->transferred_length;
2014 if (fod->offset != fod->total_length) { 2023 if (fod->offset != fod->req.transfer_len) {
2015 /* transfer the next chunk */ 2024 /* transfer the next chunk */
2016 nvmet_fc_transfer_fcp_data(tgtport, fod, 2025 nvmet_fc_transfer_fcp_data(tgtport, fod,
2017 NVMET_FCOP_READDATA); 2026 NVMET_FCOP_READDATA);
@@ -2148,7 +2157,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
2148 2157
2149 fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done; 2158 fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done;
2150 2159
2151 fod->total_length = be32_to_cpu(cmdiu->data_len); 2160 fod->req.transfer_len = be32_to_cpu(cmdiu->data_len);
2152 if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) { 2161 if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) {
2153 fod->io_dir = NVMET_FCP_WRITE; 2162 fod->io_dir = NVMET_FCP_WRITE;
2154 if (!nvme_is_write(&cmdiu->sqe)) 2163 if (!nvme_is_write(&cmdiu->sqe))
@@ -2159,7 +2168,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
2159 goto transport_error; 2168 goto transport_error;
2160 } else { 2169 } else {
2161 fod->io_dir = NVMET_FCP_NODATA; 2170 fod->io_dir = NVMET_FCP_NODATA;
2162 if (fod->total_length) 2171 if (fod->req.transfer_len)
2163 goto transport_error; 2172 goto transport_error;
2164 } 2173 }
2165 2174
@@ -2167,9 +2176,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
2167 fod->req.rsp = &fod->rspiubuf.cqe; 2176 fod->req.rsp = &fod->rspiubuf.cqe;
2168 fod->req.port = fod->queue->port; 2177 fod->req.port = fod->queue->port;
2169 2178
2170 /* ensure nvmet handlers will set cmd handler callback */
2171 fod->req.execute = NULL;
2172
2173 /* clear any response payload */ 2179 /* clear any response payload */
2174 memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); 2180 memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf));
2175 2181
@@ -2189,7 +2195,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
2189 /* keep a running counter of tail position */ 2195 /* keep a running counter of tail position */
2190 atomic_inc(&fod->queue->sqtail); 2196 atomic_inc(&fod->queue->sqtail);
2191 2197
2192 if (fod->total_length) { 2198 if (fod->req.transfer_len) {
2193 ret = nvmet_fc_alloc_tgt_pgs(fod); 2199 ret = nvmet_fc_alloc_tgt_pgs(fod);
2194 if (ret) { 2200 if (ret) {
2195 nvmet_req_complete(&fod->req, ret); 2201 nvmet_req_complete(&fod->req, ret);
@@ -2212,9 +2218,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
2212 * can invoke the nvmet_layer now. If read data, cmd completion will 2218 * can invoke the nvmet_layer now. If read data, cmd completion will
2213 * push the data 2219 * push the data
2214 */ 2220 */
2215 2221 nvmet_req_execute(&fod->req);
2216 fod->req.execute(&fod->req);
2217
2218 return; 2222 return;
2219 2223
2220transport_error: 2224transport_error:
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 0d4c23dc4532..0a4372a016f2 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -33,18 +33,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
33 req->ns->blksize_shift; 33 req->ns->blksize_shift;
34} 34}
35 35
36static void nvmet_inline_bio_init(struct nvmet_req *req)
37{
38 struct bio *bio = &req->inline_bio;
39
40 bio_init(bio, req->inline_bvec, NVMET_MAX_INLINE_BIOVEC);
41}
42
43static void nvmet_execute_rw(struct nvmet_req *req) 36static void nvmet_execute_rw(struct nvmet_req *req)
44{ 37{
45 int sg_cnt = req->sg_cnt; 38 int sg_cnt = req->sg_cnt;
39 struct bio *bio = &req->inline_bio;
46 struct scatterlist *sg; 40 struct scatterlist *sg;
47 struct bio *bio;
48 sector_t sector; 41 sector_t sector;
49 blk_qc_t cookie; 42 blk_qc_t cookie;
50 int op, op_flags = 0, i; 43 int op, op_flags = 0, i;
@@ -66,8 +59,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
66 sector = le64_to_cpu(req->cmd->rw.slba); 59 sector = le64_to_cpu(req->cmd->rw.slba);
67 sector <<= (req->ns->blksize_shift - 9); 60 sector <<= (req->ns->blksize_shift - 9);
68 61
69 nvmet_inline_bio_init(req); 62 bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
70 bio = &req->inline_bio;
71 bio_set_dev(bio, req->ns->bdev); 63 bio_set_dev(bio, req->ns->bdev);
72 bio->bi_iter.bi_sector = sector; 64 bio->bi_iter.bi_sector = sector;
73 bio->bi_private = req; 65 bio->bi_private = req;
@@ -94,16 +86,14 @@ static void nvmet_execute_rw(struct nvmet_req *req)
94 86
95 cookie = submit_bio(bio); 87 cookie = submit_bio(bio);
96 88
97 blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie); 89 blk_poll(bdev_get_queue(req->ns->bdev), cookie);
98} 90}
99 91
100static void nvmet_execute_flush(struct nvmet_req *req) 92static void nvmet_execute_flush(struct nvmet_req *req)
101{ 93{
102 struct bio *bio; 94 struct bio *bio = &req->inline_bio;
103
104 nvmet_inline_bio_init(req);
105 bio = &req->inline_bio;
106 95
96 bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
107 bio_set_dev(bio, req->ns->bdev); 97 bio_set_dev(bio, req->ns->bdev);
108 bio->bi_private = req; 98 bio->bi_private = req;
109 bio->bi_end_io = nvmet_bio_done; 99 bio->bi_end_io = nvmet_bio_done;
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 92628c432926..96d390416789 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -23,14 +23,6 @@
23 23
24#define NVME_LOOP_MAX_SEGMENTS 256 24#define NVME_LOOP_MAX_SEGMENTS 256
25 25
26/*
27 * We handle AEN commands ourselves and don't even let the
28 * block layer know about them.
29 */
30#define NVME_LOOP_NR_AEN_COMMANDS 1
31#define NVME_LOOP_AQ_BLKMQ_DEPTH \
32 (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS)
33
34struct nvme_loop_iod { 26struct nvme_loop_iod {
35 struct nvme_request nvme_req; 27 struct nvme_request nvme_req;
36 struct nvme_command cmd; 28 struct nvme_command cmd;
@@ -53,7 +45,6 @@ struct nvme_loop_ctrl {
53 struct nvme_ctrl ctrl; 45 struct nvme_ctrl ctrl;
54 46
55 struct nvmet_ctrl *target_ctrl; 47 struct nvmet_ctrl *target_ctrl;
56 struct work_struct delete_work;
57}; 48};
58 49
59static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) 50static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -113,7 +104,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
113 * for them but rather special case them here. 104 * for them but rather special case them here.
114 */ 105 */
115 if (unlikely(nvme_loop_queue_idx(queue) == 0 && 106 if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
116 cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { 107 cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
117 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, 108 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
118 &cqe->result); 109 &cqe->result);
119 } else { 110 } else {
@@ -136,7 +127,7 @@ static void nvme_loop_execute_work(struct work_struct *work)
136 struct nvme_loop_iod *iod = 127 struct nvme_loop_iod *iod =
137 container_of(work, struct nvme_loop_iod, work); 128 container_of(work, struct nvme_loop_iod, work);
138 129
139 iod->req.execute(&iod->req); 130 nvmet_req_execute(&iod->req);
140} 131}
141 132
142static enum blk_eh_timer_return 133static enum blk_eh_timer_return
@@ -185,6 +176,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
185 176
186 iod->req.sg = iod->sg_table.sgl; 177 iod->req.sg = iod->sg_table.sgl;
187 iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); 178 iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
179 iod->req.transfer_len = blk_rq_bytes(req);
188 } 180 }
189 181
190 blk_mq_start_request(req); 182 blk_mq_start_request(req);
@@ -193,7 +185,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
193 return BLK_STS_OK; 185 return BLK_STS_OK;
194} 186}
195 187
196static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) 188static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
197{ 189{
198 struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); 190 struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg);
199 struct nvme_loop_queue *queue = &ctrl->queues[0]; 191 struct nvme_loop_queue *queue = &ctrl->queues[0];
@@ -201,7 +193,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
201 193
202 memset(&iod->cmd, 0, sizeof(iod->cmd)); 194 memset(&iod->cmd, 0, sizeof(iod->cmd));
203 iod->cmd.common.opcode = nvme_admin_async_event; 195 iod->cmd.common.opcode = nvme_admin_async_event;
204 iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; 196 iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
205 iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; 197 iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
206 198
207 if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, 199 if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
@@ -357,7 +349,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
357 349
358 memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); 350 memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
359 ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; 351 ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
360 ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; 352 ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
361 ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ 353 ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
362 ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; 354 ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
363 ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + 355 ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
@@ -365,6 +357,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
365 ctrl->admin_tag_set.driver_data = ctrl; 357 ctrl->admin_tag_set.driver_data = ctrl;
366 ctrl->admin_tag_set.nr_hw_queues = 1; 358 ctrl->admin_tag_set.nr_hw_queues = 1;
367 ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; 359 ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
360 ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
368 361
369 ctrl->queues[0].ctrl = ctrl; 362 ctrl->queues[0].ctrl = ctrl;
370 error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); 363 error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
@@ -438,41 +431,9 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
438 nvme_loop_destroy_admin_queue(ctrl); 431 nvme_loop_destroy_admin_queue(ctrl);
439} 432}
440 433
441static void nvme_loop_del_ctrl_work(struct work_struct *work) 434static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl)
442{ 435{
443 struct nvme_loop_ctrl *ctrl = container_of(work, 436 nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl));
444 struct nvme_loop_ctrl, delete_work);
445
446 nvme_stop_ctrl(&ctrl->ctrl);
447 nvme_remove_namespaces(&ctrl->ctrl);
448 nvme_loop_shutdown_ctrl(ctrl);
449 nvme_uninit_ctrl(&ctrl->ctrl);
450 nvme_put_ctrl(&ctrl->ctrl);
451}
452
453static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl)
454{
455 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
456 return -EBUSY;
457
458 if (!queue_work(nvme_wq, &ctrl->delete_work))
459 return -EBUSY;
460
461 return 0;
462}
463
464static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl)
465{
466 struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
467 int ret;
468
469 ret = __nvme_loop_del_ctrl(ctrl);
470 if (ret)
471 return ret;
472
473 flush_work(&ctrl->delete_work);
474
475 return 0;
476} 437}
477 438
478static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) 439static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
@@ -482,7 +443,7 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
482 mutex_lock(&nvme_loop_ctrl_mutex); 443 mutex_lock(&nvme_loop_ctrl_mutex);
483 list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { 444 list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) {
484 if (ctrl->ctrl.cntlid == nctrl->cntlid) 445 if (ctrl->ctrl.cntlid == nctrl->cntlid)
485 __nvme_loop_del_ctrl(ctrl); 446 nvme_delete_ctrl(&ctrl->ctrl);
486 } 447 }
487 mutex_unlock(&nvme_loop_ctrl_mutex); 448 mutex_unlock(&nvme_loop_ctrl_mutex);
488} 449}
@@ -538,7 +499,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
538 .reg_write32 = nvmf_reg_write32, 499 .reg_write32 = nvmf_reg_write32,
539 .free_ctrl = nvme_loop_free_ctrl, 500 .free_ctrl = nvme_loop_free_ctrl,
540 .submit_async_event = nvme_loop_submit_async_event, 501 .submit_async_event = nvme_loop_submit_async_event,
541 .delete_ctrl = nvme_loop_del_ctrl, 502 .delete_ctrl = nvme_loop_delete_ctrl_host,
542}; 503};
543 504
544static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) 505static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
@@ -600,7 +561,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
600 ctrl->ctrl.opts = opts; 561 ctrl->ctrl.opts = opts;
601 INIT_LIST_HEAD(&ctrl->list); 562 INIT_LIST_HEAD(&ctrl->list);
602 563
603 INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work);
604 INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); 564 INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work);
605 565
606 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, 566 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
@@ -641,7 +601,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
641 dev_info(ctrl->ctrl.device, 601 dev_info(ctrl->ctrl.device,
642 "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); 602 "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
643 603
644 kref_get(&ctrl->ctrl.kref); 604 nvme_get_ctrl(&ctrl->ctrl);
645 605
646 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 606 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
647 WARN_ON_ONCE(!changed); 607 WARN_ON_ONCE(!changed);
@@ -730,7 +690,7 @@ static void __exit nvme_loop_cleanup_module(void)
730 690
731 mutex_lock(&nvme_loop_ctrl_mutex); 691 mutex_lock(&nvme_loop_ctrl_mutex);
732 list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) 692 list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list)
733 __nvme_loop_del_ctrl(ctrl); 693 nvme_delete_ctrl(&ctrl->ctrl);
734 mutex_unlock(&nvme_loop_ctrl_mutex); 694 mutex_unlock(&nvme_loop_ctrl_mutex);
735 695
736 flush_workqueue(nvme_wq); 696 flush_workqueue(nvme_wq);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 87e429bfcd8a..417f6c0331cc 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -223,7 +223,10 @@ struct nvmet_req {
223 struct bio inline_bio; 223 struct bio inline_bio;
224 struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; 224 struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC];
225 int sg_cnt; 225 int sg_cnt;
226 /* data length as parsed from the command: */
226 size_t data_len; 227 size_t data_len;
228 /* data length as parsed from the SGL descriptor: */
229 size_t transfer_len;
227 230
228 struct nvmet_port *port; 231 struct nvmet_port *port;
229 232
@@ -266,6 +269,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
266bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, 269bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
267 struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); 270 struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
268void nvmet_req_uninit(struct nvmet_req *req); 271void nvmet_req_uninit(struct nvmet_req *req);
272void nvmet_req_execute(struct nvmet_req *req);
269void nvmet_req_complete(struct nvmet_req *req, u16 status); 273void nvmet_req_complete(struct nvmet_req *req, u16 status);
270 274
271void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, 275void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
@@ -314,7 +318,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
314u32 nvmet_get_log_page_len(struct nvme_command *cmd); 318u32 nvmet_get_log_page_len(struct nvme_command *cmd);
315 319
316#define NVMET_QUEUE_SIZE 1024 320#define NVMET_QUEUE_SIZE 1024
317#define NVMET_NR_QUEUES 64 321#define NVMET_NR_QUEUES 128
318#define NVMET_MAX_CMD NVMET_QUEUE_SIZE 322#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
319#define NVMET_KAS 10 323#define NVMET_KAS 10
320#define NVMET_DISC_KATO 120 324#define NVMET_DISC_KATO 120
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 76d2bb793afe..49912909c298 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -148,14 +148,14 @@ static inline u32 get_unaligned_le24(const u8 *p)
148static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 148static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
149{ 149{
150 return nvme_is_write(rsp->req.cmd) && 150 return nvme_is_write(rsp->req.cmd) &&
151 rsp->req.data_len && 151 rsp->req.transfer_len &&
152 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 152 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
153} 153}
154 154
155static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 155static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
156{ 156{
157 return !nvme_is_write(rsp->req.cmd) && 157 return !nvme_is_write(rsp->req.cmd) &&
158 rsp->req.data_len && 158 rsp->req.transfer_len &&
159 !rsp->req.rsp->status && 159 !rsp->req.rsp->status &&
160 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 160 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
161} 161}
@@ -577,7 +577,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
577 return; 577 return;
578 } 578 }
579 579
580 rsp->req.execute(&rsp->req); 580 nvmet_req_execute(&rsp->req);
581} 581}
582 582
583static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 583static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
@@ -609,6 +609,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
609 609
610 nvmet_rdma_use_inline_sg(rsp, len, off); 610 nvmet_rdma_use_inline_sg(rsp, len, off);
611 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 611 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
612 rsp->req.transfer_len += len;
612 return 0; 613 return 0;
613} 614}
614 615
@@ -636,6 +637,7 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
636 nvmet_data_dir(&rsp->req)); 637 nvmet_data_dir(&rsp->req));
637 if (ret < 0) 638 if (ret < 0)
638 return NVME_SC_INTERNAL; 639 return NVME_SC_INTERNAL;
640 rsp->req.transfer_len += len;
639 rsp->n_rdma += ret; 641 rsp->n_rdma += ret;
640 642
641 if (invalidate) { 643 if (invalidate) {
@@ -693,7 +695,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
693 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 695 queue->cm_id->port_num, &rsp->read_cqe, NULL))
694 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 696 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
695 } else { 697 } else {
696 rsp->req.execute(&rsp->req); 698 nvmet_req_execute(&rsp->req);
697 } 699 }
698 700
699 return true; 701 return true;
@@ -1512,15 +1514,17 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = {
1512 1514
1513static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1515static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1514{ 1516{
1515 struct nvmet_rdma_queue *queue; 1517 struct nvmet_rdma_queue *queue, *tmp;
1516 1518
1517 /* Device is being removed, delete all queues using this device */ 1519 /* Device is being removed, delete all queues using this device */
1518 mutex_lock(&nvmet_rdma_queue_mutex); 1520 mutex_lock(&nvmet_rdma_queue_mutex);
1519 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1521 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1522 queue_list) {
1520 if (queue->dev->device != ib_device) 1523 if (queue->dev->device != ib_device)
1521 continue; 1524 continue;
1522 1525
1523 pr_info("Removing queue %d\n", queue->idx); 1526 pr_info("Removing queue %d\n", queue->idx);
1527 list_del_init(&queue->queue_list);
1524 __nvmet_rdma_queue_disconnect(queue); 1528 __nvmet_rdma_queue_disconnect(queue);
1525 } 1529 }
1526 mutex_unlock(&nvmet_rdma_queue_mutex); 1530 mutex_unlock(&nvmet_rdma_queue_mutex);
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 41366339b950..766955318005 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -130,7 +130,8 @@ config CHR_DEV_OSST
130 130
131config BLK_DEV_SR 131config BLK_DEV_SR
132 tristate "SCSI CDROM support" 132 tristate "SCSI CDROM support"
133 depends on SCSI 133 depends on SCSI && BLK_DEV
134 select CDROM
134 ---help--- 135 ---help---
135 If you want to use a CD or DVD drive attached to your computer 136 If you want to use a CD or DVD drive attached to your computer
136 by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO 137 by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index c17677f494af..3e02bc3a7c3f 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -3246,6 +3246,11 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport)
3246 continue; 3246 continue;
3247 if (ndlp->rport) 3247 if (ndlp->rport)
3248 ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo; 3248 ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo;
3249#if (IS_ENABLED(CONFIG_NVME_FC))
3250 if (ndlp->nrport)
3251 nvme_fc_set_remoteport_devloss(ndlp->nrport->remoteport,
3252 vport->cfg_devloss_tmo);
3253#endif
3249 } 3254 }
3250 spin_unlock_irq(shost->host_lock); 3255 spin_unlock_irq(shost->host_lock);
3251} 3256}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index bcc1694cebcd..54de24c785dd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -252,9 +252,9 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
252 struct scsi_request *rq; 252 struct scsi_request *rq;
253 int ret = DRIVER_ERROR << 24; 253 int ret = DRIVER_ERROR << 24;
254 254
255 req = blk_get_request(sdev->request_queue, 255 req = blk_get_request_flags(sdev->request_queue,
256 data_direction == DMA_TO_DEVICE ? 256 data_direction == DMA_TO_DEVICE ?
257 REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); 257 REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT);
258 if (IS_ERR(req)) 258 if (IS_ERR(req))
259 return ret; 259 return ret;
260 rq = scsi_req(req); 260 rq = scsi_req(req);
@@ -268,7 +268,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
268 rq->retries = retries; 268 rq->retries = retries;
269 req->timeout = timeout; 269 req->timeout = timeout;
270 req->cmd_flags |= flags; 270 req->cmd_flags |= flags;
271 req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT; 271 req->rq_flags |= rq_flags | RQF_QUIET;
272 272
273 /* 273 /*
274 * head injection *required* here otherwise quiesce won't work 274 * head injection *required* here otherwise quiesce won't work
@@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
1301 /* 1301 /*
1302 * If the devices is blocked we defer normal commands. 1302 * If the devices is blocked we defer normal commands.
1303 */ 1303 */
1304 if (!(req->rq_flags & RQF_PREEMPT)) 1304 if (req && !(req->rq_flags & RQF_PREEMPT))
1305 ret = BLKPREP_DEFER; 1305 ret = BLKPREP_DEFER;
1306 break; 1306 break;
1307 default: 1307 default:
@@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
1310 * special commands. In particular any user initiated 1310 * special commands. In particular any user initiated
1311 * command is not allowed. 1311 * command is not allowed.
1312 */ 1312 */
1313 if (!(req->rq_flags & RQF_PREEMPT)) 1313 if (req && !(req->rq_flags & RQF_PREEMPT))
1314 ret = BLKPREP_KILL; 1314 ret = BLKPREP_KILL;
1315 break; 1315 break;
1316 } 1316 }
@@ -1940,6 +1940,33 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
1940 blk_mq_complete_request(cmd->request); 1940 blk_mq_complete_request(cmd->request);
1941} 1941}
1942 1942
1943static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
1944{
1945 struct request_queue *q = hctx->queue;
1946 struct scsi_device *sdev = q->queuedata;
1947
1948 atomic_dec(&sdev->device_busy);
1949 put_device(&sdev->sdev_gendev);
1950}
1951
1952static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
1953{
1954 struct request_queue *q = hctx->queue;
1955 struct scsi_device *sdev = q->queuedata;
1956
1957 if (!get_device(&sdev->sdev_gendev))
1958 goto out;
1959 if (!scsi_dev_queue_ready(q, sdev))
1960 goto out_put_device;
1961
1962 return true;
1963
1964out_put_device:
1965 put_device(&sdev->sdev_gendev);
1966out:
1967 return false;
1968}
1969
1943static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, 1970static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
1944 const struct blk_mq_queue_data *bd) 1971 const struct blk_mq_queue_data *bd)
1945{ 1972{
@@ -1953,16 +1980,11 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
1953 1980
1954 ret = prep_to_mq(scsi_prep_state_check(sdev, req)); 1981 ret = prep_to_mq(scsi_prep_state_check(sdev, req));
1955 if (ret != BLK_STS_OK) 1982 if (ret != BLK_STS_OK)
1956 goto out; 1983 goto out_put_budget;
1957 1984
1958 ret = BLK_STS_RESOURCE; 1985 ret = BLK_STS_RESOURCE;
1959 if (!get_device(&sdev->sdev_gendev))
1960 goto out;
1961
1962 if (!scsi_dev_queue_ready(q, sdev))
1963 goto out_put_device;
1964 if (!scsi_target_queue_ready(shost, sdev)) 1986 if (!scsi_target_queue_ready(shost, sdev))
1965 goto out_dec_device_busy; 1987 goto out_put_budget;
1966 if (!scsi_host_queue_ready(q, shost, sdev)) 1988 if (!scsi_host_queue_ready(q, shost, sdev))
1967 goto out_dec_target_busy; 1989 goto out_dec_target_busy;
1968 1990
@@ -1993,15 +2015,12 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
1993 return BLK_STS_OK; 2015 return BLK_STS_OK;
1994 2016
1995out_dec_host_busy: 2017out_dec_host_busy:
1996 atomic_dec(&shost->host_busy); 2018 atomic_dec(&shost->host_busy);
1997out_dec_target_busy: 2019out_dec_target_busy:
1998 if (scsi_target(sdev)->can_queue > 0) 2020 if (scsi_target(sdev)->can_queue > 0)
1999 atomic_dec(&scsi_target(sdev)->target_busy); 2021 atomic_dec(&scsi_target(sdev)->target_busy);
2000out_dec_device_busy: 2022out_put_budget:
2001 atomic_dec(&sdev->device_busy); 2023 scsi_mq_put_budget(hctx);
2002out_put_device:
2003 put_device(&sdev->sdev_gendev);
2004out:
2005 switch (ret) { 2024 switch (ret) {
2006 case BLK_STS_OK: 2025 case BLK_STS_OK:
2007 break; 2026 break;
@@ -2205,6 +2224,8 @@ struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
2205} 2224}
2206 2225
2207static const struct blk_mq_ops scsi_mq_ops = { 2226static const struct blk_mq_ops scsi_mq_ops = {
2227 .get_budget = scsi_mq_get_budget,
2228 .put_budget = scsi_mq_put_budget,
2208 .queue_rq = scsi_queue_rq, 2229 .queue_rq = scsi_queue_rq,
2209 .complete = scsi_softirq_done, 2230 .complete = scsi_softirq_done,
2210 .timeout = scsi_timeout, 2231 .timeout = scsi_timeout,
@@ -2919,21 +2940,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
2919int 2940int
2920scsi_device_quiesce(struct scsi_device *sdev) 2941scsi_device_quiesce(struct scsi_device *sdev)
2921{ 2942{
2943 struct request_queue *q = sdev->request_queue;
2922 int err; 2944 int err;
2923 2945
2946 /*
2947 * It is allowed to call scsi_device_quiesce() multiple times from
2948 * the same context but concurrent scsi_device_quiesce() calls are
2949 * not allowed.
2950 */
2951 WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
2952
2953 blk_set_preempt_only(q);
2954
2955 blk_mq_freeze_queue(q);
2956 /*
2957 * Ensure that the effect of blk_set_preempt_only() will be visible
2958 * for percpu_ref_tryget() callers that occur after the queue
2959 * unfreeze even if the queue was already frozen before this function
2960 * was called. See also https://lwn.net/Articles/573497/.
2961 */
2962 synchronize_rcu();
2963 blk_mq_unfreeze_queue(q);
2964
2924 mutex_lock(&sdev->state_mutex); 2965 mutex_lock(&sdev->state_mutex);
2925 err = scsi_device_set_state(sdev, SDEV_QUIESCE); 2966 err = scsi_device_set_state(sdev, SDEV_QUIESCE);
2967 if (err == 0)
2968 sdev->quiesced_by = current;
2969 else
2970 blk_clear_preempt_only(q);
2926 mutex_unlock(&sdev->state_mutex); 2971 mutex_unlock(&sdev->state_mutex);
2927 2972
2928 if (err) 2973 return err;
2929 return err;
2930
2931 scsi_run_queue(sdev->request_queue);
2932 while (atomic_read(&sdev->device_busy)) {
2933 msleep_interruptible(200);
2934 scsi_run_queue(sdev->request_queue);
2935 }
2936 return 0;
2937} 2974}
2938EXPORT_SYMBOL(scsi_device_quiesce); 2975EXPORT_SYMBOL(scsi_device_quiesce);
2939 2976
@@ -2953,9 +2990,11 @@ void scsi_device_resume(struct scsi_device *sdev)
2953 * device deleted during suspend) 2990 * device deleted during suspend)
2954 */ 2991 */
2955 mutex_lock(&sdev->state_mutex); 2992 mutex_lock(&sdev->state_mutex);
2956 if (sdev->sdev_state == SDEV_QUIESCE && 2993 WARN_ON_ONCE(!sdev->quiesced_by);
2957 scsi_device_set_state(sdev, SDEV_RUNNING) == 0) 2994 sdev->quiesced_by = NULL;
2958 scsi_run_queue(sdev->request_queue); 2995 blk_clear_preempt_only(sdev->request_queue);
2996 if (sdev->sdev_state == SDEV_QUIESCE)
2997 scsi_device_set_state(sdev, SDEV_RUNNING);
2959 mutex_unlock(&sdev->state_mutex); 2998 mutex_unlock(&sdev->state_mutex);
2960} 2999}
2961EXPORT_SYMBOL(scsi_device_resume); 3000EXPORT_SYMBOL(scsi_device_resume);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index aa28874e8fb9..f098877eed4a 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
217 if (sfp->parentdp->device->type == TYPE_SCANNER) 217 if (sfp->parentdp->device->type == TYPE_SCANNER)
218 return 0; 218 return 0;
219 219
220 return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE); 220 return blk_verify_command(cmd, filp->f_mode);
221} 221}
222 222
223static int 223static int
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 789f55e851ae..4a181fcb5175 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode)
54} 54}
55EXPORT_SYMBOL(I_BDEV); 55EXPORT_SYMBOL(I_BDEV);
56 56
57void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
58{
59 struct va_format vaf;
60 va_list args;
61
62 va_start(args, fmt);
63 vaf.fmt = fmt;
64 vaf.va = &args;
65 printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
66 va_end(args);
67}
68
69static void bdev_write_inode(struct block_device *bdev) 57static void bdev_write_inode(struct block_device *bdev)
70{ 58{
71 struct inode *inode = bdev->bd_inode; 59 struct inode *inode = bdev->bd_inode;
@@ -249,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
249 if (!READ_ONCE(bio.bi_private)) 237 if (!READ_ONCE(bio.bi_private))
250 break; 238 break;
251 if (!(iocb->ki_flags & IOCB_HIPRI) || 239 if (!(iocb->ki_flags & IOCB_HIPRI) ||
252 !blk_mq_poll(bdev_get_queue(bdev), qc)) 240 !blk_poll(bdev_get_queue(bdev), qc))
253 io_schedule(); 241 io_schedule();
254 } 242 }
255 __set_current_state(TASK_RUNNING); 243 __set_current_state(TASK_RUNNING);
@@ -414,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
414 break; 402 break;
415 403
416 if (!(iocb->ki_flags & IOCB_HIPRI) || 404 if (!(iocb->ki_flags & IOCB_HIPRI) ||
417 !blk_mq_poll(bdev_get_queue(bdev), qc)) 405 !blk_poll(bdev_get_queue(bdev), qc))
418 io_schedule(); 406 io_schedule();
419 } 407 }
420 __set_current_state(TASK_RUNNING); 408 __set_current_state(TASK_RUNNING);
@@ -674,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
674 if (!ops->rw_page || bdev_get_integrity(bdev)) 662 if (!ops->rw_page || bdev_get_integrity(bdev))
675 return result; 663 return result;
676 664
677 result = blk_queue_enter(bdev->bd_queue, false); 665 result = blk_queue_enter(bdev->bd_queue, 0);
678 if (result) 666 if (result)
679 return result; 667 return result;
680 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); 668 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -710,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
710 698
711 if (!ops->rw_page || bdev_get_integrity(bdev)) 699 if (!ops->rw_page || bdev_get_integrity(bdev))
712 return -EOPNOTSUPP; 700 return -EOPNOTSUPP;
713 result = blk_queue_enter(bdev->bd_queue, false); 701 result = blk_queue_enter(bdev->bd_queue, 0);
714 if (result) 702 if (result)
715 return result; 703 return result;
716 704
diff --git a/fs/buffer.c b/fs/buffer.c
index 49b7e9bdcd1d..1c18a22a6013 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -253,27 +253,6 @@ out:
253} 253}
254 254
255/* 255/*
256 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
257 */
258static void free_more_memory(void)
259{
260 struct zoneref *z;
261 int nid;
262
263 wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
264 yield();
265
266 for_each_online_node(nid) {
267
268 z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
269 gfp_zone(GFP_NOFS), NULL);
270 if (z->zone)
271 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
272 GFP_NOFS, NULL);
273 }
274}
275
276/*
277 * I/O completion handler for block_read_full_page() - pages 256 * I/O completion handler for block_read_full_page() - pages
278 * which come unlocked at the end of I/O. 257 * which come unlocked at the end of I/O.
279 */ 258 */
@@ -861,16 +840,19 @@ int remove_inode_buffers(struct inode *inode)
861 * which may not fail from ordinary buffer allocations. 840 * which may not fail from ordinary buffer allocations.
862 */ 841 */
863struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, 842struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
864 int retry) 843 bool retry)
865{ 844{
866 struct buffer_head *bh, *head; 845 struct buffer_head *bh, *head;
846 gfp_t gfp = GFP_NOFS;
867 long offset; 847 long offset;
868 848
869try_again: 849 if (retry)
850 gfp |= __GFP_NOFAIL;
851
870 head = NULL; 852 head = NULL;
871 offset = PAGE_SIZE; 853 offset = PAGE_SIZE;
872 while ((offset -= size) >= 0) { 854 while ((offset -= size) >= 0) {
873 bh = alloc_buffer_head(GFP_NOFS); 855 bh = alloc_buffer_head(gfp);
874 if (!bh) 856 if (!bh)
875 goto no_grow; 857 goto no_grow;
876 858
@@ -896,23 +878,7 @@ no_grow:
896 } while (head); 878 } while (head);
897 } 879 }
898 880
899 /* 881 return NULL;
900 * Return failure for non-async IO requests. Async IO requests
901 * are not allowed to fail, so we have to wait until buffer heads
902 * become available. But we don't want tasks sleeping with
903 * partially complete buffers, so all were released above.
904 */
905 if (!retry)
906 return NULL;
907
908 /* We're _really_ low on memory. Now we just
909 * wait for old buffer heads to become free due to
910 * finishing IO. Since this is an async request and
911 * the reserve list is empty, we're sure there are
912 * async buffer heads in use.
913 */
914 free_more_memory();
915 goto try_again;
916} 882}
917EXPORT_SYMBOL_GPL(alloc_page_buffers); 883EXPORT_SYMBOL_GPL(alloc_page_buffers);
918 884
@@ -1001,8 +967,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
1001 gfp_mask |= __GFP_NOFAIL; 967 gfp_mask |= __GFP_NOFAIL;
1002 968
1003 page = find_or_create_page(inode->i_mapping, index, gfp_mask); 969 page = find_or_create_page(inode->i_mapping, index, gfp_mask);
1004 if (!page)
1005 return ret;
1006 970
1007 BUG_ON(!PageLocked(page)); 971 BUG_ON(!PageLocked(page));
1008 972
@@ -1021,9 +985,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
1021 /* 985 /*
1022 * Allocate some buffers for this page 986 * Allocate some buffers for this page
1023 */ 987 */
1024 bh = alloc_page_buffers(page, size, 0); 988 bh = alloc_page_buffers(page, size, true);
1025 if (!bh)
1026 goto failed;
1027 989
1028 /* 990 /*
1029 * Link the page to the buffers and initialise them. Take the 991 * Link the page to the buffers and initialise them. Take the
@@ -1103,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block,
1103 ret = grow_buffers(bdev, block, size, gfp); 1065 ret = grow_buffers(bdev, block, size, gfp);
1104 if (ret < 0) 1066 if (ret < 0)
1105 return NULL; 1067 return NULL;
1106 if (ret == 0)
1107 free_more_memory();
1108 } 1068 }
1109} 1069}
1110 1070
@@ -1575,7 +1535,7 @@ void create_empty_buffers(struct page *page,
1575{ 1535{
1576 struct buffer_head *bh, *head, *tail; 1536 struct buffer_head *bh, *head, *tail;
1577 1537
1578 head = alloc_page_buffers(page, blocksize, 1); 1538 head = alloc_page_buffers(page, blocksize, true);
1579 bh = head; 1539 bh = head;
1580 do { 1540 do {
1581 bh->b_state |= b_state; 1541 bh->b_state |= b_state;
@@ -2639,7 +2599,7 @@ int nobh_write_begin(struct address_space *mapping,
2639 * Be careful: the buffer linked list is a NULL terminated one, rather 2599 * Be careful: the buffer linked list is a NULL terminated one, rather
2640 * than the circular one we're used to. 2600 * than the circular one we're used to.
2641 */ 2601 */
2642 head = alloc_page_buffers(page, blocksize, 0); 2602 head = alloc_page_buffers(page, blocksize, false);
2643 if (!head) { 2603 if (!head) {
2644 ret = -ENOMEM; 2604 ret = -ENOMEM;
2645 goto out_release; 2605 goto out_release;
@@ -3056,8 +3016,16 @@ void guard_bio_eod(int op, struct bio *bio)
3056 sector_t maxsector; 3016 sector_t maxsector;
3057 struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; 3017 struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
3058 unsigned truncated_bytes; 3018 unsigned truncated_bytes;
3019 struct hd_struct *part;
3020
3021 rcu_read_lock();
3022 part = __disk_get_part(bio->bi_disk, bio->bi_partno);
3023 if (part)
3024 maxsector = part_nr_sects_read(part);
3025 else
3026 maxsector = get_capacity(bio->bi_disk);
3027 rcu_read_unlock();
3059 3028
3060 maxsector = get_capacity(bio->bi_disk);
3061 if (!maxsector) 3029 if (!maxsector)
3062 return; 3030 return;
3063 3031
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98fe1325da9d..3aafb3343a65 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -497,7 +497,7 @@ static struct bio *dio_await_one(struct dio *dio)
497 dio->waiter = current; 497 dio->waiter = current;
498 spin_unlock_irqrestore(&dio->bio_lock, flags); 498 spin_unlock_irqrestore(&dio->bio_lock, flags);
499 if (!(dio->iocb->ki_flags & IOCB_HIPRI) || 499 if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
500 !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) 500 !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
501 io_schedule(); 501 io_schedule();
502 /* wake up sets us TASK_RUNNING */ 502 /* wake up sets us TASK_RUNNING */
503 spin_lock_irqsave(&dio->bio_lock, flags); 503 spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 245c430a2e41..08f5debd07d1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -933,33 +933,36 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
933 933
934#endif /* CONFIG_CGROUP_WRITEBACK */ 934#endif /* CONFIG_CGROUP_WRITEBACK */
935 935
936void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, 936/*
937 bool range_cyclic, enum wb_reason reason) 937 * Add in the number of potentially dirty inodes, because each inode
938 * write can dirty pagecache in the underlying blockdev.
939 */
940static unsigned long get_nr_dirty_pages(void)
938{ 941{
939 struct wb_writeback_work *work; 942 return global_node_page_state(NR_FILE_DIRTY) +
943 global_node_page_state(NR_UNSTABLE_NFS) +
944 get_nr_dirty_inodes();
945}
940 946
947static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
948{
941 if (!wb_has_dirty_io(wb)) 949 if (!wb_has_dirty_io(wb))
942 return; 950 return;
943 951
944 /* 952 /*
945 * This is WB_SYNC_NONE writeback, so if allocation fails just 953 * All callers of this function want to start writeback of all
946 * wakeup the thread for old dirty data writeback 954 * dirty pages. Places like vmscan can call this at a very
955 * high frequency, causing pointless allocations of tons of
956 * work items and keeping the flusher threads busy retrieving
957 * that work. Ensure that we only allow one of them pending and
958 * inflight at the time.
947 */ 959 */
948 work = kzalloc(sizeof(*work), 960 if (test_bit(WB_start_all, &wb->state) ||
949 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); 961 test_and_set_bit(WB_start_all, &wb->state))
950 if (!work) {
951 trace_writeback_nowork(wb);
952 wb_wakeup(wb);
953 return; 962 return;
954 }
955
956 work->sync_mode = WB_SYNC_NONE;
957 work->nr_pages = nr_pages;
958 work->range_cyclic = range_cyclic;
959 work->reason = reason;
960 work->auto_free = 1;
961 963
962 wb_queue_work(wb, work); 964 wb->start_all_reason = reason;
965 wb_wakeup(wb);
963} 966}
964 967
965/** 968/**
@@ -1814,17 +1817,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1814 return work; 1817 return work;
1815} 1818}
1816 1819
1817/*
1818 * Add in the number of potentially dirty inodes, because each inode
1819 * write can dirty pagecache in the underlying blockdev.
1820 */
1821static unsigned long get_nr_dirty_pages(void)
1822{
1823 return global_node_page_state(NR_FILE_DIRTY) +
1824 global_node_page_state(NR_UNSTABLE_NFS) +
1825 get_nr_dirty_inodes();
1826}
1827
1828static long wb_check_background_flush(struct bdi_writeback *wb) 1820static long wb_check_background_flush(struct bdi_writeback *wb)
1829{ 1821{
1830 if (wb_over_bg_thresh(wb)) { 1822 if (wb_over_bg_thresh(wb)) {
@@ -1877,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
1877 return 0; 1869 return 0;
1878} 1870}
1879 1871
1872static long wb_check_start_all(struct bdi_writeback *wb)
1873{
1874 long nr_pages;
1875
1876 if (!test_bit(WB_start_all, &wb->state))
1877 return 0;
1878
1879 nr_pages = get_nr_dirty_pages();
1880 if (nr_pages) {
1881 struct wb_writeback_work work = {
1882 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
1883 .sync_mode = WB_SYNC_NONE,
1884 .range_cyclic = 1,
1885 .reason = wb->start_all_reason,
1886 };
1887
1888 nr_pages = wb_writeback(wb, &work);
1889 }
1890
1891 clear_bit(WB_start_all, &wb->state);
1892 return nr_pages;
1893}
1894
1895
1880/* 1896/*
1881 * Retrieve work items and do the writeback they describe 1897 * Retrieve work items and do the writeback they describe
1882 */ 1898 */
@@ -1893,6 +1909,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
1893 } 1909 }
1894 1910
1895 /* 1911 /*
1912 * Check for a flush-everything request
1913 */
1914 wrote += wb_check_start_all(wb);
1915
1916 /*
1896 * Check for periodic writeback, kupdated() style 1917 * Check for periodic writeback, kupdated() style
1897 */ 1918 */
1898 wrote += wb_check_old_data_flush(wb); 1919 wrote += wb_check_old_data_flush(wb);
@@ -1947,10 +1968,33 @@ void wb_workfn(struct work_struct *work)
1947} 1968}
1948 1969
1949/* 1970/*
1950 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 1971 * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
1951 * the whole world. 1972 * write back the whole world.
1952 */ 1973 */
1953void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) 1974static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
1975 enum wb_reason reason)
1976{
1977 struct bdi_writeback *wb;
1978
1979 if (!bdi_has_dirty_io(bdi))
1980 return;
1981
1982 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
1983 wb_start_writeback(wb, reason);
1984}
1985
1986void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
1987 enum wb_reason reason)
1988{
1989 rcu_read_lock();
1990 __wakeup_flusher_threads_bdi(bdi, reason);
1991 rcu_read_unlock();
1992}
1993
1994/*
1995 * Wakeup the flusher threads to start writeback of all currently dirty pages
1996 */
1997void wakeup_flusher_threads(enum wb_reason reason)
1954{ 1998{
1955 struct backing_dev_info *bdi; 1999 struct backing_dev_info *bdi;
1956 2000
@@ -1960,20 +2004,9 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1960 if (blk_needs_flush_plug(current)) 2004 if (blk_needs_flush_plug(current))
1961 blk_schedule_flush_plug(current); 2005 blk_schedule_flush_plug(current);
1962 2006
1963 if (!nr_pages)
1964 nr_pages = get_nr_dirty_pages();
1965
1966 rcu_read_lock(); 2007 rcu_read_lock();
1967 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 2008 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
1968 struct bdi_writeback *wb; 2009 __wakeup_flusher_threads_bdi(bdi, reason);
1969
1970 if (!bdi_has_dirty_io(bdi))
1971 continue;
1972
1973 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
1974 wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
1975 false, reason);
1976 }
1977 rcu_read_unlock(); 2010 rcu_read_unlock();
1978} 2011}
1979 2012
@@ -2343,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2343EXPORT_SYMBOL(writeback_inodes_sb); 2376EXPORT_SYMBOL(writeback_inodes_sb);
2344 2377
2345/** 2378/**
2346 * try_to_writeback_inodes_sb_nr - try to start writeback if none underway 2379 * try_to_writeback_inodes_sb - try to start writeback if none underway
2347 * @sb: the superblock 2380 * @sb: the superblock
2348 * @nr: the number of pages to write 2381 * @reason: reason why some writeback work was initiated
2349 * @reason: the reason of writeback
2350 * 2382 *
2351 * Invoke writeback_inodes_sb_nr if no writeback is currently underway. 2383 * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
2352 * Returns 1 if writeback was started, 0 if not.
2353 */ 2384 */
2354bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, 2385void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2355 enum wb_reason reason)
2356{ 2386{
2357 if (!down_read_trylock(&sb->s_umount)) 2387 if (!down_read_trylock(&sb->s_umount))
2358 return false; 2388 return;
2359 2389
2360 __writeback_inodes_sb_nr(sb, nr, reason, true); 2390 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2361 up_read(&sb->s_umount); 2391 up_read(&sb->s_umount);
2362 return true;
2363}
2364EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
2365
2366/**
2367 * try_to_writeback_inodes_sb - try to start writeback if none underway
2368 * @sb: the superblock
2369 * @reason: reason why some writeback work was initiated
2370 *
2371 * Implement by try_to_writeback_inodes_sb_nr()
2372 * Returns 1 if writeback was started, 0 if not.
2373 */
2374bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2375{
2376 return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2377} 2392}
2378EXPORT_SYMBOL(try_to_writeback_inodes_sb); 2393EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2379 2394
diff --git a/fs/iomap.c b/fs/iomap.c
index 5011a964a550..b9f74803e56c 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1057,7 +1057,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1057 1057
1058 if (!(iocb->ki_flags & IOCB_HIPRI) || 1058 if (!(iocb->ki_flags & IOCB_HIPRI) ||
1059 !dio->submit.last_queue || 1059 !dio->submit.last_queue ||
1060 !blk_mq_poll(dio->submit.last_queue, 1060 !blk_poll(dio->submit.last_queue,
1061 dio->submit.cookie)) 1061 dio->submit.cookie))
1062 io_schedule(); 1062 io_schedule();
1063 } 1063 }
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cc91856b5e2d..3a2e509c77c5 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
1739 spin_lock(&mapping->private_lock); 1739 spin_lock(&mapping->private_lock);
1740 if (unlikely(!page_has_buffers(page))) { 1740 if (unlikely(!page_has_buffers(page))) {
1741 spin_unlock(&mapping->private_lock); 1741 spin_unlock(&mapping->private_lock);
1742 bh = head = alloc_page_buffers(page, bh_size, 1); 1742 bh = head = alloc_page_buffers(page, bh_size, true);
1743 spin_lock(&mapping->private_lock); 1743 spin_lock(&mapping->private_lock);
1744 if (likely(!page_has_buffers(page))) { 1744 if (likely(!page_has_buffers(page))) {
1745 struct buffer_head *tail; 1745 struct buffer_head *tail;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b6f402194f02..ee8392aee9f6 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
507 if (unlikely(!page_has_buffers(page))) { 507 if (unlikely(!page_has_buffers(page))) {
508 struct buffer_head *tail; 508 struct buffer_head *tail;
509 509
510 bh = head = alloc_page_buffers(page, blocksize, 1); 510 bh = head = alloc_page_buffers(page, blocksize, true);
511 do { 511 do {
512 set_buffer_uptodate(bh); 512 set_buffer_uptodate(bh);
513 tail = bh; 513 tail = bh;
diff --git a/fs/sync.c b/fs/sync.c
index 83ac79a960dd..6e0a2cbaf6de 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -109,7 +109,7 @@ SYSCALL_DEFINE0(sync)
109{ 109{
110 int nowait = 0, wait = 1; 110 int nowait = 0, wait = 1;
111 111
112 wakeup_flusher_threads(0, WB_REASON_SYNC); 112 wakeup_flusher_threads(WB_REASON_SYNC);
113 iterate_supers(sync_inodes_one_sb, NULL); 113 iterate_supers(sync_inodes_one_sb, NULL);
114 iterate_supers(sync_fs_one_sb, &nowait); 114 iterate_supers(sync_fs_one_sb, &nowait);
115 iterate_supers(sync_fs_one_sb, &wait); 115 iterate_supers(sync_fs_one_sb, &wait);
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index fff4cfa0c21d..bfe86b54f6c1 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -25,6 +25,7 @@ enum wb_state {
25 WB_shutting_down, /* wb_shutdown() in progress */ 25 WB_shutting_down, /* wb_shutdown() in progress */
26 WB_writeback_running, /* Writeback is in progress */ 26 WB_writeback_running, /* Writeback is in progress */
27 WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ 27 WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
28 WB_start_all, /* nr_pages == 0 (all) work pending */
28}; 29};
29 30
30enum wb_congested_state { 31enum wb_congested_state {
@@ -45,6 +46,28 @@ enum wb_stat_item {
45#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) 46#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
46 47
47/* 48/*
49 * why some writeback work was initiated
50 */
51enum wb_reason {
52 WB_REASON_BACKGROUND,
53 WB_REASON_VMSCAN,
54 WB_REASON_SYNC,
55 WB_REASON_PERIODIC,
56 WB_REASON_LAPTOP_TIMER,
57 WB_REASON_FREE_MORE_MEM,
58 WB_REASON_FS_FREE_SPACE,
59 /*
60 * There is no bdi forker thread any more and works are done
61 * by emergency worker, however, this is TPs userland visible
62 * and we'll be exposing exactly the same information,
63 * so it has a mismatch name.
64 */
65 WB_REASON_FORKER_THREAD,
66
67 WB_REASON_MAX,
68};
69
70/*
48 * For cgroup writeback, multiple wb's may map to the same blkcg. Those 71 * For cgroup writeback, multiple wb's may map to the same blkcg. Those
49 * wb's can operate mostly independently but should share the congested 72 * wb's can operate mostly independently but should share the congested
50 * state. To facilitate such sharing, the congested state is tracked using 73 * state. To facilitate such sharing, the congested state is tracked using
@@ -116,6 +139,7 @@ struct bdi_writeback {
116 139
117 struct fprop_local_percpu completions; 140 struct fprop_local_percpu completions;
118 int dirty_exceeded; 141 int dirty_exceeded;
142 enum wb_reason start_all_reason;
119 143
120 spinlock_t work_lock; /* protects work_list & dwork scheduling */ 144 spinlock_t work_lock; /* protects work_list & dwork scheduling */
121 struct list_head work_list; 145 struct list_head work_list;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 16621579a3db..f41ca8486e02 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -39,8 +39,6 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
39 return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); 39 return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
40} 40}
41 41
42void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
43 bool range_cyclic, enum wb_reason reason);
44void wb_start_background_writeback(struct bdi_writeback *wb); 42void wb_start_background_writeback(struct bdi_writeback *wb);
45void wb_workfn(struct work_struct *work); 43void wb_workfn(struct work_struct *work);
46void wb_wakeup_delayed(struct bdi_writeback *wb); 44void wb_wakeup_delayed(struct bdi_writeback *wb);
@@ -175,8 +173,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
175 173
176long congestion_wait(int sync, long timeout); 174long congestion_wait(int sync, long timeout);
177long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); 175long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
178int pdflush_proc_obsolete(struct ctl_table *table, int write,
179 void __user *buffer, size_t *lenp, loff_t *ppos);
180 176
181static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) 177static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
182{ 178{
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 275c91c99516..d4eec19a6d3c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -129,18 +129,6 @@ static inline void *bio_data(struct bio *bio)
129#define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) 129#define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
130 130
131/* 131/*
132 * queues that have highmem support enabled may still need to revert to
133 * PIO transfers occasionally and thus map high pages temporarily. For
134 * permanent PIO fall back, user is probably better off disabling highmem
135 * I/O completely on that queue (see ide-dma for example)
136 */
137#define __bio_kmap_atomic(bio, iter) \
138 (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \
139 bio_iter_iovec((bio), (iter)).bv_offset)
140
141#define __bio_kunmap_atomic(addr) kunmap_atomic(addr)
142
143/*
144 * merge helpers etc 132 * merge helpers etc
145 */ 133 */
146 134
@@ -522,13 +510,11 @@ do { \
522 510
523#ifdef CONFIG_BLK_CGROUP 511#ifdef CONFIG_BLK_CGROUP
524int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); 512int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
525int bio_associate_current(struct bio *bio);
526void bio_disassociate_task(struct bio *bio); 513void bio_disassociate_task(struct bio *bio);
527void bio_clone_blkcg_association(struct bio *dst, struct bio *src); 514void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
528#else /* CONFIG_BLK_CGROUP */ 515#else /* CONFIG_BLK_CGROUP */
529static inline int bio_associate_blkcg(struct bio *bio, 516static inline int bio_associate_blkcg(struct bio *bio,
530 struct cgroup_subsys_state *blkcg_css) { return 0; } 517 struct cgroup_subsys_state *blkcg_css) { return 0; }
531static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
532static inline void bio_disassociate_task(struct bio *bio) { } 518static inline void bio_disassociate_task(struct bio *bio) { }
533static inline void bio_clone_blkcg_association(struct bio *dst, 519static inline void bio_clone_blkcg_association(struct bio *dst,
534 struct bio *src) { } 520 struct bio *src) { }
@@ -575,17 +561,6 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
575} 561}
576#endif 562#endif
577 563
578static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter,
579 unsigned long *flags)
580{
581 return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags);
582}
583#define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags)
584
585#define bio_kmap_irq(bio, flags) \
586 __bio_kmap_irq((bio), (bio)->bi_iter, (flags))
587#define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags)
588
589/* 564/*
590 * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. 565 * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
591 * 566 *
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 8bbc3716507a..e9825ff57b15 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -20,6 +20,7 @@
20#include <linux/radix-tree.h> 20#include <linux/radix-tree.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/kthread.h>
23 24
24/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ 25/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
25#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) 26#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
@@ -224,22 +225,16 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
224 return css ? container_of(css, struct blkcg, css) : NULL; 225 return css ? container_of(css, struct blkcg, css) : NULL;
225} 226}
226 227
227static inline struct blkcg *task_blkcg(struct task_struct *tsk)
228{
229 return css_to_blkcg(task_css(tsk, io_cgrp_id));
230}
231
232static inline struct blkcg *bio_blkcg(struct bio *bio) 228static inline struct blkcg *bio_blkcg(struct bio *bio)
233{ 229{
230 struct cgroup_subsys_state *css;
231
234 if (bio && bio->bi_css) 232 if (bio && bio->bi_css)
235 return css_to_blkcg(bio->bi_css); 233 return css_to_blkcg(bio->bi_css);
236 return task_blkcg(current); 234 css = kthread_blkcg();
237} 235 if (css)
238 236 return css_to_blkcg(css);
239static inline struct cgroup_subsys_state * 237 return css_to_blkcg(task_css(current, io_cgrp_id));
240task_get_blkcg_css(struct task_struct *task)
241{
242 return task_get_css(task, io_cgrp_id);
243} 238}
244 239
245/** 240/**
@@ -736,12 +731,6 @@ struct blkcg_policy {
736 731
737#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) 732#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
738 733
739static inline struct cgroup_subsys_state *
740task_get_blkcg_css(struct task_struct *task)
741{
742 return NULL;
743}
744
745#ifdef CONFIG_BLOCK 734#ifdef CONFIG_BLOCK
746 735
747static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } 736static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 994cbb0f7ffc..95c9a5c862e2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -31,10 +31,12 @@ struct blk_mq_hw_ctx {
31 31
32 struct sbitmap ctx_map; 32 struct sbitmap ctx_map;
33 33
34 struct blk_mq_ctx *dispatch_from;
35
34 struct blk_mq_ctx **ctxs; 36 struct blk_mq_ctx **ctxs;
35 unsigned int nr_ctx; 37 unsigned int nr_ctx;
36 38
37 wait_queue_entry_t dispatch_wait; 39 wait_queue_entry_t dispatch_wait;
38 atomic_t wait_index; 40 atomic_t wait_index;
39 41
40 struct blk_mq_tags *tags; 42 struct blk_mq_tags *tags;
@@ -91,6 +93,8 @@ struct blk_mq_queue_data {
91 93
92typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, 94typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
93 const struct blk_mq_queue_data *); 95 const struct blk_mq_queue_data *);
96typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
97typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
94typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); 98typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
95typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 99typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
96typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 100typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -113,6 +117,15 @@ struct blk_mq_ops {
113 queue_rq_fn *queue_rq; 117 queue_rq_fn *queue_rq;
114 118
115 /* 119 /*
120 * Reserve budget before queue request, once .queue_rq is
121 * run, it is driver's responsibility to release the
122 * reserved budget. Also we have to handle failure case
123 * of .get_budget for avoiding I/O deadlock.
124 */
125 get_budget_fn *get_budget;
126 put_budget_fn *put_budget;
127
128 /*
116 * Called on request timeout 129 * Called on request timeout
117 */ 130 */
118 timeout_fn *timeout; 131 timeout_fn *timeout;
@@ -169,8 +182,7 @@ enum {
169 BLK_MQ_S_STOPPED = 0, 182 BLK_MQ_S_STOPPED = 0,
170 BLK_MQ_S_TAG_ACTIVE = 1, 183 BLK_MQ_S_TAG_ACTIVE = 1,
171 BLK_MQ_S_SCHED_RESTART = 2, 184 BLK_MQ_S_SCHED_RESTART = 2,
172 BLK_MQ_S_TAG_WAITING = 3, 185 BLK_MQ_S_START_ON_RUN = 3,
173 BLK_MQ_S_START_ON_RUN = 4,
174 186
175 BLK_MQ_MAX_DEPTH = 10240, 187 BLK_MQ_MAX_DEPTH = 10240,
176 188
@@ -198,15 +210,21 @@ void blk_mq_free_request(struct request *rq);
198bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 210bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
199 211
200enum { 212enum {
201 BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ 213 /* return when out of requests */
202 BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ 214 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0),
203 BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ 215 /* allocate from reserved pool */
216 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1),
217 /* allocate internal/sched tag */
218 BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2),
219 /* set RQF_PREEMPT */
220 BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3),
204}; 221};
205 222
206struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 223struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
207 unsigned int flags); 224 blk_mq_req_flags_t flags);
208struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 225struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
209 unsigned int op, unsigned int flags, unsigned int hctx_idx); 226 unsigned int op, blk_mq_req_flags_t flags,
227 unsigned int hctx_idx);
210struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); 228struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
211 229
212enum { 230enum {
@@ -249,7 +267,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
249void blk_mq_quiesce_queue(struct request_queue *q); 267void blk_mq_quiesce_queue(struct request_queue *q);
250void blk_mq_unquiesce_queue(struct request_queue *q); 268void blk_mq_unquiesce_queue(struct request_queue *q);
251void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 269void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
252void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 270bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
253void blk_mq_run_hw_queues(struct request_queue *q, bool async); 271void blk_mq_run_hw_queues(struct request_queue *q, bool async);
254void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 272void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
255void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 273void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -260,8 +278,8 @@ void blk_freeze_queue_start(struct request_queue *q);
260void blk_mq_freeze_queue_wait(struct request_queue *q); 278void blk_mq_freeze_queue_wait(struct request_queue *q);
261int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 279int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
262 unsigned long timeout); 280 unsigned long timeout);
263int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, 281int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
264 int (reinit_request)(void *, struct request *)); 282 int (reinit_request)(void *, struct request *));
265 283
266int blk_mq_map_queues(struct blk_mq_tag_set *set); 284int blk_mq_map_queues(struct blk_mq_tag_set *set);
267void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 285void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 96ac3815542c..a1e628e032da 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -163,6 +163,8 @@ struct bio {
163 */ 163 */
164#define BIO_RESET_BITS BVEC_POOL_OFFSET 164#define BIO_RESET_BITS BVEC_POOL_OFFSET
165 165
166typedef __u32 __bitwise blk_mq_req_flags_t;
167
166/* 168/*
167 * Operations and flags common to the bio and request structures. 169 * Operations and flags common to the bio and request structures.
168 * We use 8 bits for encoding the operation, and the remaining 24 for flags. 170 * We use 8 bits for encoding the operation, and the remaining 24 for flags.
@@ -225,11 +227,14 @@ enum req_flag_bits {
225 __REQ_PREFLUSH, /* request for cache flush */ 227 __REQ_PREFLUSH, /* request for cache flush */
226 __REQ_RAHEAD, /* read ahead, can fail anytime */ 228 __REQ_RAHEAD, /* read ahead, can fail anytime */
227 __REQ_BACKGROUND, /* background IO */ 229 __REQ_BACKGROUND, /* background IO */
230 __REQ_NOWAIT, /* Don't wait if request will block */
228 231
229 /* command specific flags for REQ_OP_WRITE_ZEROES: */ 232 /* command specific flags for REQ_OP_WRITE_ZEROES: */
230 __REQ_NOUNMAP, /* do not free blocks when zeroing */ 233 __REQ_NOUNMAP, /* do not free blocks when zeroing */
231 234
232 __REQ_NOWAIT, /* Don't wait if request will block */ 235 /* for driver use */
236 __REQ_DRV,
237
233 __REQ_NR_BITS, /* stops here */ 238 __REQ_NR_BITS, /* stops here */
234}; 239};
235 240
@@ -246,9 +251,11 @@ enum req_flag_bits {
246#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) 251#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
247#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) 252#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
248#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) 253#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
254#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
249 255
250#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) 256#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
251#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) 257
258#define REQ_DRV (1ULL << __REQ_DRV)
252 259
253#define REQ_FAILFAST_MASK \ 260#define REQ_FAILFAST_MASK \
254 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) 261 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -330,11 +337,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
330} 337}
331 338
332struct blk_rq_stat { 339struct blk_rq_stat {
333 s64 mean; 340 u64 mean;
334 u64 min; 341 u64 min;
335 u64 max; 342 u64 max;
336 s32 nr_samples; 343 u32 nr_samples;
337 s32 nr_batch;
338 u64 batch; 344 u64 batch;
339}; 345};
340 346
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8da66379f7ea..8089ca17db9a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -267,6 +267,7 @@ struct blk_queue_ctx;
267 267
268typedef void (request_fn_proc) (struct request_queue *q); 268typedef void (request_fn_proc) (struct request_queue *q);
269typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); 269typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
270typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
270typedef int (prep_rq_fn) (struct request_queue *, struct request *); 271typedef int (prep_rq_fn) (struct request_queue *, struct request *);
271typedef void (unprep_rq_fn) (struct request_queue *, struct request *); 272typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
272 273
@@ -409,6 +410,7 @@ struct request_queue {
409 410
410 request_fn_proc *request_fn; 411 request_fn_proc *request_fn;
411 make_request_fn *make_request_fn; 412 make_request_fn *make_request_fn;
413 poll_q_fn *poll_fn;
412 prep_rq_fn *prep_rq_fn; 414 prep_rq_fn *prep_rq_fn;
413 unprep_rq_fn *unprep_rq_fn; 415 unprep_rq_fn *unprep_rq_fn;
414 softirq_done_fn *softirq_done_fn; 416 softirq_done_fn *softirq_done_fn;
@@ -610,7 +612,6 @@ struct request_queue {
610#define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ 612#define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */
611#define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ 613#define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */
612#define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ 614#define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */
613#define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */
614#define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ 615#define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */
615#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ 616#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */
616#define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ 617#define QUEUE_FLAG_IO_STAT 10 /* do IO stats */
@@ -632,14 +633,13 @@ struct request_queue {
632#define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ 633#define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */
633#define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ 634#define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */
634#define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ 635#define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */
636#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */
635 637
636#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 638#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
637 (1 << QUEUE_FLAG_STACKABLE) | \
638 (1 << QUEUE_FLAG_SAME_COMP) | \ 639 (1 << QUEUE_FLAG_SAME_COMP) | \
639 (1 << QUEUE_FLAG_ADD_RANDOM)) 640 (1 << QUEUE_FLAG_ADD_RANDOM))
640 641
641#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 642#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
642 (1 << QUEUE_FLAG_STACKABLE) | \
643 (1 << QUEUE_FLAG_SAME_COMP) | \ 643 (1 << QUEUE_FLAG_SAME_COMP) | \
644 (1 << QUEUE_FLAG_POLL)) 644 (1 << QUEUE_FLAG_POLL))
645 645
@@ -723,8 +723,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
723#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) 723#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
724#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) 724#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
725#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) 725#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
726#define blk_queue_stackable(q) \
727 test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
728#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) 726#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
729#define blk_queue_secure_erase(q) \ 727#define blk_queue_secure_erase(q) \
730 (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) 728 (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
@@ -736,6 +734,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
736 ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ 734 ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
737 REQ_FAILFAST_DRIVER)) 735 REQ_FAILFAST_DRIVER))
738#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) 736#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
737#define blk_queue_preempt_only(q) \
738 test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags)
739
740extern int blk_set_preempt_only(struct request_queue *q);
741extern void blk_clear_preempt_only(struct request_queue *q);
739 742
740static inline bool blk_account_rq(struct request *rq) 743static inline bool blk_account_rq(struct request *rq)
741{ 744{
@@ -923,24 +926,17 @@ static inline void rq_flush_dcache_pages(struct request *rq)
923} 926}
924#endif 927#endif
925 928
926#ifdef CONFIG_PRINTK
927#define vfs_msg(sb, level, fmt, ...) \
928 __vfs_msg(sb, level, fmt, ##__VA_ARGS__)
929#else
930#define vfs_msg(sb, level, fmt, ...) \
931do { \
932 no_printk(fmt, ##__VA_ARGS__); \
933 __vfs_msg(sb, "", " "); \
934} while (0)
935#endif
936
937extern int blk_register_queue(struct gendisk *disk); 929extern int blk_register_queue(struct gendisk *disk);
938extern void blk_unregister_queue(struct gendisk *disk); 930extern void blk_unregister_queue(struct gendisk *disk);
939extern blk_qc_t generic_make_request(struct bio *bio); 931extern blk_qc_t generic_make_request(struct bio *bio);
932extern blk_qc_t direct_make_request(struct bio *bio);
940extern void blk_rq_init(struct request_queue *q, struct request *rq); 933extern void blk_rq_init(struct request_queue *q, struct request *rq);
941extern void blk_init_request_from_bio(struct request *req, struct bio *bio); 934extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
942extern void blk_put_request(struct request *); 935extern void blk_put_request(struct request *);
943extern void __blk_put_request(struct request_queue *, struct request *); 936extern void __blk_put_request(struct request_queue *, struct request *);
937extern struct request *blk_get_request_flags(struct request_queue *,
938 unsigned int op,
939 blk_mq_req_flags_t flags);
944extern struct request *blk_get_request(struct request_queue *, unsigned int op, 940extern struct request *blk_get_request(struct request_queue *, unsigned int op,
945 gfp_t gfp_mask); 941 gfp_t gfp_mask);
946extern void blk_requeue_request(struct request_queue *, struct request *); 942extern void blk_requeue_request(struct request_queue *, struct request *);
@@ -964,7 +960,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
964extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, 960extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
965 struct scsi_ioctl_command __user *); 961 struct scsi_ioctl_command __user *);
966 962
967extern int blk_queue_enter(struct request_queue *q, bool nowait); 963extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
968extern void blk_queue_exit(struct request_queue *q); 964extern void blk_queue_exit(struct request_queue *q);
969extern void blk_start_queue(struct request_queue *q); 965extern void blk_start_queue(struct request_queue *q);
970extern void blk_start_queue_async(struct request_queue *q); 966extern void blk_start_queue_async(struct request_queue *q);
@@ -991,7 +987,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
991int blk_status_to_errno(blk_status_t status); 987int blk_status_to_errno(blk_status_t status);
992blk_status_t errno_to_blk_status(int errno); 988blk_status_t errno_to_blk_status(int errno);
993 989
994bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); 990bool blk_poll(struct request_queue *q, blk_qc_t cookie);
995 991
996static inline struct request_queue *bdev_get_queue(struct block_device *bdev) 992static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
997{ 993{
@@ -1110,6 +1106,8 @@ extern struct request *blk_peek_request(struct request_queue *q);
1110extern void blk_start_request(struct request *rq); 1106extern void blk_start_request(struct request *rq);
1111extern struct request *blk_fetch_request(struct request_queue *q); 1107extern struct request *blk_fetch_request(struct request_queue *q);
1112 1108
1109void blk_steal_bios(struct bio_list *list, struct request *rq);
1110
1113/* 1111/*
1114 * Request completion related functions. 1112 * Request completion related functions.
1115 * 1113 *
@@ -1372,7 +1370,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
1372 gfp_mask, 0); 1370 gfp_mask, 0);
1373} 1371}
1374 1372
1375extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); 1373extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
1376 1374
1377enum blk_default_limits { 1375enum blk_default_limits {
1378 BLK_MAX_SEGMENTS = 128, 1376 BLK_MAX_SEGMENTS = 128,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index afa37f807f12..8b1bf8d3d4a2 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -157,7 +157,7 @@ void set_bh_page(struct buffer_head *bh,
157 struct page *page, unsigned long offset); 157 struct page *page, unsigned long offset);
158int try_to_free_buffers(struct page *); 158int try_to_free_buffers(struct page *);
159struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, 159struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
160 int retry); 160 bool retry);
161void create_empty_buffers(struct page *, unsigned long, 161void create_empty_buffers(struct page *, unsigned long,
162 unsigned long b_state); 162 unsigned long b_state);
163void end_buffer_read_sync(struct buffer_head *bh, int uptodate); 163void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ddb7632d73b9..3d794b3dc532 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -145,6 +145,7 @@ struct elevator_type
145 size_t icq_align; /* ditto */ 145 size_t icq_align; /* ditto */
146 struct elv_fs_entry *elevator_attrs; 146 struct elv_fs_entry *elevator_attrs;
147 char elevator_name[ELV_NAME_MAX]; 147 char elevator_name[ELV_NAME_MAX];
148 const char *elevator_alias;
148 struct module *elevator_owner; 149 struct module *elevator_owner;
149 bool uses_mq; 150 bool uses_mq;
150#ifdef CONFIG_BLK_DEBUG_FS 151#ifdef CONFIG_BLK_DEBUG_FS
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index eaefb7a62f83..5144ebe046c9 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -141,6 +141,7 @@ struct hd_struct {
141#define GENHD_FL_NATIVE_CAPACITY 128 141#define GENHD_FL_NATIVE_CAPACITY 128
142#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 142#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256
143#define GENHD_FL_NO_PART_SCAN 512 143#define GENHD_FL_NO_PART_SCAN 512
144#define GENHD_FL_HIDDEN 1024
144 145
145enum { 146enum {
146 DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ 147 DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
@@ -236,7 +237,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk)
236 237
237static inline dev_t disk_devt(struct gendisk *disk) 238static inline dev_t disk_devt(struct gendisk *disk)
238{ 239{
239 return disk_to_dev(disk)->devt; 240 return MKDEV(disk->major, disk->first_minor);
240} 241}
241 242
242static inline dev_t part_devt(struct hd_struct *part) 243static inline dev_t part_devt(struct hd_struct *part)
@@ -244,6 +245,7 @@ static inline dev_t part_devt(struct hd_struct *part)
244 return part_to_dev(part)->devt; 245 return part_to_dev(part)->devt;
245} 246}
246 247
248extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
247extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); 249extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
248 250
249static inline void disk_put_part(struct hd_struct *part) 251static inline void disk_put_part(struct hd_struct *part)
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 86d53a3cb497..3203e36b2ee8 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,6 +4,7 @@
4/* Simple interface for creating and stopping kernel threads without mess. */ 4/* Simple interface for creating and stopping kernel threads without mess. */
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/cgroup.h>
7 8
8__printf(4, 5) 9__printf(4, 5)
9struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 10struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -199,4 +200,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
199 200
200void kthread_destroy_worker(struct kthread_worker *worker); 201void kthread_destroy_worker(struct kthread_worker *worker);
201 202
203#ifdef CONFIG_BLK_CGROUP
204void kthread_associate_blkcg(struct cgroup_subsys_state *css);
205struct cgroup_subsys_state *kthread_blkcg(void);
206#else
207static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
208static inline struct cgroup_subsys_state *kthread_blkcg(void)
209{
210 return NULL;
211}
212#endif
202#endif /* _LINUX_KTHREAD_H */ 213#endif /* _LINUX_KTHREAD_H */
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index a29a8db5cc2f..2d1d9de06728 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -57,6 +57,7 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
57typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); 57typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
58typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); 58typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
59typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); 59typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
60typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
60typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); 61typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
61typedef void (nvm_destroy_dma_pool_fn)(void *); 62typedef void (nvm_destroy_dma_pool_fn)(void *);
62typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, 63typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -70,6 +71,7 @@ struct nvm_dev_ops {
70 nvm_op_set_bb_fn *set_bb_tbl; 71 nvm_op_set_bb_fn *set_bb_tbl;
71 72
72 nvm_submit_io_fn *submit_io; 73 nvm_submit_io_fn *submit_io;
74 nvm_submit_io_sync_fn *submit_io_sync;
73 75
74 nvm_create_dma_pool_fn *create_dma_pool; 76 nvm_create_dma_pool_fn *create_dma_pool;
75 nvm_destroy_dma_pool_fn *destroy_dma_pool; 77 nvm_destroy_dma_pool_fn *destroy_dma_pool;
@@ -461,10 +463,9 @@ struct nvm_tgt_type {
461 463
462 /* For internal use */ 464 /* For internal use */
463 struct list_head list; 465 struct list_head list;
466 struct module *owner;
464}; 467};
465 468
466extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
467
468extern int nvm_register_tgt_type(struct nvm_tgt_type *); 469extern int nvm_register_tgt_type(struct nvm_tgt_type *);
469extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); 470extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
470 471
@@ -479,10 +480,8 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
479 int, int); 480 int, int);
480extern int nvm_max_phys_sects(struct nvm_tgt_dev *); 481extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
481extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); 482extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
483extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
482extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); 484extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
483extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
484 const struct ppa_addr *, int, int);
485extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
486extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, 485extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
487 void *); 486 void *);
488extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); 487extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
@@ -491,8 +490,6 @@ extern void nvm_end_io(struct nvm_rq *);
491extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); 490extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
492extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); 491extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
493 492
494extern int nvm_dev_factory(struct nvm_dev *, int flags);
495
496extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); 493extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
497 494
498#else /* CONFIG_NVM */ 495#else /* CONFIG_NVM */
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index a726f96010d5..496ff759f84c 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -40,6 +40,8 @@
40 * @node_name: FC WWNN for the port 40 * @node_name: FC WWNN for the port
41 * @port_name: FC WWPN for the port 41 * @port_name: FC WWPN for the port
42 * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) 42 * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx)
43 * @dev_loss_tmo: maximum delay for reconnects to an association on
44 * this device. Used only on a remoteport.
43 * 45 *
44 * Initialization values for dynamic port fields: 46 * Initialization values for dynamic port fields:
45 * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must 47 * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must
@@ -50,6 +52,7 @@ struct nvme_fc_port_info {
50 u64 port_name; 52 u64 port_name;
51 u32 port_role; 53 u32 port_role;
52 u32 port_id; 54 u32 port_id;
55 u32 dev_loss_tmo;
53}; 56};
54 57
55 58
@@ -102,8 +105,6 @@ enum nvmefc_fcp_datadir {
102}; 105};
103 106
104 107
105#define NVME_FC_MAX_SEGMENTS 256
106
107/** 108/**
108 * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport 109 * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport
109 * to LLDD in order to perform a NVME FCP IO operation. 110 * to LLDD in order to perform a NVME FCP IO operation.
@@ -202,6 +203,9 @@ enum nvme_fc_obj_state {
202 * The length of the buffer corresponds to the local_priv_sz 203 * The length of the buffer corresponds to the local_priv_sz
203 * value specified in the nvme_fc_port_template supplied by 204 * value specified in the nvme_fc_port_template supplied by
204 * the LLDD. 205 * the LLDD.
206 * @dev_loss_tmo: maximum delay for reconnects to an association on
207 * this device. To modify, lldd must call
208 * nvme_fc_set_remoteport_devloss().
205 * 209 *
206 * Fields with dynamic values. Values may change base on link state. LLDD 210 * Fields with dynamic values. Values may change base on link state. LLDD
207 * may reference fields directly to change them. Initialized by the 211 * may reference fields directly to change them. Initialized by the
@@ -259,10 +263,9 @@ struct nvme_fc_remote_port {
259 u32 port_role; 263 u32 port_role;
260 u64 node_name; 264 u64 node_name;
261 u64 port_name; 265 u64 port_name;
262
263 struct nvme_fc_local_port *localport; 266 struct nvme_fc_local_port *localport;
264
265 void *private; 267 void *private;
268 u32 dev_loss_tmo;
266 269
267 /* dynamic fields */ 270 /* dynamic fields */
268 u32 port_id; 271 u32 port_id;
@@ -446,6 +449,10 @@ int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
446 449
447int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); 450int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport);
448 451
452void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport);
453
454int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport,
455 u32 dev_loss_tmo);
449 456
450 457
451/* 458/*
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9310ce77d8e1..aea87f0d917b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -90,6 +90,14 @@ enum {
90}; 90};
91 91
92#define NVME_AQ_DEPTH 32 92#define NVME_AQ_DEPTH 32
93#define NVME_NR_AEN_COMMANDS 1
94#define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
95
96/*
97 * Subtract one to leave an empty queue entry for 'Full Queue' condition. See
98 * NVM-Express 1.2 specification, section 4.1.2.
99 */
100#define NVME_AQ_MQ_TAG_DEPTH (NVME_AQ_BLK_MQ_DEPTH - 1)
93 101
94enum { 102enum {
95 NVME_REG_CAP = 0x0000, /* Controller Capabilities */ 103 NVME_REG_CAP = 0x0000, /* Controller Capabilities */
@@ -267,6 +275,7 @@ enum {
267 NVME_CTRL_OACS_SEC_SUPP = 1 << 0, 275 NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
268 NVME_CTRL_OACS_DIRECTIVES = 1 << 5, 276 NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
269 NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, 277 NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8,
278 NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1,
270}; 279};
271 280
272struct nvme_lbaf { 281struct nvme_lbaf {
@@ -396,6 +405,21 @@ struct nvme_fw_slot_info_log {
396}; 405};
397 406
398enum { 407enum {
408 NVME_CMD_EFFECTS_CSUPP = 1 << 0,
409 NVME_CMD_EFFECTS_LBCC = 1 << 1,
410 NVME_CMD_EFFECTS_NCC = 1 << 2,
411 NVME_CMD_EFFECTS_NIC = 1 << 3,
412 NVME_CMD_EFFECTS_CCC = 1 << 4,
413 NVME_CMD_EFFECTS_CSE_MASK = 3 << 16,
414};
415
416struct nvme_effects_log {
417 __le32 acs[256];
418 __le32 iocs[256];
419 __u8 resv[2048];
420};
421
422enum {
399 NVME_SMART_CRIT_SPARE = 1 << 0, 423 NVME_SMART_CRIT_SPARE = 1 << 0,
400 NVME_SMART_CRIT_TEMPERATURE = 1 << 1, 424 NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
401 NVME_SMART_CRIT_RELIABILITY = 1 << 2, 425 NVME_SMART_CRIT_RELIABILITY = 1 << 2,
@@ -404,6 +428,10 @@ enum {
404}; 428};
405 429
406enum { 430enum {
431 NVME_AER_ERROR = 0,
432 NVME_AER_SMART = 1,
433 NVME_AER_CSS = 6,
434 NVME_AER_VS = 7,
407 NVME_AER_NOTICE_NS_CHANGED = 0x0002, 435 NVME_AER_NOTICE_NS_CHANGED = 0x0002,
408 NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, 436 NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
409}; 437};
@@ -681,6 +709,7 @@ enum nvme_admin_opcode {
681 nvme_admin_format_nvm = 0x80, 709 nvme_admin_format_nvm = 0x80,
682 nvme_admin_security_send = 0x81, 710 nvme_admin_security_send = 0x81,
683 nvme_admin_security_recv = 0x82, 711 nvme_admin_security_recv = 0x82,
712 nvme_admin_sanitize_nvm = 0x84,
684}; 713};
685 714
686enum { 715enum {
@@ -712,6 +741,7 @@ enum {
712 NVME_LOG_ERROR = 0x01, 741 NVME_LOG_ERROR = 0x01,
713 NVME_LOG_SMART = 0x02, 742 NVME_LOG_SMART = 0x02,
714 NVME_LOG_FW_SLOT = 0x03, 743 NVME_LOG_FW_SLOT = 0x03,
744 NVME_LOG_CMD_EFFECTS = 0x05,
715 NVME_LOG_DISC = 0x70, 745 NVME_LOG_DISC = 0x70,
716 NVME_LOG_RESERVATION = 0x80, 746 NVME_LOG_RESERVATION = 0x80,
717 NVME_FWACT_REPL = (0 << 3), 747 NVME_FWACT_REPL = (0 << 3),
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a1904aadbc45..0dcc60e820de 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb);
211 */ 211 */
212bool sbitmap_any_bit_clear(const struct sbitmap *sb); 212bool sbitmap_any_bit_clear(const struct sbitmap *sb);
213 213
214#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
215#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
216
214typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); 217typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
215 218
216/** 219/**
217 * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. 220 * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
221 * @start: Where to start the iteration.
218 * @sb: Bitmap to iterate over. 222 * @sb: Bitmap to iterate over.
219 * @fn: Callback. Should return true to continue or false to break early. 223 * @fn: Callback. Should return true to continue or false to break early.
220 * @data: Pointer to pass to callback. 224 * @data: Pointer to pass to callback.
@@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
222 * This is inline even though it's non-trivial so that the function calls to the 226 * This is inline even though it's non-trivial so that the function calls to the
223 * callback will hopefully get optimized away. 227 * callback will hopefully get optimized away.
224 */ 228 */
225static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, 229static inline void __sbitmap_for_each_set(struct sbitmap *sb,
226 void *data) 230 unsigned int start,
231 sb_for_each_fn fn, void *data)
227{ 232{
228 unsigned int i; 233 unsigned int index;
234 unsigned int nr;
235 unsigned int scanned = 0;
229 236
230 for (i = 0; i < sb->map_nr; i++) { 237 if (start >= sb->depth)
231 struct sbitmap_word *word = &sb->map[i]; 238 start = 0;
232 unsigned int off, nr; 239 index = SB_NR_TO_INDEX(sb, start);
240 nr = SB_NR_TO_BIT(sb, start);
233 241
234 if (!word->word) 242 while (scanned < sb->depth) {
235 continue; 243 struct sbitmap_word *word = &sb->map[index];
244 unsigned int depth = min_t(unsigned int, word->depth - nr,
245 sb->depth - scanned);
236 246
237 nr = 0; 247 scanned += depth;
238 off = i << sb->shift; 248 if (!word->word)
249 goto next;
250
251 /*
252 * On the first iteration of the outer loop, we need to add the
253 * bit offset back to the size of the word for find_next_bit().
254 * On all other iterations, nr is zero, so this is a noop.
255 */
256 depth += nr;
239 while (1) { 257 while (1) {
240 nr = find_next_bit(&word->word, word->depth, nr); 258 nr = find_next_bit(&word->word, depth, nr);
241 if (nr >= word->depth) 259 if (nr >= depth)
242 break; 260 break;
243 261 if (!fn(sb, (index << sb->shift) + nr, data))
244 if (!fn(sb, off + nr, data))
245 return; 262 return;
246 263
247 nr++; 264 nr++;
248 } 265 }
266next:
267 nr = 0;
268 if (++index >= sb->map_nr)
269 index = 0;
249 } 270 }
250} 271}
251 272
252#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) 273/**
253#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) 274 * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
275 * @sb: Bitmap to iterate over.
276 * @fn: Callback. Should return true to continue or false to break early.
277 * @data: Pointer to pass to callback.
278 */
279static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
280 void *data)
281{
282 __sbitmap_for_each_set(sb, 0, fn, data);
283}
254 284
255static inline unsigned long *__sbitmap_word(struct sbitmap *sb, 285static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
256 unsigned int bitnr) 286 unsigned int bitnr)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e12d92808e98..f42d85631d17 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -43,28 +43,6 @@ enum writeback_sync_modes {
43}; 43};
44 44
45/* 45/*
46 * why some writeback work was initiated
47 */
48enum wb_reason {
49 WB_REASON_BACKGROUND,
50 WB_REASON_VMSCAN,
51 WB_REASON_SYNC,
52 WB_REASON_PERIODIC,
53 WB_REASON_LAPTOP_TIMER,
54 WB_REASON_FREE_MORE_MEM,
55 WB_REASON_FS_FREE_SPACE,
56 /*
57 * There is no bdi forker thread any more and works are done
58 * by emergency worker, however, this is TPs userland visible
59 * and we'll be exposing exactly the same information,
60 * so it has a mismatch name.
61 */
62 WB_REASON_FORKER_THREAD,
63
64 WB_REASON_MAX,
65};
66
67/*
68 * A control structure which tells the writeback code what to do. These are 46 * A control structure which tells the writeback code what to do. These are
69 * always on the stack, and hence need no locking. They are always initialised 47 * always on the stack, and hence need no locking. They are always initialised
70 * in a manner such that unspecified fields are set to zero. 48 * in a manner such that unspecified fields are set to zero.
@@ -186,11 +164,11 @@ struct bdi_writeback;
186void writeback_inodes_sb(struct super_block *, enum wb_reason reason); 164void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
187void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, 165void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
188 enum wb_reason reason); 166 enum wb_reason reason);
189bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); 167void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
190bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
191 enum wb_reason reason);
192void sync_inodes_sb(struct super_block *); 168void sync_inodes_sb(struct super_block *);
193void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); 169void wakeup_flusher_threads(enum wb_reason reason);
170void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
171 enum wb_reason reason);
194void inode_wait_for_writeback(struct inode *inode); 172void inode_wait_for_writeback(struct inode *inode);
195 173
196/* writeback.h requires fs.h; it, too, is not included from here. */ 174/* writeback.h requires fs.h; it, too, is not included from here. */
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 571ddb49b926..73af87dfbff8 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -221,6 +221,7 @@ struct scsi_device {
221 unsigned char access_state; 221 unsigned char access_state;
222 struct mutex state_mutex; 222 struct mutex state_mutex;
223 enum scsi_device_state sdev_state; 223 enum scsi_device_state sdev_state;
224 struct task_struct *quiesced_by;
224 unsigned long sdev_data[0]; 225 unsigned long sdev_data[0];
225} __attribute__((aligned(sizeof(unsigned long)))); 226} __attribute__((aligned(sizeof(unsigned long))));
226 227
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 2e1fa7910306..32db72c7c055 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -287,7 +287,6 @@ DEFINE_EVENT(writeback_class, name, \
287 TP_PROTO(struct bdi_writeback *wb), \ 287 TP_PROTO(struct bdi_writeback *wb), \
288 TP_ARGS(wb)) 288 TP_ARGS(wb))
289 289
290DEFINE_WRITEBACK_EVENT(writeback_nowork);
291DEFINE_WRITEBACK_EVENT(writeback_wake_background); 290DEFINE_WRITEBACK_EVENT(writeback_wake_background);
292 291
293TRACE_EVENT(writeback_bdi_register, 292TRACE_EVENT(writeback_bdi_register,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ba3992c8c375..8af313081b0d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
20#include <linux/freezer.h> 20#include <linux/freezer.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/cgroup.h>
24#include <trace/events/sched.h> 23#include <trace/events/sched.h>
25 24
26static DEFINE_SPINLOCK(kthread_create_lock); 25static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
47 void *data; 46 void *data;
48 struct completion parked; 47 struct completion parked;
49 struct completion exited; 48 struct completion exited;
49#ifdef CONFIG_BLK_CGROUP
50 struct cgroup_subsys_state *blkcg_css;
51#endif
50}; 52};
51 53
52enum KTHREAD_BITS { 54enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
74 76
75void free_kthread_struct(struct task_struct *k) 77void free_kthread_struct(struct task_struct *k)
76{ 78{
79 struct kthread *kthread;
80
77 /* 81 /*
78 * Can be NULL if this kthread was created by kernel_thread() 82 * Can be NULL if this kthread was created by kernel_thread()
79 * or if kmalloc() in kthread() failed. 83 * or if kmalloc() in kthread() failed.
80 */ 84 */
81 kfree(to_kthread(k)); 85 kthread = to_kthread(k);
86#ifdef CONFIG_BLK_CGROUP
87 WARN_ON_ONCE(kthread && kthread->blkcg_css);
88#endif
89 kfree(kthread);
82} 90}
83 91
84/** 92/**
@@ -196,7 +204,7 @@ static int kthread(void *_create)
196 struct kthread *self; 204 struct kthread *self;
197 int ret; 205 int ret;
198 206
199 self = kmalloc(sizeof(*self), GFP_KERNEL); 207 self = kzalloc(sizeof(*self), GFP_KERNEL);
200 set_kthread_struct(self); 208 set_kthread_struct(self);
201 209
202 /* If user was SIGKILLed, I release the structure. */ 210 /* If user was SIGKILLed, I release the structure. */
@@ -212,7 +220,6 @@ static int kthread(void *_create)
212 do_exit(-ENOMEM); 220 do_exit(-ENOMEM);
213 } 221 }
214 222
215 self->flags = 0;
216 self->data = data; 223 self->data = data;
217 init_completion(&self->exited); 224 init_completion(&self->exited);
218 init_completion(&self->parked); 225 init_completion(&self->parked);
@@ -1152,3 +1159,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
1152 kfree(worker); 1159 kfree(worker);
1153} 1160}
1154EXPORT_SYMBOL(kthread_destroy_worker); 1161EXPORT_SYMBOL(kthread_destroy_worker);
1162
1163#ifdef CONFIG_BLK_CGROUP
1164/**
1165 * kthread_associate_blkcg - associate blkcg to current kthread
1166 * @css: the cgroup info
1167 *
1168 * Current thread must be a kthread. The thread is running jobs on behalf of
1169 * other threads. In some cases, we expect the jobs attach cgroup info of
1170 * original threads instead of that of current thread. This function stores
1171 * original thread's cgroup info in current kthread context for later
1172 * retrieval.
1173 */
1174void kthread_associate_blkcg(struct cgroup_subsys_state *css)
1175{
1176 struct kthread *kthread;
1177
1178 if (!(current->flags & PF_KTHREAD))
1179 return;
1180 kthread = to_kthread(current);
1181 if (!kthread)
1182 return;
1183
1184 if (kthread->blkcg_css) {
1185 css_put(kthread->blkcg_css);
1186 kthread->blkcg_css = NULL;
1187 }
1188 if (css) {
1189 css_get(css);
1190 kthread->blkcg_css = css;
1191 }
1192}
1193EXPORT_SYMBOL(kthread_associate_blkcg);
1194
1195/**
1196 * kthread_blkcg - get associated blkcg css of current kthread
1197 *
1198 * Current thread must be a kthread.
1199 */
1200struct cgroup_subsys_state *kthread_blkcg(void)
1201{
1202 struct kthread *kthread;
1203
1204 if (current->flags & PF_KTHREAD) {
1205 kthread = to_kthread(current);
1206 if (kthread)
1207 return kthread->blkcg_css;
1208 }
1209 return NULL;
1210}
1211EXPORT_SYMBOL(kthread_blkcg);
1212#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc2eaea..9576bd582d4a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1342,11 +1342,6 @@ static struct ctl_table vm_table[] = {
1342 .extra1 = &zero, 1342 .extra1 = &zero,
1343 }, 1343 },
1344 { 1344 {
1345 .procname = "nr_pdflush_threads",
1346 .mode = 0444 /* read-only */,
1347 .proc_handler = pdflush_proc_obsolete,
1348 },
1349 {
1350 .procname = "swappiness", 1345 .procname = "swappiness",
1351 .data = &vm_swappiness, 1346 .data = &vm_swappiness,
1352 .maxlen = sizeof(vm_swappiness), 1347 .maxlen = sizeof(vm_swappiness),
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 45a3928544ce..206e0e2ace53 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = {
66}; 66};
67 67
68/* Global reference count of probes */ 68/* Global reference count of probes */
69static atomic_t blk_probes_ref = ATOMIC_INIT(0); 69static DEFINE_MUTEX(blk_probe_mutex);
70static int blk_probes_ref;
70 71
71static void blk_register_tracepoints(void); 72static void blk_register_tracepoints(void);
72static void blk_unregister_tracepoints(void); 73static void blk_unregister_tracepoints(void);
@@ -329,14 +330,29 @@ static void blk_trace_free(struct blk_trace *bt)
329 kfree(bt); 330 kfree(bt);
330} 331}
331 332
333static void get_probe_ref(void)
334{
335 mutex_lock(&blk_probe_mutex);
336 if (++blk_probes_ref == 1)
337 blk_register_tracepoints();
338 mutex_unlock(&blk_probe_mutex);
339}
340
341static void put_probe_ref(void)
342{
343 mutex_lock(&blk_probe_mutex);
344 if (!--blk_probes_ref)
345 blk_unregister_tracepoints();
346 mutex_unlock(&blk_probe_mutex);
347}
348
332static void blk_trace_cleanup(struct blk_trace *bt) 349static void blk_trace_cleanup(struct blk_trace *bt)
333{ 350{
334 blk_trace_free(bt); 351 blk_trace_free(bt);
335 if (atomic_dec_and_test(&blk_probes_ref)) 352 put_probe_ref();
336 blk_unregister_tracepoints();
337} 353}
338 354
339int blk_trace_remove(struct request_queue *q) 355static int __blk_trace_remove(struct request_queue *q)
340{ 356{
341 struct blk_trace *bt; 357 struct blk_trace *bt;
342 358
@@ -349,6 +365,17 @@ int blk_trace_remove(struct request_queue *q)
349 365
350 return 0; 366 return 0;
351} 367}
368
369int blk_trace_remove(struct request_queue *q)
370{
371 int ret;
372
373 mutex_lock(&q->blk_trace_mutex);
374 ret = __blk_trace_remove(q);
375 mutex_unlock(&q->blk_trace_mutex);
376
377 return ret;
378}
352EXPORT_SYMBOL_GPL(blk_trace_remove); 379EXPORT_SYMBOL_GPL(blk_trace_remove);
353 380
354static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, 381static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -538,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
538 if (cmpxchg(&q->blk_trace, NULL, bt)) 565 if (cmpxchg(&q->blk_trace, NULL, bt))
539 goto err; 566 goto err;
540 567
541 if (atomic_inc_return(&blk_probes_ref) == 1) 568 get_probe_ref();
542 blk_register_tracepoints();
543 569
544 ret = 0; 570 ret = 0;
545err: 571err:
@@ -550,9 +576,8 @@ err:
550 return ret; 576 return ret;
551} 577}
552 578
553int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 579static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
554 struct block_device *bdev, 580 struct block_device *bdev, char __user *arg)
555 char __user *arg)
556{ 581{
557 struct blk_user_trace_setup buts; 582 struct blk_user_trace_setup buts;
558 int ret; 583 int ret;
@@ -571,6 +596,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
571 } 596 }
572 return 0; 597 return 0;
573} 598}
599
600int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
601 struct block_device *bdev,
602 char __user *arg)
603{
604 int ret;
605
606 mutex_lock(&q->blk_trace_mutex);
607 ret = __blk_trace_setup(q, name, dev, bdev, arg);
608 mutex_unlock(&q->blk_trace_mutex);
609
610 return ret;
611}
574EXPORT_SYMBOL_GPL(blk_trace_setup); 612EXPORT_SYMBOL_GPL(blk_trace_setup);
575 613
576#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) 614#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -607,7 +645,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
607} 645}
608#endif 646#endif
609 647
610int blk_trace_startstop(struct request_queue *q, int start) 648static int __blk_trace_startstop(struct request_queue *q, int start)
611{ 649{
612 int ret; 650 int ret;
613 struct blk_trace *bt = q->blk_trace; 651 struct blk_trace *bt = q->blk_trace;
@@ -646,6 +684,17 @@ int blk_trace_startstop(struct request_queue *q, int start)
646 684
647 return ret; 685 return ret;
648} 686}
687
688int blk_trace_startstop(struct request_queue *q, int start)
689{
690 int ret;
691
692 mutex_lock(&q->blk_trace_mutex);
693 ret = __blk_trace_startstop(q, start);
694 mutex_unlock(&q->blk_trace_mutex);
695
696 return ret;
697}
649EXPORT_SYMBOL_GPL(blk_trace_startstop); 698EXPORT_SYMBOL_GPL(blk_trace_startstop);
650 699
651/* 700/*
@@ -676,7 +725,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
676 switch (cmd) { 725 switch (cmd) {
677 case BLKTRACESETUP: 726 case BLKTRACESETUP:
678 bdevname(bdev, b); 727 bdevname(bdev, b);
679 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); 728 ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
680 break; 729 break;
681#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) 730#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
682 case BLKTRACESETUP32: 731 case BLKTRACESETUP32:
@@ -687,10 +736,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
687 case BLKTRACESTART: 736 case BLKTRACESTART:
688 start = 1; 737 start = 1;
689 case BLKTRACESTOP: 738 case BLKTRACESTOP:
690 ret = blk_trace_startstop(q, start); 739 ret = __blk_trace_startstop(q, start);
691 break; 740 break;
692 case BLKTRACETEARDOWN: 741 case BLKTRACETEARDOWN:
693 ret = blk_trace_remove(q); 742 ret = __blk_trace_remove(q);
694 break; 743 break;
695 default: 744 default:
696 ret = -ENOTTY; 745 ret = -ENOTTY;
@@ -708,10 +757,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
708 **/ 757 **/
709void blk_trace_shutdown(struct request_queue *q) 758void blk_trace_shutdown(struct request_queue *q)
710{ 759{
760 mutex_lock(&q->blk_trace_mutex);
761
711 if (q->blk_trace) { 762 if (q->blk_trace) {
712 blk_trace_startstop(q, 0); 763 __blk_trace_startstop(q, 0);
713 blk_trace_remove(q); 764 __blk_trace_remove(q);
714 } 765 }
766
767 mutex_unlock(&q->blk_trace_mutex);
715} 768}
716 769
717#ifdef CONFIG_BLK_CGROUP 770#ifdef CONFIG_BLK_CGROUP
@@ -1558,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
1558 if (bt == NULL) 1611 if (bt == NULL)
1559 return -EINVAL; 1612 return -EINVAL;
1560 1613
1561 if (atomic_dec_and_test(&blk_probes_ref)) 1614 put_probe_ref();
1562 blk_unregister_tracepoints();
1563
1564 blk_trace_free(bt); 1615 blk_trace_free(bt);
1565 return 0; 1616 return 0;
1566} 1617}
@@ -1591,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
1591 if (cmpxchg(&q->blk_trace, NULL, bt)) 1642 if (cmpxchg(&q->blk_trace, NULL, bt))
1592 goto free_bt; 1643 goto free_bt;
1593 1644
1594 if (atomic_inc_return(&blk_probes_ref) == 1) 1645 get_probe_ref();
1595 blk_register_tracepoints();
1596 return 0; 1646 return 0;
1597 1647
1598free_bt: 1648free_bt:
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e19606bb41a0..74b52dfd5852 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1072,23 +1072,3 @@ out:
1072 return ret; 1072 return ret;
1073} 1073}
1074EXPORT_SYMBOL(wait_iff_congested); 1074EXPORT_SYMBOL(wait_iff_congested);
1075
1076int pdflush_proc_obsolete(struct ctl_table *table, int write,
1077 void __user *buffer, size_t *lenp, loff_t *ppos)
1078{
1079 char kbuf[] = "0\n";
1080
1081 if (*ppos || *lenp < sizeof(kbuf)) {
1082 *lenp = 0;
1083 return 0;
1084 }
1085
1086 if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
1087 return -EFAULT;
1088 pr_warn_once("%s exported in /proc is scheduled for removal\n",
1089 table->procname);
1090
1091 *lenp = 2;
1092 *ppos += *lenp;
1093 return 2;
1094}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b9c5cbe8eba..c518c845f202 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1972,31 +1972,31 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
1972int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, 1972int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
1973 void __user *buffer, size_t *length, loff_t *ppos) 1973 void __user *buffer, size_t *length, loff_t *ppos)
1974{ 1974{
1975 proc_dointvec(table, write, buffer, length, ppos); 1975 unsigned int old_interval = dirty_writeback_interval;
1976 return 0; 1976 int ret;
1977
1978 ret = proc_dointvec(table, write, buffer, length, ppos);
1979
1980 /*
1981 * Writing 0 to dirty_writeback_interval will disable periodic writeback
1982 * and a different non-zero value will wakeup the writeback threads.
1983 * wb_wakeup_delayed() would be more appropriate, but it's a pain to
1984 * iterate over all bdis and wbs.
1985 * The reason we do this is to make the change take effect immediately.
1986 */
1987 if (!ret && write && dirty_writeback_interval &&
1988 dirty_writeback_interval != old_interval)
1989 wakeup_flusher_threads(WB_REASON_PERIODIC);
1990
1991 return ret;
1977} 1992}
1978 1993
1979#ifdef CONFIG_BLOCK 1994#ifdef CONFIG_BLOCK
1980void laptop_mode_timer_fn(unsigned long data) 1995void laptop_mode_timer_fn(unsigned long data)
1981{ 1996{
1982 struct request_queue *q = (struct request_queue *)data; 1997 struct request_queue *q = (struct request_queue *)data;
1983 int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
1984 global_node_page_state(NR_UNSTABLE_NFS);
1985 struct bdi_writeback *wb;
1986 1998
1987 /* 1999 wakeup_flusher_threads_bdi(q->backing_dev_info, WB_REASON_LAPTOP_TIMER);
1988 * We want to write everything out, not just down to the dirty
1989 * threshold
1990 */
1991 if (!bdi_has_dirty_io(q->backing_dev_info))
1992 return;
1993
1994 rcu_read_lock();
1995 list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
1996 if (wb_has_dirty_io(wb))
1997 wb_start_writeback(wb, nr_pages, true,
1998 WB_REASON_LAPTOP_TIMER);
1999 rcu_read_unlock();
2000} 2000}
2001 2001
2002/* 2002/*
diff --git a/mm/page_io.c b/mm/page_io.c
index 5d882de3fbfd..cd52b9cc169b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -408,7 +408,7 @@ int swap_readpage(struct page *page, bool do_poll)
408 if (!READ_ONCE(bio->bi_private)) 408 if (!READ_ONCE(bio->bi_private))
409 break; 409 break;
410 410
411 if (!blk_mq_poll(disk->queue, qc)) 411 if (!blk_poll(disk->queue, qc))
412 break; 412 break;
413 } 413 }
414 __set_current_state(TASK_RUNNING); 414 __set_current_state(TASK_RUNNING);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f0315b8c0..15b483ef6440 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1868,7 +1868,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1868 * also allow kswapd to start writing pages during reclaim. 1868 * also allow kswapd to start writing pages during reclaim.
1869 */ 1869 */
1870 if (stat.nr_unqueued_dirty == nr_taken) { 1870 if (stat.nr_unqueued_dirty == nr_taken) {
1871 wakeup_flusher_threads(0, WB_REASON_VMSCAN); 1871 wakeup_flusher_threads(WB_REASON_VMSCAN);
1872 set_bit(PGDAT_DIRTY, &pgdat->flags); 1872 set_bit(PGDAT_DIRTY, &pgdat->flags);
1873 } 1873 }
1874 1874