diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 18:32:19 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 18:32:19 -0500 |
commit | e2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch) | |
tree | b97a90170c45211bcc437761653aa8016c34afcd | |
parent | abc36be236358162202e86ad88616ff95a755101 (diff) | |
parent | a04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff) |
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe:
"This is the main pull request for block storage for 4.15-rc1.
Nothing out of the ordinary in here, and no API changes or anything
like that. Just various new features for drivers, core changes, etc.
In particular, this pull request contains:
- A patch series from Bart, closing the whole on blk/scsi-mq queue
quescing.
- A series from Christoph, building towards hidden gendisks (for
multipath) and ability to move bio chains around.
- NVMe
- Support for native multipath for NVMe (Christoph).
- Userspace notifications for AENs (Keith).
- Command side-effects support (Keith).
- SGL support (Chaitanya Kulkarni)
- FC fixes and improvements (James Smart)
- Lots of fixes and tweaks (Various)
- bcache
- New maintainer (Michael Lyle)
- Writeback control improvements (Michael)
- Various fixes (Coly, Elena, Eric, Liang, et al)
- lightnvm updates, mostly centered around the pblk interface
(Javier, Hans, and Rakesh).
- Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)
- Writeback series that fix the much discussed hundreds of millions
of sync-all units. This goes all the way, as discussed previously
(me).
- Fix for missing wakeup on writeback timer adjustments (Yafang
Shao).
- Fix laptop mode on blk-mq (me).
- {mq,name} tupple lookup for IO schedulers, allowing us to have
alias names. This means you can use 'deadline' on both !mq and on
mq (where it's called mq-deadline). (me).
- blktrace race fix, oopsing on sg load (me).
- blk-mq optimizations (me).
- Obscure waitqueue race fix for kyber (Omar).
- NBD fixes (Josef).
- Disable writeback throttling by default on bfq, like we do on cfq
(Luca Miccio).
- Series from Ming that enable us to treat flush requests on blk-mq
like any other request. This is a really nice cleanup.
- Series from Ming that improves merging on blk-mq with schedulers,
getting us closer to flipping the switch on scsi-mq again.
- BFQ updates (Paolo).
- blk-mq atomic flags memory ordering fixes (Peter Z).
- Loop cgroup support (Shaohua).
- Lots of minor fixes from lots of different folks, both for core and
driver code"
* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
nvme: fix visibility of "uuid" ns attribute
blk-mq: fixup some comment typos and lengths
ide: ide-atapi: fix compile error with defining macro DEBUG
blk-mq: improve tag waiting setup for non-shared tags
brd: remove unused brd_mutex
blk-mq: only run the hardware queue if IO is pending
block: avoid null pointer dereference on null disk
fs: guard_bio_eod() needs to consider partitions
xtensa/simdisk: fix compile error
nvme: expose subsys attribute to sysfs
nvme: create 'slaves' and 'holders' entries for hidden controllers
block: create 'slaves' and 'holders' entries for hidden gendisks
nvme: also expose the namespace identification sysfs files for mpath nodes
nvme: implement multipath access to nvme subsystems
nvme: track shared namespaces
nvme: introduce a nvme_ns_ids structure
nvme: track subsystems
block, nvme: Introduce blk_mq_req_flags_t
block, scsi: Make SCSI quiesce and resume work reliably
block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
...
131 files changed, 5470 insertions, 3089 deletions
diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads deleted file mode 100644 index b0b0eeb20fe3..000000000000 --- a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads +++ /dev/null | |||
@@ -1,5 +0,0 @@ | |||
1 | What: /proc/sys/vm/nr_pdflush_threads | ||
2 | Date: June 2012 | ||
3 | Contact: Wanpeng Li <liwp@linux.vnet.ibm.com> | ||
4 | Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush | ||
5 | exported in /proc/sys/vm/ should be removed. | ||
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 9490f2845f06..86927029a52d 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt | |||
@@ -216,10 +216,9 @@ may need to abort DMA operations and revert to PIO for the transfer, in | |||
216 | which case a virtual mapping of the page is required. For SCSI it is also | 216 | which case a virtual mapping of the page is required. For SCSI it is also |
217 | done in some scenarios where the low level driver cannot be trusted to | 217 | done in some scenarios where the low level driver cannot be trusted to |
218 | handle a single sg entry correctly. The driver is expected to perform the | 218 | handle a single sg entry correctly. The driver is expected to perform the |
219 | kmaps as needed on such occasions using the __bio_kmap_atomic and bio_kmap_irq | 219 | kmaps as needed on such occasions as appropriate. A driver could also use |
220 | routines as appropriate. A driver could also use the blk_queue_bounce() | 220 | the blk_queue_bounce() routine on its own to bounce highmem i/o to low |
221 | routine on its own to bounce highmem i/o to low memory for specific requests | 221 | memory for specific requests if so desired. |
222 | if so desired. | ||
223 | 222 | ||
224 | iii. The i/o scheduler algorithm itself can be replaced/set as appropriate | 223 | iii. The i/o scheduler algorithm itself can be replaced/set as appropriate |
225 | 224 | ||
@@ -1137,8 +1136,8 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For | |||
1137 | PIO drivers (or drivers that need to revert to PIO transfer once in a | 1136 | PIO drivers (or drivers that need to revert to PIO transfer once in a |
1138 | while (IDE for example)), where the CPU is doing the actual data | 1137 | while (IDE for example)), where the CPU is doing the actual data |
1139 | transfer a virtual mapping is needed. If the driver supports highmem I/O, | 1138 | transfer a virtual mapping is needed. If the driver supports highmem I/O, |
1140 | (Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic and bio_kmap_irq to | 1139 | (Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map |
1141 | temporarily map a bio into the virtual address space. | 1140 | a bio into the virtual address space. |
1142 | 1141 | ||
1143 | 1142 | ||
1144 | 8. Prior/Related/Impacted patches | 1143 | 8. Prior/Related/Impacted patches |
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 3140dbd860d8..733927a7b501 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt | |||
@@ -38,7 +38,7 @@ gb=[Size in GB]: Default: 250GB | |||
38 | bs=[Block size (in bytes)]: Default: 512 bytes | 38 | bs=[Block size (in bytes)]: Default: 512 bytes |
39 | The block size reported to the system. | 39 | The block size reported to the system. |
40 | 40 | ||
41 | nr_devices=[Number of devices]: Default: 2 | 41 | nr_devices=[Number of devices]: Default: 1 |
42 | Number of block devices instantiated. They are instantiated as /dev/nullb0, | 42 | Number of block devices instantiated. They are instantiated as /dev/nullb0, |
43 | etc. | 43 | etc. |
44 | 44 | ||
@@ -52,13 +52,13 @@ irqmode=[0-2]: Default: 1-Soft-irq | |||
52 | 2: Timer: Waits a specific period (completion_nsec) for each IO before | 52 | 2: Timer: Waits a specific period (completion_nsec) for each IO before |
53 | completion. | 53 | completion. |
54 | 54 | ||
55 | completion_nsec=[ns]: Default: 10.000ns | 55 | completion_nsec=[ns]: Default: 10,000ns |
56 | Combined with irqmode=2 (timer). The time each completion event must wait. | 56 | Combined with irqmode=2 (timer). The time each completion event must wait. |
57 | 57 | ||
58 | submit_queues=[0..nr_cpus]: | 58 | submit_queues=[1..nr_cpus]: |
59 | The number of submission queues attached to the device driver. If unset, it | 59 | The number of submission queues attached to the device driver. If unset, it |
60 | defaults to 1 on single-queue and bio-based instances. For multi-queue, | 60 | defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module |
61 | it is ignored when use_per_node_hctx module parameter is 1. | 61 | parameter is 1. |
62 | 62 | ||
63 | hw_queue_depth=[0..qdepth]: Default: 64 | 63 | hw_queue_depth=[0..qdepth]: Default: 64 |
64 | The hardware queue depth of the device. | 64 | The hardware queue depth of the device. |
@@ -73,3 +73,12 @@ use_per_node_hctx=[0/1]: Default: 0 | |||
73 | 73 | ||
74 | use_lightnvm=[0/1]: Default: 0 | 74 | use_lightnvm=[0/1]: Default: 0 |
75 | Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled. | 75 | Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled. |
76 | |||
77 | no_sched=[0/1]: Default: 0 | ||
78 | 0: nullb* use default blk-mq io scheduler. | ||
79 | 1: nullb* doesn't use io scheduler. | ||
80 | |||
81 | shared_tags=[0/1]: Default: 0 | ||
82 | 0: Tag set is not shared. | ||
83 | 1: Tag set shared between devices for blk-mq. Only makes sense with | ||
84 | nr_devices > 1, otherwise there's no tag set to share. | ||
diff --git a/MAINTAINERS b/MAINTAINERS index e372994747b7..ba3d8c197d92 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2562,10 +2562,12 @@ S: Maintained | |||
2562 | F: drivers/net/hamradio/baycom* | 2562 | F: drivers/net/hamradio/baycom* |
2563 | 2563 | ||
2564 | BCACHE (BLOCK LAYER CACHE) | 2564 | BCACHE (BLOCK LAYER CACHE) |
2565 | M: Michael Lyle <mlyle@lyle.org> | ||
2565 | M: Kent Overstreet <kent.overstreet@gmail.com> | 2566 | M: Kent Overstreet <kent.overstreet@gmail.com> |
2566 | L: linux-bcache@vger.kernel.org | 2567 | L: linux-bcache@vger.kernel.org |
2567 | W: http://bcache.evilpiepirate.org | 2568 | W: http://bcache.evilpiepirate.org |
2568 | S: Orphan | 2569 | C: irc://irc.oftc.net/bcache |
2570 | S: Maintained | ||
2569 | F: drivers/md/bcache/ | 2571 | F: drivers/md/bcache/ |
2570 | 2572 | ||
2571 | BDISP ST MEDIA DRIVER | 2573 | BDISP ST MEDIA DRIVER |
@@ -12085,7 +12087,6 @@ F: drivers/mmc/host/sdhci-omap.c | |||
12085 | SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER | 12087 | SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER |
12086 | M: Scott Bauer <scott.bauer@intel.com> | 12088 | M: Scott Bauer <scott.bauer@intel.com> |
12087 | M: Jonathan Derrick <jonathan.derrick@intel.com> | 12089 | M: Jonathan Derrick <jonathan.derrick@intel.com> |
12088 | M: Rafael Antognolli <rafael.antognolli@intel.com> | ||
12089 | L: linux-block@vger.kernel.org | 12090 | L: linux-block@vger.kernel.org |
12090 | S: Supported | 12091 | S: Supported |
12091 | F: block/sed* | 12092 | F: block/sed* |
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index c45b90bb9339..1b6418407467 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c | |||
@@ -110,13 +110,13 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio) | |||
110 | sector_t sector = bio->bi_iter.bi_sector; | 110 | sector_t sector = bio->bi_iter.bi_sector; |
111 | 111 | ||
112 | bio_for_each_segment(bvec, bio, iter) { | 112 | bio_for_each_segment(bvec, bio, iter) { |
113 | char *buffer = __bio_kmap_atomic(bio, iter); | 113 | char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset; |
114 | unsigned len = bvec.bv_len >> SECTOR_SHIFT; | 114 | unsigned len = bvec.bv_len >> SECTOR_SHIFT; |
115 | 115 | ||
116 | simdisk_transfer(dev, sector, len, buffer, | 116 | simdisk_transfer(dev, sector, len, buffer, |
117 | bio_data_dir(bio) == WRITE); | 117 | bio_data_dir(bio) == WRITE); |
118 | sector += len; | 118 | sector += len; |
119 | __bio_kunmap_atomic(buffer); | 119 | kunmap_atomic(buffer); |
120 | } | 120 | } |
121 | 121 | ||
122 | bio_endio(bio); | 122 | bio_endio(bio); |
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a4783da90ba8..889a8549d97f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -108,6 +108,7 @@ | |||
108 | #include "blk-mq-tag.h" | 108 | #include "blk-mq-tag.h" |
109 | #include "blk-mq-sched.h" | 109 | #include "blk-mq-sched.h" |
110 | #include "bfq-iosched.h" | 110 | #include "bfq-iosched.h" |
111 | #include "blk-wbt.h" | ||
111 | 112 | ||
112 | #define BFQ_BFQQ_FNS(name) \ | 113 | #define BFQ_BFQQ_FNS(name) \ |
113 | void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ | 114 | void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ |
@@ -724,6 +725,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, | |||
724 | } | 725 | } |
725 | } | 726 | } |
726 | 727 | ||
728 | static unsigned int bfq_wr_duration(struct bfq_data *bfqd) | ||
729 | { | ||
730 | u64 dur; | ||
731 | |||
732 | if (bfqd->bfq_wr_max_time > 0) | ||
733 | return bfqd->bfq_wr_max_time; | ||
734 | |||
735 | dur = bfqd->RT_prod; | ||
736 | do_div(dur, bfqd->peak_rate); | ||
737 | |||
738 | /* | ||
739 | * Limit duration between 3 and 13 seconds. Tests show that | ||
740 | * higher values than 13 seconds often yield the opposite of | ||
741 | * the desired result, i.e., worsen responsiveness by letting | ||
742 | * non-interactive and non-soft-real-time applications | ||
743 | * preserve weight raising for a too long time interval. | ||
744 | * | ||
745 | * On the other end, lower values than 3 seconds make it | ||
746 | * difficult for most interactive tasks to complete their jobs | ||
747 | * before weight-raising finishes. | ||
748 | */ | ||
749 | if (dur > msecs_to_jiffies(13000)) | ||
750 | dur = msecs_to_jiffies(13000); | ||
751 | else if (dur < msecs_to_jiffies(3000)) | ||
752 | dur = msecs_to_jiffies(3000); | ||
753 | |||
754 | return dur; | ||
755 | } | ||
756 | |||
757 | /* switch back from soft real-time to interactive weight raising */ | ||
758 | static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, | ||
759 | struct bfq_data *bfqd) | ||
760 | { | ||
761 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | ||
762 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | ||
763 | bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; | ||
764 | } | ||
765 | |||
727 | static void | 766 | static void |
728 | bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, | 767 | bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, |
729 | struct bfq_io_cq *bic, bool bfq_already_existing) | 768 | struct bfq_io_cq *bic, bool bfq_already_existing) |
@@ -750,10 +789,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, | |||
750 | if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || | 789 | if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || |
751 | time_is_before_jiffies(bfqq->last_wr_start_finish + | 790 | time_is_before_jiffies(bfqq->last_wr_start_finish + |
752 | bfqq->wr_cur_max_time))) { | 791 | bfqq->wr_cur_max_time))) { |
753 | bfq_log_bfqq(bfqq->bfqd, bfqq, | 792 | if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && |
754 | "resume state: switching off wr"); | 793 | !bfq_bfqq_in_large_burst(bfqq) && |
755 | 794 | time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + | |
756 | bfqq->wr_coeff = 1; | 795 | bfq_wr_duration(bfqd))) { |
796 | switch_back_to_interactive_wr(bfqq, bfqd); | ||
797 | } else { | ||
798 | bfqq->wr_coeff = 1; | ||
799 | bfq_log_bfqq(bfqq->bfqd, bfqq, | ||
800 | "resume state: switching off wr"); | ||
801 | } | ||
757 | } | 802 | } |
758 | 803 | ||
759 | /* make sure weight will be updated, however we got here */ | 804 | /* make sure weight will be updated, however we got here */ |
@@ -1173,33 +1218,22 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, | |||
1173 | return wr_or_deserves_wr; | 1218 | return wr_or_deserves_wr; |
1174 | } | 1219 | } |
1175 | 1220 | ||
1176 | static unsigned int bfq_wr_duration(struct bfq_data *bfqd) | 1221 | /* |
1222 | * Return the farthest future time instant according to jiffies | ||
1223 | * macros. | ||
1224 | */ | ||
1225 | static unsigned long bfq_greatest_from_now(void) | ||
1177 | { | 1226 | { |
1178 | u64 dur; | 1227 | return jiffies + MAX_JIFFY_OFFSET; |
1179 | 1228 | } | |
1180 | if (bfqd->bfq_wr_max_time > 0) | ||
1181 | return bfqd->bfq_wr_max_time; | ||
1182 | |||
1183 | dur = bfqd->RT_prod; | ||
1184 | do_div(dur, bfqd->peak_rate); | ||
1185 | |||
1186 | /* | ||
1187 | * Limit duration between 3 and 13 seconds. Tests show that | ||
1188 | * higher values than 13 seconds often yield the opposite of | ||
1189 | * the desired result, i.e., worsen responsiveness by letting | ||
1190 | * non-interactive and non-soft-real-time applications | ||
1191 | * preserve weight raising for a too long time interval. | ||
1192 | * | ||
1193 | * On the other end, lower values than 3 seconds make it | ||
1194 | * difficult for most interactive tasks to complete their jobs | ||
1195 | * before weight-raising finishes. | ||
1196 | */ | ||
1197 | if (dur > msecs_to_jiffies(13000)) | ||
1198 | dur = msecs_to_jiffies(13000); | ||
1199 | else if (dur < msecs_to_jiffies(3000)) | ||
1200 | dur = msecs_to_jiffies(3000); | ||
1201 | 1229 | ||
1202 | return dur; | 1230 | /* |
1231 | * Return the farthest past time instant according to jiffies | ||
1232 | * macros. | ||
1233 | */ | ||
1234 | static unsigned long bfq_smallest_from_now(void) | ||
1235 | { | ||
1236 | return jiffies - MAX_JIFFY_OFFSET; | ||
1203 | } | 1237 | } |
1204 | 1238 | ||
1205 | static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, | 1239 | static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, |
@@ -1216,7 +1250,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, | |||
1216 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | 1250 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; |
1217 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | 1251 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
1218 | } else { | 1252 | } else { |
1219 | bfqq->wr_start_at_switch_to_srt = jiffies; | 1253 | /* |
1254 | * No interactive weight raising in progress | ||
1255 | * here: assign minus infinity to | ||
1256 | * wr_start_at_switch_to_srt, to make sure | ||
1257 | * that, at the end of the soft-real-time | ||
1258 | * weight raising periods that is starting | ||
1259 | * now, no interactive weight-raising period | ||
1260 | * may be wrongly considered as still in | ||
1261 | * progress (and thus actually started by | ||
1262 | * mistake). | ||
1263 | */ | ||
1264 | bfqq->wr_start_at_switch_to_srt = | ||
1265 | bfq_smallest_from_now(); | ||
1220 | bfqq->wr_coeff = bfqd->bfq_wr_coeff * | 1266 | bfqq->wr_coeff = bfqd->bfq_wr_coeff * |
1221 | BFQ_SOFTRT_WEIGHT_FACTOR; | 1267 | BFQ_SOFTRT_WEIGHT_FACTOR; |
1222 | bfqq->wr_cur_max_time = | 1268 | bfqq->wr_cur_max_time = |
@@ -2016,10 +2062,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) | |||
2016 | bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); | 2062 | bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); |
2017 | bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); | 2063 | bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); |
2018 | bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); | 2064 | bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); |
2019 | bic->saved_wr_coeff = bfqq->wr_coeff; | 2065 | if (unlikely(bfq_bfqq_just_created(bfqq) && |
2020 | bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; | 2066 | !bfq_bfqq_in_large_burst(bfqq))) { |
2021 | bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; | 2067 | /* |
2022 | bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; | 2068 | * bfqq being merged right after being created: bfqq |
2069 | * would have deserved interactive weight raising, but | ||
2070 | * did not make it to be set in a weight-raised state, | ||
2071 | * because of this early merge. Store directly the | ||
2072 | * weight-raising state that would have been assigned | ||
2073 | * to bfqq, so that to avoid that bfqq unjustly fails | ||
2074 | * to enjoy weight raising if split soon. | ||
2075 | */ | ||
2076 | bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; | ||
2077 | bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); | ||
2078 | bic->saved_last_wr_start_finish = jiffies; | ||
2079 | } else { | ||
2080 | bic->saved_wr_coeff = bfqq->wr_coeff; | ||
2081 | bic->saved_wr_start_at_switch_to_srt = | ||
2082 | bfqq->wr_start_at_switch_to_srt; | ||
2083 | bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; | ||
2084 | bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; | ||
2085 | } | ||
2023 | } | 2086 | } |
2024 | 2087 | ||
2025 | static void | 2088 | static void |
@@ -2897,24 +2960,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, | |||
2897 | jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); | 2960 | jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); |
2898 | } | 2961 | } |
2899 | 2962 | ||
2900 | /* | ||
2901 | * Return the farthest future time instant according to jiffies | ||
2902 | * macros. | ||
2903 | */ | ||
2904 | static unsigned long bfq_greatest_from_now(void) | ||
2905 | { | ||
2906 | return jiffies + MAX_JIFFY_OFFSET; | ||
2907 | } | ||
2908 | |||
2909 | /* | ||
2910 | * Return the farthest past time instant according to jiffies | ||
2911 | * macros. | ||
2912 | */ | ||
2913 | static unsigned long bfq_smallest_from_now(void) | ||
2914 | { | ||
2915 | return jiffies - MAX_JIFFY_OFFSET; | ||
2916 | } | ||
2917 | |||
2918 | /** | 2963 | /** |
2919 | * bfq_bfqq_expire - expire a queue. | 2964 | * bfq_bfqq_expire - expire a queue. |
2920 | * @bfqd: device owning the queue. | 2965 | * @bfqd: device owning the queue. |
@@ -3489,11 +3534,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) | |||
3489 | bfq_wr_duration(bfqd))) | 3534 | bfq_wr_duration(bfqd))) |
3490 | bfq_bfqq_end_wr(bfqq); | 3535 | bfq_bfqq_end_wr(bfqq); |
3491 | else { | 3536 | else { |
3492 | /* switch back to interactive wr */ | 3537 | switch_back_to_interactive_wr(bfqq, bfqd); |
3493 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | ||
3494 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | ||
3495 | bfqq->last_wr_start_finish = | ||
3496 | bfqq->wr_start_at_switch_to_srt; | ||
3497 | bfqq->entity.prio_changed = 1; | 3538 | bfqq->entity.prio_changed = 1; |
3498 | } | 3539 | } |
3499 | } | 3540 | } |
@@ -3685,16 +3726,37 @@ void bfq_put_queue(struct bfq_queue *bfqq) | |||
3685 | if (bfqq->ref) | 3726 | if (bfqq->ref) |
3686 | return; | 3727 | return; |
3687 | 3728 | ||
3688 | if (bfq_bfqq_sync(bfqq)) | 3729 | if (!hlist_unhashed(&bfqq->burst_list_node)) { |
3730 | hlist_del_init(&bfqq->burst_list_node); | ||
3689 | /* | 3731 | /* |
3690 | * The fact that this queue is being destroyed does not | 3732 | * Decrement also burst size after the removal, if the |
3691 | * invalidate the fact that this queue may have been | 3733 | * process associated with bfqq is exiting, and thus |
3692 | * activated during the current burst. As a consequence, | 3734 | * does not contribute to the burst any longer. This |
3693 | * although the queue does not exist anymore, and hence | 3735 | * decrement helps filter out false positives of large |
3694 | * needs to be removed from the burst list if there, | 3736 | * bursts, when some short-lived process (often due to |
3695 | * the burst size has not to be decremented. | 3737 | * the execution of commands by some service) happens |
3738 | * to start and exit while a complex application is | ||
3739 | * starting, and thus spawning several processes that | ||
3740 | * do I/O (and that *must not* be treated as a large | ||
3741 | * burst, see comments on bfq_handle_burst). | ||
3742 | * | ||
3743 | * In particular, the decrement is performed only if: | ||
3744 | * 1) bfqq is not a merged queue, because, if it is, | ||
3745 | * then this free of bfqq is not triggered by the exit | ||
3746 | * of the process bfqq is associated with, but exactly | ||
3747 | * by the fact that bfqq has just been merged. | ||
3748 | * 2) burst_size is greater than 0, to handle | ||
3749 | * unbalanced decrements. Unbalanced decrements may | ||
3750 | * happen in te following case: bfqq is inserted into | ||
3751 | * the current burst list--without incrementing | ||
3752 | * bust_size--because of a split, but the current | ||
3753 | * burst list is not the burst list bfqq belonged to | ||
3754 | * (see comments on the case of a split in | ||
3755 | * bfq_set_request). | ||
3696 | */ | 3756 | */ |
3697 | hlist_del_init(&bfqq->burst_list_node); | 3757 | if (bfqq->bic && bfqq->bfqd->burst_size > 0) |
3758 | bfqq->bfqd->burst_size--; | ||
3759 | } | ||
3698 | 3760 | ||
3699 | kmem_cache_free(bfq_pool, bfqq); | 3761 | kmem_cache_free(bfq_pool, bfqq); |
3700 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 3762 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
@@ -4127,7 +4189,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) | |||
4127 | new_bfqq->allocated++; | 4189 | new_bfqq->allocated++; |
4128 | bfqq->allocated--; | 4190 | bfqq->allocated--; |
4129 | new_bfqq->ref++; | 4191 | new_bfqq->ref++; |
4130 | bfq_clear_bfqq_just_created(bfqq); | ||
4131 | /* | 4192 | /* |
4132 | * If the bic associated with the process | 4193 | * If the bic associated with the process |
4133 | * issuing this request still points to bfqq | 4194 | * issuing this request still points to bfqq |
@@ -4139,6 +4200,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) | |||
4139 | if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) | 4200 | if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) |
4140 | bfq_merge_bfqqs(bfqd, RQ_BIC(rq), | 4201 | bfq_merge_bfqqs(bfqd, RQ_BIC(rq), |
4141 | bfqq, new_bfqq); | 4202 | bfqq, new_bfqq); |
4203 | |||
4204 | bfq_clear_bfqq_just_created(bfqq); | ||
4142 | /* | 4205 | /* |
4143 | * rq is about to be enqueued into new_bfqq, | 4206 | * rq is about to be enqueued into new_bfqq, |
4144 | * release rq reference on bfqq | 4207 | * release rq reference on bfqq |
@@ -4424,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, | |||
4424 | else { | 4487 | else { |
4425 | bfq_clear_bfqq_in_large_burst(bfqq); | 4488 | bfq_clear_bfqq_in_large_burst(bfqq); |
4426 | if (bic->was_in_burst_list) | 4489 | if (bic->was_in_burst_list) |
4490 | /* | ||
4491 | * If bfqq was in the current | ||
4492 | * burst list before being | ||
4493 | * merged, then we have to add | ||
4494 | * it back. And we do not need | ||
4495 | * to increase burst_size, as | ||
4496 | * we did not decrement | ||
4497 | * burst_size when we removed | ||
4498 | * bfqq from the burst list as | ||
4499 | * a consequence of a merge | ||
4500 | * (see comments in | ||
4501 | * bfq_put_queue). In this | ||
4502 | * respect, it would be rather | ||
4503 | * costly to know whether the | ||
4504 | * current burst list is still | ||
4505 | * the same burst list from | ||
4506 | * which bfqq was removed on | ||
4507 | * the merge. To avoid this | ||
4508 | * cost, if bfqq was in a | ||
4509 | * burst list, then we add | ||
4510 | * bfqq to the current burst | ||
4511 | * list without any further | ||
4512 | * check. This can cause | ||
4513 | * inappropriate insertions, | ||
4514 | * but rarely enough to not | ||
4515 | * harm the detection of large | ||
4516 | * bursts significantly. | ||
4517 | */ | ||
4427 | hlist_add_head(&bfqq->burst_list_node, | 4518 | hlist_add_head(&bfqq->burst_list_node, |
4428 | &bfqd->burst_list); | 4519 | &bfqd->burst_list); |
4429 | } | 4520 | } |
@@ -4775,7 +4866,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) | |||
4775 | bfq_init_root_group(bfqd->root_group, bfqd); | 4866 | bfq_init_root_group(bfqd->root_group, bfqd); |
4776 | bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); | 4867 | bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); |
4777 | 4868 | ||
4778 | 4869 | wbt_disable_default(q); | |
4779 | return 0; | 4870 | return 0; |
4780 | 4871 | ||
4781 | out_free: | 4872 | out_free: |
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5df32907ff3b..23b42e8aa03e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c | |||
@@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create); | |||
485 | 485 | ||
486 | void bioset_integrity_free(struct bio_set *bs) | 486 | void bioset_integrity_free(struct bio_set *bs) |
487 | { | 487 | { |
488 | if (bs->bio_integrity_pool) | 488 | mempool_destroy(bs->bio_integrity_pool); |
489 | mempool_destroy(bs->bio_integrity_pool); | 489 | mempool_destroy(bs->bvec_integrity_pool); |
490 | |||
491 | if (bs->bvec_integrity_pool) | ||
492 | mempool_destroy(bs->bvec_integrity_pool); | ||
493 | } | 490 | } |
494 | EXPORT_SYMBOL(bioset_integrity_free); | 491 | EXPORT_SYMBOL(bioset_integrity_free); |
495 | 492 | ||
diff --git a/block/bio.c b/block/bio.c index cc60213e56d8..b94a802f8ba3 100644 --- a/block/bio.c +++ b/block/bio.c | |||
@@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) | |||
400 | 400 | ||
401 | /** | 401 | /** |
402 | * bio_alloc_bioset - allocate a bio for I/O | 402 | * bio_alloc_bioset - allocate a bio for I/O |
403 | * @gfp_mask: the GFP_ mask given to the slab allocator | 403 | * @gfp_mask: the GFP_* mask given to the slab allocator |
404 | * @nr_iovecs: number of iovecs to pre-allocate | 404 | * @nr_iovecs: number of iovecs to pre-allocate |
405 | * @bs: the bio_set to allocate from. | 405 | * @bs: the bio_set to allocate from. |
406 | * | 406 | * |
@@ -1931,11 +1931,8 @@ void bioset_free(struct bio_set *bs) | |||
1931 | if (bs->rescue_workqueue) | 1931 | if (bs->rescue_workqueue) |
1932 | destroy_workqueue(bs->rescue_workqueue); | 1932 | destroy_workqueue(bs->rescue_workqueue); |
1933 | 1933 | ||
1934 | if (bs->bio_pool) | 1934 | mempool_destroy(bs->bio_pool); |
1935 | mempool_destroy(bs->bio_pool); | 1935 | mempool_destroy(bs->bvec_pool); |
1936 | |||
1937 | if (bs->bvec_pool) | ||
1938 | mempool_destroy(bs->bvec_pool); | ||
1939 | 1936 | ||
1940 | bioset_integrity_free(bs); | 1937 | bioset_integrity_free(bs); |
1941 | bio_put_slab(bs); | 1938 | bio_put_slab(bs); |
@@ -2036,37 +2033,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) | |||
2036 | EXPORT_SYMBOL_GPL(bio_associate_blkcg); | 2033 | EXPORT_SYMBOL_GPL(bio_associate_blkcg); |
2037 | 2034 | ||
2038 | /** | 2035 | /** |
2039 | * bio_associate_current - associate a bio with %current | ||
2040 | * @bio: target bio | ||
2041 | * | ||
2042 | * Associate @bio with %current if it hasn't been associated yet. Block | ||
2043 | * layer will treat @bio as if it were issued by %current no matter which | ||
2044 | * task actually issues it. | ||
2045 | * | ||
2046 | * This function takes an extra reference of @task's io_context and blkcg | ||
2047 | * which will be put when @bio is released. The caller must own @bio, | ||
2048 | * ensure %current->io_context exists, and is responsible for synchronizing | ||
2049 | * calls to this function. | ||
2050 | */ | ||
2051 | int bio_associate_current(struct bio *bio) | ||
2052 | { | ||
2053 | struct io_context *ioc; | ||
2054 | |||
2055 | if (bio->bi_css) | ||
2056 | return -EBUSY; | ||
2057 | |||
2058 | ioc = current->io_context; | ||
2059 | if (!ioc) | ||
2060 | return -ENOENT; | ||
2061 | |||
2062 | get_io_context_active(ioc); | ||
2063 | bio->bi_ioc = ioc; | ||
2064 | bio->bi_css = task_get_css(current, io_cgrp_id); | ||
2065 | return 0; | ||
2066 | } | ||
2067 | EXPORT_SYMBOL_GPL(bio_associate_current); | ||
2068 | |||
2069 | /** | ||
2070 | * bio_disassociate_task - undo bio_associate_current() | 2036 | * bio_disassociate_task - undo bio_associate_current() |
2071 | * @bio: target bio | 2037 | * @bio: target bio |
2072 | */ | 2038 | */ |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d3f56baee936..4117524ca45b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol) | |||
1419 | if (i >= BLKCG_MAX_POLS) | 1419 | if (i >= BLKCG_MAX_POLS) |
1420 | goto err_unlock; | 1420 | goto err_unlock; |
1421 | 1421 | ||
1422 | /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ | ||
1423 | if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || | ||
1424 | (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) | ||
1425 | goto err_unlock; | ||
1426 | |||
1422 | /* register @pol */ | 1427 | /* register @pol */ |
1423 | pol->plid = i; | 1428 | pol->plid = i; |
1424 | blkcg_policy[pol->plid] = pol; | 1429 | blkcg_policy[pol->plid] = pol; |
@@ -1452,7 +1457,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) | |||
1452 | return 0; | 1457 | return 0; |
1453 | 1458 | ||
1454 | err_free_cpds: | 1459 | err_free_cpds: |
1455 | if (pol->cpd_alloc_fn) { | 1460 | if (pol->cpd_free_fn) { |
1456 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { | 1461 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { |
1457 | if (blkcg->cpd[pol->plid]) { | 1462 | if (blkcg->cpd[pol->plid]) { |
1458 | pol->cpd_free_fn(blkcg->cpd[pol->plid]); | 1463 | pol->cpd_free_fn(blkcg->cpd[pol->plid]); |
@@ -1492,7 +1497,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) | |||
1492 | /* remove cpds and unregister */ | 1497 | /* remove cpds and unregister */ |
1493 | mutex_lock(&blkcg_pol_mutex); | 1498 | mutex_lock(&blkcg_pol_mutex); |
1494 | 1499 | ||
1495 | if (pol->cpd_alloc_fn) { | 1500 | if (pol->cpd_free_fn) { |
1496 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { | 1501 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { |
1497 | if (blkcg->cpd[pol->plid]) { | 1502 | if (blkcg->cpd[pol->plid]) { |
1498 | pol->cpd_free_fn(blkcg->cpd[pol->plid]); | 1503 | pol->cpd_free_fn(blkcg->cpd[pol->plid]); |
diff --git a/block/blk-core.c b/block/blk-core.c index 048be4aa6024..7c54c195e79e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -333,11 +333,13 @@ EXPORT_SYMBOL(blk_stop_queue); | |||
333 | void blk_sync_queue(struct request_queue *q) | 333 | void blk_sync_queue(struct request_queue *q) |
334 | { | 334 | { |
335 | del_timer_sync(&q->timeout); | 335 | del_timer_sync(&q->timeout); |
336 | cancel_work_sync(&q->timeout_work); | ||
336 | 337 | ||
337 | if (q->mq_ops) { | 338 | if (q->mq_ops) { |
338 | struct blk_mq_hw_ctx *hctx; | 339 | struct blk_mq_hw_ctx *hctx; |
339 | int i; | 340 | int i; |
340 | 341 | ||
342 | cancel_delayed_work_sync(&q->requeue_work); | ||
341 | queue_for_each_hw_ctx(q, hctx, i) | 343 | queue_for_each_hw_ctx(q, hctx, i) |
342 | cancel_delayed_work_sync(&hctx->run_work); | 344 | cancel_delayed_work_sync(&hctx->run_work); |
343 | } else { | 345 | } else { |
@@ -347,6 +349,37 @@ void blk_sync_queue(struct request_queue *q) | |||
347 | EXPORT_SYMBOL(blk_sync_queue); | 349 | EXPORT_SYMBOL(blk_sync_queue); |
348 | 350 | ||
349 | /** | 351 | /** |
352 | * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY | ||
353 | * @q: request queue pointer | ||
354 | * | ||
355 | * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not | ||
356 | * set and 1 if the flag was already set. | ||
357 | */ | ||
358 | int blk_set_preempt_only(struct request_queue *q) | ||
359 | { | ||
360 | unsigned long flags; | ||
361 | int res; | ||
362 | |||
363 | spin_lock_irqsave(q->queue_lock, flags); | ||
364 | res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); | ||
365 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
366 | |||
367 | return res; | ||
368 | } | ||
369 | EXPORT_SYMBOL_GPL(blk_set_preempt_only); | ||
370 | |||
371 | void blk_clear_preempt_only(struct request_queue *q) | ||
372 | { | ||
373 | unsigned long flags; | ||
374 | |||
375 | spin_lock_irqsave(q->queue_lock, flags); | ||
376 | queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); | ||
377 | wake_up_all(&q->mq_freeze_wq); | ||
378 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
379 | } | ||
380 | EXPORT_SYMBOL_GPL(blk_clear_preempt_only); | ||
381 | |||
382 | /** | ||
350 | * __blk_run_queue_uncond - run a queue whether or not it has been stopped | 383 | * __blk_run_queue_uncond - run a queue whether or not it has been stopped |
351 | * @q: The queue to run | 384 | * @q: The queue to run |
352 | * | 385 | * |
@@ -610,6 +643,9 @@ void blk_set_queue_dying(struct request_queue *q) | |||
610 | } | 643 | } |
611 | spin_unlock_irq(q->queue_lock); | 644 | spin_unlock_irq(q->queue_lock); |
612 | } | 645 | } |
646 | |||
647 | /* Make blk_queue_enter() reexamine the DYING flag. */ | ||
648 | wake_up_all(&q->mq_freeze_wq); | ||
613 | } | 649 | } |
614 | EXPORT_SYMBOL_GPL(blk_set_queue_dying); | 650 | EXPORT_SYMBOL_GPL(blk_set_queue_dying); |
615 | 651 | ||
@@ -718,7 +754,7 @@ static void free_request_size(void *element, void *data) | |||
718 | int blk_init_rl(struct request_list *rl, struct request_queue *q, | 754 | int blk_init_rl(struct request_list *rl, struct request_queue *q, |
719 | gfp_t gfp_mask) | 755 | gfp_t gfp_mask) |
720 | { | 756 | { |
721 | if (unlikely(rl->rq_pool)) | 757 | if (unlikely(rl->rq_pool) || q->mq_ops) |
722 | return 0; | 758 | return 0; |
723 | 759 | ||
724 | rl->q = q; | 760 | rl->q = q; |
@@ -760,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) | |||
760 | } | 796 | } |
761 | EXPORT_SYMBOL(blk_alloc_queue); | 797 | EXPORT_SYMBOL(blk_alloc_queue); |
762 | 798 | ||
763 | int blk_queue_enter(struct request_queue *q, bool nowait) | 799 | /** |
800 | * blk_queue_enter() - try to increase q->q_usage_counter | ||
801 | * @q: request queue pointer | ||
802 | * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT | ||
803 | */ | ||
804 | int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) | ||
764 | { | 805 | { |
806 | const bool preempt = flags & BLK_MQ_REQ_PREEMPT; | ||
807 | |||
765 | while (true) { | 808 | while (true) { |
809 | bool success = false; | ||
766 | int ret; | 810 | int ret; |
767 | 811 | ||
768 | if (percpu_ref_tryget_live(&q->q_usage_counter)) | 812 | rcu_read_lock_sched(); |
813 | if (percpu_ref_tryget_live(&q->q_usage_counter)) { | ||
814 | /* | ||
815 | * The code that sets the PREEMPT_ONLY flag is | ||
816 | * responsible for ensuring that that flag is globally | ||
817 | * visible before the queue is unfrozen. | ||
818 | */ | ||
819 | if (preempt || !blk_queue_preempt_only(q)) { | ||
820 | success = true; | ||
821 | } else { | ||
822 | percpu_ref_put(&q->q_usage_counter); | ||
823 | } | ||
824 | } | ||
825 | rcu_read_unlock_sched(); | ||
826 | |||
827 | if (success) | ||
769 | return 0; | 828 | return 0; |
770 | 829 | ||
771 | if (nowait) | 830 | if (flags & BLK_MQ_REQ_NOWAIT) |
772 | return -EBUSY; | 831 | return -EBUSY; |
773 | 832 | ||
774 | /* | 833 | /* |
@@ -781,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait) | |||
781 | smp_rmb(); | 840 | smp_rmb(); |
782 | 841 | ||
783 | ret = wait_event_interruptible(q->mq_freeze_wq, | 842 | ret = wait_event_interruptible(q->mq_freeze_wq, |
784 | !atomic_read(&q->mq_freeze_depth) || | 843 | (atomic_read(&q->mq_freeze_depth) == 0 && |
844 | (preempt || !blk_queue_preempt_only(q))) || | ||
785 | blk_queue_dying(q)); | 845 | blk_queue_dying(q)); |
786 | if (blk_queue_dying(q)) | 846 | if (blk_queue_dying(q)) |
787 | return -ENODEV; | 847 | return -ENODEV; |
@@ -844,6 +904,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
844 | setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, | 904 | setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, |
845 | laptop_mode_timer_fn, (unsigned long) q); | 905 | laptop_mode_timer_fn, (unsigned long) q); |
846 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); | 906 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); |
907 | INIT_WORK(&q->timeout_work, NULL); | ||
847 | INIT_LIST_HEAD(&q->queue_head); | 908 | INIT_LIST_HEAD(&q->queue_head); |
848 | INIT_LIST_HEAD(&q->timeout_list); | 909 | INIT_LIST_HEAD(&q->timeout_list); |
849 | INIT_LIST_HEAD(&q->icq_list); | 910 | INIT_LIST_HEAD(&q->icq_list); |
@@ -1154,7 +1215,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) | |||
1154 | * @rl: request list to allocate from | 1215 | * @rl: request list to allocate from |
1155 | * @op: operation and flags | 1216 | * @op: operation and flags |
1156 | * @bio: bio to allocate request for (can be %NULL) | 1217 | * @bio: bio to allocate request for (can be %NULL) |
1157 | * @gfp_mask: allocation mask | 1218 | * @flags: BLQ_MQ_REQ_* flags |
1158 | * | 1219 | * |
1159 | * Get a free request from @q. This function may fail under memory | 1220 | * Get a free request from @q. This function may fail under memory |
1160 | * pressure or if @q is dead. | 1221 | * pressure or if @q is dead. |
@@ -1164,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) | |||
1164 | * Returns request pointer on success, with @q->queue_lock *not held*. | 1225 | * Returns request pointer on success, with @q->queue_lock *not held*. |
1165 | */ | 1226 | */ |
1166 | static struct request *__get_request(struct request_list *rl, unsigned int op, | 1227 | static struct request *__get_request(struct request_list *rl, unsigned int op, |
1167 | struct bio *bio, gfp_t gfp_mask) | 1228 | struct bio *bio, blk_mq_req_flags_t flags) |
1168 | { | 1229 | { |
1169 | struct request_queue *q = rl->q; | 1230 | struct request_queue *q = rl->q; |
1170 | struct request *rq; | 1231 | struct request *rq; |
@@ -1173,6 +1234,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, | |||
1173 | struct io_cq *icq = NULL; | 1234 | struct io_cq *icq = NULL; |
1174 | const bool is_sync = op_is_sync(op); | 1235 | const bool is_sync = op_is_sync(op); |
1175 | int may_queue; | 1236 | int may_queue; |
1237 | gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : | ||
1238 | __GFP_DIRECT_RECLAIM; | ||
1176 | req_flags_t rq_flags = RQF_ALLOCED; | 1239 | req_flags_t rq_flags = RQF_ALLOCED; |
1177 | 1240 | ||
1178 | lockdep_assert_held(q->queue_lock); | 1241 | lockdep_assert_held(q->queue_lock); |
@@ -1255,6 +1318,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, | |||
1255 | blk_rq_set_rl(rq, rl); | 1318 | blk_rq_set_rl(rq, rl); |
1256 | rq->cmd_flags = op; | 1319 | rq->cmd_flags = op; |
1257 | rq->rq_flags = rq_flags; | 1320 | rq->rq_flags = rq_flags; |
1321 | if (flags & BLK_MQ_REQ_PREEMPT) | ||
1322 | rq->rq_flags |= RQF_PREEMPT; | ||
1258 | 1323 | ||
1259 | /* init elvpriv */ | 1324 | /* init elvpriv */ |
1260 | if (rq_flags & RQF_ELVPRIV) { | 1325 | if (rq_flags & RQF_ELVPRIV) { |
@@ -1333,7 +1398,7 @@ rq_starved: | |||
1333 | * @q: request_queue to allocate request from | 1398 | * @q: request_queue to allocate request from |
1334 | * @op: operation and flags | 1399 | * @op: operation and flags |
1335 | * @bio: bio to allocate request for (can be %NULL) | 1400 | * @bio: bio to allocate request for (can be %NULL) |
1336 | * @gfp_mask: allocation mask | 1401 | * @flags: BLK_MQ_REQ_* flags. |
1337 | * | 1402 | * |
1338 | * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, | 1403 | * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, |
1339 | * this function keeps retrying under memory pressure and fails iff @q is dead. | 1404 | * this function keeps retrying under memory pressure and fails iff @q is dead. |
@@ -1343,7 +1408,7 @@ rq_starved: | |||
1343 | * Returns request pointer on success, with @q->queue_lock *not held*. | 1408 | * Returns request pointer on success, with @q->queue_lock *not held*. |
1344 | */ | 1409 | */ |
1345 | static struct request *get_request(struct request_queue *q, unsigned int op, | 1410 | static struct request *get_request(struct request_queue *q, unsigned int op, |
1346 | struct bio *bio, gfp_t gfp_mask) | 1411 | struct bio *bio, blk_mq_req_flags_t flags) |
1347 | { | 1412 | { |
1348 | const bool is_sync = op_is_sync(op); | 1413 | const bool is_sync = op_is_sync(op); |
1349 | DEFINE_WAIT(wait); | 1414 | DEFINE_WAIT(wait); |
@@ -1355,7 +1420,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op, | |||
1355 | 1420 | ||
1356 | rl = blk_get_rl(q, bio); /* transferred to @rq on success */ | 1421 | rl = blk_get_rl(q, bio); /* transferred to @rq on success */ |
1357 | retry: | 1422 | retry: |
1358 | rq = __get_request(rl, op, bio, gfp_mask); | 1423 | rq = __get_request(rl, op, bio, flags); |
1359 | if (!IS_ERR(rq)) | 1424 | if (!IS_ERR(rq)) |
1360 | return rq; | 1425 | return rq; |
1361 | 1426 | ||
@@ -1364,7 +1429,7 @@ retry: | |||
1364 | return ERR_PTR(-EAGAIN); | 1429 | return ERR_PTR(-EAGAIN); |
1365 | } | 1430 | } |
1366 | 1431 | ||
1367 | if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { | 1432 | if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { |
1368 | blk_put_rl(rl); | 1433 | blk_put_rl(rl); |
1369 | return rq; | 1434 | return rq; |
1370 | } | 1435 | } |
@@ -1391,20 +1456,28 @@ retry: | |||
1391 | goto retry; | 1456 | goto retry; |
1392 | } | 1457 | } |
1393 | 1458 | ||
1459 | /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ | ||
1394 | static struct request *blk_old_get_request(struct request_queue *q, | 1460 | static struct request *blk_old_get_request(struct request_queue *q, |
1395 | unsigned int op, gfp_t gfp_mask) | 1461 | unsigned int op, blk_mq_req_flags_t flags) |
1396 | { | 1462 | { |
1397 | struct request *rq; | 1463 | struct request *rq; |
1464 | gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : | ||
1465 | __GFP_DIRECT_RECLAIM; | ||
1466 | int ret = 0; | ||
1398 | 1467 | ||
1399 | WARN_ON_ONCE(q->mq_ops); | 1468 | WARN_ON_ONCE(q->mq_ops); |
1400 | 1469 | ||
1401 | /* create ioc upfront */ | 1470 | /* create ioc upfront */ |
1402 | create_io_context(gfp_mask, q->node); | 1471 | create_io_context(gfp_mask, q->node); |
1403 | 1472 | ||
1473 | ret = blk_queue_enter(q, flags); | ||
1474 | if (ret) | ||
1475 | return ERR_PTR(ret); | ||
1404 | spin_lock_irq(q->queue_lock); | 1476 | spin_lock_irq(q->queue_lock); |
1405 | rq = get_request(q, op, NULL, gfp_mask); | 1477 | rq = get_request(q, op, NULL, flags); |
1406 | if (IS_ERR(rq)) { | 1478 | if (IS_ERR(rq)) { |
1407 | spin_unlock_irq(q->queue_lock); | 1479 | spin_unlock_irq(q->queue_lock); |
1480 | blk_queue_exit(q); | ||
1408 | return rq; | 1481 | return rq; |
1409 | } | 1482 | } |
1410 | 1483 | ||
@@ -1415,25 +1488,40 @@ static struct request *blk_old_get_request(struct request_queue *q, | |||
1415 | return rq; | 1488 | return rq; |
1416 | } | 1489 | } |
1417 | 1490 | ||
1418 | struct request *blk_get_request(struct request_queue *q, unsigned int op, | 1491 | /** |
1419 | gfp_t gfp_mask) | 1492 | * blk_get_request_flags - allocate a request |
1493 | * @q: request queue to allocate a request for | ||
1494 | * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. | ||
1495 | * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. | ||
1496 | */ | ||
1497 | struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, | ||
1498 | blk_mq_req_flags_t flags) | ||
1420 | { | 1499 | { |
1421 | struct request *req; | 1500 | struct request *req; |
1422 | 1501 | ||
1502 | WARN_ON_ONCE(op & REQ_NOWAIT); | ||
1503 | WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); | ||
1504 | |||
1423 | if (q->mq_ops) { | 1505 | if (q->mq_ops) { |
1424 | req = blk_mq_alloc_request(q, op, | 1506 | req = blk_mq_alloc_request(q, op, flags); |
1425 | (gfp_mask & __GFP_DIRECT_RECLAIM) ? | ||
1426 | 0 : BLK_MQ_REQ_NOWAIT); | ||
1427 | if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) | 1507 | if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) |
1428 | q->mq_ops->initialize_rq_fn(req); | 1508 | q->mq_ops->initialize_rq_fn(req); |
1429 | } else { | 1509 | } else { |
1430 | req = blk_old_get_request(q, op, gfp_mask); | 1510 | req = blk_old_get_request(q, op, flags); |
1431 | if (!IS_ERR(req) && q->initialize_rq_fn) | 1511 | if (!IS_ERR(req) && q->initialize_rq_fn) |
1432 | q->initialize_rq_fn(req); | 1512 | q->initialize_rq_fn(req); |
1433 | } | 1513 | } |
1434 | 1514 | ||
1435 | return req; | 1515 | return req; |
1436 | } | 1516 | } |
1517 | EXPORT_SYMBOL(blk_get_request_flags); | ||
1518 | |||
1519 | struct request *blk_get_request(struct request_queue *q, unsigned int op, | ||
1520 | gfp_t gfp_mask) | ||
1521 | { | ||
1522 | return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ? | ||
1523 | 0 : BLK_MQ_REQ_NOWAIT); | ||
1524 | } | ||
1437 | EXPORT_SYMBOL(blk_get_request); | 1525 | EXPORT_SYMBOL(blk_get_request); |
1438 | 1526 | ||
1439 | /** | 1527 | /** |
@@ -1576,6 +1664,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) | |||
1576 | blk_free_request(rl, req); | 1664 | blk_free_request(rl, req); |
1577 | freed_request(rl, sync, rq_flags); | 1665 | freed_request(rl, sync, rq_flags); |
1578 | blk_put_rl(rl); | 1666 | blk_put_rl(rl); |
1667 | blk_queue_exit(q); | ||
1579 | } | 1668 | } |
1580 | } | 1669 | } |
1581 | EXPORT_SYMBOL_GPL(__blk_put_request); | 1670 | EXPORT_SYMBOL_GPL(__blk_put_request); |
@@ -1857,8 +1946,10 @@ get_rq: | |||
1857 | * Grab a free request. This is might sleep but can not fail. | 1946 | * Grab a free request. This is might sleep but can not fail. |
1858 | * Returns with the queue unlocked. | 1947 | * Returns with the queue unlocked. |
1859 | */ | 1948 | */ |
1860 | req = get_request(q, bio->bi_opf, bio, GFP_NOIO); | 1949 | blk_queue_enter_live(q); |
1950 | req = get_request(q, bio->bi_opf, bio, 0); | ||
1861 | if (IS_ERR(req)) { | 1951 | if (IS_ERR(req)) { |
1952 | blk_queue_exit(q); | ||
1862 | __wbt_done(q->rq_wb, wb_acct); | 1953 | __wbt_done(q->rq_wb, wb_acct); |
1863 | if (PTR_ERR(req) == -ENOMEM) | 1954 | if (PTR_ERR(req) == -ENOMEM) |
1864 | bio->bi_status = BLK_STS_RESOURCE; | 1955 | bio->bi_status = BLK_STS_RESOURCE; |
@@ -2200,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio) | |||
2200 | current->bio_list = bio_list_on_stack; | 2291 | current->bio_list = bio_list_on_stack; |
2201 | do { | 2292 | do { |
2202 | struct request_queue *q = bio->bi_disk->queue; | 2293 | struct request_queue *q = bio->bi_disk->queue; |
2294 | blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ? | ||
2295 | BLK_MQ_REQ_NOWAIT : 0; | ||
2203 | 2296 | ||
2204 | if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { | 2297 | if (likely(blk_queue_enter(q, flags) == 0)) { |
2205 | struct bio_list lower, same; | 2298 | struct bio_list lower, same; |
2206 | 2299 | ||
2207 | /* Create a fresh bio_list for all subordinate requests */ | 2300 | /* Create a fresh bio_list for all subordinate requests */ |
@@ -2242,6 +2335,40 @@ out: | |||
2242 | EXPORT_SYMBOL(generic_make_request); | 2335 | EXPORT_SYMBOL(generic_make_request); |
2243 | 2336 | ||
2244 | /** | 2337 | /** |
2338 | * direct_make_request - hand a buffer directly to its device driver for I/O | ||
2339 | * @bio: The bio describing the location in memory and on the device. | ||
2340 | * | ||
2341 | * This function behaves like generic_make_request(), but does not protect | ||
2342 | * against recursion. Must only be used if the called driver is known | ||
2343 | * to not call generic_make_request (or direct_make_request) again from | ||
2344 | * its make_request function. (Calling direct_make_request again from | ||
2345 | * a workqueue is perfectly fine as that doesn't recurse). | ||
2346 | */ | ||
2347 | blk_qc_t direct_make_request(struct bio *bio) | ||
2348 | { | ||
2349 | struct request_queue *q = bio->bi_disk->queue; | ||
2350 | bool nowait = bio->bi_opf & REQ_NOWAIT; | ||
2351 | blk_qc_t ret; | ||
2352 | |||
2353 | if (!generic_make_request_checks(bio)) | ||
2354 | return BLK_QC_T_NONE; | ||
2355 | |||
2356 | if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { | ||
2357 | if (nowait && !blk_queue_dying(q)) | ||
2358 | bio->bi_status = BLK_STS_AGAIN; | ||
2359 | else | ||
2360 | bio->bi_status = BLK_STS_IOERR; | ||
2361 | bio_endio(bio); | ||
2362 | return BLK_QC_T_NONE; | ||
2363 | } | ||
2364 | |||
2365 | ret = q->make_request_fn(q, bio); | ||
2366 | blk_queue_exit(q); | ||
2367 | return ret; | ||
2368 | } | ||
2369 | EXPORT_SYMBOL_GPL(direct_make_request); | ||
2370 | |||
2371 | /** | ||
2245 | * submit_bio - submit a bio to the block device layer for I/O | 2372 | * submit_bio - submit a bio to the block device layer for I/O |
2246 | * @bio: The &struct bio which describes the I/O | 2373 | * @bio: The &struct bio which describes the I/O |
2247 | * | 2374 | * |
@@ -2285,6 +2412,17 @@ blk_qc_t submit_bio(struct bio *bio) | |||
2285 | } | 2412 | } |
2286 | EXPORT_SYMBOL(submit_bio); | 2413 | EXPORT_SYMBOL(submit_bio); |
2287 | 2414 | ||
2415 | bool blk_poll(struct request_queue *q, blk_qc_t cookie) | ||
2416 | { | ||
2417 | if (!q->poll_fn || !blk_qc_t_valid(cookie)) | ||
2418 | return false; | ||
2419 | |||
2420 | if (current->plug) | ||
2421 | blk_flush_plug_list(current->plug, false); | ||
2422 | return q->poll_fn(q, cookie); | ||
2423 | } | ||
2424 | EXPORT_SYMBOL_GPL(blk_poll); | ||
2425 | |||
2288 | /** | 2426 | /** |
2289 | * blk_cloned_rq_check_limits - Helper function to check a cloned request | 2427 | * blk_cloned_rq_check_limits - Helper function to check a cloned request |
2290 | * for new the queue limits | 2428 | * for new the queue limits |
@@ -2350,7 +2488,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * | |||
2350 | * bypass a potential scheduler on the bottom device for | 2488 | * bypass a potential scheduler on the bottom device for |
2351 | * insert. | 2489 | * insert. |
2352 | */ | 2490 | */ |
2353 | blk_mq_request_bypass_insert(rq); | 2491 | blk_mq_request_bypass_insert(rq, true); |
2354 | return BLK_STS_OK; | 2492 | return BLK_STS_OK; |
2355 | } | 2493 | } |
2356 | 2494 | ||
@@ -2464,20 +2602,22 @@ void blk_account_io_done(struct request *req) | |||
2464 | * Don't process normal requests when queue is suspended | 2602 | * Don't process normal requests when queue is suspended |
2465 | * or in the process of suspending/resuming | 2603 | * or in the process of suspending/resuming |
2466 | */ | 2604 | */ |
2467 | static struct request *blk_pm_peek_request(struct request_queue *q, | 2605 | static bool blk_pm_allow_request(struct request *rq) |
2468 | struct request *rq) | ||
2469 | { | 2606 | { |
2470 | if (q->dev && (q->rpm_status == RPM_SUSPENDED || | 2607 | switch (rq->q->rpm_status) { |
2471 | (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) | 2608 | case RPM_RESUMING: |
2472 | return NULL; | 2609 | case RPM_SUSPENDING: |
2473 | else | 2610 | return rq->rq_flags & RQF_PM; |
2474 | return rq; | 2611 | case RPM_SUSPENDED: |
2612 | return false; | ||
2613 | } | ||
2614 | |||
2615 | return true; | ||
2475 | } | 2616 | } |
2476 | #else | 2617 | #else |
2477 | static inline struct request *blk_pm_peek_request(struct request_queue *q, | 2618 | static bool blk_pm_allow_request(struct request *rq) |
2478 | struct request *rq) | ||
2479 | { | 2619 | { |
2480 | return rq; | 2620 | return true; |
2481 | } | 2621 | } |
2482 | #endif | 2622 | #endif |
2483 | 2623 | ||
@@ -2517,6 +2657,48 @@ void blk_account_io_start(struct request *rq, bool new_io) | |||
2517 | part_stat_unlock(); | 2657 | part_stat_unlock(); |
2518 | } | 2658 | } |
2519 | 2659 | ||
2660 | static struct request *elv_next_request(struct request_queue *q) | ||
2661 | { | ||
2662 | struct request *rq; | ||
2663 | struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); | ||
2664 | |||
2665 | WARN_ON_ONCE(q->mq_ops); | ||
2666 | |||
2667 | while (1) { | ||
2668 | list_for_each_entry(rq, &q->queue_head, queuelist) { | ||
2669 | if (blk_pm_allow_request(rq)) | ||
2670 | return rq; | ||
2671 | |||
2672 | if (rq->rq_flags & RQF_SOFTBARRIER) | ||
2673 | break; | ||
2674 | } | ||
2675 | |||
2676 | /* | ||
2677 | * Flush request is running and flush request isn't queueable | ||
2678 | * in the drive, we can hold the queue till flush request is | ||
2679 | * finished. Even we don't do this, driver can't dispatch next | ||
2680 | * requests and will requeue them. And this can improve | ||
2681 | * throughput too. For example, we have request flush1, write1, | ||
2682 | * flush 2. flush1 is dispatched, then queue is hold, write1 | ||
2683 | * isn't inserted to queue. After flush1 is finished, flush2 | ||
2684 | * will be dispatched. Since disk cache is already clean, | ||
2685 | * flush2 will be finished very soon, so looks like flush2 is | ||
2686 | * folded to flush1. | ||
2687 | * Since the queue is hold, a flag is set to indicate the queue | ||
2688 | * should be restarted later. Please see flush_end_io() for | ||
2689 | * details. | ||
2690 | */ | ||
2691 | if (fq->flush_pending_idx != fq->flush_running_idx && | ||
2692 | !queue_flush_queueable(q)) { | ||
2693 | fq->flush_queue_delayed = 1; | ||
2694 | return NULL; | ||
2695 | } | ||
2696 | if (unlikely(blk_queue_bypass(q)) || | ||
2697 | !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) | ||
2698 | return NULL; | ||
2699 | } | ||
2700 | } | ||
2701 | |||
2520 | /** | 2702 | /** |
2521 | * blk_peek_request - peek at the top of a request queue | 2703 | * blk_peek_request - peek at the top of a request queue |
2522 | * @q: request queue to peek at | 2704 | * @q: request queue to peek at |
@@ -2538,12 +2720,7 @@ struct request *blk_peek_request(struct request_queue *q) | |||
2538 | lockdep_assert_held(q->queue_lock); | 2720 | lockdep_assert_held(q->queue_lock); |
2539 | WARN_ON_ONCE(q->mq_ops); | 2721 | WARN_ON_ONCE(q->mq_ops); |
2540 | 2722 | ||
2541 | while ((rq = __elv_next_request(q)) != NULL) { | 2723 | while ((rq = elv_next_request(q)) != NULL) { |
2542 | |||
2543 | rq = blk_pm_peek_request(q, rq); | ||
2544 | if (!rq) | ||
2545 | break; | ||
2546 | |||
2547 | if (!(rq->rq_flags & RQF_STARTED)) { | 2724 | if (!(rq->rq_flags & RQF_STARTED)) { |
2548 | /* | 2725 | /* |
2549 | * This is the first time the device driver | 2726 | * This is the first time the device driver |
@@ -2695,6 +2872,27 @@ struct request *blk_fetch_request(struct request_queue *q) | |||
2695 | } | 2872 | } |
2696 | EXPORT_SYMBOL(blk_fetch_request); | 2873 | EXPORT_SYMBOL(blk_fetch_request); |
2697 | 2874 | ||
2875 | /* | ||
2876 | * Steal bios from a request and add them to a bio list. | ||
2877 | * The request must not have been partially completed before. | ||
2878 | */ | ||
2879 | void blk_steal_bios(struct bio_list *list, struct request *rq) | ||
2880 | { | ||
2881 | if (rq->bio) { | ||
2882 | if (list->tail) | ||
2883 | list->tail->bi_next = rq->bio; | ||
2884 | else | ||
2885 | list->head = rq->bio; | ||
2886 | list->tail = rq->biotail; | ||
2887 | |||
2888 | rq->bio = NULL; | ||
2889 | rq->biotail = NULL; | ||
2890 | } | ||
2891 | |||
2892 | rq->__data_len = 0; | ||
2893 | } | ||
2894 | EXPORT_SYMBOL_GPL(blk_steal_bios); | ||
2895 | |||
2698 | /** | 2896 | /** |
2699 | * blk_update_request - Special helper function for request stacking drivers | 2897 | * blk_update_request - Special helper function for request stacking drivers |
2700 | * @req: the request being processed | 2898 | * @req: the request being processed |
diff --git a/block/blk-flush.c b/block/blk-flush.c index 4938bec8cfef..f17170675917 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c | |||
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) | |||
231 | /* release the tag's ownership to the req cloned from */ | 231 | /* release the tag's ownership to the req cloned from */ |
232 | spin_lock_irqsave(&fq->mq_flush_lock, flags); | 232 | spin_lock_irqsave(&fq->mq_flush_lock, flags); |
233 | hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); | 233 | hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); |
234 | blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); | 234 | if (!q->elevator) { |
235 | flush_rq->tag = -1; | 235 | blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); |
236 | flush_rq->tag = -1; | ||
237 | } else { | ||
238 | blk_mq_put_driver_tag_hctx(hctx, flush_rq); | ||
239 | flush_rq->internal_tag = -1; | ||
240 | } | ||
236 | } | 241 | } |
237 | 242 | ||
238 | running = &fq->flush_queue[fq->flush_running_idx]; | 243 | running = &fq->flush_queue[fq->flush_running_idx]; |
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) | |||
318 | blk_rq_init(q, flush_rq); | 323 | blk_rq_init(q, flush_rq); |
319 | 324 | ||
320 | /* | 325 | /* |
321 | * Borrow tag from the first request since they can't | 326 | * In case of none scheduler, borrow tag from the first request |
322 | * be in flight at the same time. And acquire the tag's | 327 | * since they can't be in flight at the same time. And acquire |
323 | * ownership for flush req. | 328 | * the tag's ownership for flush req. |
329 | * | ||
330 | * In case of IO scheduler, flush rq need to borrow scheduler tag | ||
331 | * just for cheating put/get driver tag. | ||
324 | */ | 332 | */ |
325 | if (q->mq_ops) { | 333 | if (q->mq_ops) { |
326 | struct blk_mq_hw_ctx *hctx; | 334 | struct blk_mq_hw_ctx *hctx; |
327 | 335 | ||
328 | flush_rq->mq_ctx = first_rq->mq_ctx; | 336 | flush_rq->mq_ctx = first_rq->mq_ctx; |
329 | flush_rq->tag = first_rq->tag; | ||
330 | fq->orig_rq = first_rq; | ||
331 | 337 | ||
332 | hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); | 338 | if (!q->elevator) { |
333 | blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); | 339 | fq->orig_rq = first_rq; |
340 | flush_rq->tag = first_rq->tag; | ||
341 | hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); | ||
342 | blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); | ||
343 | } else { | ||
344 | flush_rq->internal_tag = first_rq->internal_tag; | ||
345 | } | ||
334 | } | 346 | } |
335 | 347 | ||
336 | flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; | 348 | flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; |
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) | |||
394 | 406 | ||
395 | hctx = blk_mq_map_queue(q, ctx->cpu); | 407 | hctx = blk_mq_map_queue(q, ctx->cpu); |
396 | 408 | ||
409 | if (q->elevator) { | ||
410 | WARN_ON(rq->tag < 0); | ||
411 | blk_mq_put_driver_tag_hctx(hctx, rq); | ||
412 | } | ||
413 | |||
397 | /* | 414 | /* |
398 | * After populating an empty queue, kick it to avoid stall. Read | 415 | * After populating an empty queue, kick it to avoid stall. Read |
399 | * the comment in flush_end_io(). | 416 | * the comment in flush_end_io(). |
@@ -463,7 +480,7 @@ void blk_insert_flush(struct request *rq) | |||
463 | if ((policy & REQ_FSEQ_DATA) && | 480 | if ((policy & REQ_FSEQ_DATA) && |
464 | !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { | 481 | !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { |
465 | if (q->mq_ops) | 482 | if (q->mq_ops) |
466 | blk_mq_sched_insert_request(rq, false, true, false, false); | 483 | blk_mq_request_bypass_insert(rq, false); |
467 | else | 484 | else |
468 | list_add_tail(&rq->queuelist, &q->queue_head); | 485 | list_add_tail(&rq->queuelist, &q->queue_head); |
469 | return; | 486 | return; |
diff --git a/block/blk-lib.c b/block/blk-lib.c index 63fb971d6574..2bc544ce3d2e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -275,6 +275,40 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) | |||
275 | return min(pages, (sector_t)BIO_MAX_PAGES); | 275 | return min(pages, (sector_t)BIO_MAX_PAGES); |
276 | } | 276 | } |
277 | 277 | ||
278 | static int __blkdev_issue_zero_pages(struct block_device *bdev, | ||
279 | sector_t sector, sector_t nr_sects, gfp_t gfp_mask, | ||
280 | struct bio **biop) | ||
281 | { | ||
282 | struct request_queue *q = bdev_get_queue(bdev); | ||
283 | struct bio *bio = *biop; | ||
284 | int bi_size = 0; | ||
285 | unsigned int sz; | ||
286 | |||
287 | if (!q) | ||
288 | return -ENXIO; | ||
289 | |||
290 | while (nr_sects != 0) { | ||
291 | bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), | ||
292 | gfp_mask); | ||
293 | bio->bi_iter.bi_sector = sector; | ||
294 | bio_set_dev(bio, bdev); | ||
295 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | ||
296 | |||
297 | while (nr_sects != 0) { | ||
298 | sz = min((sector_t) PAGE_SIZE, nr_sects << 9); | ||
299 | bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); | ||
300 | nr_sects -= bi_size >> 9; | ||
301 | sector += bi_size >> 9; | ||
302 | if (bi_size < sz) | ||
303 | break; | ||
304 | } | ||
305 | cond_resched(); | ||
306 | } | ||
307 | |||
308 | *biop = bio; | ||
309 | return 0; | ||
310 | } | ||
311 | |||
278 | /** | 312 | /** |
279 | * __blkdev_issue_zeroout - generate number of zero filed write bios | 313 | * __blkdev_issue_zeroout - generate number of zero filed write bios |
280 | * @bdev: blockdev to issue | 314 | * @bdev: blockdev to issue |
@@ -288,12 +322,6 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) | |||
288 | * Zero-fill a block range, either using hardware offload or by explicitly | 322 | * Zero-fill a block range, either using hardware offload or by explicitly |
289 | * writing zeroes to the device. | 323 | * writing zeroes to the device. |
290 | * | 324 | * |
291 | * Note that this function may fail with -EOPNOTSUPP if the driver signals | ||
292 | * zeroing offload support, but the device fails to process the command (for | ||
293 | * some devices there is no non-destructive way to verify whether this | ||
294 | * operation is actually supported). In this case the caller should call | ||
295 | * retry the call to blkdev_issue_zeroout() and the fallback path will be used. | ||
296 | * | ||
297 | * If a device is using logical block provisioning, the underlying space will | 325 | * If a device is using logical block provisioning, the underlying space will |
298 | * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. | 326 | * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. |
299 | * | 327 | * |
@@ -305,9 +333,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | |||
305 | unsigned flags) | 333 | unsigned flags) |
306 | { | 334 | { |
307 | int ret; | 335 | int ret; |
308 | int bi_size = 0; | ||
309 | struct bio *bio = *biop; | ||
310 | unsigned int sz; | ||
311 | sector_t bs_mask; | 336 | sector_t bs_mask; |
312 | 337 | ||
313 | bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; | 338 | bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; |
@@ -317,30 +342,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | |||
317 | ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, | 342 | ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, |
318 | biop, flags); | 343 | biop, flags); |
319 | if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) | 344 | if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) |
320 | goto out; | 345 | return ret; |
321 | |||
322 | ret = 0; | ||
323 | while (nr_sects != 0) { | ||
324 | bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), | ||
325 | gfp_mask); | ||
326 | bio->bi_iter.bi_sector = sector; | ||
327 | bio_set_dev(bio, bdev); | ||
328 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | ||
329 | |||
330 | while (nr_sects != 0) { | ||
331 | sz = min((sector_t) PAGE_SIZE, nr_sects << 9); | ||
332 | bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); | ||
333 | nr_sects -= bi_size >> 9; | ||
334 | sector += bi_size >> 9; | ||
335 | if (bi_size < sz) | ||
336 | break; | ||
337 | } | ||
338 | cond_resched(); | ||
339 | } | ||
340 | 346 | ||
341 | *biop = bio; | 347 | return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, |
342 | out: | 348 | biop); |
343 | return ret; | ||
344 | } | 349 | } |
345 | EXPORT_SYMBOL(__blkdev_issue_zeroout); | 350 | EXPORT_SYMBOL(__blkdev_issue_zeroout); |
346 | 351 | ||
@@ -360,18 +365,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); | |||
360 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | 365 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, |
361 | sector_t nr_sects, gfp_t gfp_mask, unsigned flags) | 366 | sector_t nr_sects, gfp_t gfp_mask, unsigned flags) |
362 | { | 367 | { |
363 | int ret; | 368 | int ret = 0; |
364 | struct bio *bio = NULL; | 369 | sector_t bs_mask; |
370 | struct bio *bio; | ||
365 | struct blk_plug plug; | 371 | struct blk_plug plug; |
372 | bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); | ||
366 | 373 | ||
374 | bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; | ||
375 | if ((sector | nr_sects) & bs_mask) | ||
376 | return -EINVAL; | ||
377 | |||
378 | retry: | ||
379 | bio = NULL; | ||
367 | blk_start_plug(&plug); | 380 | blk_start_plug(&plug); |
368 | ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, | 381 | if (try_write_zeroes) { |
369 | &bio, flags); | 382 | ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, |
383 | gfp_mask, &bio, flags); | ||
384 | } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { | ||
385 | ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, | ||
386 | gfp_mask, &bio); | ||
387 | } else { | ||
388 | /* No zeroing offload support */ | ||
389 | ret = -EOPNOTSUPP; | ||
390 | } | ||
370 | if (ret == 0 && bio) { | 391 | if (ret == 0 && bio) { |
371 | ret = submit_bio_wait(bio); | 392 | ret = submit_bio_wait(bio); |
372 | bio_put(bio); | 393 | bio_put(bio); |
373 | } | 394 | } |
374 | blk_finish_plug(&plug); | 395 | blk_finish_plug(&plug); |
396 | if (ret && try_write_zeroes) { | ||
397 | if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { | ||
398 | try_write_zeroes = false; | ||
399 | goto retry; | ||
400 | } | ||
401 | if (!bdev_write_zeroes_sectors(bdev)) { | ||
402 | /* | ||
403 | * Zeroing offload support was indicated, but the | ||
404 | * device reported ILLEGAL REQUEST (for some devices | ||
405 | * there is no non-destructive way to verify whether | ||
406 | * WRITE ZEROES is actually supported). | ||
407 | */ | ||
408 | ret = -EOPNOTSUPP; | ||
409 | } | ||
410 | } | ||
375 | 411 | ||
376 | return ret; | 412 | return ret; |
377 | } | 413 | } |
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index de294d775acf..b56a4f35720d 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c | |||
@@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = { | |||
54 | QUEUE_FLAG_NAME(NOMERGES), | 54 | QUEUE_FLAG_NAME(NOMERGES), |
55 | QUEUE_FLAG_NAME(SAME_COMP), | 55 | QUEUE_FLAG_NAME(SAME_COMP), |
56 | QUEUE_FLAG_NAME(FAIL_IO), | 56 | QUEUE_FLAG_NAME(FAIL_IO), |
57 | QUEUE_FLAG_NAME(STACKABLE), | ||
58 | QUEUE_FLAG_NAME(NONROT), | 57 | QUEUE_FLAG_NAME(NONROT), |
59 | QUEUE_FLAG_NAME(IO_STAT), | 58 | QUEUE_FLAG_NAME(IO_STAT), |
60 | QUEUE_FLAG_NAME(DISCARD), | 59 | QUEUE_FLAG_NAME(DISCARD), |
@@ -75,6 +74,7 @@ static const char *const blk_queue_flag_name[] = { | |||
75 | QUEUE_FLAG_NAME(REGISTERED), | 74 | QUEUE_FLAG_NAME(REGISTERED), |
76 | QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), | 75 | QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), |
77 | QUEUE_FLAG_NAME(QUIESCED), | 76 | QUEUE_FLAG_NAME(QUIESCED), |
77 | QUEUE_FLAG_NAME(PREEMPT_ONLY), | ||
78 | }; | 78 | }; |
79 | #undef QUEUE_FLAG_NAME | 79 | #undef QUEUE_FLAG_NAME |
80 | 80 | ||
@@ -180,7 +180,6 @@ static const char *const hctx_state_name[] = { | |||
180 | HCTX_STATE_NAME(STOPPED), | 180 | HCTX_STATE_NAME(STOPPED), |
181 | HCTX_STATE_NAME(TAG_ACTIVE), | 181 | HCTX_STATE_NAME(TAG_ACTIVE), |
182 | HCTX_STATE_NAME(SCHED_RESTART), | 182 | HCTX_STATE_NAME(SCHED_RESTART), |
183 | HCTX_STATE_NAME(TAG_WAITING), | ||
184 | HCTX_STATE_NAME(START_ON_RUN), | 183 | HCTX_STATE_NAME(START_ON_RUN), |
185 | }; | 184 | }; |
186 | #undef HCTX_STATE_NAME | 185 | #undef HCTX_STATE_NAME |
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4ab69435708c..c117bd8fd1f6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c | |||
@@ -81,20 +81,103 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) | |||
81 | } else | 81 | } else |
82 | clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | 82 | clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
83 | 83 | ||
84 | if (blk_mq_hctx_has_pending(hctx)) { | 84 | return blk_mq_run_hw_queue(hctx, true); |
85 | blk_mq_run_hw_queue(hctx, true); | 85 | } |
86 | return true; | ||
87 | } | ||
88 | 86 | ||
89 | return false; | 87 | /* |
88 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts | ||
89 | * its queue by itself in its completion handler, so we don't need to | ||
90 | * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. | ||
91 | */ | ||
92 | static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) | ||
93 | { | ||
94 | struct request_queue *q = hctx->queue; | ||
95 | struct elevator_queue *e = q->elevator; | ||
96 | LIST_HEAD(rq_list); | ||
97 | |||
98 | do { | ||
99 | struct request *rq; | ||
100 | |||
101 | if (e->type->ops.mq.has_work && | ||
102 | !e->type->ops.mq.has_work(hctx)) | ||
103 | break; | ||
104 | |||
105 | if (!blk_mq_get_dispatch_budget(hctx)) | ||
106 | break; | ||
107 | |||
108 | rq = e->type->ops.mq.dispatch_request(hctx); | ||
109 | if (!rq) { | ||
110 | blk_mq_put_dispatch_budget(hctx); | ||
111 | break; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * Now this rq owns the budget which has to be released | ||
116 | * if this rq won't be queued to driver via .queue_rq() | ||
117 | * in blk_mq_dispatch_rq_list(). | ||
118 | */ | ||
119 | list_add(&rq->queuelist, &rq_list); | ||
120 | } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); | ||
90 | } | 121 | } |
91 | 122 | ||
123 | static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, | ||
124 | struct blk_mq_ctx *ctx) | ||
125 | { | ||
126 | unsigned idx = ctx->index_hw; | ||
127 | |||
128 | if (++idx == hctx->nr_ctx) | ||
129 | idx = 0; | ||
130 | |||
131 | return hctx->ctxs[idx]; | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts | ||
136 | * its queue by itself in its completion handler, so we don't need to | ||
137 | * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. | ||
138 | */ | ||
139 | static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) | ||
140 | { | ||
141 | struct request_queue *q = hctx->queue; | ||
142 | LIST_HEAD(rq_list); | ||
143 | struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); | ||
144 | |||
145 | do { | ||
146 | struct request *rq; | ||
147 | |||
148 | if (!sbitmap_any_bit_set(&hctx->ctx_map)) | ||
149 | break; | ||
150 | |||
151 | if (!blk_mq_get_dispatch_budget(hctx)) | ||
152 | break; | ||
153 | |||
154 | rq = blk_mq_dequeue_from_ctx(hctx, ctx); | ||
155 | if (!rq) { | ||
156 | blk_mq_put_dispatch_budget(hctx); | ||
157 | break; | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Now this rq owns the budget which has to be released | ||
162 | * if this rq won't be queued to driver via .queue_rq() | ||
163 | * in blk_mq_dispatch_rq_list(). | ||
164 | */ | ||
165 | list_add(&rq->queuelist, &rq_list); | ||
166 | |||
167 | /* round robin for fair dispatch */ | ||
168 | ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); | ||
169 | |||
170 | } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); | ||
171 | |||
172 | WRITE_ONCE(hctx->dispatch_from, ctx); | ||
173 | } | ||
174 | |||
175 | /* return true if hw queue need to be run again */ | ||
92 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | 176 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
93 | { | 177 | { |
94 | struct request_queue *q = hctx->queue; | 178 | struct request_queue *q = hctx->queue; |
95 | struct elevator_queue *e = q->elevator; | 179 | struct elevator_queue *e = q->elevator; |
96 | const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; | 180 | const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; |
97 | bool did_work = false; | ||
98 | LIST_HEAD(rq_list); | 181 | LIST_HEAD(rq_list); |
99 | 182 | ||
100 | /* RCU or SRCU read lock is needed before checking quiesced flag */ | 183 | /* RCU or SRCU read lock is needed before checking quiesced flag */ |
@@ -122,29 +205,34 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |||
122 | * scheduler, we can no longer merge or sort them. So it's best to | 205 | * scheduler, we can no longer merge or sort them. So it's best to |
123 | * leave them there for as long as we can. Mark the hw queue as | 206 | * leave them there for as long as we can. Mark the hw queue as |
124 | * needing a restart in that case. | 207 | * needing a restart in that case. |
208 | * | ||
209 | * We want to dispatch from the scheduler if there was nothing | ||
210 | * on the dispatch list or we were able to dispatch from the | ||
211 | * dispatch list. | ||
125 | */ | 212 | */ |
126 | if (!list_empty(&rq_list)) { | 213 | if (!list_empty(&rq_list)) { |
127 | blk_mq_sched_mark_restart_hctx(hctx); | 214 | blk_mq_sched_mark_restart_hctx(hctx); |
128 | did_work = blk_mq_dispatch_rq_list(q, &rq_list); | 215 | if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { |
129 | } else if (!has_sched_dispatch) { | 216 | if (has_sched_dispatch) |
217 | blk_mq_do_dispatch_sched(hctx); | ||
218 | else | ||
219 | blk_mq_do_dispatch_ctx(hctx); | ||
220 | } | ||
221 | } else if (has_sched_dispatch) { | ||
222 | blk_mq_do_dispatch_sched(hctx); | ||
223 | } else if (q->mq_ops->get_budget) { | ||
224 | /* | ||
225 | * If we need to get budget before queuing request, we | ||
226 | * dequeue request one by one from sw queue for avoiding | ||
227 | * to mess up I/O merge when dispatch runs out of resource. | ||
228 | * | ||
229 | * TODO: get more budgets, and dequeue more requests in | ||
230 | * one time. | ||
231 | */ | ||
232 | blk_mq_do_dispatch_ctx(hctx); | ||
233 | } else { | ||
130 | blk_mq_flush_busy_ctxs(hctx, &rq_list); | 234 | blk_mq_flush_busy_ctxs(hctx, &rq_list); |
131 | blk_mq_dispatch_rq_list(q, &rq_list); | 235 | blk_mq_dispatch_rq_list(q, &rq_list, false); |
132 | } | ||
133 | |||
134 | /* | ||
135 | * We want to dispatch from the scheduler if we had no work left | ||
136 | * on the dispatch list, OR if we did have work but weren't able | ||
137 | * to make progress. | ||
138 | */ | ||
139 | if (!did_work && has_sched_dispatch) { | ||
140 | do { | ||
141 | struct request *rq; | ||
142 | |||
143 | rq = e->type->ops.mq.dispatch_request(hctx); | ||
144 | if (!rq) | ||
145 | break; | ||
146 | list_add(&rq->queuelist, &rq_list); | ||
147 | } while (blk_mq_dispatch_rq_list(q, &rq_list)); | ||
148 | } | 236 | } |
149 | } | 237 | } |
150 | 238 | ||
@@ -260,21 +348,21 @@ void blk_mq_sched_request_inserted(struct request *rq) | |||
260 | EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); | 348 | EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); |
261 | 349 | ||
262 | static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, | 350 | static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, |
351 | bool has_sched, | ||
263 | struct request *rq) | 352 | struct request *rq) |
264 | { | 353 | { |
265 | if (rq->tag == -1) { | 354 | /* dispatch flush rq directly */ |
266 | rq->rq_flags |= RQF_SORTED; | 355 | if (rq->rq_flags & RQF_FLUSH_SEQ) { |
267 | return false; | 356 | spin_lock(&hctx->lock); |
357 | list_add(&rq->queuelist, &hctx->dispatch); | ||
358 | spin_unlock(&hctx->lock); | ||
359 | return true; | ||
268 | } | 360 | } |
269 | 361 | ||
270 | /* | 362 | if (has_sched) |
271 | * If we already have a real request tag, send directly to | 363 | rq->rq_flags |= RQF_SORTED; |
272 | * the dispatch list. | 364 | |
273 | */ | 365 | return false; |
274 | spin_lock(&hctx->lock); | ||
275 | list_add(&rq->queuelist, &hctx->dispatch); | ||
276 | spin_unlock(&hctx->lock); | ||
277 | return true; | ||
278 | } | 366 | } |
279 | 367 | ||
280 | /** | 368 | /** |
@@ -339,21 +427,6 @@ done: | |||
339 | } | 427 | } |
340 | } | 428 | } |
341 | 429 | ||
342 | /* | ||
343 | * Add flush/fua to the queue. If we fail getting a driver tag, then | ||
344 | * punt to the requeue list. Requeue will re-invoke us from a context | ||
345 | * that's safe to block from. | ||
346 | */ | ||
347 | static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, | ||
348 | struct request *rq, bool can_block) | ||
349 | { | ||
350 | if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { | ||
351 | blk_insert_flush(rq); | ||
352 | blk_mq_run_hw_queue(hctx, true); | ||
353 | } else | ||
354 | blk_mq_add_to_requeue_list(rq, false, true); | ||
355 | } | ||
356 | |||
357 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, | 430 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, |
358 | bool run_queue, bool async, bool can_block) | 431 | bool run_queue, bool async, bool can_block) |
359 | { | 432 | { |
@@ -362,12 +435,15 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, | |||
362 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 435 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
363 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | 436 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
364 | 437 | ||
365 | if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { | 438 | /* flush rq in flush machinery need to be dispatched directly */ |
366 | blk_mq_sched_insert_flush(hctx, rq, can_block); | 439 | if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { |
367 | return; | 440 | blk_insert_flush(rq); |
441 | goto run; | ||
368 | } | 442 | } |
369 | 443 | ||
370 | if (e && blk_mq_sched_bypass_insert(hctx, rq)) | 444 | WARN_ON(e && (rq->tag != -1)); |
445 | |||
446 | if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) | ||
371 | goto run; | 447 | goto run; |
372 | 448 | ||
373 | if (e && e->type->ops.mq.insert_requests) { | 449 | if (e && e->type->ops.mq.insert_requests) { |
@@ -393,23 +469,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q, | |||
393 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | 469 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
394 | struct elevator_queue *e = hctx->queue->elevator; | 470 | struct elevator_queue *e = hctx->queue->elevator; |
395 | 471 | ||
396 | if (e) { | ||
397 | struct request *rq, *next; | ||
398 | |||
399 | /* | ||
400 | * We bypass requests that already have a driver tag assigned, | ||
401 | * which should only be flushes. Flushes are only ever inserted | ||
402 | * as single requests, so we shouldn't ever hit the | ||
403 | * WARN_ON_ONCE() below (but let's handle it just in case). | ||
404 | */ | ||
405 | list_for_each_entry_safe(rq, next, list, queuelist) { | ||
406 | if (WARN_ON_ONCE(rq->tag != -1)) { | ||
407 | list_del_init(&rq->queuelist); | ||
408 | blk_mq_sched_bypass_insert(hctx, rq); | ||
409 | } | ||
410 | } | ||
411 | } | ||
412 | |||
413 | if (e && e->type->ops.mq.insert_requests) | 472 | if (e && e->type->ops.mq.insert_requests) |
414 | e->type->ops.mq.insert_requests(hctx, list, false); | 473 | e->type->ops.mq.insert_requests(hctx, list, false); |
415 | else | 474 | else |
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 6714507aa6c7..c81b40ecd3f1 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c | |||
@@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, | |||
298 | } | 298 | } |
299 | EXPORT_SYMBOL(blk_mq_tagset_busy_iter); | 299 | EXPORT_SYMBOL(blk_mq_tagset_busy_iter); |
300 | 300 | ||
301 | int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, | 301 | int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, |
302 | int (reinit_request)(void *, struct request *)) | 302 | int (fn)(void *, struct request *)) |
303 | { | 303 | { |
304 | int i, j, ret = 0; | 304 | int i, j, ret = 0; |
305 | 305 | ||
306 | if (WARN_ON_ONCE(!reinit_request)) | 306 | if (WARN_ON_ONCE(!fn)) |
307 | goto out; | 307 | goto out; |
308 | 308 | ||
309 | for (i = 0; i < set->nr_hw_queues; i++) { | 309 | for (i = 0; i < set->nr_hw_queues; i++) { |
@@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, | |||
316 | if (!tags->static_rqs[j]) | 316 | if (!tags->static_rqs[j]) |
317 | continue; | 317 | continue; |
318 | 318 | ||
319 | ret = reinit_request(set->driver_data, | 319 | ret = fn(data, tags->static_rqs[j]); |
320 | tags->static_rqs[j]); | ||
321 | if (ret) | 320 | if (ret) |
322 | goto out; | 321 | goto out; |
323 | } | 322 | } |
@@ -326,7 +325,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, | |||
326 | out: | 325 | out: |
327 | return ret; | 326 | return ret; |
328 | } | 327 | } |
329 | EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); | 328 | EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); |
330 | 329 | ||
331 | void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, | 330 | void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, |
332 | void *priv) | 331 | void *priv) |
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index c190165d92ea..61deab0b5a5a 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h | |||
@@ -45,13 +45,8 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, | |||
45 | } | 45 | } |
46 | 46 | ||
47 | enum { | 47 | enum { |
48 | BLK_MQ_TAG_CACHE_MIN = 1, | ||
49 | BLK_MQ_TAG_CACHE_MAX = 64, | ||
50 | }; | ||
51 | |||
52 | enum { | ||
53 | BLK_MQ_TAG_FAIL = -1U, | 48 | BLK_MQ_TAG_FAIL = -1U, |
54 | BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, | 49 | BLK_MQ_TAG_MIN = 1, |
55 | BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, | 50 | BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, |
56 | }; | 51 | }; |
57 | 52 | ||
diff --git a/block/blk-mq.c b/block/blk-mq.c index 98a18609755e..b600463791ec 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include "blk-wbt.h" | 37 | #include "blk-wbt.h" |
38 | #include "blk-mq-sched.h" | 38 | #include "blk-mq-sched.h" |
39 | 39 | ||
40 | static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); | ||
40 | static void blk_mq_poll_stats_start(struct request_queue *q); | 41 | static void blk_mq_poll_stats_start(struct request_queue *q); |
41 | static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); | 42 | static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); |
42 | 43 | ||
@@ -60,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) | |||
60 | /* | 61 | /* |
61 | * Check if any of the ctx's have pending work in this hardware queue | 62 | * Check if any of the ctx's have pending work in this hardware queue |
62 | */ | 63 | */ |
63 | bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) | 64 | static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) |
64 | { | 65 | { |
65 | return sbitmap_any_bit_set(&hctx->ctx_map) || | 66 | return !list_empty_careful(&hctx->dispatch) || |
66 | !list_empty_careful(&hctx->dispatch) || | 67 | sbitmap_any_bit_set(&hctx->ctx_map) || |
67 | blk_mq_sched_has_work(hctx); | 68 | blk_mq_sched_has_work(hctx); |
68 | } | 69 | } |
69 | 70 | ||
@@ -125,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q) | |||
125 | freeze_depth = atomic_inc_return(&q->mq_freeze_depth); | 126 | freeze_depth = atomic_inc_return(&q->mq_freeze_depth); |
126 | if (freeze_depth == 1) { | 127 | if (freeze_depth == 1) { |
127 | percpu_ref_kill(&q->q_usage_counter); | 128 | percpu_ref_kill(&q->q_usage_counter); |
128 | blk_mq_run_hw_queues(q, false); | 129 | if (q->mq_ops) |
130 | blk_mq_run_hw_queues(q, false); | ||
129 | } | 131 | } |
130 | } | 132 | } |
131 | EXPORT_SYMBOL_GPL(blk_freeze_queue_start); | 133 | EXPORT_SYMBOL_GPL(blk_freeze_queue_start); |
@@ -255,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q) | |||
255 | queue_for_each_hw_ctx(q, hctx, i) | 257 | queue_for_each_hw_ctx(q, hctx, i) |
256 | if (blk_mq_hw_queue_mapped(hctx)) | 258 | if (blk_mq_hw_queue_mapped(hctx)) |
257 | blk_mq_tag_wakeup_all(hctx->tags, true); | 259 | blk_mq_tag_wakeup_all(hctx->tags, true); |
258 | |||
259 | /* | ||
260 | * If we are called because the queue has now been marked as | ||
261 | * dying, we need to ensure that processes currently waiting on | ||
262 | * the queue are notified as well. | ||
263 | */ | ||
264 | wake_up_all(&q->mq_freeze_wq); | ||
265 | } | 260 | } |
266 | 261 | ||
267 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) | 262 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) |
@@ -296,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
296 | rq->q = data->q; | 291 | rq->q = data->q; |
297 | rq->mq_ctx = data->ctx; | 292 | rq->mq_ctx = data->ctx; |
298 | rq->cmd_flags = op; | 293 | rq->cmd_flags = op; |
294 | if (data->flags & BLK_MQ_REQ_PREEMPT) | ||
295 | rq->rq_flags |= RQF_PREEMPT; | ||
299 | if (blk_queue_io_stat(data->q)) | 296 | if (blk_queue_io_stat(data->q)) |
300 | rq->rq_flags |= RQF_IO_STAT; | 297 | rq->rq_flags |= RQF_IO_STAT; |
301 | /* do not touch atomic flags, it needs atomic ops against the timer */ | 298 | /* do not touch atomic flags, it needs atomic ops against the timer */ |
@@ -336,12 +333,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
336 | struct elevator_queue *e = q->elevator; | 333 | struct elevator_queue *e = q->elevator; |
337 | struct request *rq; | 334 | struct request *rq; |
338 | unsigned int tag; | 335 | unsigned int tag; |
339 | struct blk_mq_ctx *local_ctx = NULL; | 336 | bool put_ctx_on_error = false; |
340 | 337 | ||
341 | blk_queue_enter_live(q); | 338 | blk_queue_enter_live(q); |
342 | data->q = q; | 339 | data->q = q; |
343 | if (likely(!data->ctx)) | 340 | if (likely(!data->ctx)) { |
344 | data->ctx = local_ctx = blk_mq_get_ctx(q); | 341 | data->ctx = blk_mq_get_ctx(q); |
342 | put_ctx_on_error = true; | ||
343 | } | ||
345 | if (likely(!data->hctx)) | 344 | if (likely(!data->hctx)) |
346 | data->hctx = blk_mq_map_queue(q, data->ctx->cpu); | 345 | data->hctx = blk_mq_map_queue(q, data->ctx->cpu); |
347 | if (op & REQ_NOWAIT) | 346 | if (op & REQ_NOWAIT) |
@@ -360,8 +359,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
360 | 359 | ||
361 | tag = blk_mq_get_tag(data); | 360 | tag = blk_mq_get_tag(data); |
362 | if (tag == BLK_MQ_TAG_FAIL) { | 361 | if (tag == BLK_MQ_TAG_FAIL) { |
363 | if (local_ctx) { | 362 | if (put_ctx_on_error) { |
364 | blk_mq_put_ctx(local_ctx); | 363 | blk_mq_put_ctx(data->ctx); |
365 | data->ctx = NULL; | 364 | data->ctx = NULL; |
366 | } | 365 | } |
367 | blk_queue_exit(q); | 366 | blk_queue_exit(q); |
@@ -384,13 +383,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
384 | } | 383 | } |
385 | 384 | ||
386 | struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, | 385 | struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, |
387 | unsigned int flags) | 386 | blk_mq_req_flags_t flags) |
388 | { | 387 | { |
389 | struct blk_mq_alloc_data alloc_data = { .flags = flags }; | 388 | struct blk_mq_alloc_data alloc_data = { .flags = flags }; |
390 | struct request *rq; | 389 | struct request *rq; |
391 | int ret; | 390 | int ret; |
392 | 391 | ||
393 | ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); | 392 | ret = blk_queue_enter(q, flags); |
394 | if (ret) | 393 | if (ret) |
395 | return ERR_PTR(ret); | 394 | return ERR_PTR(ret); |
396 | 395 | ||
@@ -410,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, | |||
410 | EXPORT_SYMBOL(blk_mq_alloc_request); | 409 | EXPORT_SYMBOL(blk_mq_alloc_request); |
411 | 410 | ||
412 | struct request *blk_mq_alloc_request_hctx(struct request_queue *q, | 411 | struct request *blk_mq_alloc_request_hctx(struct request_queue *q, |
413 | unsigned int op, unsigned int flags, unsigned int hctx_idx) | 412 | unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) |
414 | { | 413 | { |
415 | struct blk_mq_alloc_data alloc_data = { .flags = flags }; | 414 | struct blk_mq_alloc_data alloc_data = { .flags = flags }; |
416 | struct request *rq; | 415 | struct request *rq; |
@@ -429,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, | |||
429 | if (hctx_idx >= q->nr_hw_queues) | 428 | if (hctx_idx >= q->nr_hw_queues) |
430 | return ERR_PTR(-EIO); | 429 | return ERR_PTR(-EIO); |
431 | 430 | ||
432 | ret = blk_queue_enter(q, true); | 431 | ret = blk_queue_enter(q, flags); |
433 | if (ret) | 432 | if (ret) |
434 | return ERR_PTR(ret); | 433 | return ERR_PTR(ret); |
435 | 434 | ||
@@ -476,8 +475,14 @@ void blk_mq_free_request(struct request *rq) | |||
476 | if (rq->rq_flags & RQF_MQ_INFLIGHT) | 475 | if (rq->rq_flags & RQF_MQ_INFLIGHT) |
477 | atomic_dec(&hctx->nr_active); | 476 | atomic_dec(&hctx->nr_active); |
478 | 477 | ||
478 | if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) | ||
479 | laptop_io_completion(q->backing_dev_info); | ||
480 | |||
479 | wbt_done(q->rq_wb, &rq->issue_stat); | 481 | wbt_done(q->rq_wb, &rq->issue_stat); |
480 | 482 | ||
483 | if (blk_rq_rl(rq)) | ||
484 | blk_put_rl(blk_rq_rl(rq)); | ||
485 | |||
481 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 486 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
482 | clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); | 487 | clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); |
483 | if (rq->tag != -1) | 488 | if (rq->tag != -1) |
@@ -593,22 +598,32 @@ void blk_mq_start_request(struct request *rq) | |||
593 | 598 | ||
594 | blk_add_timer(rq); | 599 | blk_add_timer(rq); |
595 | 600 | ||
596 | /* | 601 | WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); |
597 | * Ensure that ->deadline is visible before set the started | ||
598 | * flag and clear the completed flag. | ||
599 | */ | ||
600 | smp_mb__before_atomic(); | ||
601 | 602 | ||
602 | /* | 603 | /* |
603 | * Mark us as started and clear complete. Complete might have been | 604 | * Mark us as started and clear complete. Complete might have been |
604 | * set if requeue raced with timeout, which then marked it as | 605 | * set if requeue raced with timeout, which then marked it as |
605 | * complete. So be sure to clear complete again when we start | 606 | * complete. So be sure to clear complete again when we start |
606 | * the request, otherwise we'll ignore the completion event. | 607 | * the request, otherwise we'll ignore the completion event. |
608 | * | ||
609 | * Ensure that ->deadline is visible before we set STARTED, such that | ||
610 | * blk_mq_check_expired() is guaranteed to observe our ->deadline when | ||
611 | * it observes STARTED. | ||
607 | */ | 612 | */ |
608 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | 613 | smp_wmb(); |
609 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 614 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
610 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) | 615 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { |
616 | /* | ||
617 | * Coherence order guarantees these consecutive stores to a | ||
618 | * single variable propagate in the specified order. Thus the | ||
619 | * clear_bit() is ordered _after_ the set bit. See | ||
620 | * blk_mq_check_expired(). | ||
621 | * | ||
622 | * (the bits must be part of the same byte for this to be | ||
623 | * true). | ||
624 | */ | ||
611 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); | 625 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); |
626 | } | ||
612 | 627 | ||
613 | if (q->dma_drain_size && blk_rq_bytes(rq)) { | 628 | if (q->dma_drain_size && blk_rq_bytes(rq)) { |
614 | /* | 629 | /* |
@@ -634,6 +649,8 @@ static void __blk_mq_requeue_request(struct request *rq) | |||
634 | { | 649 | { |
635 | struct request_queue *q = rq->q; | 650 | struct request_queue *q = rq->q; |
636 | 651 | ||
652 | blk_mq_put_driver_tag(rq); | ||
653 | |||
637 | trace_block_rq_requeue(q, rq); | 654 | trace_block_rq_requeue(q, rq); |
638 | wbt_requeue(q->rq_wb, &rq->issue_stat); | 655 | wbt_requeue(q->rq_wb, &rq->issue_stat); |
639 | blk_mq_sched_requeue_request(rq); | 656 | blk_mq_sched_requeue_request(rq); |
@@ -690,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, | |||
690 | 707 | ||
691 | /* | 708 | /* |
692 | * We abuse this flag that is otherwise used by the I/O scheduler to | 709 | * We abuse this flag that is otherwise used by the I/O scheduler to |
693 | * request head insertation from the workqueue. | 710 | * request head insertion from the workqueue. |
694 | */ | 711 | */ |
695 | BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); | 712 | BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); |
696 | 713 | ||
@@ -778,11 +795,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
778 | struct request *rq, void *priv, bool reserved) | 795 | struct request *rq, void *priv, bool reserved) |
779 | { | 796 | { |
780 | struct blk_mq_timeout_data *data = priv; | 797 | struct blk_mq_timeout_data *data = priv; |
798 | unsigned long deadline; | ||
781 | 799 | ||
782 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | 800 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) |
783 | return; | 801 | return; |
784 | 802 | ||
785 | /* | 803 | /* |
804 | * Ensures that if we see STARTED we must also see our | ||
805 | * up-to-date deadline, see blk_mq_start_request(). | ||
806 | */ | ||
807 | smp_rmb(); | ||
808 | |||
809 | deadline = READ_ONCE(rq->deadline); | ||
810 | |||
811 | /* | ||
786 | * The rq being checked may have been freed and reallocated | 812 | * The rq being checked may have been freed and reallocated |
787 | * out already here, we avoid this race by checking rq->deadline | 813 | * out already here, we avoid this race by checking rq->deadline |
788 | * and REQ_ATOM_COMPLETE flag together: | 814 | * and REQ_ATOM_COMPLETE flag together: |
@@ -795,11 +821,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
795 | * and clearing the flag in blk_mq_start_request(), so | 821 | * and clearing the flag in blk_mq_start_request(), so |
796 | * this rq won't be timed out too. | 822 | * this rq won't be timed out too. |
797 | */ | 823 | */ |
798 | if (time_after_eq(jiffies, rq->deadline)) { | 824 | if (time_after_eq(jiffies, deadline)) { |
799 | if (!blk_mark_rq_complete(rq)) | 825 | if (!blk_mark_rq_complete(rq)) { |
826 | /* | ||
827 | * Again coherence order ensures that consecutive reads | ||
828 | * from the same variable must be in that order. This | ||
829 | * ensures that if we see COMPLETE clear, we must then | ||
830 | * see STARTED set and we'll ignore this timeout. | ||
831 | * | ||
832 | * (There's also the MB implied by the test_and_clear()) | ||
833 | */ | ||
800 | blk_mq_rq_timed_out(rq, reserved); | 834 | blk_mq_rq_timed_out(rq, reserved); |
801 | } else if (!data->next_set || time_after(data->next, rq->deadline)) { | 835 | } |
802 | data->next = rq->deadline; | 836 | } else if (!data->next_set || time_after(data->next, deadline)) { |
837 | data->next = deadline; | ||
803 | data->next_set = 1; | 838 | data->next_set = 1; |
804 | } | 839 | } |
805 | } | 840 | } |
@@ -880,6 +915,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) | |||
880 | } | 915 | } |
881 | EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); | 916 | EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); |
882 | 917 | ||
918 | struct dispatch_rq_data { | ||
919 | struct blk_mq_hw_ctx *hctx; | ||
920 | struct request *rq; | ||
921 | }; | ||
922 | |||
923 | static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, | ||
924 | void *data) | ||
925 | { | ||
926 | struct dispatch_rq_data *dispatch_data = data; | ||
927 | struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; | ||
928 | struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; | ||
929 | |||
930 | spin_lock(&ctx->lock); | ||
931 | if (unlikely(!list_empty(&ctx->rq_list))) { | ||
932 | dispatch_data->rq = list_entry_rq(ctx->rq_list.next); | ||
933 | list_del_init(&dispatch_data->rq->queuelist); | ||
934 | if (list_empty(&ctx->rq_list)) | ||
935 | sbitmap_clear_bit(sb, bitnr); | ||
936 | } | ||
937 | spin_unlock(&ctx->lock); | ||
938 | |||
939 | return !dispatch_data->rq; | ||
940 | } | ||
941 | |||
942 | struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, | ||
943 | struct blk_mq_ctx *start) | ||
944 | { | ||
945 | unsigned off = start ? start->index_hw : 0; | ||
946 | struct dispatch_rq_data data = { | ||
947 | .hctx = hctx, | ||
948 | .rq = NULL, | ||
949 | }; | ||
950 | |||
951 | __sbitmap_for_each_set(&hctx->ctx_map, off, | ||
952 | dispatch_rq_from_ctx, &data); | ||
953 | |||
954 | return data.rq; | ||
955 | } | ||
956 | |||
883 | static inline unsigned int queued_to_index(unsigned int queued) | 957 | static inline unsigned int queued_to_index(unsigned int queued) |
884 | { | 958 | { |
885 | if (!queued) | 959 | if (!queued) |
@@ -920,109 +994,95 @@ done: | |||
920 | return rq->tag != -1; | 994 | return rq->tag != -1; |
921 | } | 995 | } |
922 | 996 | ||
923 | static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, | 997 | static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, |
924 | struct request *rq) | 998 | int flags, void *key) |
925 | { | ||
926 | blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); | ||
927 | rq->tag = -1; | ||
928 | |||
929 | if (rq->rq_flags & RQF_MQ_INFLIGHT) { | ||
930 | rq->rq_flags &= ~RQF_MQ_INFLIGHT; | ||
931 | atomic_dec(&hctx->nr_active); | ||
932 | } | ||
933 | } | ||
934 | |||
935 | static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, | ||
936 | struct request *rq) | ||
937 | { | ||
938 | if (rq->tag == -1 || rq->internal_tag == -1) | ||
939 | return; | ||
940 | |||
941 | __blk_mq_put_driver_tag(hctx, rq); | ||
942 | } | ||
943 | |||
944 | static void blk_mq_put_driver_tag(struct request *rq) | ||
945 | { | 999 | { |
946 | struct blk_mq_hw_ctx *hctx; | 1000 | struct blk_mq_hw_ctx *hctx; |
947 | 1001 | ||
948 | if (rq->tag == -1 || rq->internal_tag == -1) | 1002 | hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); |
949 | return; | ||
950 | 1003 | ||
951 | hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); | 1004 | list_del_init(&wait->entry); |
952 | __blk_mq_put_driver_tag(hctx, rq); | 1005 | blk_mq_run_hw_queue(hctx, true); |
1006 | return 1; | ||
953 | } | 1007 | } |
954 | 1008 | ||
955 | /* | 1009 | /* |
956 | * If we fail getting a driver tag because all the driver tags are already | 1010 | * Mark us waiting for a tag. For shared tags, this involves hooking us into |
957 | * assigned and on the dispatch list, BUT the first entry does not have a | 1011 | * the tag wakeups. For non-shared tags, we can simply mark us nedeing a |
958 | * tag, then we could deadlock. For that case, move entries with assigned | 1012 | * restart. For both caes, take care to check the condition again after |
959 | * driver tags to the front, leaving the set of tagged requests in the | 1013 | * marking us as waiting. |
960 | * same order, and the untagged set in the same order. | ||
961 | */ | 1014 | */ |
962 | static bool reorder_tags_to_front(struct list_head *list) | 1015 | static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, |
963 | { | 1016 | struct request *rq) |
964 | struct request *rq, *tmp, *first = NULL; | ||
965 | |||
966 | list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { | ||
967 | if (rq == first) | ||
968 | break; | ||
969 | if (rq->tag != -1) { | ||
970 | list_move(&rq->queuelist, list); | ||
971 | if (!first) | ||
972 | first = rq; | ||
973 | } | ||
974 | } | ||
975 | |||
976 | return first != NULL; | ||
977 | } | ||
978 | |||
979 | static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, | ||
980 | void *key) | ||
981 | { | 1017 | { |
982 | struct blk_mq_hw_ctx *hctx; | 1018 | struct blk_mq_hw_ctx *this_hctx = *hctx; |
1019 | bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; | ||
1020 | struct sbq_wait_state *ws; | ||
1021 | wait_queue_entry_t *wait; | ||
1022 | bool ret; | ||
983 | 1023 | ||
984 | hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); | 1024 | if (!shared_tags) { |
1025 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) | ||
1026 | set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); | ||
1027 | } else { | ||
1028 | wait = &this_hctx->dispatch_wait; | ||
1029 | if (!list_empty_careful(&wait->entry)) | ||
1030 | return false; | ||
985 | 1031 | ||
986 | list_del(&wait->entry); | 1032 | spin_lock(&this_hctx->lock); |
987 | clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); | 1033 | if (!list_empty(&wait->entry)) { |
988 | blk_mq_run_hw_queue(hctx, true); | 1034 | spin_unlock(&this_hctx->lock); |
989 | return 1; | 1035 | return false; |
990 | } | 1036 | } |
991 | 1037 | ||
992 | static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) | 1038 | ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); |
993 | { | 1039 | add_wait_queue(&ws->wait, wait); |
994 | struct sbq_wait_state *ws; | 1040 | } |
995 | 1041 | ||
996 | /* | 1042 | /* |
997 | * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. | 1043 | * It's possible that a tag was freed in the window between the |
998 | * The thread which wins the race to grab this bit adds the hardware | 1044 | * allocation failure and adding the hardware queue to the wait |
999 | * queue to the wait queue. | 1045 | * queue. |
1000 | */ | 1046 | */ |
1001 | if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || | 1047 | ret = blk_mq_get_driver_tag(rq, hctx, false); |
1002 | test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) | ||
1003 | return false; | ||
1004 | 1048 | ||
1005 | init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); | 1049 | if (!shared_tags) { |
1006 | ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); | 1050 | /* |
1051 | * Don't clear RESTART here, someone else could have set it. | ||
1052 | * At most this will cost an extra queue run. | ||
1053 | */ | ||
1054 | return ret; | ||
1055 | } else { | ||
1056 | if (!ret) { | ||
1057 | spin_unlock(&this_hctx->lock); | ||
1058 | return false; | ||
1059 | } | ||
1007 | 1060 | ||
1008 | /* | 1061 | /* |
1009 | * As soon as this returns, it's no longer safe to fiddle with | 1062 | * We got a tag, remove ourselves from the wait queue to ensure |
1010 | * hctx->dispatch_wait, since a completion can wake up the wait queue | 1063 | * someone else gets the wakeup. |
1011 | * and unlock the bit. | 1064 | */ |
1012 | */ | 1065 | spin_lock_irq(&ws->wait.lock); |
1013 | add_wait_queue(&ws->wait, &hctx->dispatch_wait); | 1066 | list_del_init(&wait->entry); |
1014 | return true; | 1067 | spin_unlock_irq(&ws->wait.lock); |
1068 | spin_unlock(&this_hctx->lock); | ||
1069 | return true; | ||
1070 | } | ||
1015 | } | 1071 | } |
1016 | 1072 | ||
1017 | bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) | 1073 | bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, |
1074 | bool got_budget) | ||
1018 | { | 1075 | { |
1019 | struct blk_mq_hw_ctx *hctx; | 1076 | struct blk_mq_hw_ctx *hctx; |
1020 | struct request *rq; | 1077 | struct request *rq, *nxt; |
1078 | bool no_tag = false; | ||
1021 | int errors, queued; | 1079 | int errors, queued; |
1022 | 1080 | ||
1023 | if (list_empty(list)) | 1081 | if (list_empty(list)) |
1024 | return false; | 1082 | return false; |
1025 | 1083 | ||
1084 | WARN_ON(!list_is_singular(list) && got_budget); | ||
1085 | |||
1026 | /* | 1086 | /* |
1027 | * Now process all the entries, sending them to the driver. | 1087 | * Now process all the entries, sending them to the driver. |
1028 | */ | 1088 | */ |
@@ -1033,23 +1093,29 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) | |||
1033 | 1093 | ||
1034 | rq = list_first_entry(list, struct request, queuelist); | 1094 | rq = list_first_entry(list, struct request, queuelist); |
1035 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) { | 1095 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) { |
1036 | if (!queued && reorder_tags_to_front(list)) | ||
1037 | continue; | ||
1038 | |||
1039 | /* | 1096 | /* |
1040 | * The initial allocation attempt failed, so we need to | 1097 | * The initial allocation attempt failed, so we need to |
1041 | * rerun the hardware queue when a tag is freed. | 1098 | * rerun the hardware queue when a tag is freed. The |
1099 | * waitqueue takes care of that. If the queue is run | ||
1100 | * before we add this entry back on the dispatch list, | ||
1101 | * we'll re-run it below. | ||
1042 | */ | 1102 | */ |
1043 | if (!blk_mq_dispatch_wait_add(hctx)) | 1103 | if (!blk_mq_mark_tag_wait(&hctx, rq)) { |
1104 | if (got_budget) | ||
1105 | blk_mq_put_dispatch_budget(hctx); | ||
1106 | /* | ||
1107 | * For non-shared tags, the RESTART check | ||
1108 | * will suffice. | ||
1109 | */ | ||
1110 | if (hctx->flags & BLK_MQ_F_TAG_SHARED) | ||
1111 | no_tag = true; | ||
1044 | break; | 1112 | break; |
1113 | } | ||
1114 | } | ||
1045 | 1115 | ||
1046 | /* | 1116 | if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { |
1047 | * It's possible that a tag was freed in the window | 1117 | blk_mq_put_driver_tag(rq); |
1048 | * between the allocation failure and adding the | 1118 | break; |
1049 | * hardware queue to the wait queue. | ||
1050 | */ | ||
1051 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) | ||
1052 | break; | ||
1053 | } | 1119 | } |
1054 | 1120 | ||
1055 | list_del_init(&rq->queuelist); | 1121 | list_del_init(&rq->queuelist); |
@@ -1063,15 +1129,21 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) | |||
1063 | if (list_empty(list)) | 1129 | if (list_empty(list)) |
1064 | bd.last = true; | 1130 | bd.last = true; |
1065 | else { | 1131 | else { |
1066 | struct request *nxt; | ||
1067 | |||
1068 | nxt = list_first_entry(list, struct request, queuelist); | 1132 | nxt = list_first_entry(list, struct request, queuelist); |
1069 | bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); | 1133 | bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); |
1070 | } | 1134 | } |
1071 | 1135 | ||
1072 | ret = q->mq_ops->queue_rq(hctx, &bd); | 1136 | ret = q->mq_ops->queue_rq(hctx, &bd); |
1073 | if (ret == BLK_STS_RESOURCE) { | 1137 | if (ret == BLK_STS_RESOURCE) { |
1074 | blk_mq_put_driver_tag_hctx(hctx, rq); | 1138 | /* |
1139 | * If an I/O scheduler has been configured and we got a | ||
1140 | * driver tag for the next request already, free it | ||
1141 | * again. | ||
1142 | */ | ||
1143 | if (!list_empty(list)) { | ||
1144 | nxt = list_first_entry(list, struct request, queuelist); | ||
1145 | blk_mq_put_driver_tag(nxt); | ||
1146 | } | ||
1075 | list_add(&rq->queuelist, list); | 1147 | list_add(&rq->queuelist, list); |
1076 | __blk_mq_requeue_request(rq); | 1148 | __blk_mq_requeue_request(rq); |
1077 | break; | 1149 | break; |
@@ -1093,13 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) | |||
1093 | * that is where we will continue on next queue run. | 1165 | * that is where we will continue on next queue run. |
1094 | */ | 1166 | */ |
1095 | if (!list_empty(list)) { | 1167 | if (!list_empty(list)) { |
1096 | /* | ||
1097 | * If an I/O scheduler has been configured and we got a driver | ||
1098 | * tag for the next request already, free it again. | ||
1099 | */ | ||
1100 | rq = list_first_entry(list, struct request, queuelist); | ||
1101 | blk_mq_put_driver_tag(rq); | ||
1102 | |||
1103 | spin_lock(&hctx->lock); | 1168 | spin_lock(&hctx->lock); |
1104 | list_splice_init(list, &hctx->dispatch); | 1169 | list_splice_init(list, &hctx->dispatch); |
1105 | spin_unlock(&hctx->lock); | 1170 | spin_unlock(&hctx->lock); |
@@ -1109,10 +1174,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) | |||
1109 | * it is no longer set that means that it was cleared by another | 1174 | * it is no longer set that means that it was cleared by another |
1110 | * thread and hence that a queue rerun is needed. | 1175 | * thread and hence that a queue rerun is needed. |
1111 | * | 1176 | * |
1112 | * If TAG_WAITING is set that means that an I/O scheduler has | 1177 | * If 'no_tag' is set, that means that we failed getting |
1113 | * been configured and another thread is waiting for a driver | 1178 | * a driver tag with an I/O scheduler attached. If our dispatch |
1114 | * tag. To guarantee fairness, do not rerun this hardware queue | 1179 | * waitqueue is no longer active, ensure that we run the queue |
1115 | * but let the other thread grab the driver tag. | 1180 | * AFTER adding our entries back to the list. |
1116 | * | 1181 | * |
1117 | * If no I/O scheduler has been configured it is possible that | 1182 | * If no I/O scheduler has been configured it is possible that |
1118 | * the hardware queue got stopped and restarted before requests | 1183 | * the hardware queue got stopped and restarted before requests |
@@ -1124,8 +1189,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) | |||
1124 | * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq | 1189 | * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq |
1125 | * and dm-rq. | 1190 | * and dm-rq. |
1126 | */ | 1191 | */ |
1127 | if (!blk_mq_sched_needs_restart(hctx) && | 1192 | if (!blk_mq_sched_needs_restart(hctx) || |
1128 | !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) | 1193 | (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) |
1129 | blk_mq_run_hw_queue(hctx, true); | 1194 | blk_mq_run_hw_queue(hctx, true); |
1130 | } | 1195 | } |
1131 | 1196 | ||
@@ -1218,9 +1283,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) | |||
1218 | } | 1283 | } |
1219 | EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); | 1284 | EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); |
1220 | 1285 | ||
1221 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | 1286 | bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) |
1222 | { | 1287 | { |
1223 | __blk_mq_delay_run_hw_queue(hctx, async, 0); | 1288 | if (blk_mq_hctx_has_pending(hctx)) { |
1289 | __blk_mq_delay_run_hw_queue(hctx, async, 0); | ||
1290 | return true; | ||
1291 | } | ||
1292 | |||
1293 | return false; | ||
1224 | } | 1294 | } |
1225 | EXPORT_SYMBOL(blk_mq_run_hw_queue); | 1295 | EXPORT_SYMBOL(blk_mq_run_hw_queue); |
1226 | 1296 | ||
@@ -1230,8 +1300,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) | |||
1230 | int i; | 1300 | int i; |
1231 | 1301 | ||
1232 | queue_for_each_hw_ctx(q, hctx, i) { | 1302 | queue_for_each_hw_ctx(q, hctx, i) { |
1233 | if (!blk_mq_hctx_has_pending(hctx) || | 1303 | if (blk_mq_hctx_stopped(hctx)) |
1234 | blk_mq_hctx_stopped(hctx)) | ||
1235 | continue; | 1304 | continue; |
1236 | 1305 | ||
1237 | blk_mq_run_hw_queue(hctx, async); | 1306 | blk_mq_run_hw_queue(hctx, async); |
@@ -1405,7 +1474,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
1405 | * Should only be used carefully, when the caller knows we want to | 1474 | * Should only be used carefully, when the caller knows we want to |
1406 | * bypass a potential IO scheduler on the target device. | 1475 | * bypass a potential IO scheduler on the target device. |
1407 | */ | 1476 | */ |
1408 | void blk_mq_request_bypass_insert(struct request *rq) | 1477 | void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) |
1409 | { | 1478 | { |
1410 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 1479 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
1411 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); | 1480 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); |
@@ -1414,7 +1483,8 @@ void blk_mq_request_bypass_insert(struct request *rq) | |||
1414 | list_add_tail(&rq->queuelist, &hctx->dispatch); | 1483 | list_add_tail(&rq->queuelist, &hctx->dispatch); |
1415 | spin_unlock(&hctx->lock); | 1484 | spin_unlock(&hctx->lock); |
1416 | 1485 | ||
1417 | blk_mq_run_hw_queue(hctx, false); | 1486 | if (run_queue) |
1487 | blk_mq_run_hw_queue(hctx, false); | ||
1418 | } | 1488 | } |
1419 | 1489 | ||
1420 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | 1490 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, |
@@ -1501,13 +1571,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | |||
1501 | { | 1571 | { |
1502 | blk_init_request_from_bio(rq, bio); | 1572 | blk_init_request_from_bio(rq, bio); |
1503 | 1573 | ||
1504 | blk_account_io_start(rq, true); | 1574 | blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); |
1505 | } | ||
1506 | 1575 | ||
1507 | static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) | 1576 | blk_account_io_start(rq, true); |
1508 | { | ||
1509 | return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && | ||
1510 | !blk_queue_nomerges(hctx->queue); | ||
1511 | } | 1577 | } |
1512 | 1578 | ||
1513 | static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, | 1579 | static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, |
@@ -1552,6 +1618,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | |||
1552 | if (!blk_mq_get_driver_tag(rq, NULL, false)) | 1618 | if (!blk_mq_get_driver_tag(rq, NULL, false)) |
1553 | goto insert; | 1619 | goto insert; |
1554 | 1620 | ||
1621 | if (!blk_mq_get_dispatch_budget(hctx)) { | ||
1622 | blk_mq_put_driver_tag(rq); | ||
1623 | goto insert; | ||
1624 | } | ||
1625 | |||
1555 | new_cookie = request_to_qc_t(hctx, rq); | 1626 | new_cookie = request_to_qc_t(hctx, rq); |
1556 | 1627 | ||
1557 | /* | 1628 | /* |
@@ -1641,13 +1712,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1641 | if (unlikely(is_flush_fua)) { | 1712 | if (unlikely(is_flush_fua)) { |
1642 | blk_mq_put_ctx(data.ctx); | 1713 | blk_mq_put_ctx(data.ctx); |
1643 | blk_mq_bio_to_request(rq, bio); | 1714 | blk_mq_bio_to_request(rq, bio); |
1644 | if (q->elevator) { | 1715 | |
1645 | blk_mq_sched_insert_request(rq, false, true, true, | 1716 | /* bypass scheduler for flush rq */ |
1646 | true); | 1717 | blk_insert_flush(rq); |
1647 | } else { | 1718 | blk_mq_run_hw_queue(data.hctx, true); |
1648 | blk_insert_flush(rq); | ||
1649 | blk_mq_run_hw_queue(data.hctx, true); | ||
1650 | } | ||
1651 | } else if (plug && q->nr_hw_queues == 1) { | 1719 | } else if (plug && q->nr_hw_queues == 1) { |
1652 | struct request *last = NULL; | 1720 | struct request *last = NULL; |
1653 | 1721 | ||
@@ -1990,6 +2058,9 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
1990 | 2058 | ||
1991 | hctx->nr_ctx = 0; | 2059 | hctx->nr_ctx = 0; |
1992 | 2060 | ||
2061 | init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); | ||
2062 | INIT_LIST_HEAD(&hctx->dispatch_wait.entry); | ||
2063 | |||
1993 | if (set->ops->init_hctx && | 2064 | if (set->ops->init_hctx && |
1994 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) | 2065 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) |
1995 | goto free_bitmap; | 2066 | goto free_bitmap; |
@@ -2229,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | |||
2229 | 2300 | ||
2230 | mutex_lock(&set->tag_list_lock); | 2301 | mutex_lock(&set->tag_list_lock); |
2231 | 2302 | ||
2232 | /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ | 2303 | /* |
2233 | if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { | 2304 | * Check to see if we're transitioning to shared (from 1 to 2 queues). |
2305 | */ | ||
2306 | if (!list_empty(&set->tag_list) && | ||
2307 | !(set->flags & BLK_MQ_F_TAG_SHARED)) { | ||
2234 | set->flags |= BLK_MQ_F_TAG_SHARED; | 2308 | set->flags |= BLK_MQ_F_TAG_SHARED; |
2235 | /* update existing queue */ | 2309 | /* update existing queue */ |
2236 | blk_mq_update_tag_set_depth(set, true); | 2310 | blk_mq_update_tag_set_depth(set, true); |
@@ -2404,6 +2478,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2404 | spin_lock_init(&q->requeue_lock); | 2478 | spin_lock_init(&q->requeue_lock); |
2405 | 2479 | ||
2406 | blk_queue_make_request(q, blk_mq_make_request); | 2480 | blk_queue_make_request(q, blk_mq_make_request); |
2481 | if (q->mq_ops->poll) | ||
2482 | q->poll_fn = blk_mq_poll; | ||
2407 | 2483 | ||
2408 | /* | 2484 | /* |
2409 | * Do this after blk_queue_make_request() overrides it... | 2485 | * Do this after blk_queue_make_request() overrides it... |
@@ -2460,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q) | |||
2460 | 2536 | ||
2461 | /* | 2537 | /* |
2462 | * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe | 2538 | * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe |
2463 | * we should change hctx numa_node according to new topology (this | 2539 | * we should change hctx numa_node according to the new topology (this |
2464 | * involves free and re-allocate memory, worthy doing?) | 2540 | * involves freeing and re-allocating memory, worth doing?) |
2465 | */ | 2541 | */ |
2466 | |||
2467 | blk_mq_map_swqueue(q); | 2542 | blk_mq_map_swqueue(q); |
2468 | 2543 | ||
2469 | blk_mq_sysfs_register(q); | 2544 | blk_mq_sysfs_register(q); |
@@ -2552,6 +2627,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
2552 | if (!set->ops->queue_rq) | 2627 | if (!set->ops->queue_rq) |
2553 | return -EINVAL; | 2628 | return -EINVAL; |
2554 | 2629 | ||
2630 | if (!set->ops->get_budget ^ !set->ops->put_budget) | ||
2631 | return -EINVAL; | ||
2632 | |||
2555 | if (set->queue_depth > BLK_MQ_MAX_DEPTH) { | 2633 | if (set->queue_depth > BLK_MQ_MAX_DEPTH) { |
2556 | pr_info("blk-mq: reduced tag depth to %u\n", | 2634 | pr_info("blk-mq: reduced tag depth to %u\n", |
2557 | BLK_MQ_MAX_DEPTH); | 2635 | BLK_MQ_MAX_DEPTH); |
@@ -2642,8 +2720,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) | |||
2642 | * queue depth. This is similar to what the old code would do. | 2720 | * queue depth. This is similar to what the old code would do. |
2643 | */ | 2721 | */ |
2644 | if (!hctx->sched_tags) { | 2722 | if (!hctx->sched_tags) { |
2645 | ret = blk_mq_tag_update_depth(hctx, &hctx->tags, | 2723 | ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, |
2646 | min(nr, set->queue_depth), | ||
2647 | false); | 2724 | false); |
2648 | } else { | 2725 | } else { |
2649 | ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, | 2726 | ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, |
@@ -2863,20 +2940,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) | |||
2863 | return false; | 2940 | return false; |
2864 | } | 2941 | } |
2865 | 2942 | ||
2866 | bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) | 2943 | static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) |
2867 | { | 2944 | { |
2868 | struct blk_mq_hw_ctx *hctx; | 2945 | struct blk_mq_hw_ctx *hctx; |
2869 | struct blk_plug *plug; | ||
2870 | struct request *rq; | 2946 | struct request *rq; |
2871 | 2947 | ||
2872 | if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || | 2948 | if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) |
2873 | !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) | ||
2874 | return false; | 2949 | return false; |
2875 | 2950 | ||
2876 | plug = current->plug; | ||
2877 | if (plug) | ||
2878 | blk_flush_plug_list(plug, false); | ||
2879 | |||
2880 | hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; | 2951 | hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; |
2881 | if (!blk_qc_t_is_internal(cookie)) | 2952 | if (!blk_qc_t_is_internal(cookie)) |
2882 | rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); | 2953 | rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); |
@@ -2894,10 +2965,15 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) | |||
2894 | 2965 | ||
2895 | return __blk_mq_poll(hctx, rq); | 2966 | return __blk_mq_poll(hctx, rq); |
2896 | } | 2967 | } |
2897 | EXPORT_SYMBOL_GPL(blk_mq_poll); | ||
2898 | 2968 | ||
2899 | static int __init blk_mq_init(void) | 2969 | static int __init blk_mq_init(void) |
2900 | { | 2970 | { |
2971 | /* | ||
2972 | * See comment in block/blk.h rq_atomic_flags enum | ||
2973 | */ | ||
2974 | BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != | ||
2975 | (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); | ||
2976 | |||
2901 | cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, | 2977 | cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, |
2902 | blk_mq_hctx_notify_dead); | 2978 | blk_mq_hctx_notify_dead); |
2903 | return 0; | 2979 | return 0; |
diff --git a/block/blk-mq.h b/block/blk-mq.h index 4933af9d61f7..6c7c3ff5bf62 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h | |||
@@ -3,6 +3,7 @@ | |||
3 | #define INT_BLK_MQ_H | 3 | #define INT_BLK_MQ_H |
4 | 4 | ||
5 | #include "blk-stat.h" | 5 | #include "blk-stat.h" |
6 | #include "blk-mq-tag.h" | ||
6 | 7 | ||
7 | struct blk_mq_tag_set; | 8 | struct blk_mq_tag_set; |
8 | 9 | ||
@@ -26,16 +27,16 @@ struct blk_mq_ctx { | |||
26 | struct kobject kobj; | 27 | struct kobject kobj; |
27 | } ____cacheline_aligned_in_smp; | 28 | } ____cacheline_aligned_in_smp; |
28 | 29 | ||
29 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); | ||
30 | void blk_mq_freeze_queue(struct request_queue *q); | 30 | void blk_mq_freeze_queue(struct request_queue *q); |
31 | void blk_mq_free_queue(struct request_queue *q); | 31 | void blk_mq_free_queue(struct request_queue *q); |
32 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); | 32 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); |
33 | void blk_mq_wake_waiters(struct request_queue *q); | 33 | void blk_mq_wake_waiters(struct request_queue *q); |
34 | bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); | 34 | bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); |
35 | void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); | 35 | void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); |
36 | bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); | ||
37 | bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, | 36 | bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, |
38 | bool wait); | 37 | bool wait); |
38 | struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, | ||
39 | struct blk_mq_ctx *start); | ||
39 | 40 | ||
40 | /* | 41 | /* |
41 | * Internal helpers for allocating/freeing the request map | 42 | * Internal helpers for allocating/freeing the request map |
@@ -55,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, | |||
55 | */ | 56 | */ |
56 | void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | 57 | void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, |
57 | bool at_head); | 58 | bool at_head); |
58 | void blk_mq_request_bypass_insert(struct request *rq); | 59 | void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); |
59 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | 60 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, |
60 | struct list_head *list); | 61 | struct list_head *list); |
61 | 62 | ||
@@ -109,7 +110,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) | |||
109 | struct blk_mq_alloc_data { | 110 | struct blk_mq_alloc_data { |
110 | /* input parameter */ | 111 | /* input parameter */ |
111 | struct request_queue *q; | 112 | struct request_queue *q; |
112 | unsigned int flags; | 113 | blk_mq_req_flags_t flags; |
113 | unsigned int shallow_depth; | 114 | unsigned int shallow_depth; |
114 | 115 | ||
115 | /* input & output parameter */ | 116 | /* input & output parameter */ |
@@ -138,4 +139,53 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) | |||
138 | void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, | 139 | void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, |
139 | unsigned int inflight[2]); | 140 | unsigned int inflight[2]); |
140 | 141 | ||
142 | static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) | ||
143 | { | ||
144 | struct request_queue *q = hctx->queue; | ||
145 | |||
146 | if (q->mq_ops->put_budget) | ||
147 | q->mq_ops->put_budget(hctx); | ||
148 | } | ||
149 | |||
150 | static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) | ||
151 | { | ||
152 | struct request_queue *q = hctx->queue; | ||
153 | |||
154 | if (q->mq_ops->get_budget) | ||
155 | return q->mq_ops->get_budget(hctx); | ||
156 | return true; | ||
157 | } | ||
158 | |||
159 | static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, | ||
160 | struct request *rq) | ||
161 | { | ||
162 | blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); | ||
163 | rq->tag = -1; | ||
164 | |||
165 | if (rq->rq_flags & RQF_MQ_INFLIGHT) { | ||
166 | rq->rq_flags &= ~RQF_MQ_INFLIGHT; | ||
167 | atomic_dec(&hctx->nr_active); | ||
168 | } | ||
169 | } | ||
170 | |||
171 | static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, | ||
172 | struct request *rq) | ||
173 | { | ||
174 | if (rq->tag == -1 || rq->internal_tag == -1) | ||
175 | return; | ||
176 | |||
177 | __blk_mq_put_driver_tag(hctx, rq); | ||
178 | } | ||
179 | |||
180 | static inline void blk_mq_put_driver_tag(struct request *rq) | ||
181 | { | ||
182 | struct blk_mq_hw_ctx *hctx; | ||
183 | |||
184 | if (rq->tag == -1 || rq->internal_tag == -1) | ||
185 | return; | ||
186 | |||
187 | hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); | ||
188 | __blk_mq_put_driver_tag(hctx, rq); | ||
189 | } | ||
190 | |||
141 | #endif | 191 | #endif |
diff --git a/block/blk-settings.c b/block/blk-settings.c index 8559e9563c52..48ebe6be07b7 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits); | |||
157 | * Caveat: | 157 | * Caveat: |
158 | * The driver that does this *must* be able to deal appropriately | 158 | * The driver that does this *must* be able to deal appropriately |
159 | * with buffers in "highmemory". This can be accomplished by either calling | 159 | * with buffers in "highmemory". This can be accomplished by either calling |
160 | * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling | 160 | * kmap_atomic() to get a temporary kernel mapping, or by calling |
161 | * blk_queue_bounce() to create a buffer in normal memory. | 161 | * blk_queue_bounce() to create a buffer in normal memory. |
162 | **/ | 162 | **/ |
163 | void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) | 163 | void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) |
diff --git a/block/blk-stat.c b/block/blk-stat.c index c52356d90fe3..3a2f3c96f367 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c | |||
@@ -11,8 +11,6 @@ | |||
11 | #include "blk-mq.h" | 11 | #include "blk-mq.h" |
12 | #include "blk.h" | 12 | #include "blk.h" |
13 | 13 | ||
14 | #define BLK_RQ_STAT_BATCH 64 | ||
15 | |||
16 | struct blk_queue_stats { | 14 | struct blk_queue_stats { |
17 | struct list_head callbacks; | 15 | struct list_head callbacks; |
18 | spinlock_t lock; | 16 | spinlock_t lock; |
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat) | |||
23 | { | 21 | { |
24 | stat->min = -1ULL; | 22 | stat->min = -1ULL; |
25 | stat->max = stat->nr_samples = stat->mean = 0; | 23 | stat->max = stat->nr_samples = stat->mean = 0; |
26 | stat->batch = stat->nr_batch = 0; | 24 | stat->batch = 0; |
27 | } | ||
28 | |||
29 | static void blk_stat_flush_batch(struct blk_rq_stat *stat) | ||
30 | { | ||
31 | const s32 nr_batch = READ_ONCE(stat->nr_batch); | ||
32 | const s32 nr_samples = READ_ONCE(stat->nr_samples); | ||
33 | |||
34 | if (!nr_batch) | ||
35 | return; | ||
36 | if (!nr_samples) | ||
37 | stat->mean = div64_s64(stat->batch, nr_batch); | ||
38 | else { | ||
39 | stat->mean = div64_s64((stat->mean * nr_samples) + | ||
40 | stat->batch, | ||
41 | nr_batch + nr_samples); | ||
42 | } | ||
43 | |||
44 | stat->nr_samples += nr_batch; | ||
45 | stat->nr_batch = stat->batch = 0; | ||
46 | } | 25 | } |
47 | 26 | ||
27 | /* src is a per-cpu stat, mean isn't initialized */ | ||
48 | static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) | 28 | static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) |
49 | { | 29 | { |
50 | blk_stat_flush_batch(src); | ||
51 | |||
52 | if (!src->nr_samples) | 30 | if (!src->nr_samples) |
53 | return; | 31 | return; |
54 | 32 | ||
55 | dst->min = min(dst->min, src->min); | 33 | dst->min = min(dst->min, src->min); |
56 | dst->max = max(dst->max, src->max); | 34 | dst->max = max(dst->max, src->max); |
57 | 35 | ||
58 | if (!dst->nr_samples) | 36 | dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, |
59 | dst->mean = src->mean; | 37 | dst->nr_samples + src->nr_samples); |
60 | else { | 38 | |
61 | dst->mean = div64_s64((src->mean * src->nr_samples) + | ||
62 | (dst->mean * dst->nr_samples), | ||
63 | dst->nr_samples + src->nr_samples); | ||
64 | } | ||
65 | dst->nr_samples += src->nr_samples; | 39 | dst->nr_samples += src->nr_samples; |
66 | } | 40 | } |
67 | 41 | ||
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) | |||
69 | { | 43 | { |
70 | stat->min = min(stat->min, value); | 44 | stat->min = min(stat->min, value); |
71 | stat->max = max(stat->max, value); | 45 | stat->max = max(stat->max, value); |
72 | |||
73 | if (stat->batch + value < stat->batch || | ||
74 | stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) | ||
75 | blk_stat_flush_batch(stat); | ||
76 | |||
77 | stat->batch += value; | 46 | stat->batch += value; |
78 | stat->nr_batch++; | 47 | stat->nr_samples++; |
79 | } | 48 | } |
80 | 49 | ||
81 | void blk_stat_add(struct request *rq) | 50 | void blk_stat_add(struct request *rq) |
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq) | |||
84 | struct blk_stat_callback *cb; | 53 | struct blk_stat_callback *cb; |
85 | struct blk_rq_stat *stat; | 54 | struct blk_rq_stat *stat; |
86 | int bucket; | 55 | int bucket; |
87 | s64 now, value; | 56 | u64 now, value; |
88 | 57 | ||
89 | now = __blk_stat_time(ktime_to_ns(ktime_get())); | 58 | now = __blk_stat_time(ktime_to_ns(ktime_get())); |
90 | if (now < blk_stat_time(&rq->issue_stat)) | 59 | if (now < blk_stat_time(&rq->issue_stat)) |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8631763866c6..96ad32623427 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -2113,8 +2113,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) | |||
2113 | static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) | 2113 | static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) |
2114 | { | 2114 | { |
2115 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | 2115 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW |
2116 | if (bio->bi_css) | 2116 | if (bio->bi_css) { |
2117 | if (bio->bi_cg_private) | ||
2118 | blkg_put(tg_to_blkg(bio->bi_cg_private)); | ||
2117 | bio->bi_cg_private = tg; | 2119 | bio->bi_cg_private = tg; |
2120 | blkg_get(tg_to_blkg(tg)); | ||
2121 | } | ||
2118 | blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); | 2122 | blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); |
2119 | #endif | 2123 | #endif |
2120 | } | 2124 | } |
@@ -2284,8 +2288,10 @@ void blk_throtl_bio_endio(struct bio *bio) | |||
2284 | 2288 | ||
2285 | start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; | 2289 | start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; |
2286 | finish_time = __blk_stat_time(finish_time_ns) >> 10; | 2290 | finish_time = __blk_stat_time(finish_time_ns) >> 10; |
2287 | if (!start_time || finish_time <= start_time) | 2291 | if (!start_time || finish_time <= start_time) { |
2292 | blkg_put(tg_to_blkg(tg)); | ||
2288 | return; | 2293 | return; |
2294 | } | ||
2289 | 2295 | ||
2290 | lat = finish_time - start_time; | 2296 | lat = finish_time - start_time; |
2291 | /* this is only for bio based driver */ | 2297 | /* this is only for bio based driver */ |
@@ -2315,6 +2321,8 @@ void blk_throtl_bio_endio(struct bio *bio) | |||
2315 | tg->bio_cnt /= 2; | 2321 | tg->bio_cnt /= 2; |
2316 | tg->bad_bio_cnt /= 2; | 2322 | tg->bad_bio_cnt /= 2; |
2317 | } | 2323 | } |
2324 | |||
2325 | blkg_put(tg_to_blkg(tg)); | ||
2318 | } | 2326 | } |
2319 | #endif | 2327 | #endif |
2320 | 2328 | ||
diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 17ec83bb0900..764ecf9aeb30 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c | |||
@@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work) | |||
134 | struct request *rq, *tmp; | 134 | struct request *rq, *tmp; |
135 | int next_set = 0; | 135 | int next_set = 0; |
136 | 136 | ||
137 | if (blk_queue_enter(q, true)) | ||
138 | return; | ||
139 | spin_lock_irqsave(q->queue_lock, flags); | 137 | spin_lock_irqsave(q->queue_lock, flags); |
140 | 138 | ||
141 | list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) | 139 | list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) |
@@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work) | |||
145 | mod_timer(&q->timeout, round_jiffies_up(next)); | 143 | mod_timer(&q->timeout, round_jiffies_up(next)); |
146 | 144 | ||
147 | spin_unlock_irqrestore(q->queue_lock, flags); | 145 | spin_unlock_irqrestore(q->queue_lock, flags); |
148 | blk_queue_exit(q); | ||
149 | } | 146 | } |
150 | 147 | ||
151 | /** | 148 | /** |
@@ -211,7 +208,7 @@ void blk_add_timer(struct request *req) | |||
211 | if (!req->timeout) | 208 | if (!req->timeout) |
212 | req->timeout = q->rq_timeout; | 209 | req->timeout = q->rq_timeout; |
213 | 210 | ||
214 | req->deadline = jiffies + req->timeout; | 211 | WRITE_ONCE(req->deadline, jiffies + req->timeout); |
215 | 212 | ||
216 | /* | 213 | /* |
217 | * Only the non-mq case needs to add the request to a protected list. | 214 | * Only the non-mq case needs to add the request to a protected list. |
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index d822530e6aea..b252da0e4c11 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c | |||
@@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) | |||
654 | } | 654 | } |
655 | 655 | ||
656 | /* | 656 | /* |
657 | * Disable wbt, if enabled by default. Only called from CFQ. | 657 | * Disable wbt, if enabled by default. |
658 | */ | 658 | */ |
659 | void wbt_disable_default(struct request_queue *q) | 659 | void wbt_disable_default(struct request_queue *q) |
660 | { | 660 | { |
diff --git a/block/blk.h b/block/blk.h index 85be8b232b37..3f1446937aec 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -123,8 +123,15 @@ void blk_account_io_done(struct request *req); | |||
123 | * Internal atomic flags for request handling | 123 | * Internal atomic flags for request handling |
124 | */ | 124 | */ |
125 | enum rq_atomic_flags { | 125 | enum rq_atomic_flags { |
126 | /* | ||
127 | * Keep these two bits first - not because we depend on the | ||
128 | * value of them, but we do depend on them being in the same | ||
129 | * byte of storage to ensure ordering on writes. Keeping them | ||
130 | * first will achieve that nicely. | ||
131 | */ | ||
126 | REQ_ATOM_COMPLETE = 0, | 132 | REQ_ATOM_COMPLETE = 0, |
127 | REQ_ATOM_STARTED, | 133 | REQ_ATOM_STARTED, |
134 | |||
128 | REQ_ATOM_POLL_SLEPT, | 135 | REQ_ATOM_POLL_SLEPT, |
129 | }; | 136 | }; |
130 | 137 | ||
@@ -149,45 +156,6 @@ static inline void blk_clear_rq_complete(struct request *rq) | |||
149 | 156 | ||
150 | void blk_insert_flush(struct request *rq); | 157 | void blk_insert_flush(struct request *rq); |
151 | 158 | ||
152 | static inline struct request *__elv_next_request(struct request_queue *q) | ||
153 | { | ||
154 | struct request *rq; | ||
155 | struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); | ||
156 | |||
157 | WARN_ON_ONCE(q->mq_ops); | ||
158 | |||
159 | while (1) { | ||
160 | if (!list_empty(&q->queue_head)) { | ||
161 | rq = list_entry_rq(q->queue_head.next); | ||
162 | return rq; | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * Flush request is running and flush request isn't queueable | ||
167 | * in the drive, we can hold the queue till flush request is | ||
168 | * finished. Even we don't do this, driver can't dispatch next | ||
169 | * requests and will requeue them. And this can improve | ||
170 | * throughput too. For example, we have request flush1, write1, | ||
171 | * flush 2. flush1 is dispatched, then queue is hold, write1 | ||
172 | * isn't inserted to queue. After flush1 is finished, flush2 | ||
173 | * will be dispatched. Since disk cache is already clean, | ||
174 | * flush2 will be finished very soon, so looks like flush2 is | ||
175 | * folded to flush1. | ||
176 | * Since the queue is hold, a flag is set to indicate the queue | ||
177 | * should be restarted later. Please see flush_end_io() for | ||
178 | * details. | ||
179 | */ | ||
180 | if (fq->flush_pending_idx != fq->flush_running_idx && | ||
181 | !queue_flush_queueable(q)) { | ||
182 | fq->flush_queue_delayed = 1; | ||
183 | return NULL; | ||
184 | } | ||
185 | if (unlikely(blk_queue_bypass(q)) || | ||
186 | !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) | ||
187 | return NULL; | ||
188 | } | ||
189 | } | ||
190 | |||
191 | static inline void elv_activate_rq(struct request_queue *q, struct request *rq) | 159 | static inline void elv_activate_rq(struct request_queue *q, struct request *rq) |
192 | { | 160 | { |
193 | struct elevator_queue *e = q->elevator; | 161 | struct elevator_queue *e = q->elevator; |
diff --git a/block/bsg.c b/block/bsg.c index ee1335c68de7..452f94f1c5d4 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
@@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) | |||
137 | 137 | ||
138 | static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, | 138 | static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, |
139 | struct sg_io_v4 *hdr, struct bsg_device *bd, | 139 | struct sg_io_v4 *hdr, struct bsg_device *bd, |
140 | fmode_t has_write_perm) | 140 | fmode_t mode) |
141 | { | 141 | { |
142 | struct scsi_request *req = scsi_req(rq); | 142 | struct scsi_request *req = scsi_req(rq); |
143 | 143 | ||
@@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, | |||
152 | return -EFAULT; | 152 | return -EFAULT; |
153 | 153 | ||
154 | if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { | 154 | if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { |
155 | if (blk_verify_command(req->cmd, has_write_perm)) | 155 | if (blk_verify_command(req->cmd, mode)) |
156 | return -EPERM; | 156 | return -EPERM; |
157 | } else if (!capable(CAP_SYS_RAWIO)) | 157 | } else if (!capable(CAP_SYS_RAWIO)) |
158 | return -EPERM; | 158 | return -EPERM; |
@@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op) | |||
206 | * map sg_io_v4 to a request. | 206 | * map sg_io_v4 to a request. |
207 | */ | 207 | */ |
208 | static struct request * | 208 | static struct request * |
209 | bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) | 209 | bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) |
210 | { | 210 | { |
211 | struct request_queue *q = bd->queue; | 211 | struct request_queue *q = bd->queue; |
212 | struct request *rq, *next_rq = NULL; | 212 | struct request *rq, *next_rq = NULL; |
@@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) | |||
237 | if (IS_ERR(rq)) | 237 | if (IS_ERR(rq)) |
238 | return rq; | 238 | return rq; |
239 | 239 | ||
240 | ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); | 240 | ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode); |
241 | if (ret) | 241 | if (ret) |
242 | goto out; | 242 | goto out; |
243 | 243 | ||
@@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
587 | } | 587 | } |
588 | 588 | ||
589 | static int __bsg_write(struct bsg_device *bd, const char __user *buf, | 589 | static int __bsg_write(struct bsg_device *bd, const char __user *buf, |
590 | size_t count, ssize_t *bytes_written, | 590 | size_t count, ssize_t *bytes_written, fmode_t mode) |
591 | fmode_t has_write_perm) | ||
592 | { | 591 | { |
593 | struct bsg_command *bc; | 592 | struct bsg_command *bc; |
594 | struct request *rq; | 593 | struct request *rq; |
@@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, | |||
619 | /* | 618 | /* |
620 | * get a request, fill in the blanks, and add to request queue | 619 | * get a request, fill in the blanks, and add to request queue |
621 | */ | 620 | */ |
622 | rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); | 621 | rq = bsg_map_hdr(bd, &bc->hdr, mode); |
623 | if (IS_ERR(rq)) { | 622 | if (IS_ERR(rq)) { |
624 | ret = PTR_ERR(rq); | 623 | ret = PTR_ERR(rq); |
625 | rq = NULL; | 624 | rq = NULL; |
@@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | |||
655 | bsg_set_block(bd, file); | 654 | bsg_set_block(bd, file); |
656 | 655 | ||
657 | bytes_written = 0; | 656 | bytes_written = 0; |
658 | ret = __bsg_write(bd, buf, count, &bytes_written, | 657 | ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode); |
659 | file->f_mode & FMODE_WRITE); | ||
660 | 658 | ||
661 | *ppos = bytes_written; | 659 | *ppos = bytes_written; |
662 | 660 | ||
@@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
915 | if (copy_from_user(&hdr, uarg, sizeof(hdr))) | 913 | if (copy_from_user(&hdr, uarg, sizeof(hdr))) |
916 | return -EFAULT; | 914 | return -EFAULT; |
917 | 915 | ||
918 | rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); | 916 | rq = bsg_map_hdr(bd, &hdr, file->f_mode); |
919 | if (IS_ERR(rq)) | 917 | if (IS_ERR(rq)) |
920 | return PTR_ERR(rq); | 918 | return PTR_ERR(rq); |
921 | 919 | ||
diff --git a/block/elevator.c b/block/elevator.c index 153926a90901..7bda083d5968 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -83,12 +83,25 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) | |||
83 | } | 83 | } |
84 | EXPORT_SYMBOL(elv_bio_merge_ok); | 84 | EXPORT_SYMBOL(elv_bio_merge_ok); |
85 | 85 | ||
86 | static struct elevator_type *elevator_find(const char *name) | 86 | static bool elevator_match(const struct elevator_type *e, const char *name) |
87 | { | ||
88 | if (!strcmp(e->elevator_name, name)) | ||
89 | return true; | ||
90 | if (e->elevator_alias && !strcmp(e->elevator_alias, name)) | ||
91 | return true; | ||
92 | |||
93 | return false; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * Return scheduler with name 'name' and with matching 'mq capability | ||
98 | */ | ||
99 | static struct elevator_type *elevator_find(const char *name, bool mq) | ||
87 | { | 100 | { |
88 | struct elevator_type *e; | 101 | struct elevator_type *e; |
89 | 102 | ||
90 | list_for_each_entry(e, &elv_list, list) { | 103 | list_for_each_entry(e, &elv_list, list) { |
91 | if (!strcmp(e->elevator_name, name)) | 104 | if (elevator_match(e, name) && (mq == e->uses_mq)) |
92 | return e; | 105 | return e; |
93 | } | 106 | } |
94 | 107 | ||
@@ -100,25 +113,25 @@ static void elevator_put(struct elevator_type *e) | |||
100 | module_put(e->elevator_owner); | 113 | module_put(e->elevator_owner); |
101 | } | 114 | } |
102 | 115 | ||
103 | static struct elevator_type *elevator_get(const char *name, bool try_loading) | 116 | static struct elevator_type *elevator_get(struct request_queue *q, |
117 | const char *name, bool try_loading) | ||
104 | { | 118 | { |
105 | struct elevator_type *e; | 119 | struct elevator_type *e; |
106 | 120 | ||
107 | spin_lock(&elv_list_lock); | 121 | spin_lock(&elv_list_lock); |
108 | 122 | ||
109 | e = elevator_find(name); | 123 | e = elevator_find(name, q->mq_ops != NULL); |
110 | if (!e && try_loading) { | 124 | if (!e && try_loading) { |
111 | spin_unlock(&elv_list_lock); | 125 | spin_unlock(&elv_list_lock); |
112 | request_module("%s-iosched", name); | 126 | request_module("%s-iosched", name); |
113 | spin_lock(&elv_list_lock); | 127 | spin_lock(&elv_list_lock); |
114 | e = elevator_find(name); | 128 | e = elevator_find(name, q->mq_ops != NULL); |
115 | } | 129 | } |
116 | 130 | ||
117 | if (e && !try_module_get(e->elevator_owner)) | 131 | if (e && !try_module_get(e->elevator_owner)) |
118 | e = NULL; | 132 | e = NULL; |
119 | 133 | ||
120 | spin_unlock(&elv_list_lock); | 134 | spin_unlock(&elv_list_lock); |
121 | |||
122 | return e; | 135 | return e; |
123 | } | 136 | } |
124 | 137 | ||
@@ -144,8 +157,12 @@ void __init load_default_elevator_module(void) | |||
144 | if (!chosen_elevator[0]) | 157 | if (!chosen_elevator[0]) |
145 | return; | 158 | return; |
146 | 159 | ||
160 | /* | ||
161 | * Boot parameter is deprecated, we haven't supported that for MQ. | ||
162 | * Only look for non-mq schedulers from here. | ||
163 | */ | ||
147 | spin_lock(&elv_list_lock); | 164 | spin_lock(&elv_list_lock); |
148 | e = elevator_find(chosen_elevator); | 165 | e = elevator_find(chosen_elevator, false); |
149 | spin_unlock(&elv_list_lock); | 166 | spin_unlock(&elv_list_lock); |
150 | 167 | ||
151 | if (!e) | 168 | if (!e) |
@@ -202,7 +219,7 @@ int elevator_init(struct request_queue *q, char *name) | |||
202 | q->boundary_rq = NULL; | 219 | q->boundary_rq = NULL; |
203 | 220 | ||
204 | if (name) { | 221 | if (name) { |
205 | e = elevator_get(name, true); | 222 | e = elevator_get(q, name, true); |
206 | if (!e) | 223 | if (!e) |
207 | return -EINVAL; | 224 | return -EINVAL; |
208 | } | 225 | } |
@@ -214,7 +231,7 @@ int elevator_init(struct request_queue *q, char *name) | |||
214 | * allowed from async. | 231 | * allowed from async. |
215 | */ | 232 | */ |
216 | if (!e && !q->mq_ops && *chosen_elevator) { | 233 | if (!e && !q->mq_ops && *chosen_elevator) { |
217 | e = elevator_get(chosen_elevator, false); | 234 | e = elevator_get(q, chosen_elevator, false); |
218 | if (!e) | 235 | if (!e) |
219 | printk(KERN_ERR "I/O scheduler %s not found\n", | 236 | printk(KERN_ERR "I/O scheduler %s not found\n", |
220 | chosen_elevator); | 237 | chosen_elevator); |
@@ -229,17 +246,17 @@ int elevator_init(struct request_queue *q, char *name) | |||
229 | */ | 246 | */ |
230 | if (q->mq_ops) { | 247 | if (q->mq_ops) { |
231 | if (q->nr_hw_queues == 1) | 248 | if (q->nr_hw_queues == 1) |
232 | e = elevator_get("mq-deadline", false); | 249 | e = elevator_get(q, "mq-deadline", false); |
233 | if (!e) | 250 | if (!e) |
234 | return 0; | 251 | return 0; |
235 | } else | 252 | } else |
236 | e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); | 253 | e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); |
237 | 254 | ||
238 | if (!e) { | 255 | if (!e) { |
239 | printk(KERN_ERR | 256 | printk(KERN_ERR |
240 | "Default I/O scheduler not found. " \ | 257 | "Default I/O scheduler not found. " \ |
241 | "Using noop.\n"); | 258 | "Using noop.\n"); |
242 | e = elevator_get("noop", false); | 259 | e = elevator_get(q, "noop", false); |
243 | } | 260 | } |
244 | } | 261 | } |
245 | 262 | ||
@@ -905,7 +922,7 @@ int elv_register(struct elevator_type *e) | |||
905 | 922 | ||
906 | /* register, don't allow duplicate names */ | 923 | /* register, don't allow duplicate names */ |
907 | spin_lock(&elv_list_lock); | 924 | spin_lock(&elv_list_lock); |
908 | if (elevator_find(e->elevator_name)) { | 925 | if (elevator_find(e->elevator_name, e->uses_mq)) { |
909 | spin_unlock(&elv_list_lock); | 926 | spin_unlock(&elv_list_lock); |
910 | if (e->icq_cache) | 927 | if (e->icq_cache) |
911 | kmem_cache_destroy(e->icq_cache); | 928 | kmem_cache_destroy(e->icq_cache); |
@@ -915,9 +932,9 @@ int elv_register(struct elevator_type *e) | |||
915 | spin_unlock(&elv_list_lock); | 932 | spin_unlock(&elv_list_lock); |
916 | 933 | ||
917 | /* print pretty message */ | 934 | /* print pretty message */ |
918 | if (!strcmp(e->elevator_name, chosen_elevator) || | 935 | if (elevator_match(e, chosen_elevator) || |
919 | (!*chosen_elevator && | 936 | (!*chosen_elevator && |
920 | !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) | 937 | elevator_match(e, CONFIG_DEFAULT_IOSCHED))) |
921 | def = " (default)"; | 938 | def = " (default)"; |
922 | 939 | ||
923 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, | 940 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, |
@@ -1066,25 +1083,15 @@ static int __elevator_change(struct request_queue *q, const char *name) | |||
1066 | return elevator_switch(q, NULL); | 1083 | return elevator_switch(q, NULL); |
1067 | 1084 | ||
1068 | strlcpy(elevator_name, name, sizeof(elevator_name)); | 1085 | strlcpy(elevator_name, name, sizeof(elevator_name)); |
1069 | e = elevator_get(strstrip(elevator_name), true); | 1086 | e = elevator_get(q, strstrip(elevator_name), true); |
1070 | if (!e) | 1087 | if (!e) |
1071 | return -EINVAL; | 1088 | return -EINVAL; |
1072 | 1089 | ||
1073 | if (q->elevator && | 1090 | if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { |
1074 | !strcmp(elevator_name, q->elevator->type->elevator_name)) { | ||
1075 | elevator_put(e); | 1091 | elevator_put(e); |
1076 | return 0; | 1092 | return 0; |
1077 | } | 1093 | } |
1078 | 1094 | ||
1079 | if (!e->uses_mq && q->mq_ops) { | ||
1080 | elevator_put(e); | ||
1081 | return -EINVAL; | ||
1082 | } | ||
1083 | if (e->uses_mq && !q->mq_ops) { | ||
1084 | elevator_put(e); | ||
1085 | return -EINVAL; | ||
1086 | } | ||
1087 | |||
1088 | return elevator_switch(q, e); | 1095 | return elevator_switch(q, e); |
1089 | } | 1096 | } |
1090 | 1097 | ||
@@ -1116,9 +1123,10 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) | |||
1116 | struct elevator_queue *e = q->elevator; | 1123 | struct elevator_queue *e = q->elevator; |
1117 | struct elevator_type *elv = NULL; | 1124 | struct elevator_type *elv = NULL; |
1118 | struct elevator_type *__e; | 1125 | struct elevator_type *__e; |
1126 | bool uses_mq = q->mq_ops != NULL; | ||
1119 | int len = 0; | 1127 | int len = 0; |
1120 | 1128 | ||
1121 | if (!blk_queue_stackable(q)) | 1129 | if (!queue_is_rq_based(q)) |
1122 | return sprintf(name, "none\n"); | 1130 | return sprintf(name, "none\n"); |
1123 | 1131 | ||
1124 | if (!q->elevator) | 1132 | if (!q->elevator) |
@@ -1128,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) | |||
1128 | 1136 | ||
1129 | spin_lock(&elv_list_lock); | 1137 | spin_lock(&elv_list_lock); |
1130 | list_for_each_entry(__e, &elv_list, list) { | 1138 | list_for_each_entry(__e, &elv_list, list) { |
1131 | if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { | 1139 | if (elv && elevator_match(elv, __e->elevator_name) && |
1140 | (__e->uses_mq == uses_mq)) { | ||
1132 | len += sprintf(name+len, "[%s] ", elv->elevator_name); | 1141 | len += sprintf(name+len, "[%s] ", elv->elevator_name); |
1133 | continue; | 1142 | continue; |
1134 | } | 1143 | } |
diff --git a/block/genhd.c b/block/genhd.c index 630c0da6cfcf..c2223f12a805 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -588,6 +588,11 @@ static void register_disk(struct device *parent, struct gendisk *disk) | |||
588 | disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); | 588 | disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); |
589 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); | 589 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); |
590 | 590 | ||
591 | if (disk->flags & GENHD_FL_HIDDEN) { | ||
592 | dev_set_uevent_suppress(ddev, 0); | ||
593 | return; | ||
594 | } | ||
595 | |||
591 | /* No minors to use for partitions */ | 596 | /* No minors to use for partitions */ |
592 | if (!disk_part_scan_enabled(disk)) | 597 | if (!disk_part_scan_enabled(disk)) |
593 | goto exit; | 598 | goto exit; |
@@ -616,6 +621,11 @@ exit: | |||
616 | while ((part = disk_part_iter_next(&piter))) | 621 | while ((part = disk_part_iter_next(&piter))) |
617 | kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); | 622 | kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); |
618 | disk_part_iter_exit(&piter); | 623 | disk_part_iter_exit(&piter); |
624 | |||
625 | err = sysfs_create_link(&ddev->kobj, | ||
626 | &disk->queue->backing_dev_info->dev->kobj, | ||
627 | "bdi"); | ||
628 | WARN_ON(err); | ||
619 | } | 629 | } |
620 | 630 | ||
621 | /** | 631 | /** |
@@ -630,7 +640,6 @@ exit: | |||
630 | */ | 640 | */ |
631 | void device_add_disk(struct device *parent, struct gendisk *disk) | 641 | void device_add_disk(struct device *parent, struct gendisk *disk) |
632 | { | 642 | { |
633 | struct backing_dev_info *bdi; | ||
634 | dev_t devt; | 643 | dev_t devt; |
635 | int retval; | 644 | int retval; |
636 | 645 | ||
@@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) | |||
639 | * parameters make sense. | 648 | * parameters make sense. |
640 | */ | 649 | */ |
641 | WARN_ON(disk->minors && !(disk->major || disk->first_minor)); | 650 | WARN_ON(disk->minors && !(disk->major || disk->first_minor)); |
642 | WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); | 651 | WARN_ON(!disk->minors && |
652 | !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); | ||
643 | 653 | ||
644 | disk->flags |= GENHD_FL_UP; | 654 | disk->flags |= GENHD_FL_UP; |
645 | 655 | ||
@@ -648,22 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk) | |||
648 | WARN_ON(1); | 658 | WARN_ON(1); |
649 | return; | 659 | return; |
650 | } | 660 | } |
651 | disk_to_dev(disk)->devt = devt; | ||
652 | |||
653 | /* ->major and ->first_minor aren't supposed to be | ||
654 | * dereferenced from here on, but set them just in case. | ||
655 | */ | ||
656 | disk->major = MAJOR(devt); | 661 | disk->major = MAJOR(devt); |
657 | disk->first_minor = MINOR(devt); | 662 | disk->first_minor = MINOR(devt); |
658 | 663 | ||
659 | disk_alloc_events(disk); | 664 | disk_alloc_events(disk); |
660 | 665 | ||
661 | /* Register BDI before referencing it from bdev */ | 666 | if (disk->flags & GENHD_FL_HIDDEN) { |
662 | bdi = disk->queue->backing_dev_info; | 667 | /* |
663 | bdi_register_owner(bdi, disk_to_dev(disk)); | 668 | * Don't let hidden disks show up in /proc/partitions, |
664 | 669 | * and don't bother scanning for partitions either. | |
665 | blk_register_region(disk_devt(disk), disk->minors, NULL, | 670 | */ |
666 | exact_match, exact_lock, disk); | 671 | disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; |
672 | disk->flags |= GENHD_FL_NO_PART_SCAN; | ||
673 | } else { | ||
674 | /* Register BDI before referencing it from bdev */ | ||
675 | disk_to_dev(disk)->devt = devt; | ||
676 | bdi_register_owner(disk->queue->backing_dev_info, | ||
677 | disk_to_dev(disk)); | ||
678 | blk_register_region(disk_devt(disk), disk->minors, NULL, | ||
679 | exact_match, exact_lock, disk); | ||
680 | } | ||
667 | register_disk(parent, disk); | 681 | register_disk(parent, disk); |
668 | blk_register_queue(disk); | 682 | blk_register_queue(disk); |
669 | 683 | ||
@@ -673,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk) | |||
673 | */ | 687 | */ |
674 | WARN_ON_ONCE(!blk_get_queue(disk->queue)); | 688 | WARN_ON_ONCE(!blk_get_queue(disk->queue)); |
675 | 689 | ||
676 | retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, | ||
677 | "bdi"); | ||
678 | WARN_ON(retval); | ||
679 | |||
680 | disk_add_events(disk); | 690 | disk_add_events(disk); |
681 | blk_integrity_add(disk); | 691 | blk_integrity_add(disk); |
682 | } | 692 | } |
@@ -705,7 +715,8 @@ void del_gendisk(struct gendisk *disk) | |||
705 | set_capacity(disk, 0); | 715 | set_capacity(disk, 0); |
706 | disk->flags &= ~GENHD_FL_UP; | 716 | disk->flags &= ~GENHD_FL_UP; |
707 | 717 | ||
708 | sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); | 718 | if (!(disk->flags & GENHD_FL_HIDDEN)) |
719 | sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); | ||
709 | if (disk->queue) { | 720 | if (disk->queue) { |
710 | /* | 721 | /* |
711 | * Unregister bdi before releasing device numbers (as they can | 722 | * Unregister bdi before releasing device numbers (as they can |
@@ -716,13 +727,15 @@ void del_gendisk(struct gendisk *disk) | |||
716 | } else { | 727 | } else { |
717 | WARN_ON(1); | 728 | WARN_ON(1); |
718 | } | 729 | } |
719 | blk_unregister_region(disk_devt(disk), disk->minors); | ||
720 | 730 | ||
721 | part_stat_set_all(&disk->part0, 0); | 731 | if (!(disk->flags & GENHD_FL_HIDDEN)) |
722 | disk->part0.stamp = 0; | 732 | blk_unregister_region(disk_devt(disk), disk->minors); |
723 | 733 | ||
724 | kobject_put(disk->part0.holder_dir); | 734 | kobject_put(disk->part0.holder_dir); |
725 | kobject_put(disk->slave_dir); | 735 | kobject_put(disk->slave_dir); |
736 | |||
737 | part_stat_set_all(&disk->part0, 0); | ||
738 | disk->part0.stamp = 0; | ||
726 | if (!sysfs_deprecated) | 739 | if (!sysfs_deprecated) |
727 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); | 740 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); |
728 | pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); | 741 | pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); |
@@ -785,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) | |||
785 | spin_unlock_bh(&ext_devt_lock); | 798 | spin_unlock_bh(&ext_devt_lock); |
786 | } | 799 | } |
787 | 800 | ||
801 | if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { | ||
802 | put_disk(disk); | ||
803 | disk = NULL; | ||
804 | } | ||
788 | return disk; | 805 | return disk; |
789 | } | 806 | } |
790 | EXPORT_SYMBOL(get_gendisk); | 807 | EXPORT_SYMBOL(get_gendisk); |
@@ -1028,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev, | |||
1028 | (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); | 1045 | (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); |
1029 | } | 1046 | } |
1030 | 1047 | ||
1048 | static ssize_t disk_hidden_show(struct device *dev, | ||
1049 | struct device_attribute *attr, char *buf) | ||
1050 | { | ||
1051 | struct gendisk *disk = dev_to_disk(dev); | ||
1052 | |||
1053 | return sprintf(buf, "%d\n", | ||
1054 | (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); | ||
1055 | } | ||
1056 | |||
1031 | static ssize_t disk_ro_show(struct device *dev, | 1057 | static ssize_t disk_ro_show(struct device *dev, |
1032 | struct device_attribute *attr, char *buf) | 1058 | struct device_attribute *attr, char *buf) |
1033 | { | 1059 | { |
@@ -1065,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, | |||
1065 | static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); | 1091 | static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); |
1066 | static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); | 1092 | static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); |
1067 | static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); | 1093 | static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); |
1094 | static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL); | ||
1068 | static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); | 1095 | static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); |
1069 | static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); | 1096 | static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); |
1070 | static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); | 1097 | static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); |
@@ -1089,6 +1116,7 @@ static struct attribute *disk_attrs[] = { | |||
1089 | &dev_attr_range.attr, | 1116 | &dev_attr_range.attr, |
1090 | &dev_attr_ext_range.attr, | 1117 | &dev_attr_ext_range.attr, |
1091 | &dev_attr_removable.attr, | 1118 | &dev_attr_removable.attr, |
1119 | &dev_attr_hidden.attr, | ||
1092 | &dev_attr_ro.attr, | 1120 | &dev_attr_ro.attr, |
1093 | &dev_attr_size.attr, | 1121 | &dev_attr_size.attr, |
1094 | &dev_attr_alignment_offset.attr, | 1122 | &dev_attr_alignment_offset.attr, |
diff --git a/block/ioctl.c b/block/ioctl.c index 0de02ee67eed..1668506d8ed8 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
@@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, | |||
202 | { | 202 | { |
203 | uint64_t range[2]; | 203 | uint64_t range[2]; |
204 | uint64_t start, len; | 204 | uint64_t start, len; |
205 | struct request_queue *q = bdev_get_queue(bdev); | ||
206 | struct address_space *mapping = bdev->bd_inode->i_mapping; | ||
207 | |||
205 | 208 | ||
206 | if (!(mode & FMODE_WRITE)) | 209 | if (!(mode & FMODE_WRITE)) |
207 | return -EBADF; | 210 | return -EBADF; |
208 | 211 | ||
212 | if (!blk_queue_discard(q)) | ||
213 | return -EOPNOTSUPP; | ||
214 | |||
209 | if (copy_from_user(range, (void __user *)arg, sizeof(range))) | 215 | if (copy_from_user(range, (void __user *)arg, sizeof(range))) |
210 | return -EFAULT; | 216 | return -EFAULT; |
211 | 217 | ||
@@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, | |||
216 | return -EINVAL; | 222 | return -EINVAL; |
217 | if (len & 511) | 223 | if (len & 511) |
218 | return -EINVAL; | 224 | return -EINVAL; |
219 | start >>= 9; | ||
220 | len >>= 9; | ||
221 | 225 | ||
222 | if (start + len > (i_size_read(bdev->bd_inode) >> 9)) | 226 | if (start + len > i_size_read(bdev->bd_inode)) |
223 | return -EINVAL; | 227 | return -EINVAL; |
224 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); | 228 | truncate_inode_pages_range(mapping, start, start + len); |
229 | return blkdev_issue_discard(bdev, start >> 9, len >> 9, | ||
230 | GFP_KERNEL, flags); | ||
225 | } | 231 | } |
226 | 232 | ||
227 | static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, | 233 | static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, |
@@ -437,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, | |||
437 | { | 443 | { |
438 | int ret, n; | 444 | int ret, n; |
439 | 445 | ||
446 | if (!capable(CAP_SYS_ADMIN)) | ||
447 | return -EACCES; | ||
448 | |||
440 | ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 449 | ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
441 | if (!is_unrecognized_ioctl(ret)) | 450 | if (!is_unrecognized_ioctl(ret)) |
442 | return ret; | 451 | return ret; |
443 | if (!capable(CAP_SYS_ADMIN)) | ||
444 | return -EACCES; | ||
445 | if (get_user(n, (int __user *)arg)) | 452 | if (get_user(n, (int __user *)arg)) |
446 | return -EFAULT; | 453 | return -EFAULT; |
447 | set_device_ro(bdev, n); | 454 | set_device_ro(bdev, n); |
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index f58cab82105b..b4df317c2916 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c | |||
@@ -541,9 +541,17 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, | |||
541 | 541 | ||
542 | /* | 542 | /* |
543 | * Try again in case a token was freed before we got on the wait | 543 | * Try again in case a token was freed before we got on the wait |
544 | * queue. | 544 | * queue. The waker may have already removed the entry from the |
545 | * wait queue, but list_del_init() is okay with that. | ||
545 | */ | 546 | */ |
546 | nr = __sbitmap_queue_get(domain_tokens); | 547 | nr = __sbitmap_queue_get(domain_tokens); |
548 | if (nr >= 0) { | ||
549 | unsigned long flags; | ||
550 | |||
551 | spin_lock_irqsave(&ws->wait.lock, flags); | ||
552 | list_del_init(&wait->entry); | ||
553 | spin_unlock_irqrestore(&ws->wait.lock, flags); | ||
554 | } | ||
547 | } | 555 | } |
548 | return nr; | 556 | return nr; |
549 | } | 557 | } |
@@ -641,7 +649,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) | |||
641 | if (!list_empty_careful(&khd->rqs[i])) | 649 | if (!list_empty_careful(&khd->rqs[i])) |
642 | return true; | 650 | return true; |
643 | } | 651 | } |
644 | return false; | 652 | return sbitmap_any_bit_set(&hctx->ctx_map); |
645 | } | 653 | } |
646 | 654 | ||
647 | #define KYBER_LAT_SHOW_STORE(op) \ | 655 | #define KYBER_LAT_SHOW_STORE(op) \ |
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index a1cad4331edd..0179e484ec98 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c | |||
@@ -657,6 +657,7 @@ static struct elevator_type mq_deadline = { | |||
657 | #endif | 657 | #endif |
658 | .elevator_attrs = deadline_attrs, | 658 | .elevator_attrs = deadline_attrs, |
659 | .elevator_name = "mq-deadline", | 659 | .elevator_name = "mq-deadline", |
660 | .elevator_alias = "deadline", | ||
660 | .elevator_owner = THIS_MODULE, | 661 | .elevator_owner = THIS_MODULE, |
661 | }; | 662 | }; |
662 | MODULE_ALIAS("mq-deadline-iosched"); | 663 | MODULE_ALIAS("mq-deadline-iosched"); |
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 7440de44dd85..edcfff974527 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c | |||
@@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) | |||
207 | __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); | 207 | __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); |
208 | } | 208 | } |
209 | 209 | ||
210 | int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) | 210 | int blk_verify_command(unsigned char *cmd, fmode_t mode) |
211 | { | 211 | { |
212 | struct blk_cmd_filter *filter = &blk_default_cmd_filter; | 212 | struct blk_cmd_filter *filter = &blk_default_cmd_filter; |
213 | 213 | ||
@@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) | |||
220 | return 0; | 220 | return 0; |
221 | 221 | ||
222 | /* Write-safe commands require a writable open */ | 222 | /* Write-safe commands require a writable open */ |
223 | if (test_bit(cmd[0], filter->write_ok) && has_write_perm) | 223 | if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE)) |
224 | return 0; | 224 | return 0; |
225 | 225 | ||
226 | return -EPERM; | 226 | return -EPERM; |
@@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, | |||
234 | 234 | ||
235 | if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) | 235 | if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) |
236 | return -EFAULT; | 236 | return -EFAULT; |
237 | if (blk_verify_command(req->cmd, mode & FMODE_WRITE)) | 237 | if (blk_verify_command(req->cmd, mode)) |
238 | return -EPERM; | 238 | return -EPERM; |
239 | 239 | ||
240 | /* | 240 | /* |
@@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, | |||
469 | if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) | 469 | if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) |
470 | goto error; | 470 | goto error; |
471 | 471 | ||
472 | err = blk_verify_command(req->cmd, mode & FMODE_WRITE); | 472 | err = blk_verify_command(req->cmd, mode); |
473 | if (err) | 473 | if (err) |
474 | goto error; | 474 | goto error; |
475 | 475 | ||
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 7b2df7a54d87..923b417eaf4c 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -68,9 +68,13 @@ config AMIGA_Z2RAM | |||
68 | To compile this driver as a module, choose M here: the | 68 | To compile this driver as a module, choose M here: the |
69 | module will be called z2ram. | 69 | module will be called z2ram. |
70 | 70 | ||
71 | config CDROM | ||
72 | tristate | ||
73 | |||
71 | config GDROM | 74 | config GDROM |
72 | tristate "SEGA Dreamcast GD-ROM drive" | 75 | tristate "SEGA Dreamcast GD-ROM drive" |
73 | depends on SH_DREAMCAST | 76 | depends on SH_DREAMCAST |
77 | select CDROM | ||
74 | select BLK_SCSI_REQUEST # only for the generic cdrom code | 78 | select BLK_SCSI_REQUEST # only for the generic cdrom code |
75 | help | 79 | help |
76 | A standard SEGA Dreamcast comes with a modified CD ROM drive called a | 80 | A standard SEGA Dreamcast comes with a modified CD ROM drive called a |
@@ -348,6 +352,7 @@ config BLK_DEV_RAM_DAX | |||
348 | config CDROM_PKTCDVD | 352 | config CDROM_PKTCDVD |
349 | tristate "Packet writing on CD/DVD media (DEPRECATED)" | 353 | tristate "Packet writing on CD/DVD media (DEPRECATED)" |
350 | depends on !UML | 354 | depends on !UML |
355 | select CDROM | ||
351 | select BLK_SCSI_REQUEST | 356 | select BLK_SCSI_REQUEST |
352 | help | 357 | help |
353 | Note: This driver is deprecated and will be removed from the | 358 | Note: This driver is deprecated and will be removed from the |
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2d7178f7754e..c1cf87718c2e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c | |||
@@ -60,7 +60,6 @@ struct brd_device { | |||
60 | /* | 60 | /* |
61 | * Look up and return a brd's page for a given sector. | 61 | * Look up and return a brd's page for a given sector. |
62 | */ | 62 | */ |
63 | static DEFINE_MUTEX(brd_mutex); | ||
64 | static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) | 63 | static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) |
65 | { | 64 | { |
66 | pgoff_t idx; | 65 | pgoff_t idx; |
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c index 74e03aa537ad..7033a4beda66 100644 --- a/drivers/block/cryptoloop.c +++ b/drivers/block/cryptoloop.c | |||
@@ -43,7 +43,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) | |||
43 | int cipher_len; | 43 | int cipher_len; |
44 | int mode_len; | 44 | int mode_len; |
45 | char cms[LO_NAME_SIZE]; /* cipher-mode string */ | 45 | char cms[LO_NAME_SIZE]; /* cipher-mode string */ |
46 | char *cipher; | ||
47 | char *mode; | 46 | char *mode; |
48 | char *cmsp = cms; /* c-m string pointer */ | 47 | char *cmsp = cms; /* c-m string pointer */ |
49 | struct crypto_skcipher *tfm; | 48 | struct crypto_skcipher *tfm; |
@@ -56,7 +55,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) | |||
56 | strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE); | 55 | strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE); |
57 | cms[LO_NAME_SIZE - 1] = 0; | 56 | cms[LO_NAME_SIZE - 1] = 0; |
58 | 57 | ||
59 | cipher = cmsp; | ||
60 | cipher_len = strcspn(cmsp, "-"); | 58 | cipher_len = strcspn(cmsp, "-"); |
61 | 59 | ||
62 | mode = cmsp + cipher_len; | 60 | mode = cmsp + cipher_len; |
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 85de67334695..bc8e61506968 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -476,6 +476,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) | |||
476 | { | 476 | { |
477 | struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); | 477 | struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); |
478 | 478 | ||
479 | if (cmd->css) | ||
480 | css_put(cmd->css); | ||
479 | cmd->ret = ret; | 481 | cmd->ret = ret; |
480 | lo_rw_aio_do_completion(cmd); | 482 | lo_rw_aio_do_completion(cmd); |
481 | } | 483 | } |
@@ -535,6 +537,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, | |||
535 | cmd->iocb.ki_filp = file; | 537 | cmd->iocb.ki_filp = file; |
536 | cmd->iocb.ki_complete = lo_rw_aio_complete; | 538 | cmd->iocb.ki_complete = lo_rw_aio_complete; |
537 | cmd->iocb.ki_flags = IOCB_DIRECT; | 539 | cmd->iocb.ki_flags = IOCB_DIRECT; |
540 | if (cmd->css) | ||
541 | kthread_associate_blkcg(cmd->css); | ||
538 | 542 | ||
539 | if (rw == WRITE) | 543 | if (rw == WRITE) |
540 | ret = call_write_iter(file, &cmd->iocb, &iter); | 544 | ret = call_write_iter(file, &cmd->iocb, &iter); |
@@ -542,6 +546,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, | |||
542 | ret = call_read_iter(file, &cmd->iocb, &iter); | 546 | ret = call_read_iter(file, &cmd->iocb, &iter); |
543 | 547 | ||
544 | lo_rw_aio_do_completion(cmd); | 548 | lo_rw_aio_do_completion(cmd); |
549 | kthread_associate_blkcg(NULL); | ||
545 | 550 | ||
546 | if (ret != -EIOCBQUEUED) | 551 | if (ret != -EIOCBQUEUED) |
547 | cmd->iocb.ki_complete(&cmd->iocb, ret, 0); | 552 | cmd->iocb.ki_complete(&cmd->iocb, ret, 0); |
@@ -1686,6 +1691,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1686 | break; | 1691 | break; |
1687 | } | 1692 | } |
1688 | 1693 | ||
1694 | /* always use the first bio's css */ | ||
1695 | #ifdef CONFIG_BLK_CGROUP | ||
1696 | if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) { | ||
1697 | cmd->css = cmd->rq->bio->bi_css; | ||
1698 | css_get(cmd->css); | ||
1699 | } else | ||
1700 | #endif | ||
1701 | cmd->css = NULL; | ||
1689 | kthread_queue_work(&lo->worker, &cmd->work); | 1702 | kthread_queue_work(&lo->worker, &cmd->work); |
1690 | 1703 | ||
1691 | return BLK_STS_OK; | 1704 | return BLK_STS_OK; |
diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 1f3956702993..0f45416e4fcf 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h | |||
@@ -72,6 +72,7 @@ struct loop_cmd { | |||
72 | long ret; | 72 | long ret; |
73 | struct kiocb iocb; | 73 | struct kiocb iocb; |
74 | struct bio_vec *bvec; | 74 | struct bio_vec *bvec; |
75 | struct cgroup_subsys_state *css; | ||
75 | }; | 76 | }; |
76 | 77 | ||
77 | /* Support for loadable transfer modules */ | 78 | /* Support for loadable transfer modules */ |
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4a3cfc7940de..b8af7352a18f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c | |||
@@ -887,12 +887,9 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) | |||
887 | static bool mtip_pause_ncq(struct mtip_port *port, | 887 | static bool mtip_pause_ncq(struct mtip_port *port, |
888 | struct host_to_dev_fis *fis) | 888 | struct host_to_dev_fis *fis) |
889 | { | 889 | { |
890 | struct host_to_dev_fis *reply; | ||
891 | unsigned long task_file_data; | 890 | unsigned long task_file_data; |
892 | 891 | ||
893 | reply = port->rxfis + RX_FIS_D2H_REG; | ||
894 | task_file_data = readl(port->mmio+PORT_TFDATA); | 892 | task_file_data = readl(port->mmio+PORT_TFDATA); |
895 | |||
896 | if ((task_file_data & 1)) | 893 | if ((task_file_data & 1)) |
897 | return false; | 894 | return false; |
898 | 895 | ||
@@ -1020,7 +1017,6 @@ static int mtip_exec_internal_command(struct mtip_port *port, | |||
1020 | .opts = opts | 1017 | .opts = opts |
1021 | }; | 1018 | }; |
1022 | int rv = 0; | 1019 | int rv = 0; |
1023 | unsigned long start; | ||
1024 | 1020 | ||
1025 | /* Make sure the buffer is 8 byte aligned. This is asic specific. */ | 1021 | /* Make sure the buffer is 8 byte aligned. This is asic specific. */ |
1026 | if (buffer & 0x00000007) { | 1022 | if (buffer & 0x00000007) { |
@@ -1057,7 +1053,6 @@ static int mtip_exec_internal_command(struct mtip_port *port, | |||
1057 | /* Copy the command to the command table */ | 1053 | /* Copy the command to the command table */ |
1058 | memcpy(int_cmd->command, fis, fis_len*4); | 1054 | memcpy(int_cmd->command, fis, fis_len*4); |
1059 | 1055 | ||
1060 | start = jiffies; | ||
1061 | rq->timeout = timeout; | 1056 | rq->timeout = timeout; |
1062 | 1057 | ||
1063 | /* insert request and run queue */ | 1058 | /* insert request and run queue */ |
@@ -3015,7 +3010,6 @@ static int mtip_hw_init(struct driver_data *dd) | |||
3015 | { | 3010 | { |
3016 | int i; | 3011 | int i; |
3017 | int rv; | 3012 | int rv; |
3018 | unsigned int num_command_slots; | ||
3019 | unsigned long timeout, timetaken; | 3013 | unsigned long timeout, timetaken; |
3020 | 3014 | ||
3021 | dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; | 3015 | dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; |
@@ -3025,7 +3019,6 @@ static int mtip_hw_init(struct driver_data *dd) | |||
3025 | rv = -EIO; | 3019 | rv = -EIO; |
3026 | goto out1; | 3020 | goto out1; |
3027 | } | 3021 | } |
3028 | num_command_slots = dd->slot_groups * 32; | ||
3029 | 3022 | ||
3030 | hba_setup(dd); | 3023 | hba_setup(dd); |
3031 | 3024 | ||
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 9adfb5445f8d..5f2a4240a204 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c | |||
@@ -288,15 +288,6 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, | |||
288 | cmd->status = BLK_STS_TIMEOUT; | 288 | cmd->status = BLK_STS_TIMEOUT; |
289 | return BLK_EH_HANDLED; | 289 | return BLK_EH_HANDLED; |
290 | } | 290 | } |
291 | |||
292 | /* If we are waiting on our dead timer then we could get timeout | ||
293 | * callbacks for our request. For this we just want to reset the timer | ||
294 | * and let the queue side take care of everything. | ||
295 | */ | ||
296 | if (!completion_done(&cmd->send_complete)) { | ||
297 | nbd_config_put(nbd); | ||
298 | return BLK_EH_RESET_TIMER; | ||
299 | } | ||
300 | config = nbd->config; | 291 | config = nbd->config; |
301 | 292 | ||
302 | if (config->num_connections > 1) { | 293 | if (config->num_connections > 1) { |
@@ -723,9 +714,9 @@ static int wait_for_reconnect(struct nbd_device *nbd) | |||
723 | return 0; | 714 | return 0; |
724 | if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) | 715 | if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) |
725 | return 0; | 716 | return 0; |
726 | wait_event_interruptible_timeout(config->conn_wait, | 717 | wait_event_timeout(config->conn_wait, |
727 | atomic_read(&config->live_connections), | 718 | atomic_read(&config->live_connections), |
728 | config->dead_conn_timeout); | 719 | config->dead_conn_timeout); |
729 | return atomic_read(&config->live_connections); | 720 | return atomic_read(&config->live_connections); |
730 | } | 721 | } |
731 | 722 | ||
@@ -740,6 +731,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) | |||
740 | if (!refcount_inc_not_zero(&nbd->config_refs)) { | 731 | if (!refcount_inc_not_zero(&nbd->config_refs)) { |
741 | dev_err_ratelimited(disk_to_dev(nbd->disk), | 732 | dev_err_ratelimited(disk_to_dev(nbd->disk), |
742 | "Socks array is empty\n"); | 733 | "Socks array is empty\n"); |
734 | blk_mq_start_request(req); | ||
743 | return -EINVAL; | 735 | return -EINVAL; |
744 | } | 736 | } |
745 | config = nbd->config; | 737 | config = nbd->config; |
@@ -748,6 +740,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) | |||
748 | dev_err_ratelimited(disk_to_dev(nbd->disk), | 740 | dev_err_ratelimited(disk_to_dev(nbd->disk), |
749 | "Attempted send on invalid socket\n"); | 741 | "Attempted send on invalid socket\n"); |
750 | nbd_config_put(nbd); | 742 | nbd_config_put(nbd); |
743 | blk_mq_start_request(req); | ||
751 | return -EINVAL; | 744 | return -EINVAL; |
752 | } | 745 | } |
753 | cmd->status = BLK_STS_OK; | 746 | cmd->status = BLK_STS_OK; |
@@ -771,6 +764,7 @@ again: | |||
771 | */ | 764 | */ |
772 | sock_shutdown(nbd); | 765 | sock_shutdown(nbd); |
773 | nbd_config_put(nbd); | 766 | nbd_config_put(nbd); |
767 | blk_mq_start_request(req); | ||
774 | return -EIO; | 768 | return -EIO; |
775 | } | 769 | } |
776 | goto again; | 770 | goto again; |
@@ -781,6 +775,7 @@ again: | |||
781 | * here so that it gets put _after_ the request that is already on the | 775 | * here so that it gets put _after_ the request that is already on the |
782 | * dispatch list. | 776 | * dispatch list. |
783 | */ | 777 | */ |
778 | blk_mq_start_request(req); | ||
784 | if (unlikely(nsock->pending && nsock->pending != req)) { | 779 | if (unlikely(nsock->pending && nsock->pending != req)) { |
785 | blk_mq_requeue_request(req, true); | 780 | blk_mq_requeue_request(req, true); |
786 | ret = 0; | 781 | ret = 0; |
@@ -793,10 +788,10 @@ again: | |||
793 | ret = nbd_send_cmd(nbd, cmd, index); | 788 | ret = nbd_send_cmd(nbd, cmd, index); |
794 | if (ret == -EAGAIN) { | 789 | if (ret == -EAGAIN) { |
795 | dev_err_ratelimited(disk_to_dev(nbd->disk), | 790 | dev_err_ratelimited(disk_to_dev(nbd->disk), |
796 | "Request send failed trying another connection\n"); | 791 | "Request send failed, requeueing\n"); |
797 | nbd_mark_nsock_dead(nbd, nsock, 1); | 792 | nbd_mark_nsock_dead(nbd, nsock, 1); |
798 | mutex_unlock(&nsock->tx_lock); | 793 | blk_mq_requeue_request(req, true); |
799 | goto again; | 794 | ret = 0; |
800 | } | 795 | } |
801 | out: | 796 | out: |
802 | mutex_unlock(&nsock->tx_lock); | 797 | mutex_unlock(&nsock->tx_lock); |
@@ -820,7 +815,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
820 | * done sending everything over the wire. | 815 | * done sending everything over the wire. |
821 | */ | 816 | */ |
822 | init_completion(&cmd->send_complete); | 817 | init_completion(&cmd->send_complete); |
823 | blk_mq_start_request(bd->rq); | ||
824 | 818 | ||
825 | /* We can be called directly from the user space process, which means we | 819 | /* We can be called directly from the user space process, which means we |
826 | * could possibly have signals pending so our sendmsg will fail. In | 820 | * could possibly have signals pending so our sendmsg will fail. In |
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index cda69dbefe3b..c61960deb74a 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c | |||
@@ -154,6 +154,10 @@ enum { | |||
154 | NULL_Q_MQ = 2, | 154 | NULL_Q_MQ = 2, |
155 | }; | 155 | }; |
156 | 156 | ||
157 | static int g_no_sched; | ||
158 | module_param_named(no_sched, g_no_sched, int, S_IRUGO); | ||
159 | MODULE_PARM_DESC(no_sched, "No io scheduler"); | ||
160 | |||
157 | static int g_submit_queues = 1; | 161 | static int g_submit_queues = 1; |
158 | module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); | 162 | module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); |
159 | MODULE_PARM_DESC(submit_queues, "Number of submission queues"); | 163 | MODULE_PARM_DESC(submit_queues, "Number of submission queues"); |
@@ -1754,6 +1758,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) | |||
1754 | set->numa_node = nullb ? nullb->dev->home_node : g_home_node; | 1758 | set->numa_node = nullb ? nullb->dev->home_node : g_home_node; |
1755 | set->cmd_size = sizeof(struct nullb_cmd); | 1759 | set->cmd_size = sizeof(struct nullb_cmd); |
1756 | set->flags = BLK_MQ_F_SHOULD_MERGE; | 1760 | set->flags = BLK_MQ_F_SHOULD_MERGE; |
1761 | if (g_no_sched) | ||
1762 | set->flags |= BLK_MQ_F_NO_SCHED; | ||
1757 | set->driver_data = NULL; | 1763 | set->driver_data = NULL; |
1758 | 1764 | ||
1759 | if ((nullb && nullb->dev->blocking) || g_blocking) | 1765 | if ((nullb && nullb->dev->blocking) || g_blocking) |
@@ -1985,8 +1991,10 @@ static int __init null_init(void) | |||
1985 | 1991 | ||
1986 | for (i = 0; i < nr_devices; i++) { | 1992 | for (i = 0; i < nr_devices; i++) { |
1987 | dev = null_alloc_dev(); | 1993 | dev = null_alloc_dev(); |
1988 | if (!dev) | 1994 | if (!dev) { |
1995 | ret = -ENOMEM; | ||
1989 | goto err_dev; | 1996 | goto err_dev; |
1997 | } | ||
1990 | ret = null_add_dev(dev); | 1998 | ret = null_add_dev(dev); |
1991 | if (ret) { | 1999 | if (ret) { |
1992 | null_free_dev(dev); | 2000 | null_free_dev(dev); |
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig index b226835a909a..f8bd6ef3605a 100644 --- a/drivers/block/paride/Kconfig +++ b/drivers/block/paride/Kconfig | |||
@@ -26,6 +26,7 @@ config PARIDE_PD | |||
26 | config PARIDE_PCD | 26 | config PARIDE_PCD |
27 | tristate "Parallel port ATAPI CD-ROMs" | 27 | tristate "Parallel port ATAPI CD-ROMs" |
28 | depends on PARIDE | 28 | depends on PARIDE |
29 | select CDROM | ||
29 | select BLK_SCSI_REQUEST # only for the generic cdrom code | 30 | select BLK_SCSI_REQUEST # only for the generic cdrom code |
30 | ---help--- | 31 | ---help--- |
31 | This option enables the high-level driver for ATAPI CD-ROM devices | 32 | This option enables the high-level driver for ATAPI CD-ROM devices |
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 64d0fc17c174..2819f23e8bf2 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c | |||
@@ -1967,7 +1967,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev) | |||
1967 | break; | 1967 | break; |
1968 | 1968 | ||
1969 | case FIT_MTD_CMD_LOG_HOST_ID: | 1969 | case FIT_MTD_CMD_LOG_HOST_ID: |
1970 | skdev->connect_time_stamp = get_seconds(); | 1970 | /* hardware interface overflows in y2106 */ |
1971 | skdev->connect_time_stamp = (u32)ktime_get_real_seconds(); | ||
1971 | data = skdev->connect_time_stamp & 0xFFFF; | 1972 | data = skdev->connect_time_stamp & 0xFFFF; |
1972 | mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); | 1973 | mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); |
1973 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | 1974 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); |
diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile index a95566ff47d3..0f3664b45f48 100644 --- a/drivers/cdrom/Makefile +++ b/drivers/cdrom/Makefile | |||
@@ -1,14 +1,3 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
2 | # Makefile for the kernel cdrom device drivers. | 2 | obj-$(CONFIG_CDROM) += cdrom.o |
3 | # | 3 | obj-$(CONFIG_GDROM) += gdrom.o |
4 | # 30 Jan 1998, Michael Elizabeth Chastain, <mailto:mec@shout.net> | ||
5 | # Rewritten to use lists instead of if-statements. | ||
6 | |||
7 | # Each configuration option enables a list of files. | ||
8 | |||
9 | obj-$(CONFIG_BLK_DEV_IDECD) += cdrom.o | ||
10 | obj-$(CONFIG_BLK_DEV_SR) += cdrom.o | ||
11 | obj-$(CONFIG_PARIDE_PCD) += cdrom.o | ||
12 | obj-$(CONFIG_CDROM_PKTCDVD) += cdrom.o | ||
13 | |||
14 | obj-$(CONFIG_GDROM) += gdrom.o cdrom.o | ||
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index c99a25c075bc..cf1fb3fb5d26 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig | |||
@@ -117,7 +117,9 @@ config BLK_DEV_DELKIN | |||
117 | 117 | ||
118 | config BLK_DEV_IDECD | 118 | config BLK_DEV_IDECD |
119 | tristate "Include IDE/ATAPI CDROM support" | 119 | tristate "Include IDE/ATAPI CDROM support" |
120 | depends on BLK_DEV | ||
120 | select IDE_ATAPI | 121 | select IDE_ATAPI |
122 | select CDROM | ||
121 | ---help--- | 123 | ---help--- |
122 | If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is | 124 | If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is |
123 | a newer protocol used by IDE CD-ROM and TAPE drives, similar to the | 125 | a newer protocol used by IDE CD-ROM and TAPE drives, similar to the |
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 14d1e7d9a1d6..0e6bc631a1ca 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c | |||
@@ -282,7 +282,7 @@ int ide_cd_expiry(ide_drive_t *drive) | |||
282 | struct request *rq = drive->hwif->rq; | 282 | struct request *rq = drive->hwif->rq; |
283 | unsigned long wait = 0; | 283 | unsigned long wait = 0; |
284 | 284 | ||
285 | debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]); | 285 | debug_log("%s: scsi_req(rq)->cmd[0]: 0x%x\n", __func__, scsi_req(rq)->cmd[0]); |
286 | 286 | ||
287 | /* | 287 | /* |
288 | * Some commands are *slow* and normally take a long time to complete. | 288 | * Some commands are *slow* and normally take a long time to complete. |
@@ -463,7 +463,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) | |||
463 | return ide_do_reset(drive); | 463 | return ide_do_reset(drive); |
464 | } | 464 | } |
465 | 465 | ||
466 | debug_log("[cmd %x]: check condition\n", rq->cmd[0]); | 466 | debug_log("[cmd %x]: check condition\n", scsi_req(rq)->cmd[0]); |
467 | 467 | ||
468 | /* Retry operation */ | 468 | /* Retry operation */ |
469 | ide_retry_pc(drive); | 469 | ide_retry_pc(drive); |
@@ -531,7 +531,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) | |||
531 | ide_pad_transfer(drive, write, bcount); | 531 | ide_pad_transfer(drive, write, bcount); |
532 | 532 | ||
533 | debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n", | 533 | debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n", |
534 | rq->cmd[0], done, bcount, scsi_req(rq)->resid_len); | 534 | scsi_req(rq)->cmd[0], done, bcount, scsi_req(rq)->resid_len); |
535 | 535 | ||
536 | /* And set the interrupt handler again */ | 536 | /* And set the interrupt handler again */ |
537 | ide_set_handler(drive, ide_pc_intr, timeout); | 537 | ide_set_handler(drive, ide_pc_intr, timeout); |
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index dccdca9eda38..ad8a125defdd 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c | |||
@@ -90,9 +90,9 @@ int generic_ide_resume(struct device *dev) | |||
90 | } | 90 | } |
91 | 91 | ||
92 | memset(&rqpm, 0, sizeof(rqpm)); | 92 | memset(&rqpm, 0, sizeof(rqpm)); |
93 | rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); | 93 | rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN, |
94 | BLK_MQ_REQ_PREEMPT); | ||
94 | ide_req(rq)->type = ATA_PRIV_PM_RESUME; | 95 | ide_req(rq)->type = ATA_PRIV_PM_RESUME; |
95 | rq->rq_flags |= RQF_PREEMPT; | ||
96 | rq->special = &rqpm; | 96 | rq->special = &rqpm; |
97 | rqpm.pm_step = IDE_PM_START_RESUME; | 97 | rqpm.pm_step = IDE_PM_START_RESUME; |
98 | rqpm.pm_state = PM_EVENT_ON; | 98 | rqpm.pm_state = PM_EVENT_ON; |
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index ead61a93cb4e..2a953efec4e1 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig | |||
@@ -4,7 +4,8 @@ | |||
4 | 4 | ||
5 | menuconfig NVM | 5 | menuconfig NVM |
6 | bool "Open-Channel SSD target support" | 6 | bool "Open-Channel SSD target support" |
7 | depends on BLOCK && HAS_DMA | 7 | depends on BLOCK && HAS_DMA && PCI |
8 | select BLK_DEV_NVME | ||
8 | help | 9 | help |
9 | Say Y here to get to enable Open-channel SSDs. | 10 | Say Y here to get to enable Open-channel SSDs. |
10 | 11 | ||
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ddae430b6eae..83249b43dd06 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/types.h> | 22 | #include <linux/types.h> |
23 | #include <linux/sem.h> | 23 | #include <linux/sem.h> |
24 | #include <linux/bitmap.h> | 24 | #include <linux/bitmap.h> |
25 | #include <linux/module.h> | ||
25 | #include <linux/moduleparam.h> | 26 | #include <linux/moduleparam.h> |
26 | #include <linux/miscdevice.h> | 27 | #include <linux/miscdevice.h> |
27 | #include <linux/lightnvm.h> | 28 | #include <linux/lightnvm.h> |
@@ -138,7 +139,6 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, | |||
138 | int prev_nr_luns; | 139 | int prev_nr_luns; |
139 | int i, j; | 140 | int i, j; |
140 | 141 | ||
141 | nr_chnls = nr_luns / dev->geo.luns_per_chnl; | ||
142 | nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; | 142 | nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; |
143 | 143 | ||
144 | dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); | 144 | dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); |
@@ -226,6 +226,24 @@ static const struct block_device_operations nvm_fops = { | |||
226 | .owner = THIS_MODULE, | 226 | .owner = THIS_MODULE, |
227 | }; | 227 | }; |
228 | 228 | ||
229 | static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) | ||
230 | { | ||
231 | struct nvm_tgt_type *tmp, *tt = NULL; | ||
232 | |||
233 | if (lock) | ||
234 | down_write(&nvm_tgtt_lock); | ||
235 | |||
236 | list_for_each_entry(tmp, &nvm_tgt_types, list) | ||
237 | if (!strcmp(name, tmp->name)) { | ||
238 | tt = tmp; | ||
239 | break; | ||
240 | } | ||
241 | |||
242 | if (lock) | ||
243 | up_write(&nvm_tgtt_lock); | ||
244 | return tt; | ||
245 | } | ||
246 | |||
229 | static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) | 247 | static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) |
230 | { | 248 | { |
231 | struct nvm_ioctl_create_simple *s = &create->conf.s; | 249 | struct nvm_ioctl_create_simple *s = &create->conf.s; |
@@ -316,6 +334,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) | |||
316 | list_add_tail(&t->list, &dev->targets); | 334 | list_add_tail(&t->list, &dev->targets); |
317 | mutex_unlock(&dev->mlock); | 335 | mutex_unlock(&dev->mlock); |
318 | 336 | ||
337 | __module_get(tt->owner); | ||
338 | |||
319 | return 0; | 339 | return 0; |
320 | err_sysfs: | 340 | err_sysfs: |
321 | if (tt->exit) | 341 | if (tt->exit) |
@@ -351,6 +371,7 @@ static void __nvm_remove_target(struct nvm_target *t) | |||
351 | 371 | ||
352 | nvm_remove_tgt_dev(t->dev, 1); | 372 | nvm_remove_tgt_dev(t->dev, 1); |
353 | put_disk(tdisk); | 373 | put_disk(tdisk); |
374 | module_put(t->type->owner); | ||
354 | 375 | ||
355 | list_del(&t->list); | 376 | list_del(&t->list); |
356 | kfree(t); | 377 | kfree(t); |
@@ -532,25 +553,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, | |||
532 | } | 553 | } |
533 | EXPORT_SYMBOL(nvm_part_to_tgt); | 554 | EXPORT_SYMBOL(nvm_part_to_tgt); |
534 | 555 | ||
535 | struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) | ||
536 | { | ||
537 | struct nvm_tgt_type *tmp, *tt = NULL; | ||
538 | |||
539 | if (lock) | ||
540 | down_write(&nvm_tgtt_lock); | ||
541 | |||
542 | list_for_each_entry(tmp, &nvm_tgt_types, list) | ||
543 | if (!strcmp(name, tmp->name)) { | ||
544 | tt = tmp; | ||
545 | break; | ||
546 | } | ||
547 | |||
548 | if (lock) | ||
549 | up_write(&nvm_tgtt_lock); | ||
550 | return tt; | ||
551 | } | ||
552 | EXPORT_SYMBOL(nvm_find_target_type); | ||
553 | |||
554 | int nvm_register_tgt_type(struct nvm_tgt_type *tt) | 556 | int nvm_register_tgt_type(struct nvm_tgt_type *tt) |
555 | { | 557 | { |
556 | int ret = 0; | 558 | int ret = 0; |
@@ -571,9 +573,9 @@ void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) | |||
571 | if (!tt) | 573 | if (!tt) |
572 | return; | 574 | return; |
573 | 575 | ||
574 | down_write(&nvm_lock); | 576 | down_write(&nvm_tgtt_lock); |
575 | list_del(&tt->list); | 577 | list_del(&tt->list); |
576 | up_write(&nvm_lock); | 578 | up_write(&nvm_tgtt_lock); |
577 | } | 579 | } |
578 | EXPORT_SYMBOL(nvm_unregister_tgt_type); | 580 | EXPORT_SYMBOL(nvm_unregister_tgt_type); |
579 | 581 | ||
@@ -602,6 +604,52 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) | |||
602 | return NULL; | 604 | return NULL; |
603 | } | 605 | } |
604 | 606 | ||
607 | static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, | ||
608 | const struct ppa_addr *ppas, int nr_ppas) | ||
609 | { | ||
610 | struct nvm_dev *dev = tgt_dev->parent; | ||
611 | struct nvm_geo *geo = &tgt_dev->geo; | ||
612 | int i, plane_cnt, pl_idx; | ||
613 | struct ppa_addr ppa; | ||
614 | |||
615 | if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { | ||
616 | rqd->nr_ppas = nr_ppas; | ||
617 | rqd->ppa_addr = ppas[0]; | ||
618 | |||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | rqd->nr_ppas = nr_ppas; | ||
623 | rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); | ||
624 | if (!rqd->ppa_list) { | ||
625 | pr_err("nvm: failed to allocate dma memory\n"); | ||
626 | return -ENOMEM; | ||
627 | } | ||
628 | |||
629 | plane_cnt = geo->plane_mode; | ||
630 | rqd->nr_ppas *= plane_cnt; | ||
631 | |||
632 | for (i = 0; i < nr_ppas; i++) { | ||
633 | for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { | ||
634 | ppa = ppas[i]; | ||
635 | ppa.g.pl = pl_idx; | ||
636 | rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; | ||
637 | } | ||
638 | } | ||
639 | |||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, | ||
644 | struct nvm_rq *rqd) | ||
645 | { | ||
646 | if (!rqd->ppa_list) | ||
647 | return; | ||
648 | |||
649 | nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); | ||
650 | } | ||
651 | |||
652 | |||
605 | int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, | 653 | int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, |
606 | int nr_ppas, int type) | 654 | int nr_ppas, int type) |
607 | { | 655 | { |
@@ -616,7 +664,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, | |||
616 | 664 | ||
617 | memset(&rqd, 0, sizeof(struct nvm_rq)); | 665 | memset(&rqd, 0, sizeof(struct nvm_rq)); |
618 | 666 | ||
619 | nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); | 667 | nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); |
620 | nvm_rq_tgt_to_dev(tgt_dev, &rqd); | 668 | nvm_rq_tgt_to_dev(tgt_dev, &rqd); |
621 | 669 | ||
622 | ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); | 670 | ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); |
@@ -658,12 +706,25 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) | |||
658 | } | 706 | } |
659 | EXPORT_SYMBOL(nvm_submit_io); | 707 | EXPORT_SYMBOL(nvm_submit_io); |
660 | 708 | ||
661 | static void nvm_end_io_sync(struct nvm_rq *rqd) | 709 | int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) |
662 | { | 710 | { |
663 | struct completion *waiting = rqd->private; | 711 | struct nvm_dev *dev = tgt_dev->parent; |
712 | int ret; | ||
664 | 713 | ||
665 | complete(waiting); | 714 | if (!dev->ops->submit_io_sync) |
715 | return -ENODEV; | ||
716 | |||
717 | nvm_rq_tgt_to_dev(tgt_dev, rqd); | ||
718 | |||
719 | rqd->dev = tgt_dev; | ||
720 | |||
721 | /* In case of error, fail with right address format */ | ||
722 | ret = dev->ops->submit_io_sync(dev, rqd); | ||
723 | nvm_rq_dev_to_tgt(tgt_dev, rqd); | ||
724 | |||
725 | return ret; | ||
666 | } | 726 | } |
727 | EXPORT_SYMBOL(nvm_submit_io_sync); | ||
667 | 728 | ||
668 | int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, | 729 | int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, |
669 | int nr_ppas) | 730 | int nr_ppas) |
@@ -671,25 +732,21 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, | |||
671 | struct nvm_geo *geo = &tgt_dev->geo; | 732 | struct nvm_geo *geo = &tgt_dev->geo; |
672 | struct nvm_rq rqd; | 733 | struct nvm_rq rqd; |
673 | int ret; | 734 | int ret; |
674 | DECLARE_COMPLETION_ONSTACK(wait); | ||
675 | 735 | ||
676 | memset(&rqd, 0, sizeof(struct nvm_rq)); | 736 | memset(&rqd, 0, sizeof(struct nvm_rq)); |
677 | 737 | ||
678 | rqd.opcode = NVM_OP_ERASE; | 738 | rqd.opcode = NVM_OP_ERASE; |
679 | rqd.end_io = nvm_end_io_sync; | ||
680 | rqd.private = &wait; | ||
681 | rqd.flags = geo->plane_mode >> 1; | 739 | rqd.flags = geo->plane_mode >> 1; |
682 | 740 | ||
683 | ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); | 741 | ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); |
684 | if (ret) | 742 | if (ret) |
685 | return ret; | 743 | return ret; |
686 | 744 | ||
687 | ret = nvm_submit_io(tgt_dev, &rqd); | 745 | ret = nvm_submit_io_sync(tgt_dev, &rqd); |
688 | if (ret) { | 746 | if (ret) { |
689 | pr_err("rrpr: erase I/O submission failed: %d\n", ret); | 747 | pr_err("rrpr: erase I/O submission failed: %d\n", ret); |
690 | goto free_ppa_list; | 748 | goto free_ppa_list; |
691 | } | 749 | } |
692 | wait_for_completion_io(&wait); | ||
693 | 750 | ||
694 | free_ppa_list: | 751 | free_ppa_list: |
695 | nvm_free_rqd_ppalist(tgt_dev, &rqd); | 752 | nvm_free_rqd_ppalist(tgt_dev, &rqd); |
@@ -775,57 +832,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) | |||
775 | } | 832 | } |
776 | EXPORT_SYMBOL(nvm_put_area); | 833 | EXPORT_SYMBOL(nvm_put_area); |
777 | 834 | ||
778 | int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, | ||
779 | const struct ppa_addr *ppas, int nr_ppas, int vblk) | ||
780 | { | ||
781 | struct nvm_dev *dev = tgt_dev->parent; | ||
782 | struct nvm_geo *geo = &tgt_dev->geo; | ||
783 | int i, plane_cnt, pl_idx; | ||
784 | struct ppa_addr ppa; | ||
785 | |||
786 | if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { | ||
787 | rqd->nr_ppas = nr_ppas; | ||
788 | rqd->ppa_addr = ppas[0]; | ||
789 | |||
790 | return 0; | ||
791 | } | ||
792 | |||
793 | rqd->nr_ppas = nr_ppas; | ||
794 | rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); | ||
795 | if (!rqd->ppa_list) { | ||
796 | pr_err("nvm: failed to allocate dma memory\n"); | ||
797 | return -ENOMEM; | ||
798 | } | ||
799 | |||
800 | if (!vblk) { | ||
801 | for (i = 0; i < nr_ppas; i++) | ||
802 | rqd->ppa_list[i] = ppas[i]; | ||
803 | } else { | ||
804 | plane_cnt = geo->plane_mode; | ||
805 | rqd->nr_ppas *= plane_cnt; | ||
806 | |||
807 | for (i = 0; i < nr_ppas; i++) { | ||
808 | for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { | ||
809 | ppa = ppas[i]; | ||
810 | ppa.g.pl = pl_idx; | ||
811 | rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; | ||
812 | } | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return 0; | ||
817 | } | ||
818 | EXPORT_SYMBOL(nvm_set_rqd_ppalist); | ||
819 | |||
820 | void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) | ||
821 | { | ||
822 | if (!rqd->ppa_list) | ||
823 | return; | ||
824 | |||
825 | nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); | ||
826 | } | ||
827 | EXPORT_SYMBOL(nvm_free_rqd_ppalist); | ||
828 | |||
829 | void nvm_end_io(struct nvm_rq *rqd) | 835 | void nvm_end_io(struct nvm_rq *rqd) |
830 | { | 836 | { |
831 | struct nvm_tgt_dev *tgt_dev = rqd->dev; | 837 | struct nvm_tgt_dev *tgt_dev = rqd->dev; |
@@ -1177,7 +1183,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) | |||
1177 | info->version[1] = NVM_VERSION_MINOR; | 1183 | info->version[1] = NVM_VERSION_MINOR; |
1178 | info->version[2] = NVM_VERSION_PATCH; | 1184 | info->version[2] = NVM_VERSION_PATCH; |
1179 | 1185 | ||
1180 | down_write(&nvm_lock); | 1186 | down_write(&nvm_tgtt_lock); |
1181 | list_for_each_entry(tt, &nvm_tgt_types, list) { | 1187 | list_for_each_entry(tt, &nvm_tgt_types, list) { |
1182 | struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; | 1188 | struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; |
1183 | 1189 | ||
@@ -1190,7 +1196,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) | |||
1190 | } | 1196 | } |
1191 | 1197 | ||
1192 | info->tgtsize = tgt_iter; | 1198 | info->tgtsize = tgt_iter; |
1193 | up_write(&nvm_lock); | 1199 | up_write(&nvm_tgtt_lock); |
1194 | 1200 | ||
1195 | if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { | 1201 | if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { |
1196 | kfree(info); | 1202 | kfree(info); |
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 024a8fc93069..0d227ef7d1b9 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c | |||
@@ -43,8 +43,10 @@ retry: | |||
43 | if (unlikely(!bio_has_data(bio))) | 43 | if (unlikely(!bio_has_data(bio))) |
44 | goto out; | 44 | goto out; |
45 | 45 | ||
46 | w_ctx.flags = flags; | ||
47 | pblk_ppa_set_empty(&w_ctx.ppa); | 46 | pblk_ppa_set_empty(&w_ctx.ppa); |
47 | w_ctx.flags = flags; | ||
48 | if (bio->bi_opf & REQ_PREFLUSH) | ||
49 | w_ctx.flags |= PBLK_FLUSH_ENTRY; | ||
48 | 50 | ||
49 | for (i = 0; i < nr_entries; i++) { | 51 | for (i = 0; i < nr_entries; i++) { |
50 | void *data = bio_data(bio); | 52 | void *data = bio_data(bio); |
@@ -73,12 +75,11 @@ out: | |||
73 | * On GC the incoming lbas are not necessarily sequential. Also, some of the | 75 | * On GC the incoming lbas are not necessarily sequential. Also, some of the |
74 | * lbas might not be valid entries, which are marked as empty by the GC thread | 76 | * lbas might not be valid entries, which are marked as empty by the GC thread |
75 | */ | 77 | */ |
76 | int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, | 78 | int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) |
77 | unsigned int nr_entries, unsigned int nr_rec_entries, | ||
78 | struct pblk_line *gc_line, unsigned long flags) | ||
79 | { | 79 | { |
80 | struct pblk_w_ctx w_ctx; | 80 | struct pblk_w_ctx w_ctx; |
81 | unsigned int bpos, pos; | 81 | unsigned int bpos, pos; |
82 | void *data = gc_rq->data; | ||
82 | int i, valid_entries; | 83 | int i, valid_entries; |
83 | 84 | ||
84 | /* Update the write buffer head (mem) with the entries that we can | 85 | /* Update the write buffer head (mem) with the entries that we can |
@@ -86,28 +87,29 @@ int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, | |||
86 | * rollback from here on. | 87 | * rollback from here on. |
87 | */ | 88 | */ |
88 | retry: | 89 | retry: |
89 | if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) { | 90 | if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) { |
90 | io_schedule(); | 91 | io_schedule(); |
91 | goto retry; | 92 | goto retry; |
92 | } | 93 | } |
93 | 94 | ||
94 | w_ctx.flags = flags; | 95 | w_ctx.flags = PBLK_IOTYPE_GC; |
95 | pblk_ppa_set_empty(&w_ctx.ppa); | 96 | pblk_ppa_set_empty(&w_ctx.ppa); |
96 | 97 | ||
97 | for (i = 0, valid_entries = 0; i < nr_entries; i++) { | 98 | for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) { |
98 | if (lba_list[i] == ADDR_EMPTY) | 99 | if (gc_rq->lba_list[i] == ADDR_EMPTY) |
99 | continue; | 100 | continue; |
100 | 101 | ||
101 | w_ctx.lba = lba_list[i]; | 102 | w_ctx.lba = gc_rq->lba_list[i]; |
102 | 103 | ||
103 | pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); | 104 | pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); |
104 | pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos); | 105 | pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line, |
106 | gc_rq->paddr_list[i], pos); | ||
105 | 107 | ||
106 | data += PBLK_EXPOSED_PAGE_SIZE; | 108 | data += PBLK_EXPOSED_PAGE_SIZE; |
107 | valid_entries++; | 109 | valid_entries++; |
108 | } | 110 | } |
109 | 111 | ||
110 | WARN_ONCE(nr_rec_entries != valid_entries, | 112 | WARN_ONCE(gc_rq->secs_to_gc != valid_entries, |
111 | "pblk: inconsistent GC write\n"); | 113 | "pblk: inconsistent GC write\n"); |
112 | 114 | ||
113 | #ifdef CONFIG_NVM_DEBUG | 115 | #ifdef CONFIG_NVM_DEBUG |
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 81501644fb15..ce90213a42fa 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c | |||
@@ -18,6 +18,31 @@ | |||
18 | 18 | ||
19 | #include "pblk.h" | 19 | #include "pblk.h" |
20 | 20 | ||
21 | static void pblk_line_mark_bb(struct work_struct *work) | ||
22 | { | ||
23 | struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, | ||
24 | ws); | ||
25 | struct pblk *pblk = line_ws->pblk; | ||
26 | struct nvm_tgt_dev *dev = pblk->dev; | ||
27 | struct ppa_addr *ppa = line_ws->priv; | ||
28 | int ret; | ||
29 | |||
30 | ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); | ||
31 | if (ret) { | ||
32 | struct pblk_line *line; | ||
33 | int pos; | ||
34 | |||
35 | line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; | ||
36 | pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); | ||
37 | |||
38 | pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", | ||
39 | line->id, pos); | ||
40 | } | ||
41 | |||
42 | kfree(ppa); | ||
43 | mempool_free(line_ws, pblk->gen_ws_pool); | ||
44 | } | ||
45 | |||
21 | static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, | 46 | static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, |
22 | struct ppa_addr *ppa) | 47 | struct ppa_addr *ppa) |
23 | { | 48 | { |
@@ -33,7 +58,8 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, | |||
33 | pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", | 58 | pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", |
34 | line->id, pos); | 59 | line->id, pos); |
35 | 60 | ||
36 | pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq); | 61 | pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, |
62 | GFP_ATOMIC, pblk->bb_wq); | ||
37 | } | 63 | } |
38 | 64 | ||
39 | static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) | 65 | static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) |
@@ -63,7 +89,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd) | |||
63 | struct pblk *pblk = rqd->private; | 89 | struct pblk *pblk = rqd->private; |
64 | 90 | ||
65 | __pblk_end_io_erase(pblk, rqd); | 91 | __pblk_end_io_erase(pblk, rqd); |
66 | mempool_free(rqd, pblk->g_rq_pool); | 92 | mempool_free(rqd, pblk->e_rq_pool); |
67 | } | 93 | } |
68 | 94 | ||
69 | void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, | 95 | void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, |
@@ -77,11 +103,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, | |||
77 | * that newer updates are not overwritten. | 103 | * that newer updates are not overwritten. |
78 | */ | 104 | */ |
79 | spin_lock(&line->lock); | 105 | spin_lock(&line->lock); |
80 | if (line->state == PBLK_LINESTATE_GC || | 106 | WARN_ON(line->state == PBLK_LINESTATE_FREE); |
81 | line->state == PBLK_LINESTATE_FREE) { | ||
82 | spin_unlock(&line->lock); | ||
83 | return; | ||
84 | } | ||
85 | 107 | ||
86 | if (test_and_set_bit(paddr, line->invalid_bitmap)) { | 108 | if (test_and_set_bit(paddr, line->invalid_bitmap)) { |
87 | WARN_ONCE(1, "pblk: double invalidate\n"); | 109 | WARN_ONCE(1, "pblk: double invalidate\n"); |
@@ -98,8 +120,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, | |||
98 | spin_lock(&l_mg->gc_lock); | 120 | spin_lock(&l_mg->gc_lock); |
99 | spin_lock(&line->lock); | 121 | spin_lock(&line->lock); |
100 | /* Prevent moving a line that has just been chosen for GC */ | 122 | /* Prevent moving a line that has just been chosen for GC */ |
101 | if (line->state == PBLK_LINESTATE_GC || | 123 | if (line->state == PBLK_LINESTATE_GC) { |
102 | line->state == PBLK_LINESTATE_FREE) { | ||
103 | spin_unlock(&line->lock); | 124 | spin_unlock(&line->lock); |
104 | spin_unlock(&l_mg->gc_lock); | 125 | spin_unlock(&l_mg->gc_lock); |
105 | return; | 126 | return; |
@@ -150,17 +171,25 @@ static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, | |||
150 | spin_unlock(&pblk->trans_lock); | 171 | spin_unlock(&pblk->trans_lock); |
151 | } | 172 | } |
152 | 173 | ||
153 | struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) | 174 | /* Caller must guarantee that the request is a valid type */ |
175 | struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) | ||
154 | { | 176 | { |
155 | mempool_t *pool; | 177 | mempool_t *pool; |
156 | struct nvm_rq *rqd; | 178 | struct nvm_rq *rqd; |
157 | int rq_size; | 179 | int rq_size; |
158 | 180 | ||
159 | if (rw == WRITE) { | 181 | switch (type) { |
182 | case PBLK_WRITE: | ||
183 | case PBLK_WRITE_INT: | ||
160 | pool = pblk->w_rq_pool; | 184 | pool = pblk->w_rq_pool; |
161 | rq_size = pblk_w_rq_size; | 185 | rq_size = pblk_w_rq_size; |
162 | } else { | 186 | break; |
163 | pool = pblk->g_rq_pool; | 187 | case PBLK_READ: |
188 | pool = pblk->r_rq_pool; | ||
189 | rq_size = pblk_g_rq_size; | ||
190 | break; | ||
191 | default: | ||
192 | pool = pblk->e_rq_pool; | ||
164 | rq_size = pblk_g_rq_size; | 193 | rq_size = pblk_g_rq_size; |
165 | } | 194 | } |
166 | 195 | ||
@@ -170,15 +199,30 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) | |||
170 | return rqd; | 199 | return rqd; |
171 | } | 200 | } |
172 | 201 | ||
173 | void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) | 202 | /* Typically used on completion path. Cannot guarantee request consistency */ |
203 | void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) | ||
174 | { | 204 | { |
205 | struct nvm_tgt_dev *dev = pblk->dev; | ||
175 | mempool_t *pool; | 206 | mempool_t *pool; |
176 | 207 | ||
177 | if (rw == WRITE) | 208 | switch (type) { |
209 | case PBLK_WRITE: | ||
210 | kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); | ||
211 | case PBLK_WRITE_INT: | ||
178 | pool = pblk->w_rq_pool; | 212 | pool = pblk->w_rq_pool; |
179 | else | 213 | break; |
180 | pool = pblk->g_rq_pool; | 214 | case PBLK_READ: |
215 | pool = pblk->r_rq_pool; | ||
216 | break; | ||
217 | case PBLK_ERASE: | ||
218 | pool = pblk->e_rq_pool; | ||
219 | break; | ||
220 | default: | ||
221 | pr_err("pblk: trying to free unknown rqd type\n"); | ||
222 | return; | ||
223 | } | ||
181 | 224 | ||
225 | nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); | ||
182 | mempool_free(rqd, pool); | 226 | mempool_free(rqd, pool); |
183 | } | 227 | } |
184 | 228 | ||
@@ -190,10 +234,9 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, | |||
190 | 234 | ||
191 | WARN_ON(off + nr_pages != bio->bi_vcnt); | 235 | WARN_ON(off + nr_pages != bio->bi_vcnt); |
192 | 236 | ||
193 | bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE); | ||
194 | for (i = off; i < nr_pages + off; i++) { | 237 | for (i = off; i < nr_pages + off; i++) { |
195 | bv = bio->bi_io_vec[i]; | 238 | bv = bio->bi_io_vec[i]; |
196 | mempool_free(bv.bv_page, pblk->page_pool); | 239 | mempool_free(bv.bv_page, pblk->page_bio_pool); |
197 | } | 240 | } |
198 | } | 241 | } |
199 | 242 | ||
@@ -205,14 +248,12 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, | |||
205 | int i, ret; | 248 | int i, ret; |
206 | 249 | ||
207 | for (i = 0; i < nr_pages; i++) { | 250 | for (i = 0; i < nr_pages; i++) { |
208 | page = mempool_alloc(pblk->page_pool, flags); | 251 | page = mempool_alloc(pblk->page_bio_pool, flags); |
209 | if (!page) | ||
210 | goto err; | ||
211 | 252 | ||
212 | ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); | 253 | ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); |
213 | if (ret != PBLK_EXPOSED_PAGE_SIZE) { | 254 | if (ret != PBLK_EXPOSED_PAGE_SIZE) { |
214 | pr_err("pblk: could not add page to bio\n"); | 255 | pr_err("pblk: could not add page to bio\n"); |
215 | mempool_free(page, pblk->page_pool); | 256 | mempool_free(page, pblk->page_bio_pool); |
216 | goto err; | 257 | goto err; |
217 | } | 258 | } |
218 | } | 259 | } |
@@ -245,13 +286,6 @@ void pblk_write_should_kick(struct pblk *pblk) | |||
245 | pblk_write_kick(pblk); | 286 | pblk_write_kick(pblk); |
246 | } | 287 | } |
247 | 288 | ||
248 | void pblk_end_bio_sync(struct bio *bio) | ||
249 | { | ||
250 | struct completion *waiting = bio->bi_private; | ||
251 | |||
252 | complete(waiting); | ||
253 | } | ||
254 | |||
255 | void pblk_end_io_sync(struct nvm_rq *rqd) | 289 | void pblk_end_io_sync(struct nvm_rq *rqd) |
256 | { | 290 | { |
257 | struct completion *waiting = rqd->private; | 291 | struct completion *waiting = rqd->private; |
@@ -259,7 +293,7 @@ void pblk_end_io_sync(struct nvm_rq *rqd) | |||
259 | complete(waiting); | 293 | complete(waiting); |
260 | } | 294 | } |
261 | 295 | ||
262 | void pblk_wait_for_meta(struct pblk *pblk) | 296 | static void pblk_wait_for_meta(struct pblk *pblk) |
263 | { | 297 | { |
264 | do { | 298 | do { |
265 | if (!atomic_read(&pblk->inflight_io)) | 299 | if (!atomic_read(&pblk->inflight_io)) |
@@ -336,17 +370,6 @@ void pblk_discard(struct pblk *pblk, struct bio *bio) | |||
336 | pblk_invalidate_range(pblk, slba, nr_secs); | 370 | pblk_invalidate_range(pblk, slba, nr_secs); |
337 | } | 371 | } |
338 | 372 | ||
339 | struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba) | ||
340 | { | ||
341 | struct ppa_addr ppa; | ||
342 | |||
343 | spin_lock(&pblk->trans_lock); | ||
344 | ppa = pblk_trans_map_get(pblk, lba); | ||
345 | spin_unlock(&pblk->trans_lock); | ||
346 | |||
347 | return ppa; | ||
348 | } | ||
349 | |||
350 | void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) | 373 | void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) |
351 | { | 374 | { |
352 | atomic_long_inc(&pblk->write_failed); | 375 | atomic_long_inc(&pblk->write_failed); |
@@ -389,39 +412,38 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) | |||
389 | struct nvm_tgt_dev *dev = pblk->dev; | 412 | struct nvm_tgt_dev *dev = pblk->dev; |
390 | 413 | ||
391 | #ifdef CONFIG_NVM_DEBUG | 414 | #ifdef CONFIG_NVM_DEBUG |
392 | struct ppa_addr *ppa_list; | 415 | int ret; |
393 | 416 | ||
394 | ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; | 417 | ret = pblk_check_io(pblk, rqd); |
395 | if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { | 418 | if (ret) |
396 | WARN_ON(1); | 419 | return ret; |
397 | return -EINVAL; | 420 | #endif |
398 | } | ||
399 | 421 | ||
400 | if (rqd->opcode == NVM_OP_PWRITE) { | 422 | atomic_inc(&pblk->inflight_io); |
401 | struct pblk_line *line; | ||
402 | struct ppa_addr ppa; | ||
403 | int i; | ||
404 | 423 | ||
405 | for (i = 0; i < rqd->nr_ppas; i++) { | 424 | return nvm_submit_io(dev, rqd); |
406 | ppa = ppa_list[i]; | 425 | } |
407 | line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; | ||
408 | 426 | ||
409 | spin_lock(&line->lock); | 427 | int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) |
410 | if (line->state != PBLK_LINESTATE_OPEN) { | 428 | { |
411 | pr_err("pblk: bad ppa: line:%d,state:%d\n", | 429 | struct nvm_tgt_dev *dev = pblk->dev; |
412 | line->id, line->state); | 430 | |
413 | WARN_ON(1); | 431 | #ifdef CONFIG_NVM_DEBUG |
414 | spin_unlock(&line->lock); | 432 | int ret; |
415 | return -EINVAL; | 433 | |
416 | } | 434 | ret = pblk_check_io(pblk, rqd); |
417 | spin_unlock(&line->lock); | 435 | if (ret) |
418 | } | 436 | return ret; |
419 | } | ||
420 | #endif | 437 | #endif |
421 | 438 | ||
422 | atomic_inc(&pblk->inflight_io); | 439 | atomic_inc(&pblk->inflight_io); |
423 | 440 | ||
424 | return nvm_submit_io(dev, rqd); | 441 | return nvm_submit_io_sync(dev, rqd); |
442 | } | ||
443 | |||
444 | static void pblk_bio_map_addr_endio(struct bio *bio) | ||
445 | { | ||
446 | bio_put(bio); | ||
425 | } | 447 | } |
426 | 448 | ||
427 | struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, | 449 | struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, |
@@ -460,6 +482,8 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, | |||
460 | 482 | ||
461 | kaddr += PAGE_SIZE; | 483 | kaddr += PAGE_SIZE; |
462 | } | 484 | } |
485 | |||
486 | bio->bi_end_io = pblk_bio_map_addr_endio; | ||
463 | out: | 487 | out: |
464 | return bio; | 488 | return bio; |
465 | } | 489 | } |
@@ -486,12 +510,14 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) | |||
486 | u64 addr; | 510 | u64 addr; |
487 | int i; | 511 | int i; |
488 | 512 | ||
513 | spin_lock(&line->lock); | ||
489 | addr = find_next_zero_bit(line->map_bitmap, | 514 | addr = find_next_zero_bit(line->map_bitmap, |
490 | pblk->lm.sec_per_line, line->cur_sec); | 515 | pblk->lm.sec_per_line, line->cur_sec); |
491 | line->cur_sec = addr - nr_secs; | 516 | line->cur_sec = addr - nr_secs; |
492 | 517 | ||
493 | for (i = 0; i < nr_secs; i++, line->cur_sec--) | 518 | for (i = 0; i < nr_secs; i++, line->cur_sec--) |
494 | WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); | 519 | WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); |
520 | spin_unlock(&line->lock); | ||
495 | } | 521 | } |
496 | 522 | ||
497 | u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) | 523 | u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) |
@@ -565,12 +591,11 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, | |||
565 | int cmd_op, bio_op; | 591 | int cmd_op, bio_op; |
566 | int i, j; | 592 | int i, j; |
567 | int ret; | 593 | int ret; |
568 | DECLARE_COMPLETION_ONSTACK(wait); | ||
569 | 594 | ||
570 | if (dir == WRITE) { | 595 | if (dir == PBLK_WRITE) { |
571 | bio_op = REQ_OP_WRITE; | 596 | bio_op = REQ_OP_WRITE; |
572 | cmd_op = NVM_OP_PWRITE; | 597 | cmd_op = NVM_OP_PWRITE; |
573 | } else if (dir == READ) { | 598 | } else if (dir == PBLK_READ) { |
574 | bio_op = REQ_OP_READ; | 599 | bio_op = REQ_OP_READ; |
575 | cmd_op = NVM_OP_PREAD; | 600 | cmd_op = NVM_OP_PREAD; |
576 | } else | 601 | } else |
@@ -607,13 +632,11 @@ next_rq: | |||
607 | rqd.dma_ppa_list = dma_ppa_list; | 632 | rqd.dma_ppa_list = dma_ppa_list; |
608 | rqd.opcode = cmd_op; | 633 | rqd.opcode = cmd_op; |
609 | rqd.nr_ppas = rq_ppas; | 634 | rqd.nr_ppas = rq_ppas; |
610 | rqd.end_io = pblk_end_io_sync; | ||
611 | rqd.private = &wait; | ||
612 | 635 | ||
613 | if (dir == WRITE) { | 636 | if (dir == PBLK_WRITE) { |
614 | struct pblk_sec_meta *meta_list = rqd.meta_list; | 637 | struct pblk_sec_meta *meta_list = rqd.meta_list; |
615 | 638 | ||
616 | rqd.flags = pblk_set_progr_mode(pblk, WRITE); | 639 | rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE); |
617 | for (i = 0; i < rqd.nr_ppas; ) { | 640 | for (i = 0; i < rqd.nr_ppas; ) { |
618 | spin_lock(&line->lock); | 641 | spin_lock(&line->lock); |
619 | paddr = __pblk_alloc_page(pblk, line, min); | 642 | paddr = __pblk_alloc_page(pblk, line, min); |
@@ -662,25 +685,17 @@ next_rq: | |||
662 | } | 685 | } |
663 | } | 686 | } |
664 | 687 | ||
665 | ret = pblk_submit_io(pblk, &rqd); | 688 | ret = pblk_submit_io_sync(pblk, &rqd); |
666 | if (ret) { | 689 | if (ret) { |
667 | pr_err("pblk: emeta I/O submission failed: %d\n", ret); | 690 | pr_err("pblk: emeta I/O submission failed: %d\n", ret); |
668 | bio_put(bio); | 691 | bio_put(bio); |
669 | goto free_rqd_dma; | 692 | goto free_rqd_dma; |
670 | } | 693 | } |
671 | 694 | ||
672 | if (!wait_for_completion_io_timeout(&wait, | ||
673 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
674 | pr_err("pblk: emeta I/O timed out\n"); | ||
675 | } | ||
676 | atomic_dec(&pblk->inflight_io); | 695 | atomic_dec(&pblk->inflight_io); |
677 | reinit_completion(&wait); | ||
678 | |||
679 | if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META)) | ||
680 | bio_put(bio); | ||
681 | 696 | ||
682 | if (rqd.error) { | 697 | if (rqd.error) { |
683 | if (dir == WRITE) | 698 | if (dir == PBLK_WRITE) |
684 | pblk_log_write_err(pblk, &rqd); | 699 | pblk_log_write_err(pblk, &rqd); |
685 | else | 700 | else |
686 | pblk_log_read_err(pblk, &rqd); | 701 | pblk_log_read_err(pblk, &rqd); |
@@ -721,14 +736,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, | |||
721 | int i, ret; | 736 | int i, ret; |
722 | int cmd_op, bio_op; | 737 | int cmd_op, bio_op; |
723 | int flags; | 738 | int flags; |
724 | DECLARE_COMPLETION_ONSTACK(wait); | ||
725 | 739 | ||
726 | if (dir == WRITE) { | 740 | if (dir == PBLK_WRITE) { |
727 | bio_op = REQ_OP_WRITE; | 741 | bio_op = REQ_OP_WRITE; |
728 | cmd_op = NVM_OP_PWRITE; | 742 | cmd_op = NVM_OP_PWRITE; |
729 | flags = pblk_set_progr_mode(pblk, WRITE); | 743 | flags = pblk_set_progr_mode(pblk, PBLK_WRITE); |
730 | lba_list = emeta_to_lbas(pblk, line->emeta->buf); | 744 | lba_list = emeta_to_lbas(pblk, line->emeta->buf); |
731 | } else if (dir == READ) { | 745 | } else if (dir == PBLK_READ) { |
732 | bio_op = REQ_OP_READ; | 746 | bio_op = REQ_OP_READ; |
733 | cmd_op = NVM_OP_PREAD; | 747 | cmd_op = NVM_OP_PREAD; |
734 | flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); | 748 | flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); |
@@ -758,15 +772,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, | |||
758 | rqd.opcode = cmd_op; | 772 | rqd.opcode = cmd_op; |
759 | rqd.flags = flags; | 773 | rqd.flags = flags; |
760 | rqd.nr_ppas = lm->smeta_sec; | 774 | rqd.nr_ppas = lm->smeta_sec; |
761 | rqd.end_io = pblk_end_io_sync; | ||
762 | rqd.private = &wait; | ||
763 | 775 | ||
764 | for (i = 0; i < lm->smeta_sec; i++, paddr++) { | 776 | for (i = 0; i < lm->smeta_sec; i++, paddr++) { |
765 | struct pblk_sec_meta *meta_list = rqd.meta_list; | 777 | struct pblk_sec_meta *meta_list = rqd.meta_list; |
766 | 778 | ||
767 | rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); | 779 | rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); |
768 | 780 | ||
769 | if (dir == WRITE) { | 781 | if (dir == PBLK_WRITE) { |
770 | __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); | 782 | __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); |
771 | 783 | ||
772 | meta_list[i].lba = lba_list[paddr] = addr_empty; | 784 | meta_list[i].lba = lba_list[paddr] = addr_empty; |
@@ -778,21 +790,17 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, | |||
778 | * the write thread is the only one sending write and erase commands, | 790 | * the write thread is the only one sending write and erase commands, |
779 | * there is no need to take the LUN semaphore. | 791 | * there is no need to take the LUN semaphore. |
780 | */ | 792 | */ |
781 | ret = pblk_submit_io(pblk, &rqd); | 793 | ret = pblk_submit_io_sync(pblk, &rqd); |
782 | if (ret) { | 794 | if (ret) { |
783 | pr_err("pblk: smeta I/O submission failed: %d\n", ret); | 795 | pr_err("pblk: smeta I/O submission failed: %d\n", ret); |
784 | bio_put(bio); | 796 | bio_put(bio); |
785 | goto free_ppa_list; | 797 | goto free_ppa_list; |
786 | } | 798 | } |
787 | 799 | ||
788 | if (!wait_for_completion_io_timeout(&wait, | ||
789 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
790 | pr_err("pblk: smeta I/O timed out\n"); | ||
791 | } | ||
792 | atomic_dec(&pblk->inflight_io); | 800 | atomic_dec(&pblk->inflight_io); |
793 | 801 | ||
794 | if (rqd.error) { | 802 | if (rqd.error) { |
795 | if (dir == WRITE) | 803 | if (dir == PBLK_WRITE) |
796 | pblk_log_write_err(pblk, &rqd); | 804 | pblk_log_write_err(pblk, &rqd); |
797 | else | 805 | else |
798 | pblk_log_read_err(pblk, &rqd); | 806 | pblk_log_read_err(pblk, &rqd); |
@@ -808,14 +816,14 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) | |||
808 | { | 816 | { |
809 | u64 bpaddr = pblk_line_smeta_start(pblk, line); | 817 | u64 bpaddr = pblk_line_smeta_start(pblk, line); |
810 | 818 | ||
811 | return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ); | 819 | return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ); |
812 | } | 820 | } |
813 | 821 | ||
814 | int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, | 822 | int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, |
815 | void *emeta_buf) | 823 | void *emeta_buf) |
816 | { | 824 | { |
817 | return pblk_line_submit_emeta_io(pblk, line, emeta_buf, | 825 | return pblk_line_submit_emeta_io(pblk, line, emeta_buf, |
818 | line->emeta_ssec, READ); | 826 | line->emeta_ssec, PBLK_READ); |
819 | } | 827 | } |
820 | 828 | ||
821 | static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, | 829 | static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, |
@@ -824,7 +832,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, | |||
824 | rqd->opcode = NVM_OP_ERASE; | 832 | rqd->opcode = NVM_OP_ERASE; |
825 | rqd->ppa_addr = ppa; | 833 | rqd->ppa_addr = ppa; |
826 | rqd->nr_ppas = 1; | 834 | rqd->nr_ppas = 1; |
827 | rqd->flags = pblk_set_progr_mode(pblk, ERASE); | 835 | rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE); |
828 | rqd->bio = NULL; | 836 | rqd->bio = NULL; |
829 | } | 837 | } |
830 | 838 | ||
@@ -832,19 +840,15 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) | |||
832 | { | 840 | { |
833 | struct nvm_rq rqd; | 841 | struct nvm_rq rqd; |
834 | int ret = 0; | 842 | int ret = 0; |
835 | DECLARE_COMPLETION_ONSTACK(wait); | ||
836 | 843 | ||
837 | memset(&rqd, 0, sizeof(struct nvm_rq)); | 844 | memset(&rqd, 0, sizeof(struct nvm_rq)); |
838 | 845 | ||
839 | pblk_setup_e_rq(pblk, &rqd, ppa); | 846 | pblk_setup_e_rq(pblk, &rqd, ppa); |
840 | 847 | ||
841 | rqd.end_io = pblk_end_io_sync; | ||
842 | rqd.private = &wait; | ||
843 | |||
844 | /* The write thread schedules erases so that it minimizes disturbances | 848 | /* The write thread schedules erases so that it minimizes disturbances |
845 | * with writes. Thus, there is no need to take the LUN semaphore. | 849 | * with writes. Thus, there is no need to take the LUN semaphore. |
846 | */ | 850 | */ |
847 | ret = pblk_submit_io(pblk, &rqd); | 851 | ret = pblk_submit_io_sync(pblk, &rqd); |
848 | if (ret) { | 852 | if (ret) { |
849 | struct nvm_tgt_dev *dev = pblk->dev; | 853 | struct nvm_tgt_dev *dev = pblk->dev; |
850 | struct nvm_geo *geo = &dev->geo; | 854 | struct nvm_geo *geo = &dev->geo; |
@@ -857,11 +861,6 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) | |||
857 | goto out; | 861 | goto out; |
858 | } | 862 | } |
859 | 863 | ||
860 | if (!wait_for_completion_io_timeout(&wait, | ||
861 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
862 | pr_err("pblk: sync erase timed out\n"); | ||
863 | } | ||
864 | |||
865 | out: | 864 | out: |
866 | rqd.private = pblk; | 865 | rqd.private = pblk; |
867 | __pblk_end_io_erase(pblk, &rqd); | 866 | __pblk_end_io_erase(pblk, &rqd); |
@@ -976,7 +975,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, | |||
976 | memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); | 975 | memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); |
977 | smeta_buf->header.id = cpu_to_le32(line->id); | 976 | smeta_buf->header.id = cpu_to_le32(line->id); |
978 | smeta_buf->header.type = cpu_to_le16(line->type); | 977 | smeta_buf->header.type = cpu_to_le16(line->type); |
979 | smeta_buf->header.version = cpu_to_le16(1); | 978 | smeta_buf->header.version = SMETA_VERSION; |
980 | 979 | ||
981 | /* Start metadata */ | 980 | /* Start metadata */ |
982 | smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); | 981 | smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); |
@@ -1046,7 +1045,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, | |||
1046 | line->smeta_ssec = off; | 1045 | line->smeta_ssec = off; |
1047 | line->cur_sec = off + lm->smeta_sec; | 1046 | line->cur_sec = off + lm->smeta_sec; |
1048 | 1047 | ||
1049 | if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) { | 1048 | if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { |
1050 | pr_debug("pblk: line smeta I/O failed. Retry\n"); | 1049 | pr_debug("pblk: line smeta I/O failed. Retry\n"); |
1051 | return 1; | 1050 | return 1; |
1052 | } | 1051 | } |
@@ -1056,7 +1055,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, | |||
1056 | /* Mark emeta metadata sectors as bad sectors. We need to consider bad | 1055 | /* Mark emeta metadata sectors as bad sectors. We need to consider bad |
1057 | * blocks to make sure that there are enough sectors to store emeta | 1056 | * blocks to make sure that there are enough sectors to store emeta |
1058 | */ | 1057 | */ |
1059 | bit = lm->sec_per_line; | ||
1060 | off = lm->sec_per_line - lm->emeta_sec[0]; | 1058 | off = lm->sec_per_line - lm->emeta_sec[0]; |
1061 | bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]); | 1059 | bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]); |
1062 | while (nr_bb) { | 1060 | while (nr_bb) { |
@@ -1093,25 +1091,21 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) | |||
1093 | struct pblk_line_meta *lm = &pblk->lm; | 1091 | struct pblk_line_meta *lm = &pblk->lm; |
1094 | int blk_in_line = atomic_read(&line->blk_in_line); | 1092 | int blk_in_line = atomic_read(&line->blk_in_line); |
1095 | 1093 | ||
1096 | line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC); | 1094 | line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC); |
1097 | if (!line->map_bitmap) | 1095 | if (!line->map_bitmap) |
1098 | return -ENOMEM; | 1096 | return -ENOMEM; |
1099 | memset(line->map_bitmap, 0, lm->sec_bitmap_len); | ||
1100 | 1097 | ||
1101 | /* invalid_bitmap is special since it is used when line is closed. No | 1098 | /* will be initialized using bb info from map_bitmap */ |
1102 | * need to zeroized; it will be initialized using bb info form | 1099 | line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC); |
1103 | * map_bitmap | ||
1104 | */ | ||
1105 | line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC); | ||
1106 | if (!line->invalid_bitmap) { | 1100 | if (!line->invalid_bitmap) { |
1107 | mempool_free(line->map_bitmap, pblk->line_meta_pool); | 1101 | kfree(line->map_bitmap); |
1108 | return -ENOMEM; | 1102 | return -ENOMEM; |
1109 | } | 1103 | } |
1110 | 1104 | ||
1111 | spin_lock(&line->lock); | 1105 | spin_lock(&line->lock); |
1112 | if (line->state != PBLK_LINESTATE_FREE) { | 1106 | if (line->state != PBLK_LINESTATE_FREE) { |
1113 | mempool_free(line->invalid_bitmap, pblk->line_meta_pool); | 1107 | kfree(line->map_bitmap); |
1114 | mempool_free(line->map_bitmap, pblk->line_meta_pool); | 1108 | kfree(line->invalid_bitmap); |
1115 | spin_unlock(&line->lock); | 1109 | spin_unlock(&line->lock); |
1116 | WARN(1, "pblk: corrupted line %d, state %d\n", | 1110 | WARN(1, "pblk: corrupted line %d, state %d\n", |
1117 | line->id, line->state); | 1111 | line->id, line->state); |
@@ -1163,7 +1157,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) | |||
1163 | 1157 | ||
1164 | void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) | 1158 | void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) |
1165 | { | 1159 | { |
1166 | mempool_free(line->map_bitmap, pblk->line_meta_pool); | 1160 | kfree(line->map_bitmap); |
1167 | line->map_bitmap = NULL; | 1161 | line->map_bitmap = NULL; |
1168 | line->smeta = NULL; | 1162 | line->smeta = NULL; |
1169 | line->emeta = NULL; | 1163 | line->emeta = NULL; |
@@ -1328,6 +1322,41 @@ static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) | |||
1328 | pblk->state = PBLK_STATE_STOPPING; | 1322 | pblk->state = PBLK_STATE_STOPPING; |
1329 | } | 1323 | } |
1330 | 1324 | ||
1325 | static void pblk_line_close_meta_sync(struct pblk *pblk) | ||
1326 | { | ||
1327 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | ||
1328 | struct pblk_line_meta *lm = &pblk->lm; | ||
1329 | struct pblk_line *line, *tline; | ||
1330 | LIST_HEAD(list); | ||
1331 | |||
1332 | spin_lock(&l_mg->close_lock); | ||
1333 | if (list_empty(&l_mg->emeta_list)) { | ||
1334 | spin_unlock(&l_mg->close_lock); | ||
1335 | return; | ||
1336 | } | ||
1337 | |||
1338 | list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); | ||
1339 | spin_unlock(&l_mg->close_lock); | ||
1340 | |||
1341 | list_for_each_entry_safe(line, tline, &list, list) { | ||
1342 | struct pblk_emeta *emeta = line->emeta; | ||
1343 | |||
1344 | while (emeta->mem < lm->emeta_len[0]) { | ||
1345 | int ret; | ||
1346 | |||
1347 | ret = pblk_submit_meta_io(pblk, line); | ||
1348 | if (ret) { | ||
1349 | pr_err("pblk: sync meta line %d failed (%d)\n", | ||
1350 | line->id, ret); | ||
1351 | return; | ||
1352 | } | ||
1353 | } | ||
1354 | } | ||
1355 | |||
1356 | pblk_wait_for_meta(pblk); | ||
1357 | flush_workqueue(pblk->close_wq); | ||
1358 | } | ||
1359 | |||
1331 | void pblk_pipeline_stop(struct pblk *pblk) | 1360 | void pblk_pipeline_stop(struct pblk *pblk) |
1332 | { | 1361 | { |
1333 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 1362 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
@@ -1361,17 +1390,17 @@ void pblk_pipeline_stop(struct pblk *pblk) | |||
1361 | spin_unlock(&l_mg->free_lock); | 1390 | spin_unlock(&l_mg->free_lock); |
1362 | } | 1391 | } |
1363 | 1392 | ||
1364 | void pblk_line_replace_data(struct pblk *pblk) | 1393 | struct pblk_line *pblk_line_replace_data(struct pblk *pblk) |
1365 | { | 1394 | { |
1366 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 1395 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
1367 | struct pblk_line *cur, *new; | 1396 | struct pblk_line *cur, *new = NULL; |
1368 | unsigned int left_seblks; | 1397 | unsigned int left_seblks; |
1369 | int is_next = 0; | 1398 | int is_next = 0; |
1370 | 1399 | ||
1371 | cur = l_mg->data_line; | 1400 | cur = l_mg->data_line; |
1372 | new = l_mg->data_next; | 1401 | new = l_mg->data_next; |
1373 | if (!new) | 1402 | if (!new) |
1374 | return; | 1403 | goto out; |
1375 | l_mg->data_line = new; | 1404 | l_mg->data_line = new; |
1376 | 1405 | ||
1377 | spin_lock(&l_mg->free_lock); | 1406 | spin_lock(&l_mg->free_lock); |
@@ -1379,7 +1408,7 @@ void pblk_line_replace_data(struct pblk *pblk) | |||
1379 | l_mg->data_line = NULL; | 1408 | l_mg->data_line = NULL; |
1380 | l_mg->data_next = NULL; | 1409 | l_mg->data_next = NULL; |
1381 | spin_unlock(&l_mg->free_lock); | 1410 | spin_unlock(&l_mg->free_lock); |
1382 | return; | 1411 | goto out; |
1383 | } | 1412 | } |
1384 | 1413 | ||
1385 | pblk_line_setup_metadata(new, l_mg, &pblk->lm); | 1414 | pblk_line_setup_metadata(new, l_mg, &pblk->lm); |
@@ -1391,7 +1420,7 @@ retry_erase: | |||
1391 | /* If line is not fully erased, erase it */ | 1420 | /* If line is not fully erased, erase it */ |
1392 | if (atomic_read(&new->left_eblks)) { | 1421 | if (atomic_read(&new->left_eblks)) { |
1393 | if (pblk_line_erase(pblk, new)) | 1422 | if (pblk_line_erase(pblk, new)) |
1394 | return; | 1423 | goto out; |
1395 | } else { | 1424 | } else { |
1396 | io_schedule(); | 1425 | io_schedule(); |
1397 | } | 1426 | } |
@@ -1402,7 +1431,7 @@ retry_setup: | |||
1402 | if (!pblk_line_init_metadata(pblk, new, cur)) { | 1431 | if (!pblk_line_init_metadata(pblk, new, cur)) { |
1403 | new = pblk_line_retry(pblk, new); | 1432 | new = pblk_line_retry(pblk, new); |
1404 | if (!new) | 1433 | if (!new) |
1405 | return; | 1434 | goto out; |
1406 | 1435 | ||
1407 | goto retry_setup; | 1436 | goto retry_setup; |
1408 | } | 1437 | } |
@@ -1410,7 +1439,7 @@ retry_setup: | |||
1410 | if (!pblk_line_init_bb(pblk, new, 1)) { | 1439 | if (!pblk_line_init_bb(pblk, new, 1)) { |
1411 | new = pblk_line_retry(pblk, new); | 1440 | new = pblk_line_retry(pblk, new); |
1412 | if (!new) | 1441 | if (!new) |
1413 | return; | 1442 | goto out; |
1414 | 1443 | ||
1415 | goto retry_setup; | 1444 | goto retry_setup; |
1416 | } | 1445 | } |
@@ -1434,14 +1463,15 @@ retry_setup: | |||
1434 | 1463 | ||
1435 | if (is_next) | 1464 | if (is_next) |
1436 | pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); | 1465 | pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); |
1466 | |||
1467 | out: | ||
1468 | return new; | ||
1437 | } | 1469 | } |
1438 | 1470 | ||
1439 | void pblk_line_free(struct pblk *pblk, struct pblk_line *line) | 1471 | void pblk_line_free(struct pblk *pblk, struct pblk_line *line) |
1440 | { | 1472 | { |
1441 | if (line->map_bitmap) | 1473 | kfree(line->map_bitmap); |
1442 | mempool_free(line->map_bitmap, pblk->line_meta_pool); | 1474 | kfree(line->invalid_bitmap); |
1443 | if (line->invalid_bitmap) | ||
1444 | mempool_free(line->invalid_bitmap, pblk->line_meta_pool); | ||
1445 | 1475 | ||
1446 | *line->vsc = cpu_to_le32(EMPTY_ENTRY); | 1476 | *line->vsc = cpu_to_le32(EMPTY_ENTRY); |
1447 | 1477 | ||
@@ -1451,11 +1481,10 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line) | |||
1451 | line->emeta = NULL; | 1481 | line->emeta = NULL; |
1452 | } | 1482 | } |
1453 | 1483 | ||
1454 | void pblk_line_put(struct kref *ref) | 1484 | static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) |
1455 | { | 1485 | { |
1456 | struct pblk_line *line = container_of(ref, struct pblk_line, ref); | ||
1457 | struct pblk *pblk = line->pblk; | ||
1458 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 1486 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
1487 | struct pblk_gc *gc = &pblk->gc; | ||
1459 | 1488 | ||
1460 | spin_lock(&line->lock); | 1489 | spin_lock(&line->lock); |
1461 | WARN_ON(line->state != PBLK_LINESTATE_GC); | 1490 | WARN_ON(line->state != PBLK_LINESTATE_GC); |
@@ -1464,6 +1493,8 @@ void pblk_line_put(struct kref *ref) | |||
1464 | pblk_line_free(pblk, line); | 1493 | pblk_line_free(pblk, line); |
1465 | spin_unlock(&line->lock); | 1494 | spin_unlock(&line->lock); |
1466 | 1495 | ||
1496 | atomic_dec(&gc->pipeline_gc); | ||
1497 | |||
1467 | spin_lock(&l_mg->free_lock); | 1498 | spin_lock(&l_mg->free_lock); |
1468 | list_add_tail(&line->list, &l_mg->free_list); | 1499 | list_add_tail(&line->list, &l_mg->free_list); |
1469 | l_mg->nr_free_lines++; | 1500 | l_mg->nr_free_lines++; |
@@ -1472,13 +1503,49 @@ void pblk_line_put(struct kref *ref) | |||
1472 | pblk_rl_free_lines_inc(&pblk->rl, line); | 1503 | pblk_rl_free_lines_inc(&pblk->rl, line); |
1473 | } | 1504 | } |
1474 | 1505 | ||
1506 | static void pblk_line_put_ws(struct work_struct *work) | ||
1507 | { | ||
1508 | struct pblk_line_ws *line_put_ws = container_of(work, | ||
1509 | struct pblk_line_ws, ws); | ||
1510 | struct pblk *pblk = line_put_ws->pblk; | ||
1511 | struct pblk_line *line = line_put_ws->line; | ||
1512 | |||
1513 | __pblk_line_put(pblk, line); | ||
1514 | mempool_free(line_put_ws, pblk->gen_ws_pool); | ||
1515 | } | ||
1516 | |||
1517 | void pblk_line_put(struct kref *ref) | ||
1518 | { | ||
1519 | struct pblk_line *line = container_of(ref, struct pblk_line, ref); | ||
1520 | struct pblk *pblk = line->pblk; | ||
1521 | |||
1522 | __pblk_line_put(pblk, line); | ||
1523 | } | ||
1524 | |||
1525 | void pblk_line_put_wq(struct kref *ref) | ||
1526 | { | ||
1527 | struct pblk_line *line = container_of(ref, struct pblk_line, ref); | ||
1528 | struct pblk *pblk = line->pblk; | ||
1529 | struct pblk_line_ws *line_put_ws; | ||
1530 | |||
1531 | line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC); | ||
1532 | if (!line_put_ws) | ||
1533 | return; | ||
1534 | |||
1535 | line_put_ws->pblk = pblk; | ||
1536 | line_put_ws->line = line; | ||
1537 | line_put_ws->priv = NULL; | ||
1538 | |||
1539 | INIT_WORK(&line_put_ws->ws, pblk_line_put_ws); | ||
1540 | queue_work(pblk->r_end_wq, &line_put_ws->ws); | ||
1541 | } | ||
1542 | |||
1475 | int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) | 1543 | int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) |
1476 | { | 1544 | { |
1477 | struct nvm_rq *rqd; | 1545 | struct nvm_rq *rqd; |
1478 | int err; | 1546 | int err; |
1479 | 1547 | ||
1480 | rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL); | 1548 | rqd = pblk_alloc_rqd(pblk, PBLK_ERASE); |
1481 | memset(rqd, 0, pblk_g_rq_size); | ||
1482 | 1549 | ||
1483 | pblk_setup_e_rq(pblk, rqd, ppa); | 1550 | pblk_setup_e_rq(pblk, rqd, ppa); |
1484 | 1551 | ||
@@ -1517,41 +1584,6 @@ int pblk_line_is_full(struct pblk_line *line) | |||
1517 | return (line->left_msecs == 0); | 1584 | return (line->left_msecs == 0); |
1518 | } | 1585 | } |
1519 | 1586 | ||
1520 | void pblk_line_close_meta_sync(struct pblk *pblk) | ||
1521 | { | ||
1522 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | ||
1523 | struct pblk_line_meta *lm = &pblk->lm; | ||
1524 | struct pblk_line *line, *tline; | ||
1525 | LIST_HEAD(list); | ||
1526 | |||
1527 | spin_lock(&l_mg->close_lock); | ||
1528 | if (list_empty(&l_mg->emeta_list)) { | ||
1529 | spin_unlock(&l_mg->close_lock); | ||
1530 | return; | ||
1531 | } | ||
1532 | |||
1533 | list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); | ||
1534 | spin_unlock(&l_mg->close_lock); | ||
1535 | |||
1536 | list_for_each_entry_safe(line, tline, &list, list) { | ||
1537 | struct pblk_emeta *emeta = line->emeta; | ||
1538 | |||
1539 | while (emeta->mem < lm->emeta_len[0]) { | ||
1540 | int ret; | ||
1541 | |||
1542 | ret = pblk_submit_meta_io(pblk, line); | ||
1543 | if (ret) { | ||
1544 | pr_err("pblk: sync meta line %d failed (%d)\n", | ||
1545 | line->id, ret); | ||
1546 | return; | ||
1547 | } | ||
1548 | } | ||
1549 | } | ||
1550 | |||
1551 | pblk_wait_for_meta(pblk); | ||
1552 | flush_workqueue(pblk->close_wq); | ||
1553 | } | ||
1554 | |||
1555 | static void pblk_line_should_sync_meta(struct pblk *pblk) | 1587 | static void pblk_line_should_sync_meta(struct pblk *pblk) |
1556 | { | 1588 | { |
1557 | if (pblk_rl_is_limit(&pblk->rl)) | 1589 | if (pblk_rl_is_limit(&pblk->rl)) |
@@ -1582,15 +1614,13 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) | |||
1582 | 1614 | ||
1583 | list_add_tail(&line->list, move_list); | 1615 | list_add_tail(&line->list, move_list); |
1584 | 1616 | ||
1585 | mempool_free(line->map_bitmap, pblk->line_meta_pool); | 1617 | kfree(line->map_bitmap); |
1586 | line->map_bitmap = NULL; | 1618 | line->map_bitmap = NULL; |
1587 | line->smeta = NULL; | 1619 | line->smeta = NULL; |
1588 | line->emeta = NULL; | 1620 | line->emeta = NULL; |
1589 | 1621 | ||
1590 | spin_unlock(&line->lock); | 1622 | spin_unlock(&line->lock); |
1591 | spin_unlock(&l_mg->gc_lock); | 1623 | spin_unlock(&l_mg->gc_lock); |
1592 | |||
1593 | pblk_gc_should_kick(pblk); | ||
1594 | } | 1624 | } |
1595 | 1625 | ||
1596 | void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) | 1626 | void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) |
@@ -1624,43 +1654,16 @@ void pblk_line_close_ws(struct work_struct *work) | |||
1624 | struct pblk_line *line = line_ws->line; | 1654 | struct pblk_line *line = line_ws->line; |
1625 | 1655 | ||
1626 | pblk_line_close(pblk, line); | 1656 | pblk_line_close(pblk, line); |
1627 | mempool_free(line_ws, pblk->line_ws_pool); | 1657 | mempool_free(line_ws, pblk->gen_ws_pool); |
1628 | } | ||
1629 | |||
1630 | void pblk_line_mark_bb(struct work_struct *work) | ||
1631 | { | ||
1632 | struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, | ||
1633 | ws); | ||
1634 | struct pblk *pblk = line_ws->pblk; | ||
1635 | struct nvm_tgt_dev *dev = pblk->dev; | ||
1636 | struct ppa_addr *ppa = line_ws->priv; | ||
1637 | int ret; | ||
1638 | |||
1639 | ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); | ||
1640 | if (ret) { | ||
1641 | struct pblk_line *line; | ||
1642 | int pos; | ||
1643 | |||
1644 | line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; | ||
1645 | pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); | ||
1646 | |||
1647 | pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", | ||
1648 | line->id, pos); | ||
1649 | } | ||
1650 | |||
1651 | kfree(ppa); | ||
1652 | mempool_free(line_ws, pblk->line_ws_pool); | ||
1653 | } | 1658 | } |
1654 | 1659 | ||
1655 | void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, | 1660 | void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, |
1656 | void (*work)(struct work_struct *), | 1661 | void (*work)(struct work_struct *), gfp_t gfp_mask, |
1657 | struct workqueue_struct *wq) | 1662 | struct workqueue_struct *wq) |
1658 | { | 1663 | { |
1659 | struct pblk_line_ws *line_ws; | 1664 | struct pblk_line_ws *line_ws; |
1660 | 1665 | ||
1661 | line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC); | 1666 | line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask); |
1662 | if (!line_ws) | ||
1663 | return; | ||
1664 | 1667 | ||
1665 | line_ws->pblk = pblk; | 1668 | line_ws->pblk = pblk; |
1666 | line_ws->line = line; | 1669 | line_ws->line = line; |
@@ -1689,16 +1692,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, | |||
1689 | #endif | 1692 | #endif |
1690 | 1693 | ||
1691 | ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); | 1694 | ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); |
1692 | if (ret) { | 1695 | if (ret == -ETIME || ret == -EINTR) |
1693 | switch (ret) { | 1696 | pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret); |
1694 | case -ETIME: | ||
1695 | pr_err("pblk: lun semaphore timed out\n"); | ||
1696 | break; | ||
1697 | case -EINTR: | ||
1698 | pr_err("pblk: lun semaphore timed out\n"); | ||
1699 | break; | ||
1700 | } | ||
1701 | } | ||
1702 | } | 1697 | } |
1703 | 1698 | ||
1704 | void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) | 1699 | void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) |
@@ -1758,13 +1753,11 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, | |||
1758 | rlun = &pblk->luns[bit]; | 1753 | rlun = &pblk->luns[bit]; |
1759 | up(&rlun->wr_sem); | 1754 | up(&rlun->wr_sem); |
1760 | } | 1755 | } |
1761 | |||
1762 | kfree(lun_bitmap); | ||
1763 | } | 1756 | } |
1764 | 1757 | ||
1765 | void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) | 1758 | void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) |
1766 | { | 1759 | { |
1767 | struct ppa_addr l2p_ppa; | 1760 | struct ppa_addr ppa_l2p; |
1768 | 1761 | ||
1769 | /* logic error: lba out-of-bounds. Ignore update */ | 1762 | /* logic error: lba out-of-bounds. Ignore update */ |
1770 | if (!(lba < pblk->rl.nr_secs)) { | 1763 | if (!(lba < pblk->rl.nr_secs)) { |
@@ -1773,10 +1766,10 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) | |||
1773 | } | 1766 | } |
1774 | 1767 | ||
1775 | spin_lock(&pblk->trans_lock); | 1768 | spin_lock(&pblk->trans_lock); |
1776 | l2p_ppa = pblk_trans_map_get(pblk, lba); | 1769 | ppa_l2p = pblk_trans_map_get(pblk, lba); |
1777 | 1770 | ||
1778 | if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa)) | 1771 | if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)) |
1779 | pblk_map_invalidate(pblk, l2p_ppa); | 1772 | pblk_map_invalidate(pblk, ppa_l2p); |
1780 | 1773 | ||
1781 | pblk_trans_map_set(pblk, lba, ppa); | 1774 | pblk_trans_map_set(pblk, lba, ppa); |
1782 | spin_unlock(&pblk->trans_lock); | 1775 | spin_unlock(&pblk->trans_lock); |
@@ -1784,6 +1777,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) | |||
1784 | 1777 | ||
1785 | void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) | 1778 | void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) |
1786 | { | 1779 | { |
1780 | |||
1787 | #ifdef CONFIG_NVM_DEBUG | 1781 | #ifdef CONFIG_NVM_DEBUG |
1788 | /* Callers must ensure that the ppa points to a cache address */ | 1782 | /* Callers must ensure that the ppa points to a cache address */ |
1789 | BUG_ON(!pblk_addr_in_cache(ppa)); | 1783 | BUG_ON(!pblk_addr_in_cache(ppa)); |
@@ -1793,16 +1787,16 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) | |||
1793 | pblk_update_map(pblk, lba, ppa); | 1787 | pblk_update_map(pblk, lba, ppa); |
1794 | } | 1788 | } |
1795 | 1789 | ||
1796 | int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, | 1790 | int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, |
1797 | struct pblk_line *gc_line) | 1791 | struct pblk_line *gc_line, u64 paddr_gc) |
1798 | { | 1792 | { |
1799 | struct ppa_addr l2p_ppa; | 1793 | struct ppa_addr ppa_l2p, ppa_gc; |
1800 | int ret = 1; | 1794 | int ret = 1; |
1801 | 1795 | ||
1802 | #ifdef CONFIG_NVM_DEBUG | 1796 | #ifdef CONFIG_NVM_DEBUG |
1803 | /* Callers must ensure that the ppa points to a cache address */ | 1797 | /* Callers must ensure that the ppa points to a cache address */ |
1804 | BUG_ON(!pblk_addr_in_cache(ppa)); | 1798 | BUG_ON(!pblk_addr_in_cache(ppa_new)); |
1805 | BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); | 1799 | BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); |
1806 | #endif | 1800 | #endif |
1807 | 1801 | ||
1808 | /* logic error: lba out-of-bounds. Ignore update */ | 1802 | /* logic error: lba out-of-bounds. Ignore update */ |
@@ -1812,36 +1806,41 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, | |||
1812 | } | 1806 | } |
1813 | 1807 | ||
1814 | spin_lock(&pblk->trans_lock); | 1808 | spin_lock(&pblk->trans_lock); |
1815 | l2p_ppa = pblk_trans_map_get(pblk, lba); | 1809 | ppa_l2p = pblk_trans_map_get(pblk, lba); |
1810 | ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id); | ||
1811 | |||
1812 | if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) { | ||
1813 | spin_lock(&gc_line->lock); | ||
1814 | WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap), | ||
1815 | "pblk: corrupted GC update"); | ||
1816 | spin_unlock(&gc_line->lock); | ||
1816 | 1817 | ||
1817 | /* Prevent updated entries to be overwritten by GC */ | ||
1818 | if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) || | ||
1819 | pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) { | ||
1820 | ret = 0; | 1818 | ret = 0; |
1821 | goto out; | 1819 | goto out; |
1822 | } | 1820 | } |
1823 | 1821 | ||
1824 | pblk_trans_map_set(pblk, lba, ppa); | 1822 | pblk_trans_map_set(pblk, lba, ppa_new); |
1825 | out: | 1823 | out: |
1826 | spin_unlock(&pblk->trans_lock); | 1824 | spin_unlock(&pblk->trans_lock); |
1827 | return ret; | 1825 | return ret; |
1828 | } | 1826 | } |
1829 | 1827 | ||
1830 | void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, | 1828 | void pblk_update_map_dev(struct pblk *pblk, sector_t lba, |
1831 | struct ppa_addr entry_line) | 1829 | struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache) |
1832 | { | 1830 | { |
1833 | struct ppa_addr l2p_line; | 1831 | struct ppa_addr ppa_l2p; |
1834 | 1832 | ||
1835 | #ifdef CONFIG_NVM_DEBUG | 1833 | #ifdef CONFIG_NVM_DEBUG |
1836 | /* Callers must ensure that the ppa points to a device address */ | 1834 | /* Callers must ensure that the ppa points to a device address */ |
1837 | BUG_ON(pblk_addr_in_cache(ppa)); | 1835 | BUG_ON(pblk_addr_in_cache(ppa_mapped)); |
1838 | #endif | 1836 | #endif |
1839 | /* Invalidate and discard padded entries */ | 1837 | /* Invalidate and discard padded entries */ |
1840 | if (lba == ADDR_EMPTY) { | 1838 | if (lba == ADDR_EMPTY) { |
1841 | #ifdef CONFIG_NVM_DEBUG | 1839 | #ifdef CONFIG_NVM_DEBUG |
1842 | atomic_long_inc(&pblk->padded_wb); | 1840 | atomic_long_inc(&pblk->padded_wb); |
1843 | #endif | 1841 | #endif |
1844 | pblk_map_invalidate(pblk, ppa); | 1842 | if (!pblk_ppa_empty(ppa_mapped)) |
1843 | pblk_map_invalidate(pblk, ppa_mapped); | ||
1845 | return; | 1844 | return; |
1846 | } | 1845 | } |
1847 | 1846 | ||
@@ -1852,22 +1851,22 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, | |||
1852 | } | 1851 | } |
1853 | 1852 | ||
1854 | spin_lock(&pblk->trans_lock); | 1853 | spin_lock(&pblk->trans_lock); |
1855 | l2p_line = pblk_trans_map_get(pblk, lba); | 1854 | ppa_l2p = pblk_trans_map_get(pblk, lba); |
1856 | 1855 | ||
1857 | /* Do not update L2P if the cacheline has been updated. In this case, | 1856 | /* Do not update L2P if the cacheline has been updated. In this case, |
1858 | * the mapped ppa must be invalidated | 1857 | * the mapped ppa must be invalidated |
1859 | */ | 1858 | */ |
1860 | if (l2p_line.ppa != entry_line.ppa) { | 1859 | if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) { |
1861 | if (!pblk_ppa_empty(ppa)) | 1860 | if (!pblk_ppa_empty(ppa_mapped)) |
1862 | pblk_map_invalidate(pblk, ppa); | 1861 | pblk_map_invalidate(pblk, ppa_mapped); |
1863 | goto out; | 1862 | goto out; |
1864 | } | 1863 | } |
1865 | 1864 | ||
1866 | #ifdef CONFIG_NVM_DEBUG | 1865 | #ifdef CONFIG_NVM_DEBUG |
1867 | WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line)); | 1866 | WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); |
1868 | #endif | 1867 | #endif |
1869 | 1868 | ||
1870 | pblk_trans_map_set(pblk, lba, ppa); | 1869 | pblk_trans_map_set(pblk, lba, ppa_mapped); |
1871 | out: | 1870 | out: |
1872 | spin_unlock(&pblk->trans_lock); | 1871 | spin_unlock(&pblk->trans_lock); |
1873 | } | 1872 | } |
@@ -1878,23 +1877,32 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, | |||
1878 | int i; | 1877 | int i; |
1879 | 1878 | ||
1880 | spin_lock(&pblk->trans_lock); | 1879 | spin_lock(&pblk->trans_lock); |
1881 | for (i = 0; i < nr_secs; i++) | 1880 | for (i = 0; i < nr_secs; i++) { |
1882 | ppas[i] = pblk_trans_map_get(pblk, blba + i); | 1881 | struct ppa_addr ppa; |
1882 | |||
1883 | ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i); | ||
1884 | |||
1885 | /* If the L2P entry maps to a line, the reference is valid */ | ||
1886 | if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { | ||
1887 | int line_id = pblk_dev_ppa_to_line(ppa); | ||
1888 | struct pblk_line *line = &pblk->lines[line_id]; | ||
1889 | |||
1890 | kref_get(&line->ref); | ||
1891 | } | ||
1892 | } | ||
1883 | spin_unlock(&pblk->trans_lock); | 1893 | spin_unlock(&pblk->trans_lock); |
1884 | } | 1894 | } |
1885 | 1895 | ||
1886 | void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, | 1896 | void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, |
1887 | u64 *lba_list, int nr_secs) | 1897 | u64 *lba_list, int nr_secs) |
1888 | { | 1898 | { |
1889 | sector_t lba; | 1899 | u64 lba; |
1890 | int i; | 1900 | int i; |
1891 | 1901 | ||
1892 | spin_lock(&pblk->trans_lock); | 1902 | spin_lock(&pblk->trans_lock); |
1893 | for (i = 0; i < nr_secs; i++) { | 1903 | for (i = 0; i < nr_secs; i++) { |
1894 | lba = lba_list[i]; | 1904 | lba = lba_list[i]; |
1895 | if (lba == ADDR_EMPTY) { | 1905 | if (lba != ADDR_EMPTY) { |
1896 | ppas[i].ppa = ADDR_EMPTY; | ||
1897 | } else { | ||
1898 | /* logic error: lba out-of-bounds. Ignore update */ | 1906 | /* logic error: lba out-of-bounds. Ignore update */ |
1899 | if (!(lba < pblk->rl.nr_secs)) { | 1907 | if (!(lba < pblk->rl.nr_secs)) { |
1900 | WARN(1, "pblk: corrupted L2P map request\n"); | 1908 | WARN(1, "pblk: corrupted L2P map request\n"); |
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 6090d28f7995..00d5698d64a9 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c | |||
@@ -20,7 +20,8 @@ | |||
20 | 20 | ||
21 | static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) | 21 | static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) |
22 | { | 22 | { |
23 | vfree(gc_rq->data); | 23 | if (gc_rq->data) |
24 | vfree(gc_rq->data); | ||
24 | kfree(gc_rq); | 25 | kfree(gc_rq); |
25 | } | 26 | } |
26 | 27 | ||
@@ -41,10 +42,7 @@ static int pblk_gc_write(struct pblk *pblk) | |||
41 | spin_unlock(&gc->w_lock); | 42 | spin_unlock(&gc->w_lock); |
42 | 43 | ||
43 | list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { | 44 | list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { |
44 | pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list, | 45 | pblk_write_gc_to_cache(pblk, gc_rq); |
45 | gc_rq->nr_secs, gc_rq->secs_to_gc, | ||
46 | gc_rq->line, PBLK_IOTYPE_GC); | ||
47 | |||
48 | list_del(&gc_rq->list); | 46 | list_del(&gc_rq->list); |
49 | kref_put(&gc_rq->line->ref, pblk_line_put); | 47 | kref_put(&gc_rq->line->ref, pblk_line_put); |
50 | pblk_gc_free_gc_rq(gc_rq); | 48 | pblk_gc_free_gc_rq(gc_rq); |
@@ -58,42 +56,59 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc) | |||
58 | wake_up_process(gc->gc_writer_ts); | 56 | wake_up_process(gc->gc_writer_ts); |
59 | } | 57 | } |
60 | 58 | ||
61 | /* | 59 | static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) |
62 | * Responsible for managing all memory related to a gc request. Also in case of | 60 | { |
63 | * failure | 61 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
64 | */ | 62 | struct list_head *move_list; |
65 | static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) | 63 | |
64 | spin_lock(&line->lock); | ||
65 | WARN_ON(line->state != PBLK_LINESTATE_GC); | ||
66 | line->state = PBLK_LINESTATE_CLOSED; | ||
67 | move_list = pblk_line_gc_list(pblk, line); | ||
68 | spin_unlock(&line->lock); | ||
69 | |||
70 | if (move_list) { | ||
71 | spin_lock(&l_mg->gc_lock); | ||
72 | list_add_tail(&line->list, move_list); | ||
73 | spin_unlock(&l_mg->gc_lock); | ||
74 | } | ||
75 | } | ||
76 | |||
77 | static void pblk_gc_line_ws(struct work_struct *work) | ||
66 | { | 78 | { |
79 | struct pblk_line_ws *gc_rq_ws = container_of(work, | ||
80 | struct pblk_line_ws, ws); | ||
81 | struct pblk *pblk = gc_rq_ws->pblk; | ||
67 | struct nvm_tgt_dev *dev = pblk->dev; | 82 | struct nvm_tgt_dev *dev = pblk->dev; |
68 | struct nvm_geo *geo = &dev->geo; | 83 | struct nvm_geo *geo = &dev->geo; |
69 | struct pblk_gc *gc = &pblk->gc; | 84 | struct pblk_gc *gc = &pblk->gc; |
70 | struct pblk_line *line = gc_rq->line; | 85 | struct pblk_line *line = gc_rq_ws->line; |
71 | void *data; | 86 | struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; |
72 | unsigned int secs_to_gc; | 87 | int ret; |
73 | int ret = 0; | ||
74 | 88 | ||
75 | data = vmalloc(gc_rq->nr_secs * geo->sec_size); | 89 | up(&gc->gc_sem); |
76 | if (!data) { | 90 | |
77 | ret = -ENOMEM; | 91 | gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size); |
92 | if (!gc_rq->data) { | ||
93 | pr_err("pblk: could not GC line:%d (%d/%d)\n", | ||
94 | line->id, *line->vsc, gc_rq->nr_secs); | ||
78 | goto out; | 95 | goto out; |
79 | } | 96 | } |
80 | 97 | ||
81 | /* Read from GC victim block */ | 98 | /* Read from GC victim block */ |
82 | if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs, | 99 | ret = pblk_submit_read_gc(pblk, gc_rq); |
83 | &secs_to_gc, line)) { | 100 | if (ret) { |
84 | ret = -EFAULT; | 101 | pr_err("pblk: failed GC read in line:%d (err:%d)\n", |
85 | goto free_data; | 102 | line->id, ret); |
103 | goto out; | ||
86 | } | 104 | } |
87 | 105 | ||
88 | if (!secs_to_gc) | 106 | if (!gc_rq->secs_to_gc) |
89 | goto free_rq; | 107 | goto out; |
90 | |||
91 | gc_rq->data = data; | ||
92 | gc_rq->secs_to_gc = secs_to_gc; | ||
93 | 108 | ||
94 | retry: | 109 | retry: |
95 | spin_lock(&gc->w_lock); | 110 | spin_lock(&gc->w_lock); |
96 | if (gc->w_entries >= PBLK_GC_W_QD) { | 111 | if (gc->w_entries >= PBLK_GC_RQ_QD) { |
97 | spin_unlock(&gc->w_lock); | 112 | spin_unlock(&gc->w_lock); |
98 | pblk_gc_writer_kick(&pblk->gc); | 113 | pblk_gc_writer_kick(&pblk->gc); |
99 | usleep_range(128, 256); | 114 | usleep_range(128, 256); |
@@ -105,53 +120,13 @@ retry: | |||
105 | 120 | ||
106 | pblk_gc_writer_kick(&pblk->gc); | 121 | pblk_gc_writer_kick(&pblk->gc); |
107 | 122 | ||
108 | return 0; | 123 | kfree(gc_rq_ws); |
124 | return; | ||
109 | 125 | ||
110 | free_rq: | ||
111 | kfree(gc_rq); | ||
112 | free_data: | ||
113 | vfree(data); | ||
114 | out: | 126 | out: |
127 | pblk_gc_free_gc_rq(gc_rq); | ||
115 | kref_put(&line->ref, pblk_line_put); | 128 | kref_put(&line->ref, pblk_line_put); |
116 | return ret; | 129 | kfree(gc_rq_ws); |
117 | } | ||
118 | |||
119 | static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) | ||
120 | { | ||
121 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | ||
122 | struct list_head *move_list; | ||
123 | |||
124 | spin_lock(&line->lock); | ||
125 | WARN_ON(line->state != PBLK_LINESTATE_GC); | ||
126 | line->state = PBLK_LINESTATE_CLOSED; | ||
127 | move_list = pblk_line_gc_list(pblk, line); | ||
128 | spin_unlock(&line->lock); | ||
129 | |||
130 | if (move_list) { | ||
131 | spin_lock(&l_mg->gc_lock); | ||
132 | list_add_tail(&line->list, move_list); | ||
133 | spin_unlock(&l_mg->gc_lock); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | static void pblk_gc_line_ws(struct work_struct *work) | ||
138 | { | ||
139 | struct pblk_line_ws *line_rq_ws = container_of(work, | ||
140 | struct pblk_line_ws, ws); | ||
141 | struct pblk *pblk = line_rq_ws->pblk; | ||
142 | struct pblk_gc *gc = &pblk->gc; | ||
143 | struct pblk_line *line = line_rq_ws->line; | ||
144 | struct pblk_gc_rq *gc_rq = line_rq_ws->priv; | ||
145 | |||
146 | up(&gc->gc_sem); | ||
147 | |||
148 | if (pblk_gc_move_valid_secs(pblk, gc_rq)) { | ||
149 | pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n", | ||
150 | line->id, *line->vsc, | ||
151 | gc_rq->nr_secs); | ||
152 | } | ||
153 | |||
154 | mempool_free(line_rq_ws, pblk->line_ws_pool); | ||
155 | } | 130 | } |
156 | 131 | ||
157 | static void pblk_gc_line_prepare_ws(struct work_struct *work) | 132 | static void pblk_gc_line_prepare_ws(struct work_struct *work) |
@@ -164,17 +139,24 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) | |||
164 | struct pblk_line_meta *lm = &pblk->lm; | 139 | struct pblk_line_meta *lm = &pblk->lm; |
165 | struct pblk_gc *gc = &pblk->gc; | 140 | struct pblk_gc *gc = &pblk->gc; |
166 | struct line_emeta *emeta_buf; | 141 | struct line_emeta *emeta_buf; |
167 | struct pblk_line_ws *line_rq_ws; | 142 | struct pblk_line_ws *gc_rq_ws; |
168 | struct pblk_gc_rq *gc_rq; | 143 | struct pblk_gc_rq *gc_rq; |
169 | __le64 *lba_list; | 144 | __le64 *lba_list; |
145 | unsigned long *invalid_bitmap; | ||
170 | int sec_left, nr_secs, bit; | 146 | int sec_left, nr_secs, bit; |
171 | int ret; | 147 | int ret; |
172 | 148 | ||
149 | invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); | ||
150 | if (!invalid_bitmap) { | ||
151 | pr_err("pblk: could not allocate GC invalid bitmap\n"); | ||
152 | goto fail_free_ws; | ||
153 | } | ||
154 | |||
173 | emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, | 155 | emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, |
174 | GFP_KERNEL); | 156 | GFP_KERNEL); |
175 | if (!emeta_buf) { | 157 | if (!emeta_buf) { |
176 | pr_err("pblk: cannot use GC emeta\n"); | 158 | pr_err("pblk: cannot use GC emeta\n"); |
177 | return; | 159 | goto fail_free_bitmap; |
178 | } | 160 | } |
179 | 161 | ||
180 | ret = pblk_line_read_emeta(pblk, line, emeta_buf); | 162 | ret = pblk_line_read_emeta(pblk, line, emeta_buf); |
@@ -193,7 +175,11 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) | |||
193 | goto fail_free_emeta; | 175 | goto fail_free_emeta; |
194 | } | 176 | } |
195 | 177 | ||
178 | spin_lock(&line->lock); | ||
179 | bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line); | ||
196 | sec_left = pblk_line_vsc(line); | 180 | sec_left = pblk_line_vsc(line); |
181 | spin_unlock(&line->lock); | ||
182 | |||
197 | if (sec_left < 0) { | 183 | if (sec_left < 0) { |
198 | pr_err("pblk: corrupted GC line (%d)\n", line->id); | 184 | pr_err("pblk: corrupted GC line (%d)\n", line->id); |
199 | goto fail_free_emeta; | 185 | goto fail_free_emeta; |
@@ -207,11 +193,12 @@ next_rq: | |||
207 | 193 | ||
208 | nr_secs = 0; | 194 | nr_secs = 0; |
209 | do { | 195 | do { |
210 | bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line, | 196 | bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line, |
211 | bit + 1); | 197 | bit + 1); |
212 | if (bit > line->emeta_ssec) | 198 | if (bit > line->emeta_ssec) |
213 | break; | 199 | break; |
214 | 200 | ||
201 | gc_rq->paddr_list[nr_secs] = bit; | ||
215 | gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); | 202 | gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); |
216 | } while (nr_secs < pblk->max_write_pgs); | 203 | } while (nr_secs < pblk->max_write_pgs); |
217 | 204 | ||
@@ -223,19 +210,25 @@ next_rq: | |||
223 | gc_rq->nr_secs = nr_secs; | 210 | gc_rq->nr_secs = nr_secs; |
224 | gc_rq->line = line; | 211 | gc_rq->line = line; |
225 | 212 | ||
226 | line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); | 213 | gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); |
227 | if (!line_rq_ws) | 214 | if (!gc_rq_ws) |
228 | goto fail_free_gc_rq; | 215 | goto fail_free_gc_rq; |
229 | 216 | ||
230 | line_rq_ws->pblk = pblk; | 217 | gc_rq_ws->pblk = pblk; |
231 | line_rq_ws->line = line; | 218 | gc_rq_ws->line = line; |
232 | line_rq_ws->priv = gc_rq; | 219 | gc_rq_ws->priv = gc_rq; |
220 | |||
221 | /* The write GC path can be much slower than the read GC one due to | ||
222 | * the budget imposed by the rate-limiter. Balance in case that we get | ||
223 | * back pressure from the write GC path. | ||
224 | */ | ||
225 | while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000))) | ||
226 | io_schedule(); | ||
233 | 227 | ||
234 | down(&gc->gc_sem); | ||
235 | kref_get(&line->ref); | 228 | kref_get(&line->ref); |
236 | 229 | ||
237 | INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws); | 230 | INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); |
238 | queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws); | 231 | queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws); |
239 | 232 | ||
240 | sec_left -= nr_secs; | 233 | sec_left -= nr_secs; |
241 | if (sec_left > 0) | 234 | if (sec_left > 0) |
@@ -243,10 +236,11 @@ next_rq: | |||
243 | 236 | ||
244 | out: | 237 | out: |
245 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); | 238 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); |
246 | mempool_free(line_ws, pblk->line_ws_pool); | 239 | kfree(line_ws); |
240 | kfree(invalid_bitmap); | ||
247 | 241 | ||
248 | kref_put(&line->ref, pblk_line_put); | 242 | kref_put(&line->ref, pblk_line_put); |
249 | atomic_dec(&gc->inflight_gc); | 243 | atomic_dec(&gc->read_inflight_gc); |
250 | 244 | ||
251 | return; | 245 | return; |
252 | 246 | ||
@@ -254,10 +248,14 @@ fail_free_gc_rq: | |||
254 | kfree(gc_rq); | 248 | kfree(gc_rq); |
255 | fail_free_emeta: | 249 | fail_free_emeta: |
256 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); | 250 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); |
251 | fail_free_bitmap: | ||
252 | kfree(invalid_bitmap); | ||
253 | fail_free_ws: | ||
254 | kfree(line_ws); | ||
255 | |||
257 | pblk_put_line_back(pblk, line); | 256 | pblk_put_line_back(pblk, line); |
258 | kref_put(&line->ref, pblk_line_put); | 257 | kref_put(&line->ref, pblk_line_put); |
259 | mempool_free(line_ws, pblk->line_ws_pool); | 258 | atomic_dec(&gc->read_inflight_gc); |
260 | atomic_dec(&gc->inflight_gc); | ||
261 | 259 | ||
262 | pr_err("pblk: Failed to GC line %d\n", line->id); | 260 | pr_err("pblk: Failed to GC line %d\n", line->id); |
263 | } | 261 | } |
@@ -269,19 +267,40 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) | |||
269 | 267 | ||
270 | pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); | 268 | pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); |
271 | 269 | ||
272 | line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); | 270 | line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); |
273 | if (!line_ws) | 271 | if (!line_ws) |
274 | return -ENOMEM; | 272 | return -ENOMEM; |
275 | 273 | ||
276 | line_ws->pblk = pblk; | 274 | line_ws->pblk = pblk; |
277 | line_ws->line = line; | 275 | line_ws->line = line; |
278 | 276 | ||
277 | atomic_inc(&gc->pipeline_gc); | ||
279 | INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); | 278 | INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); |
280 | queue_work(gc->gc_reader_wq, &line_ws->ws); | 279 | queue_work(gc->gc_reader_wq, &line_ws->ws); |
281 | 280 | ||
282 | return 0; | 281 | return 0; |
283 | } | 282 | } |
284 | 283 | ||
284 | static void pblk_gc_reader_kick(struct pblk_gc *gc) | ||
285 | { | ||
286 | wake_up_process(gc->gc_reader_ts); | ||
287 | } | ||
288 | |||
289 | static void pblk_gc_kick(struct pblk *pblk) | ||
290 | { | ||
291 | struct pblk_gc *gc = &pblk->gc; | ||
292 | |||
293 | pblk_gc_writer_kick(gc); | ||
294 | pblk_gc_reader_kick(gc); | ||
295 | |||
296 | /* If we're shutting down GC, let's not start it up again */ | ||
297 | if (gc->gc_enabled) { | ||
298 | wake_up_process(gc->gc_ts); | ||
299 | mod_timer(&gc->gc_timer, | ||
300 | jiffies + msecs_to_jiffies(GC_TIME_MSECS)); | ||
301 | } | ||
302 | } | ||
303 | |||
285 | static int pblk_gc_read(struct pblk *pblk) | 304 | static int pblk_gc_read(struct pblk *pblk) |
286 | { | 305 | { |
287 | struct pblk_gc *gc = &pblk->gc; | 306 | struct pblk_gc *gc = &pblk->gc; |
@@ -305,11 +324,6 @@ static int pblk_gc_read(struct pblk *pblk) | |||
305 | return 0; | 324 | return 0; |
306 | } | 325 | } |
307 | 326 | ||
308 | static void pblk_gc_reader_kick(struct pblk_gc *gc) | ||
309 | { | ||
310 | wake_up_process(gc->gc_reader_ts); | ||
311 | } | ||
312 | |||
313 | static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, | 327 | static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, |
314 | struct list_head *group_list) | 328 | struct list_head *group_list) |
315 | { | 329 | { |
@@ -338,26 +352,17 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) | |||
338 | return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); | 352 | return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); |
339 | } | 353 | } |
340 | 354 | ||
341 | /* | 355 | void pblk_gc_free_full_lines(struct pblk *pblk) |
342 | * Lines with no valid sectors will be returned to the free list immediately. If | ||
343 | * GC is activated - either because the free block count is under the determined | ||
344 | * threshold, or because it is being forced from user space - only lines with a | ||
345 | * high count of invalid sectors will be recycled. | ||
346 | */ | ||
347 | static void pblk_gc_run(struct pblk *pblk) | ||
348 | { | 356 | { |
349 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 357 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
350 | struct pblk_gc *gc = &pblk->gc; | 358 | struct pblk_gc *gc = &pblk->gc; |
351 | struct pblk_line *line; | 359 | struct pblk_line *line; |
352 | struct list_head *group_list; | ||
353 | bool run_gc; | ||
354 | int inflight_gc, gc_group = 0, prev_group = 0; | ||
355 | 360 | ||
356 | do { | 361 | do { |
357 | spin_lock(&l_mg->gc_lock); | 362 | spin_lock(&l_mg->gc_lock); |
358 | if (list_empty(&l_mg->gc_full_list)) { | 363 | if (list_empty(&l_mg->gc_full_list)) { |
359 | spin_unlock(&l_mg->gc_lock); | 364 | spin_unlock(&l_mg->gc_lock); |
360 | break; | 365 | return; |
361 | } | 366 | } |
362 | 367 | ||
363 | line = list_first_entry(&l_mg->gc_full_list, | 368 | line = list_first_entry(&l_mg->gc_full_list, |
@@ -371,11 +376,30 @@ static void pblk_gc_run(struct pblk *pblk) | |||
371 | list_del(&line->list); | 376 | list_del(&line->list); |
372 | spin_unlock(&l_mg->gc_lock); | 377 | spin_unlock(&l_mg->gc_lock); |
373 | 378 | ||
379 | atomic_inc(&gc->pipeline_gc); | ||
374 | kref_put(&line->ref, pblk_line_put); | 380 | kref_put(&line->ref, pblk_line_put); |
375 | } while (1); | 381 | } while (1); |
382 | } | ||
383 | |||
384 | /* | ||
385 | * Lines with no valid sectors will be returned to the free list immediately. If | ||
386 | * GC is activated - either because the free block count is under the determined | ||
387 | * threshold, or because it is being forced from user space - only lines with a | ||
388 | * high count of invalid sectors will be recycled. | ||
389 | */ | ||
390 | static void pblk_gc_run(struct pblk *pblk) | ||
391 | { | ||
392 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | ||
393 | struct pblk_gc *gc = &pblk->gc; | ||
394 | struct pblk_line *line; | ||
395 | struct list_head *group_list; | ||
396 | bool run_gc; | ||
397 | int read_inflight_gc, gc_group = 0, prev_group = 0; | ||
398 | |||
399 | pblk_gc_free_full_lines(pblk); | ||
376 | 400 | ||
377 | run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); | 401 | run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); |
378 | if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) | 402 | if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD)) |
379 | return; | 403 | return; |
380 | 404 | ||
381 | next_gc_group: | 405 | next_gc_group: |
@@ -402,14 +426,14 @@ next_gc_group: | |||
402 | list_add_tail(&line->list, &gc->r_list); | 426 | list_add_tail(&line->list, &gc->r_list); |
403 | spin_unlock(&gc->r_lock); | 427 | spin_unlock(&gc->r_lock); |
404 | 428 | ||
405 | inflight_gc = atomic_inc_return(&gc->inflight_gc); | 429 | read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc); |
406 | pblk_gc_reader_kick(gc); | 430 | pblk_gc_reader_kick(gc); |
407 | 431 | ||
408 | prev_group = 1; | 432 | prev_group = 1; |
409 | 433 | ||
410 | /* No need to queue up more GC lines than we can handle */ | 434 | /* No need to queue up more GC lines than we can handle */ |
411 | run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); | 435 | run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); |
412 | if (!run_gc || inflight_gc >= PBLK_GC_L_QD) | 436 | if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD) |
413 | break; | 437 | break; |
414 | } while (1); | 438 | } while (1); |
415 | 439 | ||
@@ -418,16 +442,6 @@ next_gc_group: | |||
418 | goto next_gc_group; | 442 | goto next_gc_group; |
419 | } | 443 | } |
420 | 444 | ||
421 | void pblk_gc_kick(struct pblk *pblk) | ||
422 | { | ||
423 | struct pblk_gc *gc = &pblk->gc; | ||
424 | |||
425 | wake_up_process(gc->gc_ts); | ||
426 | pblk_gc_writer_kick(gc); | ||
427 | pblk_gc_reader_kick(gc); | ||
428 | mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); | ||
429 | } | ||
430 | |||
431 | static void pblk_gc_timer(unsigned long data) | 445 | static void pblk_gc_timer(unsigned long data) |
432 | { | 446 | { |
433 | struct pblk *pblk = (struct pblk *)data; | 447 | struct pblk *pblk = (struct pblk *)data; |
@@ -465,6 +479,7 @@ static int pblk_gc_writer_ts(void *data) | |||
465 | static int pblk_gc_reader_ts(void *data) | 479 | static int pblk_gc_reader_ts(void *data) |
466 | { | 480 | { |
467 | struct pblk *pblk = data; | 481 | struct pblk *pblk = data; |
482 | struct pblk_gc *gc = &pblk->gc; | ||
468 | 483 | ||
469 | while (!kthread_should_stop()) { | 484 | while (!kthread_should_stop()) { |
470 | if (!pblk_gc_read(pblk)) | 485 | if (!pblk_gc_read(pblk)) |
@@ -473,6 +488,18 @@ static int pblk_gc_reader_ts(void *data) | |||
473 | io_schedule(); | 488 | io_schedule(); |
474 | } | 489 | } |
475 | 490 | ||
491 | #ifdef CONFIG_NVM_DEBUG | ||
492 | pr_info("pblk: flushing gc pipeline, %d lines left\n", | ||
493 | atomic_read(&gc->pipeline_gc)); | ||
494 | #endif | ||
495 | |||
496 | do { | ||
497 | if (!atomic_read(&gc->pipeline_gc)) | ||
498 | break; | ||
499 | |||
500 | schedule(); | ||
501 | } while (1); | ||
502 | |||
476 | return 0; | 503 | return 0; |
477 | } | 504 | } |
478 | 505 | ||
@@ -486,10 +513,10 @@ void pblk_gc_should_start(struct pblk *pblk) | |||
486 | { | 513 | { |
487 | struct pblk_gc *gc = &pblk->gc; | 514 | struct pblk_gc *gc = &pblk->gc; |
488 | 515 | ||
489 | if (gc->gc_enabled && !gc->gc_active) | 516 | if (gc->gc_enabled && !gc->gc_active) { |
490 | pblk_gc_start(pblk); | 517 | pblk_gc_start(pblk); |
491 | 518 | pblk_gc_kick(pblk); | |
492 | pblk_gc_kick(pblk); | 519 | } |
493 | } | 520 | } |
494 | 521 | ||
495 | /* | 522 | /* |
@@ -510,6 +537,11 @@ void pblk_gc_should_stop(struct pblk *pblk) | |||
510 | pblk_gc_stop(pblk, 0); | 537 | pblk_gc_stop(pblk, 0); |
511 | } | 538 | } |
512 | 539 | ||
540 | void pblk_gc_should_kick(struct pblk *pblk) | ||
541 | { | ||
542 | pblk_rl_update_rates(&pblk->rl); | ||
543 | } | ||
544 | |||
513 | void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, | 545 | void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, |
514 | int *gc_active) | 546 | int *gc_active) |
515 | { | 547 | { |
@@ -576,7 +608,8 @@ int pblk_gc_init(struct pblk *pblk) | |||
576 | gc->gc_forced = 0; | 608 | gc->gc_forced = 0; |
577 | gc->gc_enabled = 1; | 609 | gc->gc_enabled = 1; |
578 | gc->w_entries = 0; | 610 | gc->w_entries = 0; |
579 | atomic_set(&gc->inflight_gc, 0); | 611 | atomic_set(&gc->read_inflight_gc, 0); |
612 | atomic_set(&gc->pipeline_gc, 0); | ||
580 | 613 | ||
581 | /* Workqueue that reads valid sectors from a line and submit them to the | 614 | /* Workqueue that reads valid sectors from a line and submit them to the |
582 | * GC writer to be recycled. | 615 | * GC writer to be recycled. |
@@ -602,7 +635,7 @@ int pblk_gc_init(struct pblk *pblk) | |||
602 | spin_lock_init(&gc->w_lock); | 635 | spin_lock_init(&gc->w_lock); |
603 | spin_lock_init(&gc->r_lock); | 636 | spin_lock_init(&gc->r_lock); |
604 | 637 | ||
605 | sema_init(&gc->gc_sem, 128); | 638 | sema_init(&gc->gc_sem, PBLK_GC_RQ_QD); |
606 | 639 | ||
607 | INIT_LIST_HEAD(&gc->w_list); | 640 | INIT_LIST_HEAD(&gc->w_list); |
608 | INIT_LIST_HEAD(&gc->r_list); | 641 | INIT_LIST_HEAD(&gc->r_list); |
@@ -625,24 +658,24 @@ void pblk_gc_exit(struct pblk *pblk) | |||
625 | { | 658 | { |
626 | struct pblk_gc *gc = &pblk->gc; | 659 | struct pblk_gc *gc = &pblk->gc; |
627 | 660 | ||
628 | flush_workqueue(gc->gc_reader_wq); | 661 | gc->gc_enabled = 0; |
629 | flush_workqueue(gc->gc_line_reader_wq); | 662 | del_timer_sync(&gc->gc_timer); |
630 | |||
631 | del_timer(&gc->gc_timer); | ||
632 | pblk_gc_stop(pblk, 1); | 663 | pblk_gc_stop(pblk, 1); |
633 | 664 | ||
634 | if (gc->gc_ts) | 665 | if (gc->gc_ts) |
635 | kthread_stop(gc->gc_ts); | 666 | kthread_stop(gc->gc_ts); |
636 | 667 | ||
668 | if (gc->gc_reader_ts) | ||
669 | kthread_stop(gc->gc_reader_ts); | ||
670 | |||
671 | flush_workqueue(gc->gc_reader_wq); | ||
637 | if (gc->gc_reader_wq) | 672 | if (gc->gc_reader_wq) |
638 | destroy_workqueue(gc->gc_reader_wq); | 673 | destroy_workqueue(gc->gc_reader_wq); |
639 | 674 | ||
675 | flush_workqueue(gc->gc_line_reader_wq); | ||
640 | if (gc->gc_line_reader_wq) | 676 | if (gc->gc_line_reader_wq) |
641 | destroy_workqueue(gc->gc_line_reader_wq); | 677 | destroy_workqueue(gc->gc_line_reader_wq); |
642 | 678 | ||
643 | if (gc->gc_writer_ts) | 679 | if (gc->gc_writer_ts) |
644 | kthread_stop(gc->gc_writer_ts); | 680 | kthread_stop(gc->gc_writer_ts); |
645 | |||
646 | if (gc->gc_reader_ts) | ||
647 | kthread_stop(gc->gc_reader_ts); | ||
648 | } | 681 | } |
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 1b0f61233c21..f62112ba5482 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c | |||
@@ -20,8 +20,8 @@ | |||
20 | 20 | ||
21 | #include "pblk.h" | 21 | #include "pblk.h" |
22 | 22 | ||
23 | static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, | 23 | static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, |
24 | *pblk_w_rq_cache, *pblk_line_meta_cache; | 24 | *pblk_w_rq_cache; |
25 | static DECLARE_RWSEM(pblk_lock); | 25 | static DECLARE_RWSEM(pblk_lock); |
26 | struct bio_set *pblk_bio_set; | 26 | struct bio_set *pblk_bio_set; |
27 | 27 | ||
@@ -46,7 +46,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, | |||
46 | * user I/Os. Unless stalled, the rate limiter leaves at least 256KB | 46 | * user I/Os. Unless stalled, the rate limiter leaves at least 256KB |
47 | * available for user I/O. | 47 | * available for user I/O. |
48 | */ | 48 | */ |
49 | if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl))) | 49 | if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) |
50 | blk_queue_split(q, &bio); | 50 | blk_queue_split(q, &bio); |
51 | 51 | ||
52 | return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); | 52 | return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); |
@@ -76,6 +76,28 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) | |||
76 | return BLK_QC_T_NONE; | 76 | return BLK_QC_T_NONE; |
77 | } | 77 | } |
78 | 78 | ||
79 | static size_t pblk_trans_map_size(struct pblk *pblk) | ||
80 | { | ||
81 | int entry_size = 8; | ||
82 | |||
83 | if (pblk->ppaf_bitsize < 32) | ||
84 | entry_size = 4; | ||
85 | |||
86 | return entry_size * pblk->rl.nr_secs; | ||
87 | } | ||
88 | |||
89 | #ifdef CONFIG_NVM_DEBUG | ||
90 | static u32 pblk_l2p_crc(struct pblk *pblk) | ||
91 | { | ||
92 | size_t map_size; | ||
93 | u32 crc = ~(u32)0; | ||
94 | |||
95 | map_size = pblk_trans_map_size(pblk); | ||
96 | crc = crc32_le(crc, pblk->trans_map, map_size); | ||
97 | return crc; | ||
98 | } | ||
99 | #endif | ||
100 | |||
79 | static void pblk_l2p_free(struct pblk *pblk) | 101 | static void pblk_l2p_free(struct pblk *pblk) |
80 | { | 102 | { |
81 | vfree(pblk->trans_map); | 103 | vfree(pblk->trans_map); |
@@ -85,12 +107,10 @@ static int pblk_l2p_init(struct pblk *pblk) | |||
85 | { | 107 | { |
86 | sector_t i; | 108 | sector_t i; |
87 | struct ppa_addr ppa; | 109 | struct ppa_addr ppa; |
88 | int entry_size = 8; | 110 | size_t map_size; |
89 | 111 | ||
90 | if (pblk->ppaf_bitsize < 32) | 112 | map_size = pblk_trans_map_size(pblk); |
91 | entry_size = 4; | 113 | pblk->trans_map = vmalloc(map_size); |
92 | |||
93 | pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs); | ||
94 | if (!pblk->trans_map) | 114 | if (!pblk->trans_map) |
95 | return -ENOMEM; | 115 | return -ENOMEM; |
96 | 116 | ||
@@ -132,7 +152,6 @@ static int pblk_rwb_init(struct pblk *pblk) | |||
132 | } | 152 | } |
133 | 153 | ||
134 | /* Minimum pages needed within a lun */ | 154 | /* Minimum pages needed within a lun */ |
135 | #define PAGE_POOL_SIZE 16 | ||
136 | #define ADDR_POOL_SIZE 64 | 155 | #define ADDR_POOL_SIZE 64 |
137 | 156 | ||
138 | static int pblk_set_ppaf(struct pblk *pblk) | 157 | static int pblk_set_ppaf(struct pblk *pblk) |
@@ -182,12 +201,10 @@ static int pblk_set_ppaf(struct pblk *pblk) | |||
182 | 201 | ||
183 | static int pblk_init_global_caches(struct pblk *pblk) | 202 | static int pblk_init_global_caches(struct pblk *pblk) |
184 | { | 203 | { |
185 | char cache_name[PBLK_CACHE_NAME_LEN]; | ||
186 | |||
187 | down_write(&pblk_lock); | 204 | down_write(&pblk_lock); |
188 | pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws", | 205 | pblk_ws_cache = kmem_cache_create("pblk_blk_ws", |
189 | sizeof(struct pblk_line_ws), 0, 0, NULL); | 206 | sizeof(struct pblk_line_ws), 0, 0, NULL); |
190 | if (!pblk_blk_ws_cache) { | 207 | if (!pblk_ws_cache) { |
191 | up_write(&pblk_lock); | 208 | up_write(&pblk_lock); |
192 | return -ENOMEM; | 209 | return -ENOMEM; |
193 | } | 210 | } |
@@ -195,7 +212,7 @@ static int pblk_init_global_caches(struct pblk *pblk) | |||
195 | pblk_rec_cache = kmem_cache_create("pblk_rec", | 212 | pblk_rec_cache = kmem_cache_create("pblk_rec", |
196 | sizeof(struct pblk_rec_ctx), 0, 0, NULL); | 213 | sizeof(struct pblk_rec_ctx), 0, 0, NULL); |
197 | if (!pblk_rec_cache) { | 214 | if (!pblk_rec_cache) { |
198 | kmem_cache_destroy(pblk_blk_ws_cache); | 215 | kmem_cache_destroy(pblk_ws_cache); |
199 | up_write(&pblk_lock); | 216 | up_write(&pblk_lock); |
200 | return -ENOMEM; | 217 | return -ENOMEM; |
201 | } | 218 | } |
@@ -203,7 +220,7 @@ static int pblk_init_global_caches(struct pblk *pblk) | |||
203 | pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, | 220 | pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, |
204 | 0, 0, NULL); | 221 | 0, 0, NULL); |
205 | if (!pblk_g_rq_cache) { | 222 | if (!pblk_g_rq_cache) { |
206 | kmem_cache_destroy(pblk_blk_ws_cache); | 223 | kmem_cache_destroy(pblk_ws_cache); |
207 | kmem_cache_destroy(pblk_rec_cache); | 224 | kmem_cache_destroy(pblk_rec_cache); |
208 | up_write(&pblk_lock); | 225 | up_write(&pblk_lock); |
209 | return -ENOMEM; | 226 | return -ENOMEM; |
@@ -212,30 +229,25 @@ static int pblk_init_global_caches(struct pblk *pblk) | |||
212 | pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, | 229 | pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, |
213 | 0, 0, NULL); | 230 | 0, 0, NULL); |
214 | if (!pblk_w_rq_cache) { | 231 | if (!pblk_w_rq_cache) { |
215 | kmem_cache_destroy(pblk_blk_ws_cache); | 232 | kmem_cache_destroy(pblk_ws_cache); |
216 | kmem_cache_destroy(pblk_rec_cache); | 233 | kmem_cache_destroy(pblk_rec_cache); |
217 | kmem_cache_destroy(pblk_g_rq_cache); | 234 | kmem_cache_destroy(pblk_g_rq_cache); |
218 | up_write(&pblk_lock); | 235 | up_write(&pblk_lock); |
219 | return -ENOMEM; | 236 | return -ENOMEM; |
220 | } | 237 | } |
221 | |||
222 | snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s", | ||
223 | pblk->disk->disk_name); | ||
224 | pblk_line_meta_cache = kmem_cache_create(cache_name, | ||
225 | pblk->lm.sec_bitmap_len, 0, 0, NULL); | ||
226 | if (!pblk_line_meta_cache) { | ||
227 | kmem_cache_destroy(pblk_blk_ws_cache); | ||
228 | kmem_cache_destroy(pblk_rec_cache); | ||
229 | kmem_cache_destroy(pblk_g_rq_cache); | ||
230 | kmem_cache_destroy(pblk_w_rq_cache); | ||
231 | up_write(&pblk_lock); | ||
232 | return -ENOMEM; | ||
233 | } | ||
234 | up_write(&pblk_lock); | 238 | up_write(&pblk_lock); |
235 | 239 | ||
236 | return 0; | 240 | return 0; |
237 | } | 241 | } |
238 | 242 | ||
243 | static void pblk_free_global_caches(struct pblk *pblk) | ||
244 | { | ||
245 | kmem_cache_destroy(pblk_ws_cache); | ||
246 | kmem_cache_destroy(pblk_rec_cache); | ||
247 | kmem_cache_destroy(pblk_g_rq_cache); | ||
248 | kmem_cache_destroy(pblk_w_rq_cache); | ||
249 | } | ||
250 | |||
239 | static int pblk_core_init(struct pblk *pblk) | 251 | static int pblk_core_init(struct pblk *pblk) |
240 | { | 252 | { |
241 | struct nvm_tgt_dev *dev = pblk->dev; | 253 | struct nvm_tgt_dev *dev = pblk->dev; |
@@ -247,70 +259,80 @@ static int pblk_core_init(struct pblk *pblk) | |||
247 | if (pblk_init_global_caches(pblk)) | 259 | if (pblk_init_global_caches(pblk)) |
248 | return -ENOMEM; | 260 | return -ENOMEM; |
249 | 261 | ||
250 | pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0); | 262 | /* Internal bios can be at most the sectors signaled by the device. */ |
251 | if (!pblk->page_pool) | 263 | pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev), |
252 | return -ENOMEM; | 264 | 0); |
265 | if (!pblk->page_bio_pool) | ||
266 | goto free_global_caches; | ||
253 | 267 | ||
254 | pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, | 268 | pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE, |
255 | pblk_blk_ws_cache); | 269 | pblk_ws_cache); |
256 | if (!pblk->line_ws_pool) | 270 | if (!pblk->gen_ws_pool) |
257 | goto free_page_pool; | 271 | goto free_page_bio_pool; |
258 | 272 | ||
259 | pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); | 273 | pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); |
260 | if (!pblk->rec_pool) | 274 | if (!pblk->rec_pool) |
261 | goto free_blk_ws_pool; | 275 | goto free_gen_ws_pool; |
262 | 276 | ||
263 | pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, | 277 | pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns, |
264 | pblk_g_rq_cache); | 278 | pblk_g_rq_cache); |
265 | if (!pblk->g_rq_pool) | 279 | if (!pblk->r_rq_pool) |
266 | goto free_rec_pool; | 280 | goto free_rec_pool; |
267 | 281 | ||
268 | pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2, | 282 | pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns, |
283 | pblk_g_rq_cache); | ||
284 | if (!pblk->e_rq_pool) | ||
285 | goto free_r_rq_pool; | ||
286 | |||
287 | pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns, | ||
269 | pblk_w_rq_cache); | 288 | pblk_w_rq_cache); |
270 | if (!pblk->w_rq_pool) | 289 | if (!pblk->w_rq_pool) |
271 | goto free_g_rq_pool; | 290 | goto free_e_rq_pool; |
272 | |||
273 | pblk->line_meta_pool = | ||
274 | mempool_create_slab_pool(PBLK_META_POOL_SIZE, | ||
275 | pblk_line_meta_cache); | ||
276 | if (!pblk->line_meta_pool) | ||
277 | goto free_w_rq_pool; | ||
278 | 291 | ||
279 | pblk->close_wq = alloc_workqueue("pblk-close-wq", | 292 | pblk->close_wq = alloc_workqueue("pblk-close-wq", |
280 | WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); | 293 | WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); |
281 | if (!pblk->close_wq) | 294 | if (!pblk->close_wq) |
282 | goto free_line_meta_pool; | 295 | goto free_w_rq_pool; |
283 | 296 | ||
284 | pblk->bb_wq = alloc_workqueue("pblk-bb-wq", | 297 | pblk->bb_wq = alloc_workqueue("pblk-bb-wq", |
285 | WQ_MEM_RECLAIM | WQ_UNBOUND, 0); | 298 | WQ_MEM_RECLAIM | WQ_UNBOUND, 0); |
286 | if (!pblk->bb_wq) | 299 | if (!pblk->bb_wq) |
287 | goto free_close_wq; | 300 | goto free_close_wq; |
288 | 301 | ||
289 | if (pblk_set_ppaf(pblk)) | 302 | pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq", |
303 | WQ_MEM_RECLAIM | WQ_UNBOUND, 0); | ||
304 | if (!pblk->r_end_wq) | ||
290 | goto free_bb_wq; | 305 | goto free_bb_wq; |
291 | 306 | ||
307 | if (pblk_set_ppaf(pblk)) | ||
308 | goto free_r_end_wq; | ||
309 | |||
292 | if (pblk_rwb_init(pblk)) | 310 | if (pblk_rwb_init(pblk)) |
293 | goto free_bb_wq; | 311 | goto free_r_end_wq; |
294 | 312 | ||
295 | INIT_LIST_HEAD(&pblk->compl_list); | 313 | INIT_LIST_HEAD(&pblk->compl_list); |
296 | return 0; | 314 | return 0; |
297 | 315 | ||
316 | free_r_end_wq: | ||
317 | destroy_workqueue(pblk->r_end_wq); | ||
298 | free_bb_wq: | 318 | free_bb_wq: |
299 | destroy_workqueue(pblk->bb_wq); | 319 | destroy_workqueue(pblk->bb_wq); |
300 | free_close_wq: | 320 | free_close_wq: |
301 | destroy_workqueue(pblk->close_wq); | 321 | destroy_workqueue(pblk->close_wq); |
302 | free_line_meta_pool: | ||
303 | mempool_destroy(pblk->line_meta_pool); | ||
304 | free_w_rq_pool: | 322 | free_w_rq_pool: |
305 | mempool_destroy(pblk->w_rq_pool); | 323 | mempool_destroy(pblk->w_rq_pool); |
306 | free_g_rq_pool: | 324 | free_e_rq_pool: |
307 | mempool_destroy(pblk->g_rq_pool); | 325 | mempool_destroy(pblk->e_rq_pool); |
326 | free_r_rq_pool: | ||
327 | mempool_destroy(pblk->r_rq_pool); | ||
308 | free_rec_pool: | 328 | free_rec_pool: |
309 | mempool_destroy(pblk->rec_pool); | 329 | mempool_destroy(pblk->rec_pool); |
310 | free_blk_ws_pool: | 330 | free_gen_ws_pool: |
311 | mempool_destroy(pblk->line_ws_pool); | 331 | mempool_destroy(pblk->gen_ws_pool); |
312 | free_page_pool: | 332 | free_page_bio_pool: |
313 | mempool_destroy(pblk->page_pool); | 333 | mempool_destroy(pblk->page_bio_pool); |
334 | free_global_caches: | ||
335 | pblk_free_global_caches(pblk); | ||
314 | return -ENOMEM; | 336 | return -ENOMEM; |
315 | } | 337 | } |
316 | 338 | ||
@@ -319,21 +341,20 @@ static void pblk_core_free(struct pblk *pblk) | |||
319 | if (pblk->close_wq) | 341 | if (pblk->close_wq) |
320 | destroy_workqueue(pblk->close_wq); | 342 | destroy_workqueue(pblk->close_wq); |
321 | 343 | ||
344 | if (pblk->r_end_wq) | ||
345 | destroy_workqueue(pblk->r_end_wq); | ||
346 | |||
322 | if (pblk->bb_wq) | 347 | if (pblk->bb_wq) |
323 | destroy_workqueue(pblk->bb_wq); | 348 | destroy_workqueue(pblk->bb_wq); |
324 | 349 | ||
325 | mempool_destroy(pblk->page_pool); | 350 | mempool_destroy(pblk->page_bio_pool); |
326 | mempool_destroy(pblk->line_ws_pool); | 351 | mempool_destroy(pblk->gen_ws_pool); |
327 | mempool_destroy(pblk->rec_pool); | 352 | mempool_destroy(pblk->rec_pool); |
328 | mempool_destroy(pblk->g_rq_pool); | 353 | mempool_destroy(pblk->r_rq_pool); |
354 | mempool_destroy(pblk->e_rq_pool); | ||
329 | mempool_destroy(pblk->w_rq_pool); | 355 | mempool_destroy(pblk->w_rq_pool); |
330 | mempool_destroy(pblk->line_meta_pool); | ||
331 | 356 | ||
332 | kmem_cache_destroy(pblk_blk_ws_cache); | 357 | pblk_free_global_caches(pblk); |
333 | kmem_cache_destroy(pblk_rec_cache); | ||
334 | kmem_cache_destroy(pblk_g_rq_cache); | ||
335 | kmem_cache_destroy(pblk_w_rq_cache); | ||
336 | kmem_cache_destroy(pblk_line_meta_cache); | ||
337 | } | 358 | } |
338 | 359 | ||
339 | static void pblk_luns_free(struct pblk *pblk) | 360 | static void pblk_luns_free(struct pblk *pblk) |
@@ -372,13 +393,11 @@ static void pblk_line_meta_free(struct pblk *pblk) | |||
372 | kfree(l_mg->bb_aux); | 393 | kfree(l_mg->bb_aux); |
373 | kfree(l_mg->vsc_list); | 394 | kfree(l_mg->vsc_list); |
374 | 395 | ||
375 | spin_lock(&l_mg->free_lock); | ||
376 | for (i = 0; i < PBLK_DATA_LINES; i++) { | 396 | for (i = 0; i < PBLK_DATA_LINES; i++) { |
377 | kfree(l_mg->sline_meta[i]); | 397 | kfree(l_mg->sline_meta[i]); |
378 | pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); | 398 | pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); |
379 | kfree(l_mg->eline_meta[i]); | 399 | kfree(l_mg->eline_meta[i]); |
380 | } | 400 | } |
381 | spin_unlock(&l_mg->free_lock); | ||
382 | 401 | ||
383 | kfree(pblk->lines); | 402 | kfree(pblk->lines); |
384 | } | 403 | } |
@@ -507,6 +526,13 @@ static int pblk_lines_configure(struct pblk *pblk, int flags) | |||
507 | } | 526 | } |
508 | } | 527 | } |
509 | 528 | ||
529 | #ifdef CONFIG_NVM_DEBUG | ||
530 | pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); | ||
531 | #endif | ||
532 | |||
533 | /* Free full lines directly as GC has not been started yet */ | ||
534 | pblk_gc_free_full_lines(pblk); | ||
535 | |||
510 | if (!line) { | 536 | if (!line) { |
511 | /* Configure next line for user data */ | 537 | /* Configure next line for user data */ |
512 | line = pblk_line_get_first_data(pblk); | 538 | line = pblk_line_get_first_data(pblk); |
@@ -630,7 +656,10 @@ static int pblk_lines_alloc_metadata(struct pblk *pblk) | |||
630 | 656 | ||
631 | fail_free_emeta: | 657 | fail_free_emeta: |
632 | while (--i >= 0) { | 658 | while (--i >= 0) { |
633 | vfree(l_mg->eline_meta[i]->buf); | 659 | if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META) |
660 | vfree(l_mg->eline_meta[i]->buf); | ||
661 | else | ||
662 | kfree(l_mg->eline_meta[i]->buf); | ||
634 | kfree(l_mg->eline_meta[i]); | 663 | kfree(l_mg->eline_meta[i]); |
635 | } | 664 | } |
636 | 665 | ||
@@ -681,8 +710,8 @@ static int pblk_lines_init(struct pblk *pblk) | |||
681 | lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); | 710 | lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); |
682 | lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); | 711 | lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); |
683 | lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); | 712 | lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); |
684 | lm->high_thrs = lm->sec_per_line / 2; | 713 | lm->mid_thrs = lm->sec_per_line / 2; |
685 | lm->mid_thrs = lm->sec_per_line / 4; | 714 | lm->high_thrs = lm->sec_per_line / 4; |
686 | lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; | 715 | lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; |
687 | 716 | ||
688 | /* Calculate necessary pages for smeta. See comment over struct | 717 | /* Calculate necessary pages for smeta. See comment over struct |
@@ -713,9 +742,13 @@ add_emeta_page: | |||
713 | goto add_emeta_page; | 742 | goto add_emeta_page; |
714 | } | 743 | } |
715 | 744 | ||
716 | lm->emeta_bb = geo->nr_luns - i; | 745 | lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0; |
717 | lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0], | 746 | |
718 | geo->sec_per_blk); | 747 | lm->min_blk_line = 1; |
748 | if (geo->nr_luns > 1) | ||
749 | lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + | ||
750 | lm->emeta_sec[0], geo->sec_per_blk); | ||
751 | |||
719 | if (lm->min_blk_line > lm->blk_per_line) { | 752 | if (lm->min_blk_line > lm->blk_per_line) { |
720 | pr_err("pblk: config. not supported. Min. LUN in line:%d\n", | 753 | pr_err("pblk: config. not supported. Min. LUN in line:%d\n", |
721 | lm->blk_per_line); | 754 | lm->blk_per_line); |
@@ -890,6 +923,11 @@ static void pblk_exit(void *private) | |||
890 | down_write(&pblk_lock); | 923 | down_write(&pblk_lock); |
891 | pblk_gc_exit(pblk); | 924 | pblk_gc_exit(pblk); |
892 | pblk_tear_down(pblk); | 925 | pblk_tear_down(pblk); |
926 | |||
927 | #ifdef CONFIG_NVM_DEBUG | ||
928 | pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); | ||
929 | #endif | ||
930 | |||
893 | pblk_free(pblk); | 931 | pblk_free(pblk); |
894 | up_write(&pblk_lock); | 932 | up_write(&pblk_lock); |
895 | } | 933 | } |
@@ -911,7 +949,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
911 | int ret; | 949 | int ret; |
912 | 950 | ||
913 | if (dev->identity.dom & NVM_RSP_L2P) { | 951 | if (dev->identity.dom & NVM_RSP_L2P) { |
914 | pr_err("pblk: device-side L2P table not supported. (%x)\n", | 952 | pr_err("pblk: host-side L2P table not supported. (%x)\n", |
915 | dev->identity.dom); | 953 | dev->identity.dom); |
916 | return ERR_PTR(-EINVAL); | 954 | return ERR_PTR(-EINVAL); |
917 | } | 955 | } |
@@ -923,6 +961,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
923 | pblk->dev = dev; | 961 | pblk->dev = dev; |
924 | pblk->disk = tdisk; | 962 | pblk->disk = tdisk; |
925 | pblk->state = PBLK_STATE_RUNNING; | 963 | pblk->state = PBLK_STATE_RUNNING; |
964 | pblk->gc.gc_enabled = 0; | ||
926 | 965 | ||
927 | spin_lock_init(&pblk->trans_lock); | 966 | spin_lock_init(&pblk->trans_lock); |
928 | spin_lock_init(&pblk->lock); | 967 | spin_lock_init(&pblk->lock); |
@@ -944,6 +983,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
944 | atomic_long_set(&pblk->recov_writes, 0); | 983 | atomic_long_set(&pblk->recov_writes, 0); |
945 | atomic_long_set(&pblk->recov_writes, 0); | 984 | atomic_long_set(&pblk->recov_writes, 0); |
946 | atomic_long_set(&pblk->recov_gc_writes, 0); | 985 | atomic_long_set(&pblk->recov_gc_writes, 0); |
986 | atomic_long_set(&pblk->recov_gc_reads, 0); | ||
947 | #endif | 987 | #endif |
948 | 988 | ||
949 | atomic_long_set(&pblk->read_failed, 0); | 989 | atomic_long_set(&pblk->read_failed, 0); |
@@ -1012,6 +1052,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
1012 | pblk->rwb.nr_entries); | 1052 | pblk->rwb.nr_entries); |
1013 | 1053 | ||
1014 | wake_up_process(pblk->writer_ts); | 1054 | wake_up_process(pblk->writer_ts); |
1055 | |||
1056 | /* Check if we need to start GC */ | ||
1057 | pblk_gc_should_kick(pblk); | ||
1058 | |||
1015 | return pblk; | 1059 | return pblk; |
1016 | 1060 | ||
1017 | fail_stop_writer: | 1061 | fail_stop_writer: |
@@ -1044,6 +1088,7 @@ static struct nvm_tgt_type tt_pblk = { | |||
1044 | 1088 | ||
1045 | .sysfs_init = pblk_sysfs_init, | 1089 | .sysfs_init = pblk_sysfs_init, |
1046 | .sysfs_exit = pblk_sysfs_exit, | 1090 | .sysfs_exit = pblk_sysfs_exit, |
1091 | .owner = THIS_MODULE, | ||
1047 | }; | 1092 | }; |
1048 | 1093 | ||
1049 | static int __init pblk_module_init(void) | 1094 | static int __init pblk_module_init(void) |
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index fddb924f6dde..6f3ecde2140f 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c | |||
@@ -25,16 +25,28 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, | |||
25 | unsigned int valid_secs) | 25 | unsigned int valid_secs) |
26 | { | 26 | { |
27 | struct pblk_line *line = pblk_line_get_data(pblk); | 27 | struct pblk_line *line = pblk_line_get_data(pblk); |
28 | struct pblk_emeta *emeta = line->emeta; | 28 | struct pblk_emeta *emeta; |
29 | struct pblk_w_ctx *w_ctx; | 29 | struct pblk_w_ctx *w_ctx; |
30 | __le64 *lba_list = emeta_to_lbas(pblk, emeta->buf); | 30 | __le64 *lba_list; |
31 | u64 paddr; | 31 | u64 paddr; |
32 | int nr_secs = pblk->min_write_pgs; | 32 | int nr_secs = pblk->min_write_pgs; |
33 | int i; | 33 | int i; |
34 | 34 | ||
35 | if (pblk_line_is_full(line)) { | ||
36 | struct pblk_line *prev_line = line; | ||
37 | |||
38 | line = pblk_line_replace_data(pblk); | ||
39 | pblk_line_close_meta(pblk, prev_line); | ||
40 | } | ||
41 | |||
42 | emeta = line->emeta; | ||
43 | lba_list = emeta_to_lbas(pblk, emeta->buf); | ||
44 | |||
35 | paddr = pblk_alloc_page(pblk, line, nr_secs); | 45 | paddr = pblk_alloc_page(pblk, line, nr_secs); |
36 | 46 | ||
37 | for (i = 0; i < nr_secs; i++, paddr++) { | 47 | for (i = 0; i < nr_secs; i++, paddr++) { |
48 | __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); | ||
49 | |||
38 | /* ppa to be sent to the device */ | 50 | /* ppa to be sent to the device */ |
39 | ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); | 51 | ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); |
40 | 52 | ||
@@ -51,22 +63,14 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, | |||
51 | w_ctx->ppa = ppa_list[i]; | 63 | w_ctx->ppa = ppa_list[i]; |
52 | meta_list[i].lba = cpu_to_le64(w_ctx->lba); | 64 | meta_list[i].lba = cpu_to_le64(w_ctx->lba); |
53 | lba_list[paddr] = cpu_to_le64(w_ctx->lba); | 65 | lba_list[paddr] = cpu_to_le64(w_ctx->lba); |
54 | line->nr_valid_lbas++; | 66 | if (lba_list[paddr] != addr_empty) |
67 | line->nr_valid_lbas++; | ||
55 | } else { | 68 | } else { |
56 | __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); | ||
57 | |||
58 | lba_list[paddr] = meta_list[i].lba = addr_empty; | 69 | lba_list[paddr] = meta_list[i].lba = addr_empty; |
59 | __pblk_map_invalidate(pblk, line, paddr); | 70 | __pblk_map_invalidate(pblk, line, paddr); |
60 | } | 71 | } |
61 | } | 72 | } |
62 | 73 | ||
63 | if (pblk_line_is_full(line)) { | ||
64 | struct pblk_line *prev_line = line; | ||
65 | |||
66 | pblk_line_replace_data(pblk); | ||
67 | pblk_line_close_meta(pblk, prev_line); | ||
68 | } | ||
69 | |||
70 | pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); | 74 | pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); |
71 | } | 75 | } |
72 | 76 | ||
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 9bc32578a766..b8f78e401482 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c | |||
@@ -201,8 +201,7 @@ unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) | |||
201 | return subm; | 201 | return subm; |
202 | } | 202 | } |
203 | 203 | ||
204 | static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, | 204 | static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) |
205 | unsigned int to_update) | ||
206 | { | 205 | { |
207 | struct pblk *pblk = container_of(rb, struct pblk, rwb); | 206 | struct pblk *pblk = container_of(rb, struct pblk, rwb); |
208 | struct pblk_line *line; | 207 | struct pblk_line *line; |
@@ -213,7 +212,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, | |||
213 | int flags; | 212 | int flags; |
214 | 213 | ||
215 | for (i = 0; i < to_update; i++) { | 214 | for (i = 0; i < to_update; i++) { |
216 | entry = &rb->entries[*l2p_upd]; | 215 | entry = &rb->entries[rb->l2p_update]; |
217 | w_ctx = &entry->w_ctx; | 216 | w_ctx = &entry->w_ctx; |
218 | 217 | ||
219 | flags = READ_ONCE(entry->w_ctx.flags); | 218 | flags = READ_ONCE(entry->w_ctx.flags); |
@@ -230,7 +229,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, | |||
230 | line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; | 229 | line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; |
231 | kref_put(&line->ref, pblk_line_put); | 230 | kref_put(&line->ref, pblk_line_put); |
232 | clean_wctx(w_ctx); | 231 | clean_wctx(w_ctx); |
233 | *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1); | 232 | rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); |
234 | } | 233 | } |
235 | 234 | ||
236 | pblk_rl_out(&pblk->rl, user_io, gc_io); | 235 | pblk_rl_out(&pblk->rl, user_io, gc_io); |
@@ -258,7 +257,7 @@ static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries, | |||
258 | 257 | ||
259 | count = nr_entries - space; | 258 | count = nr_entries - space; |
260 | /* l2p_update used exclusively under rb->w_lock */ | 259 | /* l2p_update used exclusively under rb->w_lock */ |
261 | ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count); | 260 | ret = __pblk_rb_update_l2p(rb, count); |
262 | 261 | ||
263 | out: | 262 | out: |
264 | return ret; | 263 | return ret; |
@@ -280,7 +279,7 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb) | |||
280 | sync = smp_load_acquire(&rb->sync); | 279 | sync = smp_load_acquire(&rb->sync); |
281 | 280 | ||
282 | to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); | 281 | to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); |
283 | __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update); | 282 | __pblk_rb_update_l2p(rb, to_update); |
284 | 283 | ||
285 | spin_unlock(&rb->w_lock); | 284 | spin_unlock(&rb->w_lock); |
286 | } | 285 | } |
@@ -325,8 +324,8 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, | |||
325 | } | 324 | } |
326 | 325 | ||
327 | void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, | 326 | void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, |
328 | struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, | 327 | struct pblk_w_ctx w_ctx, struct pblk_line *line, |
329 | unsigned int ring_pos) | 328 | u64 paddr, unsigned int ring_pos) |
330 | { | 329 | { |
331 | struct pblk *pblk = container_of(rb, struct pblk, rwb); | 330 | struct pblk *pblk = container_of(rb, struct pblk, rwb); |
332 | struct pblk_rb_entry *entry; | 331 | struct pblk_rb_entry *entry; |
@@ -341,7 +340,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, | |||
341 | 340 | ||
342 | __pblk_rb_write_entry(rb, data, w_ctx, entry); | 341 | __pblk_rb_write_entry(rb, data, w_ctx, entry); |
343 | 342 | ||
344 | if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line)) | 343 | if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr)) |
345 | entry->w_ctx.lba = ADDR_EMPTY; | 344 | entry->w_ctx.lba = ADDR_EMPTY; |
346 | 345 | ||
347 | flags = w_ctx.flags | PBLK_WRITTEN_DATA; | 346 | flags = w_ctx.flags | PBLK_WRITTEN_DATA; |
@@ -355,7 +354,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, | |||
355 | { | 354 | { |
356 | struct pblk_rb_entry *entry; | 355 | struct pblk_rb_entry *entry; |
357 | unsigned int subm, sync_point; | 356 | unsigned int subm, sync_point; |
358 | int flags; | ||
359 | 357 | ||
360 | subm = READ_ONCE(rb->subm); | 358 | subm = READ_ONCE(rb->subm); |
361 | 359 | ||
@@ -369,12 +367,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, | |||
369 | sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); | 367 | sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); |
370 | entry = &rb->entries[sync_point]; | 368 | entry = &rb->entries[sync_point]; |
371 | 369 | ||
372 | flags = READ_ONCE(entry->w_ctx.flags); | ||
373 | flags |= PBLK_FLUSH_ENTRY; | ||
374 | |||
375 | /* Release flags on context. Protect from writes */ | ||
376 | smp_store_release(&entry->w_ctx.flags, flags); | ||
377 | |||
378 | /* Protect syncs */ | 370 | /* Protect syncs */ |
379 | smp_store_release(&rb->sync_point, sync_point); | 371 | smp_store_release(&rb->sync_point, sync_point); |
380 | 372 | ||
@@ -454,6 +446,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, | |||
454 | 446 | ||
455 | /* Protect from read count */ | 447 | /* Protect from read count */ |
456 | smp_store_release(&rb->mem, mem); | 448 | smp_store_release(&rb->mem, mem); |
449 | |||
457 | return 1; | 450 | return 1; |
458 | } | 451 | } |
459 | 452 | ||
@@ -558,12 +551,13 @@ out: | |||
558 | * persist data on the write buffer to the media. | 551 | * persist data on the write buffer to the media. |
559 | */ | 552 | */ |
560 | unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, | 553 | unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, |
561 | struct bio *bio, unsigned int pos, | 554 | unsigned int pos, unsigned int nr_entries, |
562 | unsigned int nr_entries, unsigned int count) | 555 | unsigned int count) |
563 | { | 556 | { |
564 | struct pblk *pblk = container_of(rb, struct pblk, rwb); | 557 | struct pblk *pblk = container_of(rb, struct pblk, rwb); |
565 | struct request_queue *q = pblk->dev->q; | 558 | struct request_queue *q = pblk->dev->q; |
566 | struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); | 559 | struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); |
560 | struct bio *bio = rqd->bio; | ||
567 | struct pblk_rb_entry *entry; | 561 | struct pblk_rb_entry *entry; |
568 | struct page *page; | 562 | struct page *page; |
569 | unsigned int pad = 0, to_read = nr_entries; | 563 | unsigned int pad = 0, to_read = nr_entries; |
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d682e89e6493..ca79d8fb3e60 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c | |||
@@ -39,21 +39,15 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, | |||
39 | } | 39 | } |
40 | 40 | ||
41 | static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, | 41 | static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, |
42 | unsigned long *read_bitmap) | 42 | sector_t blba, unsigned long *read_bitmap) |
43 | { | 43 | { |
44 | struct pblk_sec_meta *meta_list = rqd->meta_list; | ||
44 | struct bio *bio = rqd->bio; | 45 | struct bio *bio = rqd->bio; |
45 | struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; | 46 | struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; |
46 | sector_t blba = pblk_get_lba(bio); | ||
47 | int nr_secs = rqd->nr_ppas; | 47 | int nr_secs = rqd->nr_ppas; |
48 | bool advanced_bio = false; | 48 | bool advanced_bio = false; |
49 | int i, j = 0; | 49 | int i, j = 0; |
50 | 50 | ||
51 | /* logic error: lba out-of-bounds. Ignore read request */ | ||
52 | if (blba + nr_secs >= pblk->rl.nr_secs) { | ||
53 | WARN(1, "pblk: read lbas out of bounds\n"); | ||
54 | return; | ||
55 | } | ||
56 | |||
57 | pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs); | 51 | pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs); |
58 | 52 | ||
59 | for (i = 0; i < nr_secs; i++) { | 53 | for (i = 0; i < nr_secs; i++) { |
@@ -63,6 +57,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, | |||
63 | retry: | 57 | retry: |
64 | if (pblk_ppa_empty(p)) { | 58 | if (pblk_ppa_empty(p)) { |
65 | WARN_ON(test_and_set_bit(i, read_bitmap)); | 59 | WARN_ON(test_and_set_bit(i, read_bitmap)); |
60 | meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); | ||
66 | 61 | ||
67 | if (unlikely(!advanced_bio)) { | 62 | if (unlikely(!advanced_bio)) { |
68 | bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); | 63 | bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); |
@@ -82,6 +77,7 @@ retry: | |||
82 | goto retry; | 77 | goto retry; |
83 | } | 78 | } |
84 | WARN_ON(test_and_set_bit(i, read_bitmap)); | 79 | WARN_ON(test_and_set_bit(i, read_bitmap)); |
80 | meta_list[i].lba = cpu_to_le64(lba); | ||
85 | advanced_bio = true; | 81 | advanced_bio = true; |
86 | #ifdef CONFIG_NVM_DEBUG | 82 | #ifdef CONFIG_NVM_DEBUG |
87 | atomic_long_inc(&pblk->cache_reads); | 83 | atomic_long_inc(&pblk->cache_reads); |
@@ -117,10 +113,51 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd) | |||
117 | return NVM_IO_OK; | 113 | return NVM_IO_OK; |
118 | } | 114 | } |
119 | 115 | ||
120 | static void pblk_end_io_read(struct nvm_rq *rqd) | 116 | static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd, |
117 | sector_t blba) | ||
118 | { | ||
119 | struct pblk_sec_meta *meta_list = rqd->meta_list; | ||
120 | int nr_lbas = rqd->nr_ppas; | ||
121 | int i; | ||
122 | |||
123 | for (i = 0; i < nr_lbas; i++) { | ||
124 | u64 lba = le64_to_cpu(meta_list[i].lba); | ||
125 | |||
126 | if (lba == ADDR_EMPTY) | ||
127 | continue; | ||
128 | |||
129 | WARN(lba != blba + i, "pblk: corrupted read LBA\n"); | ||
130 | } | ||
131 | } | ||
132 | |||
133 | static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) | ||
134 | { | ||
135 | struct ppa_addr *ppa_list; | ||
136 | int i; | ||
137 | |||
138 | ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; | ||
139 | |||
140 | for (i = 0; i < rqd->nr_ppas; i++) { | ||
141 | struct ppa_addr ppa = ppa_list[i]; | ||
142 | struct pblk_line *line; | ||
143 | |||
144 | line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; | ||
145 | kref_put(&line->ref, pblk_line_put_wq); | ||
146 | } | ||
147 | } | ||
148 | |||
149 | static void pblk_end_user_read(struct bio *bio) | ||
150 | { | ||
151 | #ifdef CONFIG_NVM_DEBUG | ||
152 | WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); | ||
153 | #endif | ||
154 | bio_endio(bio); | ||
155 | bio_put(bio); | ||
156 | } | ||
157 | |||
158 | static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, | ||
159 | bool put_line) | ||
121 | { | 160 | { |
122 | struct pblk *pblk = rqd->private; | ||
123 | struct nvm_tgt_dev *dev = pblk->dev; | ||
124 | struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); | 161 | struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); |
125 | struct bio *bio = rqd->bio; | 162 | struct bio *bio = rqd->bio; |
126 | 163 | ||
@@ -131,47 +168,51 @@ static void pblk_end_io_read(struct nvm_rq *rqd) | |||
131 | WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); | 168 | WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); |
132 | #endif | 169 | #endif |
133 | 170 | ||
134 | nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); | 171 | pblk_read_check(pblk, rqd, r_ctx->lba); |
135 | 172 | ||
136 | bio_put(bio); | 173 | bio_put(bio); |
137 | if (r_ctx->private) { | 174 | if (r_ctx->private) |
138 | struct bio *orig_bio = r_ctx->private; | 175 | pblk_end_user_read((struct bio *)r_ctx->private); |
139 | 176 | ||
140 | #ifdef CONFIG_NVM_DEBUG | 177 | if (put_line) |
141 | WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n"); | 178 | pblk_read_put_rqd_kref(pblk, rqd); |
142 | #endif | ||
143 | bio_endio(orig_bio); | ||
144 | bio_put(orig_bio); | ||
145 | } | ||
146 | 179 | ||
147 | #ifdef CONFIG_NVM_DEBUG | 180 | #ifdef CONFIG_NVM_DEBUG |
148 | atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); | 181 | atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); |
149 | atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); | 182 | atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); |
150 | #endif | 183 | #endif |
151 | 184 | ||
152 | pblk_free_rqd(pblk, rqd, READ); | 185 | pblk_free_rqd(pblk, rqd, PBLK_READ); |
153 | atomic_dec(&pblk->inflight_io); | 186 | atomic_dec(&pblk->inflight_io); |
154 | } | 187 | } |
155 | 188 | ||
189 | static void pblk_end_io_read(struct nvm_rq *rqd) | ||
190 | { | ||
191 | struct pblk *pblk = rqd->private; | ||
192 | |||
193 | __pblk_end_io_read(pblk, rqd, true); | ||
194 | } | ||
195 | |||
156 | static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | 196 | static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, |
157 | unsigned int bio_init_idx, | 197 | unsigned int bio_init_idx, |
158 | unsigned long *read_bitmap) | 198 | unsigned long *read_bitmap) |
159 | { | 199 | { |
160 | struct bio *new_bio, *bio = rqd->bio; | 200 | struct bio *new_bio, *bio = rqd->bio; |
201 | struct pblk_sec_meta *meta_list = rqd->meta_list; | ||
161 | struct bio_vec src_bv, dst_bv; | 202 | struct bio_vec src_bv, dst_bv; |
162 | void *ppa_ptr = NULL; | 203 | void *ppa_ptr = NULL; |
163 | void *src_p, *dst_p; | 204 | void *src_p, *dst_p; |
164 | dma_addr_t dma_ppa_list = 0; | 205 | dma_addr_t dma_ppa_list = 0; |
206 | __le64 *lba_list_mem, *lba_list_media; | ||
165 | int nr_secs = rqd->nr_ppas; | 207 | int nr_secs = rqd->nr_ppas; |
166 | int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); | 208 | int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); |
167 | int i, ret, hole; | 209 | int i, ret, hole; |
168 | DECLARE_COMPLETION_ONSTACK(wait); | 210 | |
211 | /* Re-use allocated memory for intermediate lbas */ | ||
212 | lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); | ||
213 | lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); | ||
169 | 214 | ||
170 | new_bio = bio_alloc(GFP_KERNEL, nr_holes); | 215 | new_bio = bio_alloc(GFP_KERNEL, nr_holes); |
171 | if (!new_bio) { | ||
172 | pr_err("pblk: could not alloc read bio\n"); | ||
173 | return NVM_IO_ERR; | ||
174 | } | ||
175 | 216 | ||
176 | if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) | 217 | if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) |
177 | goto err; | 218 | goto err; |
@@ -181,34 +222,29 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
181 | goto err; | 222 | goto err; |
182 | } | 223 | } |
183 | 224 | ||
225 | for (i = 0; i < nr_secs; i++) | ||
226 | lba_list_mem[i] = meta_list[i].lba; | ||
227 | |||
184 | new_bio->bi_iter.bi_sector = 0; /* internal bio */ | 228 | new_bio->bi_iter.bi_sector = 0; /* internal bio */ |
185 | bio_set_op_attrs(new_bio, REQ_OP_READ, 0); | 229 | bio_set_op_attrs(new_bio, REQ_OP_READ, 0); |
186 | new_bio->bi_private = &wait; | ||
187 | new_bio->bi_end_io = pblk_end_bio_sync; | ||
188 | 230 | ||
189 | rqd->bio = new_bio; | 231 | rqd->bio = new_bio; |
190 | rqd->nr_ppas = nr_holes; | 232 | rqd->nr_ppas = nr_holes; |
191 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); | 233 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); |
192 | rqd->end_io = NULL; | ||
193 | 234 | ||
194 | if (unlikely(nr_secs > 1 && nr_holes == 1)) { | 235 | if (unlikely(nr_holes == 1)) { |
195 | ppa_ptr = rqd->ppa_list; | 236 | ppa_ptr = rqd->ppa_list; |
196 | dma_ppa_list = rqd->dma_ppa_list; | 237 | dma_ppa_list = rqd->dma_ppa_list; |
197 | rqd->ppa_addr = rqd->ppa_list[0]; | 238 | rqd->ppa_addr = rqd->ppa_list[0]; |
198 | } | 239 | } |
199 | 240 | ||
200 | ret = pblk_submit_read_io(pblk, rqd); | 241 | ret = pblk_submit_io_sync(pblk, rqd); |
201 | if (ret) { | 242 | if (ret) { |
202 | bio_put(rqd->bio); | 243 | bio_put(rqd->bio); |
203 | pr_err("pblk: read IO submission failed\n"); | 244 | pr_err("pblk: sync read IO submission failed\n"); |
204 | goto err; | 245 | goto err; |
205 | } | 246 | } |
206 | 247 | ||
207 | if (!wait_for_completion_io_timeout(&wait, | ||
208 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
209 | pr_err("pblk: partial read I/O timed out\n"); | ||
210 | } | ||
211 | |||
212 | if (rqd->error) { | 248 | if (rqd->error) { |
213 | atomic_long_inc(&pblk->read_failed); | 249 | atomic_long_inc(&pblk->read_failed); |
214 | #ifdef CONFIG_NVM_DEBUG | 250 | #ifdef CONFIG_NVM_DEBUG |
@@ -216,15 +252,31 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
216 | #endif | 252 | #endif |
217 | } | 253 | } |
218 | 254 | ||
219 | if (unlikely(nr_secs > 1 && nr_holes == 1)) { | 255 | if (unlikely(nr_holes == 1)) { |
256 | struct ppa_addr ppa; | ||
257 | |||
258 | ppa = rqd->ppa_addr; | ||
220 | rqd->ppa_list = ppa_ptr; | 259 | rqd->ppa_list = ppa_ptr; |
221 | rqd->dma_ppa_list = dma_ppa_list; | 260 | rqd->dma_ppa_list = dma_ppa_list; |
261 | rqd->ppa_list[0] = ppa; | ||
262 | } | ||
263 | |||
264 | for (i = 0; i < nr_secs; i++) { | ||
265 | lba_list_media[i] = meta_list[i].lba; | ||
266 | meta_list[i].lba = lba_list_mem[i]; | ||
222 | } | 267 | } |
223 | 268 | ||
224 | /* Fill the holes in the original bio */ | 269 | /* Fill the holes in the original bio */ |
225 | i = 0; | 270 | i = 0; |
226 | hole = find_first_zero_bit(read_bitmap, nr_secs); | 271 | hole = find_first_zero_bit(read_bitmap, nr_secs); |
227 | do { | 272 | do { |
273 | int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]); | ||
274 | struct pblk_line *line = &pblk->lines[line_id]; | ||
275 | |||
276 | kref_put(&line->ref, pblk_line_put); | ||
277 | |||
278 | meta_list[hole].lba = lba_list_media[i]; | ||
279 | |||
228 | src_bv = new_bio->bi_io_vec[i++]; | 280 | src_bv = new_bio->bi_io_vec[i++]; |
229 | dst_bv = bio->bi_io_vec[bio_init_idx + hole]; | 281 | dst_bv = bio->bi_io_vec[bio_init_idx + hole]; |
230 | 282 | ||
@@ -238,7 +290,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
238 | kunmap_atomic(src_p); | 290 | kunmap_atomic(src_p); |
239 | kunmap_atomic(dst_p); | 291 | kunmap_atomic(dst_p); |
240 | 292 | ||
241 | mempool_free(src_bv.bv_page, pblk->page_pool); | 293 | mempool_free(src_bv.bv_page, pblk->page_bio_pool); |
242 | 294 | ||
243 | hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); | 295 | hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); |
244 | } while (hole < nr_secs); | 296 | } while (hole < nr_secs); |
@@ -246,34 +298,26 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
246 | bio_put(new_bio); | 298 | bio_put(new_bio); |
247 | 299 | ||
248 | /* Complete the original bio and associated request */ | 300 | /* Complete the original bio and associated request */ |
301 | bio_endio(bio); | ||
249 | rqd->bio = bio; | 302 | rqd->bio = bio; |
250 | rqd->nr_ppas = nr_secs; | 303 | rqd->nr_ppas = nr_secs; |
251 | rqd->private = pblk; | ||
252 | 304 | ||
253 | bio_endio(bio); | 305 | __pblk_end_io_read(pblk, rqd, false); |
254 | pblk_end_io_read(rqd); | ||
255 | return NVM_IO_OK; | 306 | return NVM_IO_OK; |
256 | 307 | ||
257 | err: | 308 | err: |
258 | /* Free allocated pages in new bio */ | 309 | /* Free allocated pages in new bio */ |
259 | pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); | 310 | pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); |
260 | rqd->private = pblk; | 311 | __pblk_end_io_read(pblk, rqd, false); |
261 | pblk_end_io_read(rqd); | ||
262 | return NVM_IO_ERR; | 312 | return NVM_IO_ERR; |
263 | } | 313 | } |
264 | 314 | ||
265 | static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, | 315 | static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, |
266 | unsigned long *read_bitmap) | 316 | sector_t lba, unsigned long *read_bitmap) |
267 | { | 317 | { |
318 | struct pblk_sec_meta *meta_list = rqd->meta_list; | ||
268 | struct bio *bio = rqd->bio; | 319 | struct bio *bio = rqd->bio; |
269 | struct ppa_addr ppa; | 320 | struct ppa_addr ppa; |
270 | sector_t lba = pblk_get_lba(bio); | ||
271 | |||
272 | /* logic error: lba out-of-bounds. Ignore read request */ | ||
273 | if (lba >= pblk->rl.nr_secs) { | ||
274 | WARN(1, "pblk: read lba out of bounds\n"); | ||
275 | return; | ||
276 | } | ||
277 | 321 | ||
278 | pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); | 322 | pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); |
279 | 323 | ||
@@ -284,6 +328,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, | |||
284 | retry: | 328 | retry: |
285 | if (pblk_ppa_empty(ppa)) { | 329 | if (pblk_ppa_empty(ppa)) { |
286 | WARN_ON(test_and_set_bit(0, read_bitmap)); | 330 | WARN_ON(test_and_set_bit(0, read_bitmap)); |
331 | meta_list[0].lba = cpu_to_le64(ADDR_EMPTY); | ||
287 | return; | 332 | return; |
288 | } | 333 | } |
289 | 334 | ||
@@ -295,9 +340,12 @@ retry: | |||
295 | pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); | 340 | pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); |
296 | goto retry; | 341 | goto retry; |
297 | } | 342 | } |
343 | |||
298 | WARN_ON(test_and_set_bit(0, read_bitmap)); | 344 | WARN_ON(test_and_set_bit(0, read_bitmap)); |
345 | meta_list[0].lba = cpu_to_le64(lba); | ||
346 | |||
299 | #ifdef CONFIG_NVM_DEBUG | 347 | #ifdef CONFIG_NVM_DEBUG |
300 | atomic_long_inc(&pblk->cache_reads); | 348 | atomic_long_inc(&pblk->cache_reads); |
301 | #endif | 349 | #endif |
302 | } else { | 350 | } else { |
303 | rqd->ppa_addr = ppa; | 351 | rqd->ppa_addr = ppa; |
@@ -309,22 +357,24 @@ retry: | |||
309 | int pblk_submit_read(struct pblk *pblk, struct bio *bio) | 357 | int pblk_submit_read(struct pblk *pblk, struct bio *bio) |
310 | { | 358 | { |
311 | struct nvm_tgt_dev *dev = pblk->dev; | 359 | struct nvm_tgt_dev *dev = pblk->dev; |
360 | sector_t blba = pblk_get_lba(bio); | ||
312 | unsigned int nr_secs = pblk_get_secs(bio); | 361 | unsigned int nr_secs = pblk_get_secs(bio); |
362 | struct pblk_g_ctx *r_ctx; | ||
313 | struct nvm_rq *rqd; | 363 | struct nvm_rq *rqd; |
314 | unsigned long read_bitmap; /* Max 64 ppas per request */ | ||
315 | unsigned int bio_init_idx; | 364 | unsigned int bio_init_idx; |
365 | unsigned long read_bitmap; /* Max 64 ppas per request */ | ||
316 | int ret = NVM_IO_ERR; | 366 | int ret = NVM_IO_ERR; |
317 | 367 | ||
318 | if (nr_secs > PBLK_MAX_REQ_ADDRS) | 368 | /* logic error: lba out-of-bounds. Ignore read request */ |
369 | if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) { | ||
370 | WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n", | ||
371 | (unsigned long long)blba, nr_secs); | ||
319 | return NVM_IO_ERR; | 372 | return NVM_IO_ERR; |
373 | } | ||
320 | 374 | ||
321 | bitmap_zero(&read_bitmap, nr_secs); | 375 | bitmap_zero(&read_bitmap, nr_secs); |
322 | 376 | ||
323 | rqd = pblk_alloc_rqd(pblk, READ); | 377 | rqd = pblk_alloc_rqd(pblk, PBLK_READ); |
324 | if (IS_ERR(rqd)) { | ||
325 | pr_err_ratelimited("pblk: not able to alloc rqd"); | ||
326 | return NVM_IO_ERR; | ||
327 | } | ||
328 | 378 | ||
329 | rqd->opcode = NVM_OP_PREAD; | 379 | rqd->opcode = NVM_OP_PREAD; |
330 | rqd->bio = bio; | 380 | rqd->bio = bio; |
@@ -332,6 +382,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
332 | rqd->private = pblk; | 382 | rqd->private = pblk; |
333 | rqd->end_io = pblk_end_io_read; | 383 | rqd->end_io = pblk_end_io_read; |
334 | 384 | ||
385 | r_ctx = nvm_rq_to_pdu(rqd); | ||
386 | r_ctx->lba = blba; | ||
387 | |||
335 | /* Save the index for this bio's start. This is needed in case | 388 | /* Save the index for this bio's start. This is needed in case |
336 | * we need to fill a partial read. | 389 | * we need to fill a partial read. |
337 | */ | 390 | */ |
@@ -348,23 +401,22 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
348 | rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; | 401 | rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; |
349 | rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; | 402 | rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; |
350 | 403 | ||
351 | pblk_read_ppalist_rq(pblk, rqd, &read_bitmap); | 404 | pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap); |
352 | } else { | 405 | } else { |
353 | pblk_read_rq(pblk, rqd, &read_bitmap); | 406 | pblk_read_rq(pblk, rqd, blba, &read_bitmap); |
354 | } | 407 | } |
355 | 408 | ||
356 | bio_get(bio); | 409 | bio_get(bio); |
357 | if (bitmap_full(&read_bitmap, nr_secs)) { | 410 | if (bitmap_full(&read_bitmap, nr_secs)) { |
358 | bio_endio(bio); | 411 | bio_endio(bio); |
359 | atomic_inc(&pblk->inflight_io); | 412 | atomic_inc(&pblk->inflight_io); |
360 | pblk_end_io_read(rqd); | 413 | __pblk_end_io_read(pblk, rqd, false); |
361 | return NVM_IO_OK; | 414 | return NVM_IO_OK; |
362 | } | 415 | } |
363 | 416 | ||
364 | /* All sectors are to be read from the device */ | 417 | /* All sectors are to be read from the device */ |
365 | if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { | 418 | if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { |
366 | struct bio *int_bio = NULL; | 419 | struct bio *int_bio = NULL; |
367 | struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); | ||
368 | 420 | ||
369 | /* Clone read bio to deal with read errors internally */ | 421 | /* Clone read bio to deal with read errors internally */ |
370 | int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); | 422 | int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); |
@@ -399,40 +451,46 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
399 | return NVM_IO_OK; | 451 | return NVM_IO_OK; |
400 | 452 | ||
401 | fail_rqd_free: | 453 | fail_rqd_free: |
402 | pblk_free_rqd(pblk, rqd, READ); | 454 | pblk_free_rqd(pblk, rqd, PBLK_READ); |
403 | return ret; | 455 | return ret; |
404 | } | 456 | } |
405 | 457 | ||
406 | static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, | 458 | static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, |
407 | struct pblk_line *line, u64 *lba_list, | 459 | struct pblk_line *line, u64 *lba_list, |
408 | unsigned int nr_secs) | 460 | u64 *paddr_list_gc, unsigned int nr_secs) |
409 | { | 461 | { |
410 | struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; | 462 | struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS]; |
463 | struct ppa_addr ppa_gc; | ||
411 | int valid_secs = 0; | 464 | int valid_secs = 0; |
412 | int i; | 465 | int i; |
413 | 466 | ||
414 | pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs); | 467 | pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs); |
415 | 468 | ||
416 | for (i = 0; i < nr_secs; i++) { | 469 | for (i = 0; i < nr_secs; i++) { |
417 | if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id || | 470 | if (lba_list[i] == ADDR_EMPTY) |
418 | pblk_ppa_empty(ppas[i])) { | 471 | continue; |
419 | lba_list[i] = ADDR_EMPTY; | 472 | |
473 | ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id); | ||
474 | if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) { | ||
475 | paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY; | ||
420 | continue; | 476 | continue; |
421 | } | 477 | } |
422 | 478 | ||
423 | rqd->ppa_list[valid_secs++] = ppas[i]; | 479 | rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; |
424 | } | 480 | } |
425 | 481 | ||
426 | #ifdef CONFIG_NVM_DEBUG | 482 | #ifdef CONFIG_NVM_DEBUG |
427 | atomic_long_add(valid_secs, &pblk->inflight_reads); | 483 | atomic_long_add(valid_secs, &pblk->inflight_reads); |
428 | #endif | 484 | #endif |
485 | |||
429 | return valid_secs; | 486 | return valid_secs; |
430 | } | 487 | } |
431 | 488 | ||
432 | static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, | 489 | static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, |
433 | struct pblk_line *line, sector_t lba) | 490 | struct pblk_line *line, sector_t lba, |
491 | u64 paddr_gc) | ||
434 | { | 492 | { |
435 | struct ppa_addr ppa; | 493 | struct ppa_addr ppa_l2p, ppa_gc; |
436 | int valid_secs = 0; | 494 | int valid_secs = 0; |
437 | 495 | ||
438 | if (lba == ADDR_EMPTY) | 496 | if (lba == ADDR_EMPTY) |
@@ -445,15 +503,14 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, | |||
445 | } | 503 | } |
446 | 504 | ||
447 | spin_lock(&pblk->trans_lock); | 505 | spin_lock(&pblk->trans_lock); |
448 | ppa = pblk_trans_map_get(pblk, lba); | 506 | ppa_l2p = pblk_trans_map_get(pblk, lba); |
449 | spin_unlock(&pblk->trans_lock); | 507 | spin_unlock(&pblk->trans_lock); |
450 | 508 | ||
451 | /* Ignore updated values until the moment */ | 509 | ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id); |
452 | if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id || | 510 | if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) |
453 | pblk_ppa_empty(ppa)) | ||
454 | goto out; | 511 | goto out; |
455 | 512 | ||
456 | rqd->ppa_addr = ppa; | 513 | rqd->ppa_addr = ppa_l2p; |
457 | valid_secs = 1; | 514 | valid_secs = 1; |
458 | 515 | ||
459 | #ifdef CONFIG_NVM_DEBUG | 516 | #ifdef CONFIG_NVM_DEBUG |
@@ -464,42 +521,44 @@ out: | |||
464 | return valid_secs; | 521 | return valid_secs; |
465 | } | 522 | } |
466 | 523 | ||
467 | int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, | 524 | int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) |
468 | unsigned int nr_secs, unsigned int *secs_to_gc, | ||
469 | struct pblk_line *line) | ||
470 | { | 525 | { |
471 | struct nvm_tgt_dev *dev = pblk->dev; | 526 | struct nvm_tgt_dev *dev = pblk->dev; |
472 | struct nvm_geo *geo = &dev->geo; | 527 | struct nvm_geo *geo = &dev->geo; |
473 | struct bio *bio; | 528 | struct bio *bio; |
474 | struct nvm_rq rqd; | 529 | struct nvm_rq rqd; |
475 | int ret, data_len; | 530 | int data_len; |
476 | DECLARE_COMPLETION_ONSTACK(wait); | 531 | int ret = NVM_IO_OK; |
477 | 532 | ||
478 | memset(&rqd, 0, sizeof(struct nvm_rq)); | 533 | memset(&rqd, 0, sizeof(struct nvm_rq)); |
479 | 534 | ||
480 | rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, | 535 | rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, |
481 | &rqd.dma_meta_list); | 536 | &rqd.dma_meta_list); |
482 | if (!rqd.meta_list) | 537 | if (!rqd.meta_list) |
483 | return NVM_IO_ERR; | 538 | return -ENOMEM; |
484 | 539 | ||
485 | if (nr_secs > 1) { | 540 | if (gc_rq->nr_secs > 1) { |
486 | rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; | 541 | rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; |
487 | rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; | 542 | rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; |
488 | 543 | ||
489 | *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list, | 544 | gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line, |
490 | nr_secs); | 545 | gc_rq->lba_list, |
491 | if (*secs_to_gc == 1) | 546 | gc_rq->paddr_list, |
547 | gc_rq->nr_secs); | ||
548 | if (gc_rq->secs_to_gc == 1) | ||
492 | rqd.ppa_addr = rqd.ppa_list[0]; | 549 | rqd.ppa_addr = rqd.ppa_list[0]; |
493 | } else { | 550 | } else { |
494 | *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]); | 551 | gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line, |
552 | gc_rq->lba_list[0], | ||
553 | gc_rq->paddr_list[0]); | ||
495 | } | 554 | } |
496 | 555 | ||
497 | if (!(*secs_to_gc)) | 556 | if (!(gc_rq->secs_to_gc)) |
498 | goto out; | 557 | goto out; |
499 | 558 | ||
500 | data_len = (*secs_to_gc) * geo->sec_size; | 559 | data_len = (gc_rq->secs_to_gc) * geo->sec_size; |
501 | bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, | 560 | bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, |
502 | PBLK_KMALLOC_META, GFP_KERNEL); | 561 | PBLK_VMALLOC_META, GFP_KERNEL); |
503 | if (IS_ERR(bio)) { | 562 | if (IS_ERR(bio)) { |
504 | pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); | 563 | pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); |
505 | goto err_free_dma; | 564 | goto err_free_dma; |
@@ -509,23 +568,16 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, | |||
509 | bio_set_op_attrs(bio, REQ_OP_READ, 0); | 568 | bio_set_op_attrs(bio, REQ_OP_READ, 0); |
510 | 569 | ||
511 | rqd.opcode = NVM_OP_PREAD; | 570 | rqd.opcode = NVM_OP_PREAD; |
512 | rqd.end_io = pblk_end_io_sync; | 571 | rqd.nr_ppas = gc_rq->secs_to_gc; |
513 | rqd.private = &wait; | ||
514 | rqd.nr_ppas = *secs_to_gc; | ||
515 | rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); | 572 | rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); |
516 | rqd.bio = bio; | 573 | rqd.bio = bio; |
517 | 574 | ||
518 | ret = pblk_submit_read_io(pblk, &rqd); | 575 | if (pblk_submit_io_sync(pblk, &rqd)) { |
519 | if (ret) { | 576 | ret = -EIO; |
520 | bio_endio(bio); | ||
521 | pr_err("pblk: GC read request failed\n"); | 577 | pr_err("pblk: GC read request failed\n"); |
522 | goto err_free_dma; | 578 | goto err_free_bio; |
523 | } | 579 | } |
524 | 580 | ||
525 | if (!wait_for_completion_io_timeout(&wait, | ||
526 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
527 | pr_err("pblk: GC read I/O timed out\n"); | ||
528 | } | ||
529 | atomic_dec(&pblk->inflight_io); | 581 | atomic_dec(&pblk->inflight_io); |
530 | 582 | ||
531 | if (rqd.error) { | 583 | if (rqd.error) { |
@@ -536,16 +588,18 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, | |||
536 | } | 588 | } |
537 | 589 | ||
538 | #ifdef CONFIG_NVM_DEBUG | 590 | #ifdef CONFIG_NVM_DEBUG |
539 | atomic_long_add(*secs_to_gc, &pblk->sync_reads); | 591 | atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); |
540 | atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads); | 592 | atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); |
541 | atomic_long_sub(*secs_to_gc, &pblk->inflight_reads); | 593 | atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); |
542 | #endif | 594 | #endif |
543 | 595 | ||
544 | out: | 596 | out: |
545 | nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); | 597 | nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); |
546 | return NVM_IO_OK; | 598 | return ret; |
547 | 599 | ||
600 | err_free_bio: | ||
601 | bio_put(bio); | ||
548 | err_free_dma: | 602 | err_free_dma: |
549 | nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); | 603 | nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); |
550 | return NVM_IO_ERR; | 604 | return ret; |
551 | } | 605 | } |
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index cb556e06673e..eadb3eb5d4dc 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c | |||
@@ -34,10 +34,6 @@ void pblk_submit_rec(struct work_struct *work) | |||
34 | max_secs); | 34 | max_secs); |
35 | 35 | ||
36 | bio = bio_alloc(GFP_KERNEL, nr_rec_secs); | 36 | bio = bio_alloc(GFP_KERNEL, nr_rec_secs); |
37 | if (!bio) { | ||
38 | pr_err("pblk: not able to create recovery bio\n"); | ||
39 | return; | ||
40 | } | ||
41 | 37 | ||
42 | bio->bi_iter.bi_sector = 0; | 38 | bio->bi_iter.bi_sector = 0; |
43 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | 39 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); |
@@ -71,7 +67,7 @@ void pblk_submit_rec(struct work_struct *work) | |||
71 | 67 | ||
72 | err: | 68 | err: |
73 | bio_put(bio); | 69 | bio_put(bio); |
74 | pblk_free_rqd(pblk, rqd, WRITE); | 70 | pblk_free_rqd(pblk, rqd, PBLK_WRITE); |
75 | } | 71 | } |
76 | 72 | ||
77 | int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, | 73 | int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, |
@@ -84,12 +80,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, | |||
84 | struct pblk_c_ctx *rec_ctx; | 80 | struct pblk_c_ctx *rec_ctx; |
85 | int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; | 81 | int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; |
86 | 82 | ||
87 | rec_rqd = pblk_alloc_rqd(pblk, WRITE); | 83 | rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); |
88 | if (IS_ERR(rec_rqd)) { | ||
89 | pr_err("pblk: could not create recovery req.\n"); | ||
90 | return -ENOMEM; | ||
91 | } | ||
92 | |||
93 | rec_ctx = nvm_rq_to_pdu(rec_rqd); | 84 | rec_ctx = nvm_rq_to_pdu(rec_rqd); |
94 | 85 | ||
95 | /* Copy completion bitmap, but exclude the first X completed entries */ | 86 | /* Copy completion bitmap, but exclude the first X completed entries */ |
@@ -142,19 +133,19 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) | |||
142 | struct pblk_emeta *emeta = line->emeta; | 133 | struct pblk_emeta *emeta = line->emeta; |
143 | struct line_emeta *emeta_buf = emeta->buf; | 134 | struct line_emeta *emeta_buf = emeta->buf; |
144 | __le64 *lba_list; | 135 | __le64 *lba_list; |
145 | int data_start; | 136 | u64 data_start, data_end; |
146 | int nr_data_lbas, nr_valid_lbas, nr_lbas = 0; | 137 | u64 nr_valid_lbas, nr_lbas = 0; |
147 | int i; | 138 | u64 i; |
148 | 139 | ||
149 | lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); | 140 | lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); |
150 | if (!lba_list) | 141 | if (!lba_list) |
151 | return 1; | 142 | return 1; |
152 | 143 | ||
153 | data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; | 144 | data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; |
154 | nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0]; | 145 | data_end = line->emeta_ssec; |
155 | nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); | 146 | nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); |
156 | 147 | ||
157 | for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) { | 148 | for (i = data_start; i < data_end; i++) { |
158 | struct ppa_addr ppa; | 149 | struct ppa_addr ppa; |
159 | int pos; | 150 | int pos; |
160 | 151 | ||
@@ -181,8 +172,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) | |||
181 | } | 172 | } |
182 | 173 | ||
183 | if (nr_valid_lbas != nr_lbas) | 174 | if (nr_valid_lbas != nr_lbas) |
184 | pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n", | 175 | pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n", |
185 | line->id, emeta_buf->nr_valid_lbas, nr_lbas); | 176 | line->id, nr_valid_lbas, nr_lbas); |
186 | 177 | ||
187 | line->left_msecs = 0; | 178 | line->left_msecs = 0; |
188 | 179 | ||
@@ -225,7 +216,6 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, | |||
225 | int rq_ppas, rq_len; | 216 | int rq_ppas, rq_len; |
226 | int i, j; | 217 | int i, j; |
227 | int ret = 0; | 218 | int ret = 0; |
228 | DECLARE_COMPLETION_ONSTACK(wait); | ||
229 | 219 | ||
230 | ppa_list = p.ppa_list; | 220 | ppa_list = p.ppa_list; |
231 | meta_list = p.meta_list; | 221 | meta_list = p.meta_list; |
@@ -262,8 +252,6 @@ next_read_rq: | |||
262 | rqd->ppa_list = ppa_list; | 252 | rqd->ppa_list = ppa_list; |
263 | rqd->dma_ppa_list = dma_ppa_list; | 253 | rqd->dma_ppa_list = dma_ppa_list; |
264 | rqd->dma_meta_list = dma_meta_list; | 254 | rqd->dma_meta_list = dma_meta_list; |
265 | rqd->end_io = pblk_end_io_sync; | ||
266 | rqd->private = &wait; | ||
267 | 255 | ||
268 | if (pblk_io_aligned(pblk, rq_ppas)) | 256 | if (pblk_io_aligned(pblk, rq_ppas)) |
269 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); | 257 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); |
@@ -289,19 +277,13 @@ next_read_rq: | |||
289 | } | 277 | } |
290 | 278 | ||
291 | /* If read fails, more padding is needed */ | 279 | /* If read fails, more padding is needed */ |
292 | ret = pblk_submit_io(pblk, rqd); | 280 | ret = pblk_submit_io_sync(pblk, rqd); |
293 | if (ret) { | 281 | if (ret) { |
294 | pr_err("pblk: I/O submission failed: %d\n", ret); | 282 | pr_err("pblk: I/O submission failed: %d\n", ret); |
295 | return ret; | 283 | return ret; |
296 | } | 284 | } |
297 | 285 | ||
298 | if (!wait_for_completion_io_timeout(&wait, | ||
299 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
300 | pr_err("pblk: L2P recovery read timed out\n"); | ||
301 | return -EINTR; | ||
302 | } | ||
303 | atomic_dec(&pblk->inflight_io); | 286 | atomic_dec(&pblk->inflight_io); |
304 | reinit_completion(&wait); | ||
305 | 287 | ||
306 | /* At this point, the read should not fail. If it does, it is a problem | 288 | /* At this point, the read should not fail. If it does, it is a problem |
307 | * we cannot recover from here. Need FTL log. | 289 | * we cannot recover from here. Need FTL log. |
@@ -338,13 +320,10 @@ static void pblk_end_io_recov(struct nvm_rq *rqd) | |||
338 | { | 320 | { |
339 | struct pblk_pad_rq *pad_rq = rqd->private; | 321 | struct pblk_pad_rq *pad_rq = rqd->private; |
340 | struct pblk *pblk = pad_rq->pblk; | 322 | struct pblk *pblk = pad_rq->pblk; |
341 | struct nvm_tgt_dev *dev = pblk->dev; | ||
342 | 323 | ||
343 | pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); | 324 | pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); |
344 | 325 | ||
345 | bio_put(rqd->bio); | 326 | pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); |
346 | nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); | ||
347 | pblk_free_rqd(pblk, rqd, WRITE); | ||
348 | 327 | ||
349 | atomic_dec(&pblk->inflight_io); | 328 | atomic_dec(&pblk->inflight_io); |
350 | kref_put(&pad_rq->ref, pblk_recov_complete); | 329 | kref_put(&pad_rq->ref, pblk_recov_complete); |
@@ -404,25 +383,21 @@ next_pad_rq: | |||
404 | ppa_list = (void *)(meta_list) + pblk_dma_meta_size; | 383 | ppa_list = (void *)(meta_list) + pblk_dma_meta_size; |
405 | dma_ppa_list = dma_meta_list + pblk_dma_meta_size; | 384 | dma_ppa_list = dma_meta_list + pblk_dma_meta_size; |
406 | 385 | ||
407 | rqd = pblk_alloc_rqd(pblk, WRITE); | ||
408 | if (IS_ERR(rqd)) { | ||
409 | ret = PTR_ERR(rqd); | ||
410 | goto fail_free_meta; | ||
411 | } | ||
412 | |||
413 | bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, | 386 | bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, |
414 | PBLK_VMALLOC_META, GFP_KERNEL); | 387 | PBLK_VMALLOC_META, GFP_KERNEL); |
415 | if (IS_ERR(bio)) { | 388 | if (IS_ERR(bio)) { |
416 | ret = PTR_ERR(bio); | 389 | ret = PTR_ERR(bio); |
417 | goto fail_free_rqd; | 390 | goto fail_free_meta; |
418 | } | 391 | } |
419 | 392 | ||
420 | bio->bi_iter.bi_sector = 0; /* internal bio */ | 393 | bio->bi_iter.bi_sector = 0; /* internal bio */ |
421 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | 394 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); |
422 | 395 | ||
396 | rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); | ||
397 | |||
423 | rqd->bio = bio; | 398 | rqd->bio = bio; |
424 | rqd->opcode = NVM_OP_PWRITE; | 399 | rqd->opcode = NVM_OP_PWRITE; |
425 | rqd->flags = pblk_set_progr_mode(pblk, WRITE); | 400 | rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); |
426 | rqd->meta_list = meta_list; | 401 | rqd->meta_list = meta_list; |
427 | rqd->nr_ppas = rq_ppas; | 402 | rqd->nr_ppas = rq_ppas; |
428 | rqd->ppa_list = ppa_list; | 403 | rqd->ppa_list = ppa_list; |
@@ -490,8 +465,6 @@ free_rq: | |||
490 | 465 | ||
491 | fail_free_bio: | 466 | fail_free_bio: |
492 | bio_put(bio); | 467 | bio_put(bio); |
493 | fail_free_rqd: | ||
494 | pblk_free_rqd(pblk, rqd, WRITE); | ||
495 | fail_free_meta: | 468 | fail_free_meta: |
496 | nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); | 469 | nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); |
497 | fail_free_pad: | 470 | fail_free_pad: |
@@ -522,7 +495,6 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, | |||
522 | int ret = 0; | 495 | int ret = 0; |
523 | int rec_round; | 496 | int rec_round; |
524 | int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec; | 497 | int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec; |
525 | DECLARE_COMPLETION_ONSTACK(wait); | ||
526 | 498 | ||
527 | ppa_list = p.ppa_list; | 499 | ppa_list = p.ppa_list; |
528 | meta_list = p.meta_list; | 500 | meta_list = p.meta_list; |
@@ -557,8 +529,6 @@ next_rq: | |||
557 | rqd->ppa_list = ppa_list; | 529 | rqd->ppa_list = ppa_list; |
558 | rqd->dma_ppa_list = dma_ppa_list; | 530 | rqd->dma_ppa_list = dma_ppa_list; |
559 | rqd->dma_meta_list = dma_meta_list; | 531 | rqd->dma_meta_list = dma_meta_list; |
560 | rqd->end_io = pblk_end_io_sync; | ||
561 | rqd->private = &wait; | ||
562 | 532 | ||
563 | if (pblk_io_aligned(pblk, rq_ppas)) | 533 | if (pblk_io_aligned(pblk, rq_ppas)) |
564 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); | 534 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); |
@@ -584,18 +554,13 @@ next_rq: | |||
584 | addr_to_gen_ppa(pblk, w_ptr, line->id); | 554 | addr_to_gen_ppa(pblk, w_ptr, line->id); |
585 | } | 555 | } |
586 | 556 | ||
587 | ret = pblk_submit_io(pblk, rqd); | 557 | ret = pblk_submit_io_sync(pblk, rqd); |
588 | if (ret) { | 558 | if (ret) { |
589 | pr_err("pblk: I/O submission failed: %d\n", ret); | 559 | pr_err("pblk: I/O submission failed: %d\n", ret); |
590 | return ret; | 560 | return ret; |
591 | } | 561 | } |
592 | 562 | ||
593 | if (!wait_for_completion_io_timeout(&wait, | ||
594 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
595 | pr_err("pblk: L2P recovery read timed out\n"); | ||
596 | } | ||
597 | atomic_dec(&pblk->inflight_io); | 563 | atomic_dec(&pblk->inflight_io); |
598 | reinit_completion(&wait); | ||
599 | 564 | ||
600 | /* This should not happen since the read failed during normal recovery, | 565 | /* This should not happen since the read failed during normal recovery, |
601 | * but the media works funny sometimes... | 566 | * but the media works funny sometimes... |
@@ -663,7 +628,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, | |||
663 | int i, j; | 628 | int i, j; |
664 | int ret = 0; | 629 | int ret = 0; |
665 | int left_ppas = pblk_calc_sec_in_line(pblk, line); | 630 | int left_ppas = pblk_calc_sec_in_line(pblk, line); |
666 | DECLARE_COMPLETION_ONSTACK(wait); | ||
667 | 631 | ||
668 | ppa_list = p.ppa_list; | 632 | ppa_list = p.ppa_list; |
669 | meta_list = p.meta_list; | 633 | meta_list = p.meta_list; |
@@ -696,8 +660,6 @@ next_rq: | |||
696 | rqd->ppa_list = ppa_list; | 660 | rqd->ppa_list = ppa_list; |
697 | rqd->dma_ppa_list = dma_ppa_list; | 661 | rqd->dma_ppa_list = dma_ppa_list; |
698 | rqd->dma_meta_list = dma_meta_list; | 662 | rqd->dma_meta_list = dma_meta_list; |
699 | rqd->end_io = pblk_end_io_sync; | ||
700 | rqd->private = &wait; | ||
701 | 663 | ||
702 | if (pblk_io_aligned(pblk, rq_ppas)) | 664 | if (pblk_io_aligned(pblk, rq_ppas)) |
703 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); | 665 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); |
@@ -723,19 +685,14 @@ next_rq: | |||
723 | addr_to_gen_ppa(pblk, paddr, line->id); | 685 | addr_to_gen_ppa(pblk, paddr, line->id); |
724 | } | 686 | } |
725 | 687 | ||
726 | ret = pblk_submit_io(pblk, rqd); | 688 | ret = pblk_submit_io_sync(pblk, rqd); |
727 | if (ret) { | 689 | if (ret) { |
728 | pr_err("pblk: I/O submission failed: %d\n", ret); | 690 | pr_err("pblk: I/O submission failed: %d\n", ret); |
729 | bio_put(bio); | 691 | bio_put(bio); |
730 | return ret; | 692 | return ret; |
731 | } | 693 | } |
732 | 694 | ||
733 | if (!wait_for_completion_io_timeout(&wait, | ||
734 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | ||
735 | pr_err("pblk: L2P recovery read timed out\n"); | ||
736 | } | ||
737 | atomic_dec(&pblk->inflight_io); | 695 | atomic_dec(&pblk->inflight_io); |
738 | reinit_completion(&wait); | ||
739 | 696 | ||
740 | /* Reached the end of the written line */ | 697 | /* Reached the end of the written line */ |
741 | if (rqd->error) { | 698 | if (rqd->error) { |
@@ -785,15 +742,9 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) | |||
785 | dma_addr_t dma_ppa_list, dma_meta_list; | 742 | dma_addr_t dma_ppa_list, dma_meta_list; |
786 | int done, ret = 0; | 743 | int done, ret = 0; |
787 | 744 | ||
788 | rqd = pblk_alloc_rqd(pblk, READ); | ||
789 | if (IS_ERR(rqd)) | ||
790 | return PTR_ERR(rqd); | ||
791 | |||
792 | meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); | 745 | meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); |
793 | if (!meta_list) { | 746 | if (!meta_list) |
794 | ret = -ENOMEM; | 747 | return -ENOMEM; |
795 | goto free_rqd; | ||
796 | } | ||
797 | 748 | ||
798 | ppa_list = (void *)(meta_list) + pblk_dma_meta_size; | 749 | ppa_list = (void *)(meta_list) + pblk_dma_meta_size; |
799 | dma_ppa_list = dma_meta_list + pblk_dma_meta_size; | 750 | dma_ppa_list = dma_meta_list + pblk_dma_meta_size; |
@@ -804,6 +755,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) | |||
804 | goto free_meta_list; | 755 | goto free_meta_list; |
805 | } | 756 | } |
806 | 757 | ||
758 | rqd = pblk_alloc_rqd(pblk, PBLK_READ); | ||
759 | |||
807 | p.ppa_list = ppa_list; | 760 | p.ppa_list = ppa_list; |
808 | p.meta_list = meta_list; | 761 | p.meta_list = meta_list; |
809 | p.rqd = rqd; | 762 | p.rqd = rqd; |
@@ -832,8 +785,6 @@ out: | |||
832 | kfree(data); | 785 | kfree(data); |
833 | free_meta_list: | 786 | free_meta_list: |
834 | nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); | 787 | nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); |
835 | free_rqd: | ||
836 | pblk_free_rqd(pblk, rqd, READ); | ||
837 | 788 | ||
838 | return ret; | 789 | return ret; |
839 | } | 790 | } |
@@ -851,11 +802,33 @@ static void pblk_recov_line_add_ordered(struct list_head *head, | |||
851 | __list_add(&line->list, t->list.prev, &t->list); | 802 | __list_add(&line->list, t->list.prev, &t->list); |
852 | } | 803 | } |
853 | 804 | ||
854 | struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | 805 | static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) |
855 | { | 806 | { |
856 | struct nvm_tgt_dev *dev = pblk->dev; | 807 | struct nvm_tgt_dev *dev = pblk->dev; |
857 | struct nvm_geo *geo = &dev->geo; | 808 | struct nvm_geo *geo = &dev->geo; |
858 | struct pblk_line_meta *lm = &pblk->lm; | 809 | struct pblk_line_meta *lm = &pblk->lm; |
810 | unsigned int emeta_secs; | ||
811 | u64 emeta_start; | ||
812 | struct ppa_addr ppa; | ||
813 | int pos; | ||
814 | |||
815 | emeta_secs = lm->emeta_sec[0]; | ||
816 | emeta_start = lm->sec_per_line; | ||
817 | |||
818 | while (emeta_secs) { | ||
819 | emeta_start--; | ||
820 | ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id); | ||
821 | pos = pblk_ppa_to_pos(geo, ppa); | ||
822 | if (!test_bit(pos, line->blk_bitmap)) | ||
823 | emeta_secs--; | ||
824 | } | ||
825 | |||
826 | return emeta_start; | ||
827 | } | ||
828 | |||
829 | struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | ||
830 | { | ||
831 | struct pblk_line_meta *lm = &pblk->lm; | ||
859 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 832 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
860 | struct pblk_line *line, *tline, *data_line = NULL; | 833 | struct pblk_line *line, *tline, *data_line = NULL; |
861 | struct pblk_smeta *smeta; | 834 | struct pblk_smeta *smeta; |
@@ -900,9 +873,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | |||
900 | if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) | 873 | if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) |
901 | continue; | 874 | continue; |
902 | 875 | ||
903 | if (le16_to_cpu(smeta_buf->header.version) != 1) { | 876 | if (smeta_buf->header.version != SMETA_VERSION) { |
904 | pr_err("pblk: found incompatible line version %u\n", | 877 | pr_err("pblk: found incompatible line version %u\n", |
905 | smeta_buf->header.version); | 878 | le16_to_cpu(smeta_buf->header.version)); |
906 | return ERR_PTR(-EINVAL); | 879 | return ERR_PTR(-EINVAL); |
907 | } | 880 | } |
908 | 881 | ||
@@ -954,15 +927,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | |||
954 | 927 | ||
955 | /* Verify closed blocks and recover this portion of L2P table*/ | 928 | /* Verify closed blocks and recover this portion of L2P table*/ |
956 | list_for_each_entry_safe(line, tline, &recov_list, list) { | 929 | list_for_each_entry_safe(line, tline, &recov_list, list) { |
957 | int off, nr_bb; | ||
958 | |||
959 | recovered_lines++; | 930 | recovered_lines++; |
960 | /* Calculate where emeta starts based on the line bb */ | ||
961 | off = lm->sec_per_line - lm->emeta_sec[0]; | ||
962 | nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); | ||
963 | off -= nr_bb * geo->sec_per_pl; | ||
964 | 931 | ||
965 | line->emeta_ssec = off; | 932 | line->emeta_ssec = pblk_line_emeta_start(pblk, line); |
966 | line->emeta = emeta; | 933 | line->emeta = emeta; |
967 | memset(line->emeta->buf, 0, lm->emeta_len[0]); | 934 | memset(line->emeta->buf, 0, lm->emeta_len[0]); |
968 | 935 | ||
@@ -987,7 +954,7 @@ next: | |||
987 | list_move_tail(&line->list, move_list); | 954 | list_move_tail(&line->list, move_list); |
988 | spin_unlock(&l_mg->gc_lock); | 955 | spin_unlock(&l_mg->gc_lock); |
989 | 956 | ||
990 | mempool_free(line->map_bitmap, pblk->line_meta_pool); | 957 | kfree(line->map_bitmap); |
991 | line->map_bitmap = NULL; | 958 | line->map_bitmap = NULL; |
992 | line->smeta = NULL; | 959 | line->smeta = NULL; |
993 | line->emeta = NULL; | 960 | line->emeta = NULL; |
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 2e6a5361baf0..abae31fd434e 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c | |||
@@ -96,9 +96,11 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) | |||
96 | * | 96 | * |
97 | * Only the total number of free blocks is used to configure the rate limiter. | 97 | * Only the total number of free blocks is used to configure the rate limiter. |
98 | */ | 98 | */ |
99 | static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) | 99 | void pblk_rl_update_rates(struct pblk_rl *rl) |
100 | { | 100 | { |
101 | struct pblk *pblk = container_of(rl, struct pblk, rl); | ||
101 | unsigned long free_blocks = pblk_rl_nr_free_blks(rl); | 102 | unsigned long free_blocks = pblk_rl_nr_free_blks(rl); |
103 | int max = rl->rb_budget; | ||
102 | 104 | ||
103 | if (free_blocks >= rl->high) { | 105 | if (free_blocks >= rl->high) { |
104 | rl->rb_user_max = max; | 106 | rl->rb_user_max = max; |
@@ -124,23 +126,18 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) | |||
124 | rl->rb_state = PBLK_RL_LOW; | 126 | rl->rb_state = PBLK_RL_LOW; |
125 | } | 127 | } |
126 | 128 | ||
127 | return rl->rb_state; | 129 | if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW)) |
130 | pblk_gc_should_start(pblk); | ||
131 | else | ||
132 | pblk_gc_should_stop(pblk); | ||
128 | } | 133 | } |
129 | 134 | ||
130 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) | 135 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) |
131 | { | 136 | { |
132 | struct pblk *pblk = container_of(rl, struct pblk, rl); | ||
133 | int blk_in_line = atomic_read(&line->blk_in_line); | 137 | int blk_in_line = atomic_read(&line->blk_in_line); |
134 | int ret; | ||
135 | 138 | ||
136 | atomic_add(blk_in_line, &rl->free_blocks); | 139 | atomic_add(blk_in_line, &rl->free_blocks); |
137 | /* Rates will not change that often - no need to lock update */ | 140 | pblk_rl_update_rates(rl); |
138 | ret = pblk_rl_update_rates(rl, rl->rb_budget); | ||
139 | |||
140 | if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) | ||
141 | pblk_gc_should_start(pblk); | ||
142 | else | ||
143 | pblk_gc_should_stop(pblk); | ||
144 | } | 141 | } |
145 | 142 | ||
146 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) | 143 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) |
@@ -148,19 +145,7 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) | |||
148 | int blk_in_line = atomic_read(&line->blk_in_line); | 145 | int blk_in_line = atomic_read(&line->blk_in_line); |
149 | 146 | ||
150 | atomic_sub(blk_in_line, &rl->free_blocks); | 147 | atomic_sub(blk_in_line, &rl->free_blocks); |
151 | } | 148 | pblk_rl_update_rates(rl); |
152 | |||
153 | void pblk_gc_should_kick(struct pblk *pblk) | ||
154 | { | ||
155 | struct pblk_rl *rl = &pblk->rl; | ||
156 | int ret; | ||
157 | |||
158 | /* Rates will not change that often - no need to lock update */ | ||
159 | ret = pblk_rl_update_rates(rl, rl->rb_budget); | ||
160 | if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) | ||
161 | pblk_gc_should_start(pblk); | ||
162 | else | ||
163 | pblk_gc_should_stop(pblk); | ||
164 | } | 149 | } |
165 | 150 | ||
166 | int pblk_rl_high_thrs(struct pblk_rl *rl) | 151 | int pblk_rl_high_thrs(struct pblk_rl *rl) |
@@ -168,14 +153,9 @@ int pblk_rl_high_thrs(struct pblk_rl *rl) | |||
168 | return rl->high; | 153 | return rl->high; |
169 | } | 154 | } |
170 | 155 | ||
171 | int pblk_rl_low_thrs(struct pblk_rl *rl) | 156 | int pblk_rl_max_io(struct pblk_rl *rl) |
172 | { | ||
173 | return rl->low; | ||
174 | } | ||
175 | |||
176 | int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) | ||
177 | { | 157 | { |
178 | return rl->rb_user_max; | 158 | return rl->rb_max_io; |
179 | } | 159 | } |
180 | 160 | ||
181 | static void pblk_rl_u_timer(unsigned long data) | 161 | static void pblk_rl_u_timer(unsigned long data) |
@@ -214,6 +194,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) | |||
214 | /* To start with, all buffer is available to user I/O writers */ | 194 | /* To start with, all buffer is available to user I/O writers */ |
215 | rl->rb_budget = budget; | 195 | rl->rb_budget = budget; |
216 | rl->rb_user_max = budget; | 196 | rl->rb_user_max = budget; |
197 | rl->rb_max_io = budget >> 1; | ||
217 | rl->rb_gc_max = 0; | 198 | rl->rb_gc_max = 0; |
218 | rl->rb_state = PBLK_RL_HIGH; | 199 | rl->rb_state = PBLK_RL_HIGH; |
219 | 200 | ||
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index 95fb434e2f01..cd49e8875d4e 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c | |||
@@ -253,7 +253,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) | |||
253 | sz += snprintf(page + sz, PAGE_SIZE - sz, | 253 | sz += snprintf(page + sz, PAGE_SIZE - sz, |
254 | "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n", | 254 | "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n", |
255 | gc_full, gc_high, gc_mid, gc_low, gc_empty, | 255 | gc_full, gc_high, gc_mid, gc_low, gc_empty, |
256 | atomic_read(&pblk->gc.inflight_gc)); | 256 | atomic_read(&pblk->gc.read_inflight_gc)); |
257 | 257 | ||
258 | sz += snprintf(page + sz, PAGE_SIZE - sz, | 258 | sz += snprintf(page + sz, PAGE_SIZE - sz, |
259 | "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", | 259 | "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", |
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 3ad9e56d2473..6c1cafafef53 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c | |||
@@ -20,7 +20,6 @@ | |||
20 | static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, | 20 | static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, |
21 | struct pblk_c_ctx *c_ctx) | 21 | struct pblk_c_ctx *c_ctx) |
22 | { | 22 | { |
23 | struct nvm_tgt_dev *dev = pblk->dev; | ||
24 | struct bio *original_bio; | 23 | struct bio *original_bio; |
25 | unsigned long ret; | 24 | unsigned long ret; |
26 | int i; | 25 | int i; |
@@ -33,16 +32,18 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
33 | bio_endio(original_bio); | 32 | bio_endio(original_bio); |
34 | } | 33 | } |
35 | 34 | ||
35 | if (c_ctx->nr_padded) | ||
36 | pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, | ||
37 | c_ctx->nr_padded); | ||
38 | |||
36 | #ifdef CONFIG_NVM_DEBUG | 39 | #ifdef CONFIG_NVM_DEBUG |
37 | atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); | 40 | atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); |
38 | #endif | 41 | #endif |
39 | 42 | ||
40 | ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); | 43 | ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); |
41 | 44 | ||
42 | nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); | ||
43 | |||
44 | bio_put(rqd->bio); | 45 | bio_put(rqd->bio); |
45 | pblk_free_rqd(pblk, rqd, WRITE); | 46 | pblk_free_rqd(pblk, rqd, PBLK_WRITE); |
46 | 47 | ||
47 | return ret; | 48 | return ret; |
48 | } | 49 | } |
@@ -107,10 +108,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) | |||
107 | ppa_list = &rqd->ppa_addr; | 108 | ppa_list = &rqd->ppa_addr; |
108 | 109 | ||
109 | recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); | 110 | recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); |
110 | if (!recovery) { | 111 | |
111 | pr_err("pblk: could not allocate recovery context\n"); | ||
112 | return; | ||
113 | } | ||
114 | INIT_LIST_HEAD(&recovery->failed); | 112 | INIT_LIST_HEAD(&recovery->failed); |
115 | 113 | ||
116 | bit = -1; | 114 | bit = -1; |
@@ -175,7 +173,6 @@ static void pblk_end_io_write(struct nvm_rq *rqd) | |||
175 | static void pblk_end_io_write_meta(struct nvm_rq *rqd) | 173 | static void pblk_end_io_write_meta(struct nvm_rq *rqd) |
176 | { | 174 | { |
177 | struct pblk *pblk = rqd->private; | 175 | struct pblk *pblk = rqd->private; |
178 | struct nvm_tgt_dev *dev = pblk->dev; | ||
179 | struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); | 176 | struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); |
180 | struct pblk_line *line = m_ctx->private; | 177 | struct pblk_line *line = m_ctx->private; |
181 | struct pblk_emeta *emeta = line->emeta; | 178 | struct pblk_emeta *emeta = line->emeta; |
@@ -187,19 +184,13 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) | |||
187 | pblk_log_write_err(pblk, rqd); | 184 | pblk_log_write_err(pblk, rqd); |
188 | pr_err("pblk: metadata I/O failed. Line %d\n", line->id); | 185 | pr_err("pblk: metadata I/O failed. Line %d\n", line->id); |
189 | } | 186 | } |
190 | #ifdef CONFIG_NVM_DEBUG | ||
191 | else | ||
192 | WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); | ||
193 | #endif | ||
194 | 187 | ||
195 | sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); | 188 | sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); |
196 | if (sync == emeta->nr_entries) | 189 | if (sync == emeta->nr_entries) |
197 | pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws, | 190 | pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, |
198 | pblk->close_wq); | 191 | GFP_ATOMIC, pblk->close_wq); |
199 | 192 | ||
200 | bio_put(rqd->bio); | 193 | pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); |
201 | nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); | ||
202 | pblk_free_rqd(pblk, rqd, READ); | ||
203 | 194 | ||
204 | atomic_dec(&pblk->inflight_io); | 195 | atomic_dec(&pblk->inflight_io); |
205 | } | 196 | } |
@@ -213,7 +204,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, | |||
213 | /* Setup write request */ | 204 | /* Setup write request */ |
214 | rqd->opcode = NVM_OP_PWRITE; | 205 | rqd->opcode = NVM_OP_PWRITE; |
215 | rqd->nr_ppas = nr_secs; | 206 | rqd->nr_ppas = nr_secs; |
216 | rqd->flags = pblk_set_progr_mode(pblk, WRITE); | 207 | rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); |
217 | rqd->private = pblk; | 208 | rqd->private = pblk; |
218 | rqd->end_io = end_io; | 209 | rqd->end_io = end_io; |
219 | 210 | ||
@@ -229,15 +220,16 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, | |||
229 | } | 220 | } |
230 | 221 | ||
231 | static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, | 222 | static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, |
232 | struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa) | 223 | struct ppa_addr *erase_ppa) |
233 | { | 224 | { |
234 | struct pblk_line_meta *lm = &pblk->lm; | 225 | struct pblk_line_meta *lm = &pblk->lm; |
235 | struct pblk_line *e_line = pblk_line_get_erase(pblk); | 226 | struct pblk_line *e_line = pblk_line_get_erase(pblk); |
227 | struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); | ||
236 | unsigned int valid = c_ctx->nr_valid; | 228 | unsigned int valid = c_ctx->nr_valid; |
237 | unsigned int padded = c_ctx->nr_padded; | 229 | unsigned int padded = c_ctx->nr_padded; |
238 | unsigned int nr_secs = valid + padded; | 230 | unsigned int nr_secs = valid + padded; |
239 | unsigned long *lun_bitmap; | 231 | unsigned long *lun_bitmap; |
240 | int ret = 0; | 232 | int ret; |
241 | 233 | ||
242 | lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); | 234 | lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); |
243 | if (!lun_bitmap) | 235 | if (!lun_bitmap) |
@@ -279,7 +271,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, | |||
279 | pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0); | 271 | pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0); |
280 | 272 | ||
281 | rqd->ppa_status = (u64)0; | 273 | rqd->ppa_status = (u64)0; |
282 | rqd->flags = pblk_set_progr_mode(pblk, WRITE); | 274 | rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); |
283 | 275 | ||
284 | return ret; | 276 | return ret; |
285 | } | 277 | } |
@@ -303,55 +295,6 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, | |||
303 | return secs_to_sync; | 295 | return secs_to_sync; |
304 | } | 296 | } |
305 | 297 | ||
306 | static inline int pblk_valid_meta_ppa(struct pblk *pblk, | ||
307 | struct pblk_line *meta_line, | ||
308 | struct ppa_addr *ppa_list, int nr_ppas) | ||
309 | { | ||
310 | struct nvm_tgt_dev *dev = pblk->dev; | ||
311 | struct nvm_geo *geo = &dev->geo; | ||
312 | struct pblk_line *data_line; | ||
313 | struct ppa_addr ppa, ppa_opt; | ||
314 | u64 paddr; | ||
315 | int i; | ||
316 | |||
317 | data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])]; | ||
318 | paddr = pblk_lookup_page(pblk, meta_line); | ||
319 | ppa = addr_to_gen_ppa(pblk, paddr, 0); | ||
320 | |||
321 | if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap)) | ||
322 | return 1; | ||
323 | |||
324 | /* Schedule a metadata I/O that is half the distance from the data I/O | ||
325 | * with regards to the number of LUNs forming the pblk instance. This | ||
326 | * balances LUN conflicts across every I/O. | ||
327 | * | ||
328 | * When the LUN configuration changes (e.g., due to GC), this distance | ||
329 | * can align, which would result on a LUN deadlock. In this case, modify | ||
330 | * the distance to not be optimal, but allow metadata I/Os to succeed. | ||
331 | */ | ||
332 | ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); | ||
333 | if (unlikely(ppa_opt.ppa == ppa.ppa)) { | ||
334 | data_line->meta_distance--; | ||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) | ||
339 | if (ppa_list[i].g.ch == ppa_opt.g.ch && | ||
340 | ppa_list[i].g.lun == ppa_opt.g.lun) | ||
341 | return 1; | ||
342 | |||
343 | if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) { | ||
344 | for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) | ||
345 | if (ppa_list[i].g.ch == ppa.g.ch && | ||
346 | ppa_list[i].g.lun == ppa.g.lun) | ||
347 | return 0; | ||
348 | |||
349 | return 1; | ||
350 | } | ||
351 | |||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) | 298 | int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) |
356 | { | 299 | { |
357 | struct nvm_tgt_dev *dev = pblk->dev; | 300 | struct nvm_tgt_dev *dev = pblk->dev; |
@@ -370,11 +313,8 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) | |||
370 | int i, j; | 313 | int i, j; |
371 | int ret; | 314 | int ret; |
372 | 315 | ||
373 | rqd = pblk_alloc_rqd(pblk, READ); | 316 | rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); |
374 | if (IS_ERR(rqd)) { | 317 | |
375 | pr_err("pblk: cannot allocate write req.\n"); | ||
376 | return PTR_ERR(rqd); | ||
377 | } | ||
378 | m_ctx = nvm_rq_to_pdu(rqd); | 318 | m_ctx = nvm_rq_to_pdu(rqd); |
379 | m_ctx->private = meta_line; | 319 | m_ctx->private = meta_line; |
380 | 320 | ||
@@ -407,8 +347,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) | |||
407 | if (emeta->mem >= lm->emeta_len[0]) { | 347 | if (emeta->mem >= lm->emeta_len[0]) { |
408 | spin_lock(&l_mg->close_lock); | 348 | spin_lock(&l_mg->close_lock); |
409 | list_del(&meta_line->list); | 349 | list_del(&meta_line->list); |
410 | WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line), | ||
411 | "pblk: corrupt meta line %d\n", meta_line->id); | ||
412 | spin_unlock(&l_mg->close_lock); | 350 | spin_unlock(&l_mg->close_lock); |
413 | } | 351 | } |
414 | 352 | ||
@@ -428,18 +366,51 @@ fail_rollback: | |||
428 | pblk_dealloc_page(pblk, meta_line, rq_ppas); | 366 | pblk_dealloc_page(pblk, meta_line, rq_ppas); |
429 | list_add(&meta_line->list, &meta_line->list); | 367 | list_add(&meta_line->list, &meta_line->list); |
430 | spin_unlock(&l_mg->close_lock); | 368 | spin_unlock(&l_mg->close_lock); |
431 | |||
432 | nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); | ||
433 | fail_free_bio: | 369 | fail_free_bio: |
434 | if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META)) | 370 | bio_put(bio); |
435 | bio_put(bio); | ||
436 | fail_free_rqd: | 371 | fail_free_rqd: |
437 | pblk_free_rqd(pblk, rqd, READ); | 372 | pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); |
438 | return ret; | 373 | return ret; |
439 | } | 374 | } |
440 | 375 | ||
441 | static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, | 376 | static inline bool pblk_valid_meta_ppa(struct pblk *pblk, |
442 | int prev_n) | 377 | struct pblk_line *meta_line, |
378 | struct nvm_rq *data_rqd) | ||
379 | { | ||
380 | struct nvm_tgt_dev *dev = pblk->dev; | ||
381 | struct nvm_geo *geo = &dev->geo; | ||
382 | struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); | ||
383 | struct pblk_line *data_line = pblk_line_get_data(pblk); | ||
384 | struct ppa_addr ppa, ppa_opt; | ||
385 | u64 paddr; | ||
386 | int pos_opt; | ||
387 | |||
388 | /* Schedule a metadata I/O that is half the distance from the data I/O | ||
389 | * with regards to the number of LUNs forming the pblk instance. This | ||
390 | * balances LUN conflicts across every I/O. | ||
391 | * | ||
392 | * When the LUN configuration changes (e.g., due to GC), this distance | ||
393 | * can align, which would result on metadata and data I/Os colliding. In | ||
394 | * this case, modify the distance to not be optimal, but move the | ||
395 | * optimal in the right direction. | ||
396 | */ | ||
397 | paddr = pblk_lookup_page(pblk, meta_line); | ||
398 | ppa = addr_to_gen_ppa(pblk, paddr, 0); | ||
399 | ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); | ||
400 | pos_opt = pblk_ppa_to_pos(geo, ppa_opt); | ||
401 | |||
402 | if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || | ||
403 | test_bit(pos_opt, data_line->blk_bitmap)) | ||
404 | return true; | ||
405 | |||
406 | if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) | ||
407 | data_line->meta_distance--; | ||
408 | |||
409 | return false; | ||
410 | } | ||
411 | |||
412 | static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, | ||
413 | struct nvm_rq *data_rqd) | ||
443 | { | 414 | { |
444 | struct pblk_line_meta *lm = &pblk->lm; | 415 | struct pblk_line_meta *lm = &pblk->lm; |
445 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 416 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
@@ -449,57 +420,45 @@ static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, | |||
449 | retry: | 420 | retry: |
450 | if (list_empty(&l_mg->emeta_list)) { | 421 | if (list_empty(&l_mg->emeta_list)) { |
451 | spin_unlock(&l_mg->close_lock); | 422 | spin_unlock(&l_mg->close_lock); |
452 | return 0; | 423 | return NULL; |
453 | } | 424 | } |
454 | meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); | 425 | meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); |
455 | if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line)) | 426 | if (meta_line->emeta->mem >= lm->emeta_len[0]) |
456 | goto retry; | 427 | goto retry; |
457 | spin_unlock(&l_mg->close_lock); | 428 | spin_unlock(&l_mg->close_lock); |
458 | 429 | ||
459 | if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n)) | 430 | if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) |
460 | return 0; | 431 | return NULL; |
461 | 432 | ||
462 | return pblk_submit_meta_io(pblk, meta_line); | 433 | return meta_line; |
463 | } | 434 | } |
464 | 435 | ||
465 | static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) | 436 | static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) |
466 | { | 437 | { |
467 | struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); | ||
468 | struct ppa_addr erase_ppa; | 438 | struct ppa_addr erase_ppa; |
439 | struct pblk_line *meta_line; | ||
469 | int err; | 440 | int err; |
470 | 441 | ||
471 | ppa_set_empty(&erase_ppa); | 442 | ppa_set_empty(&erase_ppa); |
472 | 443 | ||
473 | /* Assign lbas to ppas and populate request structure */ | 444 | /* Assign lbas to ppas and populate request structure */ |
474 | err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa); | 445 | err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); |
475 | if (err) { | 446 | if (err) { |
476 | pr_err("pblk: could not setup write request: %d\n", err); | 447 | pr_err("pblk: could not setup write request: %d\n", err); |
477 | return NVM_IO_ERR; | 448 | return NVM_IO_ERR; |
478 | } | 449 | } |
479 | 450 | ||
480 | if (likely(ppa_empty(erase_ppa))) { | 451 | meta_line = pblk_should_submit_meta_io(pblk, rqd); |
481 | /* Submit metadata write for previous data line */ | ||
482 | err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas); | ||
483 | if (err) { | ||
484 | pr_err("pblk: metadata I/O submission failed: %d", err); | ||
485 | return NVM_IO_ERR; | ||
486 | } | ||
487 | 452 | ||
488 | /* Submit data write for current data line */ | 453 | /* Submit data write for current data line */ |
489 | err = pblk_submit_io(pblk, rqd); | 454 | err = pblk_submit_io(pblk, rqd); |
490 | if (err) { | 455 | if (err) { |
491 | pr_err("pblk: data I/O submission failed: %d\n", err); | 456 | pr_err("pblk: data I/O submission failed: %d\n", err); |
492 | return NVM_IO_ERR; | 457 | return NVM_IO_ERR; |
493 | } | 458 | } |
494 | } else { | ||
495 | /* Submit data write for current data line */ | ||
496 | err = pblk_submit_io(pblk, rqd); | ||
497 | if (err) { | ||
498 | pr_err("pblk: data I/O submission failed: %d\n", err); | ||
499 | return NVM_IO_ERR; | ||
500 | } | ||
501 | 459 | ||
502 | /* Submit available erase for next data line */ | 460 | if (!ppa_empty(erase_ppa)) { |
461 | /* Submit erase for next data line */ | ||
503 | if (pblk_blk_erase_async(pblk, erase_ppa)) { | 462 | if (pblk_blk_erase_async(pblk, erase_ppa)) { |
504 | struct pblk_line *e_line = pblk_line_get_erase(pblk); | 463 | struct pblk_line *e_line = pblk_line_get_erase(pblk); |
505 | struct nvm_tgt_dev *dev = pblk->dev; | 464 | struct nvm_tgt_dev *dev = pblk->dev; |
@@ -512,6 +471,15 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) | |||
512 | } | 471 | } |
513 | } | 472 | } |
514 | 473 | ||
474 | if (meta_line) { | ||
475 | /* Submit metadata write for previous data line */ | ||
476 | err = pblk_submit_meta_io(pblk, meta_line); | ||
477 | if (err) { | ||
478 | pr_err("pblk: metadata I/O submission failed: %d", err); | ||
479 | return NVM_IO_ERR; | ||
480 | } | ||
481 | } | ||
482 | |||
515 | return NVM_IO_OK; | 483 | return NVM_IO_OK; |
516 | } | 484 | } |
517 | 485 | ||
@@ -521,7 +489,8 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) | |||
521 | struct bio *bio = rqd->bio; | 489 | struct bio *bio = rqd->bio; |
522 | 490 | ||
523 | if (c_ctx->nr_padded) | 491 | if (c_ctx->nr_padded) |
524 | pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded); | 492 | pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, |
493 | c_ctx->nr_padded); | ||
525 | } | 494 | } |
526 | 495 | ||
527 | static int pblk_submit_write(struct pblk *pblk) | 496 | static int pblk_submit_write(struct pblk *pblk) |
@@ -543,31 +512,24 @@ static int pblk_submit_write(struct pblk *pblk) | |||
543 | if (!secs_to_flush && secs_avail < pblk->min_write_pgs) | 512 | if (!secs_to_flush && secs_avail < pblk->min_write_pgs) |
544 | return 1; | 513 | return 1; |
545 | 514 | ||
546 | rqd = pblk_alloc_rqd(pblk, WRITE); | ||
547 | if (IS_ERR(rqd)) { | ||
548 | pr_err("pblk: cannot allocate write req.\n"); | ||
549 | return 1; | ||
550 | } | ||
551 | |||
552 | bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs); | ||
553 | if (!bio) { | ||
554 | pr_err("pblk: cannot allocate write bio\n"); | ||
555 | goto fail_free_rqd; | ||
556 | } | ||
557 | bio->bi_iter.bi_sector = 0; /* internal bio */ | ||
558 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | ||
559 | rqd->bio = bio; | ||
560 | |||
561 | secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); | 515 | secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); |
562 | if (secs_to_sync > pblk->max_write_pgs) { | 516 | if (secs_to_sync > pblk->max_write_pgs) { |
563 | pr_err("pblk: bad buffer sync calculation\n"); | 517 | pr_err("pblk: bad buffer sync calculation\n"); |
564 | goto fail_put_bio; | 518 | return 1; |
565 | } | 519 | } |
566 | 520 | ||
567 | secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; | 521 | secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; |
568 | pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); | 522 | pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); |
569 | 523 | ||
570 | if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync, | 524 | bio = bio_alloc(GFP_KERNEL, secs_to_sync); |
525 | |||
526 | bio->bi_iter.bi_sector = 0; /* internal bio */ | ||
527 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | ||
528 | |||
529 | rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); | ||
530 | rqd->bio = bio; | ||
531 | |||
532 | if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, | ||
571 | secs_avail)) { | 533 | secs_avail)) { |
572 | pr_err("pblk: corrupted write bio\n"); | 534 | pr_err("pblk: corrupted write bio\n"); |
573 | goto fail_put_bio; | 535 | goto fail_put_bio; |
@@ -586,8 +548,7 @@ fail_free_bio: | |||
586 | pblk_free_write_rqd(pblk, rqd); | 548 | pblk_free_write_rqd(pblk, rqd); |
587 | fail_put_bio: | 549 | fail_put_bio: |
588 | bio_put(bio); | 550 | bio_put(bio); |
589 | fail_free_rqd: | 551 | pblk_free_rqd(pblk, rqd, PBLK_WRITE); |
590 | pblk_free_rqd(pblk, rqd, WRITE); | ||
591 | 552 | ||
592 | return 1; | 553 | return 1; |
593 | } | 554 | } |
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 67e623bd5c2d..90961033a79f 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h | |||
@@ -40,10 +40,6 @@ | |||
40 | #define PBLK_MAX_REQ_ADDRS (64) | 40 | #define PBLK_MAX_REQ_ADDRS (64) |
41 | #define PBLK_MAX_REQ_ADDRS_PW (6) | 41 | #define PBLK_MAX_REQ_ADDRS_PW (6) |
42 | 42 | ||
43 | #define PBLK_WS_POOL_SIZE (128) | ||
44 | #define PBLK_META_POOL_SIZE (128) | ||
45 | #define PBLK_READ_REQ_POOL_SIZE (1024) | ||
46 | |||
47 | #define PBLK_NR_CLOSE_JOBS (4) | 43 | #define PBLK_NR_CLOSE_JOBS (4) |
48 | 44 | ||
49 | #define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) | 45 | #define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) |
@@ -59,7 +55,15 @@ | |||
59 | for ((i) = 0, rlun = &(pblk)->luns[0]; \ | 55 | for ((i) = 0, rlun = &(pblk)->luns[0]; \ |
60 | (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)]) | 56 | (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)]) |
61 | 57 | ||
62 | #define ERASE 2 /* READ = 0, WRITE = 1 */ | 58 | /* Static pool sizes */ |
59 | #define PBLK_GEN_WS_POOL_SIZE (2) | ||
60 | |||
61 | enum { | ||
62 | PBLK_READ = READ, | ||
63 | PBLK_WRITE = WRITE,/* Write from write buffer */ | ||
64 | PBLK_WRITE_INT, /* Internal write - no write buffer */ | ||
65 | PBLK_ERASE, | ||
66 | }; | ||
63 | 67 | ||
64 | enum { | 68 | enum { |
65 | /* IO Types */ | 69 | /* IO Types */ |
@@ -95,6 +99,7 @@ enum { | |||
95 | }; | 99 | }; |
96 | 100 | ||
97 | #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) | 101 | #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) |
102 | #define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS) | ||
98 | 103 | ||
99 | /* write buffer completion context */ | 104 | /* write buffer completion context */ |
100 | struct pblk_c_ctx { | 105 | struct pblk_c_ctx { |
@@ -106,9 +111,10 @@ struct pblk_c_ctx { | |||
106 | unsigned int nr_padded; | 111 | unsigned int nr_padded; |
107 | }; | 112 | }; |
108 | 113 | ||
109 | /* generic context */ | 114 | /* read context */ |
110 | struct pblk_g_ctx { | 115 | struct pblk_g_ctx { |
111 | void *private; | 116 | void *private; |
117 | u64 lba; | ||
112 | }; | 118 | }; |
113 | 119 | ||
114 | /* Pad context */ | 120 | /* Pad context */ |
@@ -207,6 +213,7 @@ struct pblk_lun { | |||
207 | struct pblk_gc_rq { | 213 | struct pblk_gc_rq { |
208 | struct pblk_line *line; | 214 | struct pblk_line *line; |
209 | void *data; | 215 | void *data; |
216 | u64 paddr_list[PBLK_MAX_REQ_ADDRS]; | ||
210 | u64 lba_list[PBLK_MAX_REQ_ADDRS]; | 217 | u64 lba_list[PBLK_MAX_REQ_ADDRS]; |
211 | int nr_secs; | 218 | int nr_secs; |
212 | int secs_to_gc; | 219 | int secs_to_gc; |
@@ -231,7 +238,10 @@ struct pblk_gc { | |||
231 | struct timer_list gc_timer; | 238 | struct timer_list gc_timer; |
232 | 239 | ||
233 | struct semaphore gc_sem; | 240 | struct semaphore gc_sem; |
234 | atomic_t inflight_gc; | 241 | atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */ |
242 | atomic_t pipeline_gc; /* Number of lines in the GC pipeline - | ||
243 | * started reads to finished writes | ||
244 | */ | ||
235 | int w_entries; | 245 | int w_entries; |
236 | 246 | ||
237 | struct list_head w_list; | 247 | struct list_head w_list; |
@@ -267,6 +277,7 @@ struct pblk_rl { | |||
267 | int rb_gc_max; /* Max buffer entries available for GC I/O */ | 277 | int rb_gc_max; /* Max buffer entries available for GC I/O */ |
268 | int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ | 278 | int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ |
269 | int rb_state; /* Rate-limiter current state */ | 279 | int rb_state; /* Rate-limiter current state */ |
280 | int rb_max_io; /* Maximum size for an I/O giving the config */ | ||
270 | 281 | ||
271 | atomic_t rb_user_cnt; /* User I/O buffer counter */ | 282 | atomic_t rb_user_cnt; /* User I/O buffer counter */ |
272 | atomic_t rb_gc_cnt; /* GC I/O buffer counter */ | 283 | atomic_t rb_gc_cnt; /* GC I/O buffer counter */ |
@@ -310,6 +321,7 @@ enum { | |||
310 | }; | 321 | }; |
311 | 322 | ||
312 | #define PBLK_MAGIC 0x70626c6b /*pblk*/ | 323 | #define PBLK_MAGIC 0x70626c6b /*pblk*/ |
324 | #define SMETA_VERSION cpu_to_le16(1) | ||
313 | 325 | ||
314 | struct line_header { | 326 | struct line_header { |
315 | __le32 crc; | 327 | __le32 crc; |
@@ -618,15 +630,16 @@ struct pblk { | |||
618 | 630 | ||
619 | struct list_head compl_list; | 631 | struct list_head compl_list; |
620 | 632 | ||
621 | mempool_t *page_pool; | 633 | mempool_t *page_bio_pool; |
622 | mempool_t *line_ws_pool; | 634 | mempool_t *gen_ws_pool; |
623 | mempool_t *rec_pool; | 635 | mempool_t *rec_pool; |
624 | mempool_t *g_rq_pool; | 636 | mempool_t *r_rq_pool; |
625 | mempool_t *w_rq_pool; | 637 | mempool_t *w_rq_pool; |
626 | mempool_t *line_meta_pool; | 638 | mempool_t *e_rq_pool; |
627 | 639 | ||
628 | struct workqueue_struct *close_wq; | 640 | struct workqueue_struct *close_wq; |
629 | struct workqueue_struct *bb_wq; | 641 | struct workqueue_struct *bb_wq; |
642 | struct workqueue_struct *r_end_wq; | ||
630 | 643 | ||
631 | struct timer_list wtimer; | 644 | struct timer_list wtimer; |
632 | 645 | ||
@@ -657,15 +670,15 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, | |||
657 | void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, | 670 | void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, |
658 | struct pblk_w_ctx w_ctx, unsigned int pos); | 671 | struct pblk_w_ctx w_ctx, unsigned int pos); |
659 | void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, | 672 | void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, |
660 | struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, | 673 | struct pblk_w_ctx w_ctx, struct pblk_line *line, |
661 | unsigned int pos); | 674 | u64 paddr, unsigned int pos); |
662 | struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); | 675 | struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); |
663 | void pblk_rb_flush(struct pblk_rb *rb); | 676 | void pblk_rb_flush(struct pblk_rb *rb); |
664 | 677 | ||
665 | void pblk_rb_sync_l2p(struct pblk_rb *rb); | 678 | void pblk_rb_sync_l2p(struct pblk_rb *rb); |
666 | unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, | 679 | unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, |
667 | struct bio *bio, unsigned int pos, | 680 | unsigned int pos, unsigned int nr_entries, |
668 | unsigned int nr_entries, unsigned int count); | 681 | unsigned int count); |
669 | unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, | 682 | unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, |
670 | struct list_head *list, | 683 | struct list_head *list, |
671 | unsigned int max); | 684 | unsigned int max); |
@@ -692,24 +705,23 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); | |||
692 | /* | 705 | /* |
693 | * pblk core | 706 | * pblk core |
694 | */ | 707 | */ |
695 | struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw); | 708 | struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type); |
709 | void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); | ||
696 | void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); | 710 | void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); |
697 | int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, | 711 | int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, |
698 | struct pblk_c_ctx *c_ctx); | 712 | struct pblk_c_ctx *c_ctx); |
699 | void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw); | ||
700 | void pblk_wait_for_meta(struct pblk *pblk); | ||
701 | struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba); | ||
702 | void pblk_discard(struct pblk *pblk, struct bio *bio); | 713 | void pblk_discard(struct pblk *pblk, struct bio *bio); |
703 | void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); | 714 | void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); |
704 | void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); | 715 | void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); |
705 | int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); | 716 | int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); |
717 | int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd); | ||
706 | int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); | 718 | int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); |
707 | struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, | 719 | struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, |
708 | unsigned int nr_secs, unsigned int len, | 720 | unsigned int nr_secs, unsigned int len, |
709 | int alloc_type, gfp_t gfp_mask); | 721 | int alloc_type, gfp_t gfp_mask); |
710 | struct pblk_line *pblk_line_get(struct pblk *pblk); | 722 | struct pblk_line *pblk_line_get(struct pblk *pblk); |
711 | struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); | 723 | struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); |
712 | void pblk_line_replace_data(struct pblk *pblk); | 724 | struct pblk_line *pblk_line_replace_data(struct pblk *pblk); |
713 | int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); | 725 | int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); |
714 | void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); | 726 | void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); |
715 | struct pblk_line *pblk_line_get_data(struct pblk *pblk); | 727 | struct pblk_line *pblk_line_get_data(struct pblk *pblk); |
@@ -719,19 +731,18 @@ int pblk_line_is_full(struct pblk_line *line); | |||
719 | void pblk_line_free(struct pblk *pblk, struct pblk_line *line); | 731 | void pblk_line_free(struct pblk *pblk, struct pblk_line *line); |
720 | void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); | 732 | void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); |
721 | void pblk_line_close(struct pblk *pblk, struct pblk_line *line); | 733 | void pblk_line_close(struct pblk *pblk, struct pblk_line *line); |
722 | void pblk_line_close_meta_sync(struct pblk *pblk); | ||
723 | void pblk_line_close_ws(struct work_struct *work); | 734 | void pblk_line_close_ws(struct work_struct *work); |
724 | void pblk_pipeline_stop(struct pblk *pblk); | 735 | void pblk_pipeline_stop(struct pblk *pblk); |
725 | void pblk_line_mark_bb(struct work_struct *work); | 736 | void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, |
726 | void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, | 737 | void (*work)(struct work_struct *), gfp_t gfp_mask, |
727 | void (*work)(struct work_struct *), | 738 | struct workqueue_struct *wq); |
728 | struct workqueue_struct *wq); | ||
729 | u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); | 739 | u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); |
730 | int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); | 740 | int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); |
731 | int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, | 741 | int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, |
732 | void *emeta_buf); | 742 | void *emeta_buf); |
733 | int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); | 743 | int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); |
734 | void pblk_line_put(struct kref *ref); | 744 | void pblk_line_put(struct kref *ref); |
745 | void pblk_line_put_wq(struct kref *ref); | ||
735 | struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); | 746 | struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); |
736 | u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); | 747 | u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); |
737 | void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); | 748 | void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); |
@@ -745,7 +756,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, | |||
745 | void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); | 756 | void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); |
746 | void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, | 757 | void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, |
747 | unsigned long *lun_bitmap); | 758 | unsigned long *lun_bitmap); |
748 | void pblk_end_bio_sync(struct bio *bio); | ||
749 | void pblk_end_io_sync(struct nvm_rq *rqd); | 759 | void pblk_end_io_sync(struct nvm_rq *rqd); |
750 | int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, | 760 | int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, |
751 | int nr_pages); | 761 | int nr_pages); |
@@ -760,7 +770,7 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, | |||
760 | void pblk_update_map_dev(struct pblk *pblk, sector_t lba, | 770 | void pblk_update_map_dev(struct pblk *pblk, sector_t lba, |
761 | struct ppa_addr ppa, struct ppa_addr entry_line); | 771 | struct ppa_addr ppa, struct ppa_addr entry_line); |
762 | int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, | 772 | int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, |
763 | struct pblk_line *gc_line); | 773 | struct pblk_line *gc_line, u64 paddr); |
764 | void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, | 774 | void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, |
765 | u64 *lba_list, int nr_secs); | 775 | u64 *lba_list, int nr_secs); |
766 | void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, | 776 | void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, |
@@ -771,9 +781,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, | |||
771 | */ | 781 | */ |
772 | int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, | 782 | int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, |
773 | unsigned long flags); | 783 | unsigned long flags); |
774 | int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, | 784 | int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); |
775 | unsigned int nr_entries, unsigned int nr_rec_entries, | ||
776 | struct pblk_line *gc_line, unsigned long flags); | ||
777 | 785 | ||
778 | /* | 786 | /* |
779 | * pblk map | 787 | * pblk map |
@@ -797,9 +805,7 @@ void pblk_write_should_kick(struct pblk *pblk); | |||
797 | */ | 805 | */ |
798 | extern struct bio_set *pblk_bio_set; | 806 | extern struct bio_set *pblk_bio_set; |
799 | int pblk_submit_read(struct pblk *pblk, struct bio *bio); | 807 | int pblk_submit_read(struct pblk *pblk, struct bio *bio); |
800 | int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, | 808 | int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); |
801 | unsigned int nr_secs, unsigned int *secs_to_gc, | ||
802 | struct pblk_line *line); | ||
803 | /* | 809 | /* |
804 | * pblk recovery | 810 | * pblk recovery |
805 | */ | 811 | */ |
@@ -815,7 +821,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, | |||
815 | * pblk gc | 821 | * pblk gc |
816 | */ | 822 | */ |
817 | #define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ | 823 | #define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ |
818 | #define PBLK_GC_W_QD 128 /* Queue depth for inflight GC write I/Os */ | 824 | #define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ |
819 | #define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ | 825 | #define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ |
820 | #define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ | 826 | #define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ |
821 | 827 | ||
@@ -824,7 +830,7 @@ void pblk_gc_exit(struct pblk *pblk); | |||
824 | void pblk_gc_should_start(struct pblk *pblk); | 830 | void pblk_gc_should_start(struct pblk *pblk); |
825 | void pblk_gc_should_stop(struct pblk *pblk); | 831 | void pblk_gc_should_stop(struct pblk *pblk); |
826 | void pblk_gc_should_kick(struct pblk *pblk); | 832 | void pblk_gc_should_kick(struct pblk *pblk); |
827 | void pblk_gc_kick(struct pblk *pblk); | 833 | void pblk_gc_free_full_lines(struct pblk *pblk); |
828 | void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, | 834 | void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, |
829 | int *gc_active); | 835 | int *gc_active); |
830 | int pblk_gc_sysfs_force(struct pblk *pblk, int force); | 836 | int pblk_gc_sysfs_force(struct pblk *pblk, int force); |
@@ -834,8 +840,8 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force); | |||
834 | */ | 840 | */ |
835 | void pblk_rl_init(struct pblk_rl *rl, int budget); | 841 | void pblk_rl_init(struct pblk_rl *rl, int budget); |
836 | void pblk_rl_free(struct pblk_rl *rl); | 842 | void pblk_rl_free(struct pblk_rl *rl); |
843 | void pblk_rl_update_rates(struct pblk_rl *rl); | ||
837 | int pblk_rl_high_thrs(struct pblk_rl *rl); | 844 | int pblk_rl_high_thrs(struct pblk_rl *rl); |
838 | int pblk_rl_low_thrs(struct pblk_rl *rl); | ||
839 | unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); | 845 | unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); |
840 | int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); | 846 | int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); |
841 | void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); | 847 | void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); |
@@ -843,10 +849,9 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); | |||
843 | int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); | 849 | int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); |
844 | void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); | 850 | void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); |
845 | void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); | 851 | void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); |
846 | int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); | 852 | int pblk_rl_max_io(struct pblk_rl *rl); |
847 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); | 853 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); |
848 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); | 854 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); |
849 | void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left); | ||
850 | int pblk_rl_is_limit(struct pblk_rl *rl); | 855 | int pblk_rl_is_limit(struct pblk_rl *rl); |
851 | 856 | ||
852 | /* | 857 | /* |
@@ -892,13 +897,7 @@ static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) | |||
892 | 897 | ||
893 | static inline int pblk_line_vsc(struct pblk_line *line) | 898 | static inline int pblk_line_vsc(struct pblk_line *line) |
894 | { | 899 | { |
895 | int vsc; | 900 | return le32_to_cpu(*line->vsc); |
896 | |||
897 | spin_lock(&line->lock); | ||
898 | vsc = le32_to_cpu(*line->vsc); | ||
899 | spin_unlock(&line->lock); | ||
900 | |||
901 | return vsc; | ||
902 | } | 901 | } |
903 | 902 | ||
904 | #define NVM_MEM_PAGE_WRITE (8) | 903 | #define NVM_MEM_PAGE_WRITE (8) |
@@ -1140,7 +1139,7 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type) | |||
1140 | 1139 | ||
1141 | flags = geo->plane_mode >> 1; | 1140 | flags = geo->plane_mode >> 1; |
1142 | 1141 | ||
1143 | if (type == WRITE) | 1142 | if (type == PBLK_WRITE) |
1144 | flags |= NVM_IO_SCRAMBLE_ENABLE; | 1143 | flags |= NVM_IO_SCRAMBLE_ENABLE; |
1145 | 1144 | ||
1146 | return flags; | 1145 | return flags; |
@@ -1200,7 +1199,6 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, | |||
1200 | 1199 | ||
1201 | pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); | 1200 | pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); |
1202 | } | 1201 | } |
1203 | #endif | ||
1204 | 1202 | ||
1205 | static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, | 1203 | static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, |
1206 | struct ppa_addr *ppas, int nr_ppas) | 1204 | struct ppa_addr *ppas, int nr_ppas) |
@@ -1221,14 +1219,50 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, | |||
1221 | ppa->g.sec < geo->sec_per_pg) | 1219 | ppa->g.sec < geo->sec_per_pg) |
1222 | continue; | 1220 | continue; |
1223 | 1221 | ||
1224 | #ifdef CONFIG_NVM_DEBUG | ||
1225 | print_ppa(ppa, "boundary", i); | 1222 | print_ppa(ppa, "boundary", i); |
1226 | #endif | 1223 | |
1227 | return 1; | 1224 | return 1; |
1228 | } | 1225 | } |
1229 | return 0; | 1226 | return 0; |
1230 | } | 1227 | } |
1231 | 1228 | ||
1229 | static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) | ||
1230 | { | ||
1231 | struct nvm_tgt_dev *dev = pblk->dev; | ||
1232 | struct ppa_addr *ppa_list; | ||
1233 | |||
1234 | ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; | ||
1235 | |||
1236 | if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { | ||
1237 | WARN_ON(1); | ||
1238 | return -EINVAL; | ||
1239 | } | ||
1240 | |||
1241 | if (rqd->opcode == NVM_OP_PWRITE) { | ||
1242 | struct pblk_line *line; | ||
1243 | struct ppa_addr ppa; | ||
1244 | int i; | ||
1245 | |||
1246 | for (i = 0; i < rqd->nr_ppas; i++) { | ||
1247 | ppa = ppa_list[i]; | ||
1248 | line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; | ||
1249 | |||
1250 | spin_lock(&line->lock); | ||
1251 | if (line->state != PBLK_LINESTATE_OPEN) { | ||
1252 | pr_err("pblk: bad ppa: line:%d,state:%d\n", | ||
1253 | line->id, line->state); | ||
1254 | WARN_ON(1); | ||
1255 | spin_unlock(&line->lock); | ||
1256 | return -EINVAL; | ||
1257 | } | ||
1258 | spin_unlock(&line->lock); | ||
1259 | } | ||
1260 | } | ||
1261 | |||
1262 | return 0; | ||
1263 | } | ||
1264 | #endif | ||
1265 | |||
1232 | static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) | 1266 | static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) |
1233 | { | 1267 | { |
1234 | struct pblk_line_meta *lm = &pblk->lm; | 1268 | struct pblk_line_meta *lm = &pblk->lm; |
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 08035634795c..a27d85232ce1 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -407,7 +407,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) | |||
407 | 407 | ||
408 | finish_wait(&ca->set->bucket_wait, &w); | 408 | finish_wait(&ca->set->bucket_wait, &w); |
409 | out: | 409 | out: |
410 | wake_up_process(ca->alloc_thread); | 410 | if (ca->alloc_thread) |
411 | wake_up_process(ca->alloc_thread); | ||
411 | 412 | ||
412 | trace_bcache_alloc(ca, reserve); | 413 | trace_bcache_alloc(ca, reserve); |
413 | 414 | ||
@@ -442,6 +443,11 @@ out: | |||
442 | b->prio = INITIAL_PRIO; | 443 | b->prio = INITIAL_PRIO; |
443 | } | 444 | } |
444 | 445 | ||
446 | if (ca->set->avail_nbuckets > 0) { | ||
447 | ca->set->avail_nbuckets--; | ||
448 | bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); | ||
449 | } | ||
450 | |||
445 | return r; | 451 | return r; |
446 | } | 452 | } |
447 | 453 | ||
@@ -449,6 +455,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) | |||
449 | { | 455 | { |
450 | SET_GC_MARK(b, 0); | 456 | SET_GC_MARK(b, 0); |
451 | SET_GC_SECTORS_USED(b, 0); | 457 | SET_GC_SECTORS_USED(b, 0); |
458 | |||
459 | if (ca->set->avail_nbuckets < ca->set->nbuckets) { | ||
460 | ca->set->avail_nbuckets++; | ||
461 | bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); | ||
462 | } | ||
452 | } | 463 | } |
453 | 464 | ||
454 | void bch_bucket_free(struct cache_set *c, struct bkey *k) | 465 | void bch_bucket_free(struct cache_set *c, struct bkey *k) |
@@ -601,7 +612,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, | |||
601 | 612 | ||
602 | /* | 613 | /* |
603 | * If we had to allocate, we might race and not need to allocate the | 614 | * If we had to allocate, we might race and not need to allocate the |
604 | * second time we call find_data_bucket(). If we allocated a bucket but | 615 | * second time we call pick_data_bucket(). If we allocated a bucket but |
605 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | 616 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: |
606 | */ | 617 | */ |
607 | if (KEY_PTRS(&alloc.key)) | 618 | if (KEY_PTRS(&alloc.key)) |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index abd31e847f96..843877e017e1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -185,6 +185,7 @@ | |||
185 | #include <linux/mutex.h> | 185 | #include <linux/mutex.h> |
186 | #include <linux/rbtree.h> | 186 | #include <linux/rbtree.h> |
187 | #include <linux/rwsem.h> | 187 | #include <linux/rwsem.h> |
188 | #include <linux/refcount.h> | ||
188 | #include <linux/types.h> | 189 | #include <linux/types.h> |
189 | #include <linux/workqueue.h> | 190 | #include <linux/workqueue.h> |
190 | 191 | ||
@@ -266,9 +267,6 @@ struct bcache_device { | |||
266 | atomic_t *stripe_sectors_dirty; | 267 | atomic_t *stripe_sectors_dirty; |
267 | unsigned long *full_dirty_stripes; | 268 | unsigned long *full_dirty_stripes; |
268 | 269 | ||
269 | unsigned long sectors_dirty_last; | ||
270 | long sectors_dirty_derivative; | ||
271 | |||
272 | struct bio_set *bio_split; | 270 | struct bio_set *bio_split; |
273 | 271 | ||
274 | unsigned data_csum:1; | 272 | unsigned data_csum:1; |
@@ -300,7 +298,7 @@ struct cached_dev { | |||
300 | struct semaphore sb_write_mutex; | 298 | struct semaphore sb_write_mutex; |
301 | 299 | ||
302 | /* Refcount on the cache set. Always nonzero when we're caching. */ | 300 | /* Refcount on the cache set. Always nonzero when we're caching. */ |
303 | atomic_t count; | 301 | refcount_t count; |
304 | struct work_struct detach; | 302 | struct work_struct detach; |
305 | 303 | ||
306 | /* | 304 | /* |
@@ -363,12 +361,14 @@ struct cached_dev { | |||
363 | 361 | ||
364 | uint64_t writeback_rate_target; | 362 | uint64_t writeback_rate_target; |
365 | int64_t writeback_rate_proportional; | 363 | int64_t writeback_rate_proportional; |
366 | int64_t writeback_rate_derivative; | 364 | int64_t writeback_rate_integral; |
367 | int64_t writeback_rate_change; | 365 | int64_t writeback_rate_integral_scaled; |
366 | int32_t writeback_rate_change; | ||
368 | 367 | ||
369 | unsigned writeback_rate_update_seconds; | 368 | unsigned writeback_rate_update_seconds; |
370 | unsigned writeback_rate_d_term; | 369 | unsigned writeback_rate_i_term_inverse; |
371 | unsigned writeback_rate_p_term_inverse; | 370 | unsigned writeback_rate_p_term_inverse; |
371 | unsigned writeback_rate_minimum; | ||
372 | }; | 372 | }; |
373 | 373 | ||
374 | enum alloc_reserve { | 374 | enum alloc_reserve { |
@@ -582,6 +582,7 @@ struct cache_set { | |||
582 | uint8_t need_gc; | 582 | uint8_t need_gc; |
583 | struct gc_stat gc_stats; | 583 | struct gc_stat gc_stats; |
584 | size_t nbuckets; | 584 | size_t nbuckets; |
585 | size_t avail_nbuckets; | ||
585 | 586 | ||
586 | struct task_struct *gc_thread; | 587 | struct task_struct *gc_thread; |
587 | /* Where in the btree gc currently is */ | 588 | /* Where in the btree gc currently is */ |
@@ -807,13 +808,13 @@ do { \ | |||
807 | 808 | ||
808 | static inline void cached_dev_put(struct cached_dev *dc) | 809 | static inline void cached_dev_put(struct cached_dev *dc) |
809 | { | 810 | { |
810 | if (atomic_dec_and_test(&dc->count)) | 811 | if (refcount_dec_and_test(&dc->count)) |
811 | schedule_work(&dc->detach); | 812 | schedule_work(&dc->detach); |
812 | } | 813 | } |
813 | 814 | ||
814 | static inline bool cached_dev_get(struct cached_dev *dc) | 815 | static inline bool cached_dev_get(struct cached_dev *dc) |
815 | { | 816 | { |
816 | if (!atomic_inc_not_zero(&dc->count)) | 817 | if (!refcount_inc_not_zero(&dc->count)) |
817 | return false; | 818 | return false; |
818 | 819 | ||
819 | /* Paired with the mb in cached_dev_attach */ | 820 | /* Paired with the mb in cached_dev_attach */ |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 658c54b3b07a..11c5503d31dc 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -1241,6 +1241,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1241 | __bch_btree_mark_key(c, level, k); | 1241 | __bch_btree_mark_key(c, level, k); |
1242 | } | 1242 | } |
1243 | 1243 | ||
1244 | void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) | ||
1245 | { | ||
1246 | stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; | ||
1247 | } | ||
1248 | |||
1244 | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) | 1249 | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) |
1245 | { | 1250 | { |
1246 | uint8_t stale = 0; | 1251 | uint8_t stale = 0; |
@@ -1652,9 +1657,8 @@ static void btree_gc_start(struct cache_set *c) | |||
1652 | mutex_unlock(&c->bucket_lock); | 1657 | mutex_unlock(&c->bucket_lock); |
1653 | } | 1658 | } |
1654 | 1659 | ||
1655 | static size_t bch_btree_gc_finish(struct cache_set *c) | 1660 | static void bch_btree_gc_finish(struct cache_set *c) |
1656 | { | 1661 | { |
1657 | size_t available = 0; | ||
1658 | struct bucket *b; | 1662 | struct bucket *b; |
1659 | struct cache *ca; | 1663 | struct cache *ca; |
1660 | unsigned i; | 1664 | unsigned i; |
@@ -1691,6 +1695,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c) | |||
1691 | } | 1695 | } |
1692 | rcu_read_unlock(); | 1696 | rcu_read_unlock(); |
1693 | 1697 | ||
1698 | c->avail_nbuckets = 0; | ||
1694 | for_each_cache(ca, c, i) { | 1699 | for_each_cache(ca, c, i) { |
1695 | uint64_t *i; | 1700 | uint64_t *i; |
1696 | 1701 | ||
@@ -1712,18 +1717,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c) | |||
1712 | BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); | 1717 | BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); |
1713 | 1718 | ||
1714 | if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) | 1719 | if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) |
1715 | available++; | 1720 | c->avail_nbuckets++; |
1716 | } | 1721 | } |
1717 | } | 1722 | } |
1718 | 1723 | ||
1719 | mutex_unlock(&c->bucket_lock); | 1724 | mutex_unlock(&c->bucket_lock); |
1720 | return available; | ||
1721 | } | 1725 | } |
1722 | 1726 | ||
1723 | static void bch_btree_gc(struct cache_set *c) | 1727 | static void bch_btree_gc(struct cache_set *c) |
1724 | { | 1728 | { |
1725 | int ret; | 1729 | int ret; |
1726 | unsigned long available; | ||
1727 | struct gc_stat stats; | 1730 | struct gc_stat stats; |
1728 | struct closure writes; | 1731 | struct closure writes; |
1729 | struct btree_op op; | 1732 | struct btree_op op; |
@@ -1746,14 +1749,14 @@ static void bch_btree_gc(struct cache_set *c) | |||
1746 | pr_warn("gc failed!"); | 1749 | pr_warn("gc failed!"); |
1747 | } while (ret); | 1750 | } while (ret); |
1748 | 1751 | ||
1749 | available = bch_btree_gc_finish(c); | 1752 | bch_btree_gc_finish(c); |
1750 | wake_up_allocators(c); | 1753 | wake_up_allocators(c); |
1751 | 1754 | ||
1752 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1755 | bch_time_stats_update(&c->btree_gc_time, start_time); |
1753 | 1756 | ||
1754 | stats.key_bytes *= sizeof(uint64_t); | 1757 | stats.key_bytes *= sizeof(uint64_t); |
1755 | stats.data <<= 9; | 1758 | stats.data <<= 9; |
1756 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1759 | bch_update_bucket_in_use(c, &stats); |
1757 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1760 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
1758 | 1761 | ||
1759 | trace_bcache_gc_end(c); | 1762 | trace_bcache_gc_end(c); |
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 42204d61bc95..d211e2c25b6b 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
@@ -306,5 +306,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | |||
306 | struct keybuf_key *bch_keybuf_next(struct keybuf *); | 306 | struct keybuf_key *bch_keybuf_next(struct keybuf *); |
307 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, | 307 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, |
308 | struct bkey *, keybuf_pred_fn *); | 308 | struct bkey *, keybuf_pred_fn *); |
309 | 309 | void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); | |
310 | #endif | 310 | #endif |
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 965907ce1e20..ccfbea6f9f6b 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h | |||
@@ -252,6 +252,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | |||
252 | static inline void closure_queue(struct closure *cl) | 252 | static inline void closure_queue(struct closure *cl) |
253 | { | 253 | { |
254 | struct workqueue_struct *wq = cl->wq; | 254 | struct workqueue_struct *wq = cl->wq; |
255 | /** | ||
256 | * Changes made to closure, work_struct, or a couple of other structs | ||
257 | * may cause work.func not pointing to the right location. | ||
258 | */ | ||
259 | BUILD_BUG_ON(offsetof(struct closure, fn) | ||
260 | != offsetof(struct work_struct, func)); | ||
255 | if (wq) { | 261 | if (wq) { |
256 | INIT_WORK(&cl->work, cl->work.func); | 262 | INIT_WORK(&cl->work, cl->work.func); |
257 | BUG_ON(!queue_work(wq, &cl->work)); | 263 | BUG_ON(!queue_work(wq, &cl->work)); |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 3475d6628e21..3a7aed7282b2 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -27,12 +27,12 @@ struct kmem_cache *bch_search_cache; | |||
27 | 27 | ||
28 | static void bch_data_insert_start(struct closure *); | 28 | static void bch_data_insert_start(struct closure *); |
29 | 29 | ||
30 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) | 30 | static unsigned cache_mode(struct cached_dev *dc) |
31 | { | 31 | { |
32 | return BDEV_CACHE_MODE(&dc->sb); | 32 | return BDEV_CACHE_MODE(&dc->sb); |
33 | } | 33 | } |
34 | 34 | ||
35 | static bool verify(struct cached_dev *dc, struct bio *bio) | 35 | static bool verify(struct cached_dev *dc) |
36 | { | 36 | { |
37 | return dc->verify; | 37 | return dc->verify; |
38 | } | 38 | } |
@@ -370,7 +370,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) | |||
370 | static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) | 370 | static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) |
371 | { | 371 | { |
372 | struct cache_set *c = dc->disk.c; | 372 | struct cache_set *c = dc->disk.c; |
373 | unsigned mode = cache_mode(dc, bio); | 373 | unsigned mode = cache_mode(dc); |
374 | unsigned sectors, congested = bch_get_congested(c); | 374 | unsigned sectors, congested = bch_get_congested(c); |
375 | struct task_struct *task = current; | 375 | struct task_struct *task = current; |
376 | struct io *i; | 376 | struct io *i; |
@@ -385,6 +385,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) | |||
385 | op_is_write(bio_op(bio)))) | 385 | op_is_write(bio_op(bio)))) |
386 | goto skip; | 386 | goto skip; |
387 | 387 | ||
388 | /* | ||
389 | * Flag for bypass if the IO is for read-ahead or background, | ||
390 | * unless the read-ahead request is for metadata (eg, for gfs2). | ||
391 | */ | ||
392 | if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && | ||
393 | !(bio->bi_opf & REQ_META)) | ||
394 | goto skip; | ||
395 | |||
388 | if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || | 396 | if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || |
389 | bio_sectors(bio) & (c->sb.block_size - 1)) { | 397 | bio_sectors(bio) & (c->sb.block_size - 1)) { |
390 | pr_debug("skipping unaligned io"); | 398 | pr_debug("skipping unaligned io"); |
@@ -463,6 +471,7 @@ struct search { | |||
463 | unsigned recoverable:1; | 471 | unsigned recoverable:1; |
464 | unsigned write:1; | 472 | unsigned write:1; |
465 | unsigned read_dirty_data:1; | 473 | unsigned read_dirty_data:1; |
474 | unsigned cache_missed:1; | ||
466 | 475 | ||
467 | unsigned long start_time; | 476 | unsigned long start_time; |
468 | 477 | ||
@@ -649,6 +658,7 @@ static inline struct search *search_alloc(struct bio *bio, | |||
649 | 658 | ||
650 | s->orig_bio = bio; | 659 | s->orig_bio = bio; |
651 | s->cache_miss = NULL; | 660 | s->cache_miss = NULL; |
661 | s->cache_missed = 0; | ||
652 | s->d = d; | 662 | s->d = d; |
653 | s->recoverable = 1; | 663 | s->recoverable = 1; |
654 | s->write = op_is_write(bio_op(bio)); | 664 | s->write = op_is_write(bio_op(bio)); |
@@ -698,8 +708,16 @@ static void cached_dev_read_error(struct closure *cl) | |||
698 | { | 708 | { |
699 | struct search *s = container_of(cl, struct search, cl); | 709 | struct search *s = container_of(cl, struct search, cl); |
700 | struct bio *bio = &s->bio.bio; | 710 | struct bio *bio = &s->bio.bio; |
711 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
701 | 712 | ||
702 | if (s->recoverable) { | 713 | /* |
714 | * If cache device is dirty (dc->has_dirty is non-zero), then | ||
715 | * recovery a failed read request from cached device may get a | ||
716 | * stale data back. So read failure recovery is only permitted | ||
717 | * when cache device is clean. | ||
718 | */ | ||
719 | if (s->recoverable && | ||
720 | (dc && !atomic_read(&dc->has_dirty))) { | ||
703 | /* Retry from the backing device: */ | 721 | /* Retry from the backing device: */ |
704 | trace_bcache_read_retry(s->orig_bio); | 722 | trace_bcache_read_retry(s->orig_bio); |
705 | 723 | ||
@@ -740,7 +758,7 @@ static void cached_dev_read_done(struct closure *cl) | |||
740 | s->cache_miss = NULL; | 758 | s->cache_miss = NULL; |
741 | } | 759 | } |
742 | 760 | ||
743 | if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) | 761 | if (verify(dc) && s->recoverable && !s->read_dirty_data) |
744 | bch_data_verify(dc, s->orig_bio); | 762 | bch_data_verify(dc, s->orig_bio); |
745 | 763 | ||
746 | bio_complete(s); | 764 | bio_complete(s); |
@@ -760,12 +778,12 @@ static void cached_dev_read_done_bh(struct closure *cl) | |||
760 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 778 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
761 | 779 | ||
762 | bch_mark_cache_accounting(s->iop.c, s->d, | 780 | bch_mark_cache_accounting(s->iop.c, s->d, |
763 | !s->cache_miss, s->iop.bypass); | 781 | !s->cache_missed, s->iop.bypass); |
764 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); | 782 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); |
765 | 783 | ||
766 | if (s->iop.status) | 784 | if (s->iop.status) |
767 | continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); | 785 | continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); |
768 | else if (s->iop.bio || verify(dc, &s->bio.bio)) | 786 | else if (s->iop.bio || verify(dc)) |
769 | continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); | 787 | continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); |
770 | else | 788 | else |
771 | continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); | 789 | continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); |
@@ -779,6 +797,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
779 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 797 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
780 | struct bio *miss, *cache_bio; | 798 | struct bio *miss, *cache_bio; |
781 | 799 | ||
800 | s->cache_missed = 1; | ||
801 | |||
782 | if (s->cache_miss || s->iop.bypass) { | 802 | if (s->cache_miss || s->iop.bypass) { |
783 | miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 803 | miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
784 | ret = miss == bio ? MAP_DONE : MAP_CONTINUE; | 804 | ret = miss == bio ? MAP_DONE : MAP_CONTINUE; |
@@ -892,7 +912,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) | |||
892 | s->iop.bypass = true; | 912 | s->iop.bypass = true; |
893 | 913 | ||
894 | if (should_writeback(dc, s->orig_bio, | 914 | if (should_writeback(dc, s->orig_bio, |
895 | cache_mode(dc, bio), | 915 | cache_mode(dc), |
896 | s->iop.bypass)) { | 916 | s->iop.bypass)) { |
897 | s->iop.bypass = false; | 917 | s->iop.bypass = false; |
898 | s->iop.writeback = true; | 918 | s->iop.writeback = true; |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fc0a31b13ac4..b4d28928dec5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets); | |||
53 | static LIST_HEAD(uncached_devices); | 53 | static LIST_HEAD(uncached_devices); |
54 | 54 | ||
55 | static int bcache_major; | 55 | static int bcache_major; |
56 | static DEFINE_IDA(bcache_minor); | 56 | static DEFINE_IDA(bcache_device_idx); |
57 | static wait_queue_head_t unregister_wait; | 57 | static wait_queue_head_t unregister_wait; |
58 | struct workqueue_struct *bcache_wq; | 58 | struct workqueue_struct *bcache_wq; |
59 | 59 | ||
60 | #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) | 60 | #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) |
61 | #define BCACHE_MINORS 16 /* partition support */ | 61 | /* limitation of partitions number on single bcache device */ |
62 | #define BCACHE_MINORS 128 | ||
63 | /* limitation of bcache devices number on single system */ | ||
64 | #define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS) | ||
62 | 65 | ||
63 | /* Superblock */ | 66 | /* Superblock */ |
64 | 67 | ||
@@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, | |||
721 | closure_get(&c->caching); | 724 | closure_get(&c->caching); |
722 | } | 725 | } |
723 | 726 | ||
727 | static inline int first_minor_to_idx(int first_minor) | ||
728 | { | ||
729 | return (first_minor/BCACHE_MINORS); | ||
730 | } | ||
731 | |||
732 | static inline int idx_to_first_minor(int idx) | ||
733 | { | ||
734 | return (idx * BCACHE_MINORS); | ||
735 | } | ||
736 | |||
724 | static void bcache_device_free(struct bcache_device *d) | 737 | static void bcache_device_free(struct bcache_device *d) |
725 | { | 738 | { |
726 | lockdep_assert_held(&bch_register_lock); | 739 | lockdep_assert_held(&bch_register_lock); |
@@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d) | |||
734 | if (d->disk && d->disk->queue) | 747 | if (d->disk && d->disk->queue) |
735 | blk_cleanup_queue(d->disk->queue); | 748 | blk_cleanup_queue(d->disk->queue); |
736 | if (d->disk) { | 749 | if (d->disk) { |
737 | ida_simple_remove(&bcache_minor, d->disk->first_minor); | 750 | ida_simple_remove(&bcache_device_idx, |
751 | first_minor_to_idx(d->disk->first_minor)); | ||
738 | put_disk(d->disk); | 752 | put_disk(d->disk); |
739 | } | 753 | } |
740 | 754 | ||
@@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
751 | { | 765 | { |
752 | struct request_queue *q; | 766 | struct request_queue *q; |
753 | size_t n; | 767 | size_t n; |
754 | int minor; | 768 | int idx; |
755 | 769 | ||
756 | if (!d->stripe_size) | 770 | if (!d->stripe_size) |
757 | d->stripe_size = 1 << 31; | 771 | d->stripe_size = 1 << 31; |
@@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
776 | if (!d->full_dirty_stripes) | 790 | if (!d->full_dirty_stripes) |
777 | return -ENOMEM; | 791 | return -ENOMEM; |
778 | 792 | ||
779 | minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); | 793 | idx = ida_simple_get(&bcache_device_idx, 0, |
780 | if (minor < 0) | 794 | BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); |
781 | return minor; | 795 | if (idx < 0) |
782 | 796 | return idx; | |
783 | minor *= BCACHE_MINORS; | ||
784 | 797 | ||
785 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), | 798 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), |
786 | BIOSET_NEED_BVECS | | 799 | BIOSET_NEED_BVECS | |
787 | BIOSET_NEED_RESCUER)) || | 800 | BIOSET_NEED_RESCUER)) || |
788 | !(d->disk = alloc_disk(BCACHE_MINORS))) { | 801 | !(d->disk = alloc_disk(BCACHE_MINORS))) { |
789 | ida_simple_remove(&bcache_minor, minor); | 802 | ida_simple_remove(&bcache_device_idx, idx); |
790 | return -ENOMEM; | 803 | return -ENOMEM; |
791 | } | 804 | } |
792 | 805 | ||
793 | set_capacity(d->disk, sectors); | 806 | set_capacity(d->disk, sectors); |
794 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); | 807 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); |
795 | 808 | ||
796 | d->disk->major = bcache_major; | 809 | d->disk->major = bcache_major; |
797 | d->disk->first_minor = minor; | 810 | d->disk->first_minor = idx_to_first_minor(idx); |
798 | d->disk->fops = &bcache_ops; | 811 | d->disk->fops = &bcache_ops; |
799 | d->disk->private_data = d; | 812 | d->disk->private_data = d; |
800 | 813 | ||
@@ -889,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w) | |||
889 | closure_init_stack(&cl); | 902 | closure_init_stack(&cl); |
890 | 903 | ||
891 | BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); | 904 | BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); |
892 | BUG_ON(atomic_read(&dc->count)); | 905 | BUG_ON(refcount_read(&dc->count)); |
893 | 906 | ||
894 | mutex_lock(&bch_register_lock); | 907 | mutex_lock(&bch_register_lock); |
895 | 908 | ||
@@ -1016,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | |||
1016 | * dc->c must be set before dc->count != 0 - paired with the mb in | 1029 | * dc->c must be set before dc->count != 0 - paired with the mb in |
1017 | * cached_dev_get() | 1030 | * cached_dev_get() |
1018 | */ | 1031 | */ |
1019 | atomic_set(&dc->count, 1); | 1032 | refcount_set(&dc->count, 1); |
1020 | 1033 | ||
1021 | /* Block writeback thread, but spawn it */ | 1034 | /* Block writeback thread, but spawn it */ |
1022 | down_write(&dc->writeback_lock); | 1035 | down_write(&dc->writeback_lock); |
@@ -1028,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | |||
1028 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { | 1041 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { |
1029 | bch_sectors_dirty_init(&dc->disk); | 1042 | bch_sectors_dirty_init(&dc->disk); |
1030 | atomic_set(&dc->has_dirty, 1); | 1043 | atomic_set(&dc->has_dirty, 1); |
1031 | atomic_inc(&dc->count); | 1044 | refcount_inc(&dc->count); |
1032 | bch_writeback_queue(dc); | 1045 | bch_writeback_queue(dc); |
1033 | } | 1046 | } |
1034 | 1047 | ||
@@ -1129,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
1129 | if (ret) | 1142 | if (ret) |
1130 | return ret; | 1143 | return ret; |
1131 | 1144 | ||
1132 | set_capacity(dc->disk.disk, | ||
1133 | dc->bdev->bd_part->nr_sects - dc->sb.data_offset); | ||
1134 | |||
1135 | dc->disk.disk->queue->backing_dev_info->ra_pages = | 1145 | dc->disk.disk->queue->backing_dev_info->ra_pages = |
1136 | max(dc->disk.disk->queue->backing_dev_info->ra_pages, | 1146 | max(dc->disk.disk->queue->backing_dev_info->ra_pages, |
1137 | q->backing_dev_info->ra_pages); | 1147 | q->backing_dev_info->ra_pages); |
@@ -2085,6 +2095,7 @@ static void bcache_exit(void) | |||
2085 | if (bcache_major) | 2095 | if (bcache_major) |
2086 | unregister_blkdev(bcache_major, "bcache"); | 2096 | unregister_blkdev(bcache_major, "bcache"); |
2087 | unregister_reboot_notifier(&reboot); | 2097 | unregister_reboot_notifier(&reboot); |
2098 | mutex_destroy(&bch_register_lock); | ||
2088 | } | 2099 | } |
2089 | 2100 | ||
2090 | static int __init bcache_init(void) | 2101 | static int __init bcache_init(void) |
@@ -2103,14 +2114,15 @@ static int __init bcache_init(void) | |||
2103 | bcache_major = register_blkdev(0, "bcache"); | 2114 | bcache_major = register_blkdev(0, "bcache"); |
2104 | if (bcache_major < 0) { | 2115 | if (bcache_major < 0) { |
2105 | unregister_reboot_notifier(&reboot); | 2116 | unregister_reboot_notifier(&reboot); |
2117 | mutex_destroy(&bch_register_lock); | ||
2106 | return bcache_major; | 2118 | return bcache_major; |
2107 | } | 2119 | } |
2108 | 2120 | ||
2109 | if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || | 2121 | if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || |
2110 | !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || | 2122 | !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || |
2111 | sysfs_create_files(bcache_kobj, files) || | ||
2112 | bch_request_init() || | 2123 | bch_request_init() || |
2113 | bch_debug_init(bcache_kobj)) | 2124 | bch_debug_init(bcache_kobj) || |
2125 | sysfs_create_files(bcache_kobj, files)) | ||
2114 | goto err; | 2126 | goto err; |
2115 | 2127 | ||
2116 | return 0; | 2128 | return 0; |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 234b2f5b286d..b4184092c727 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -82,8 +82,9 @@ rw_attribute(writeback_delay); | |||
82 | rw_attribute(writeback_rate); | 82 | rw_attribute(writeback_rate); |
83 | 83 | ||
84 | rw_attribute(writeback_rate_update_seconds); | 84 | rw_attribute(writeback_rate_update_seconds); |
85 | rw_attribute(writeback_rate_d_term); | 85 | rw_attribute(writeback_rate_i_term_inverse); |
86 | rw_attribute(writeback_rate_p_term_inverse); | 86 | rw_attribute(writeback_rate_p_term_inverse); |
87 | rw_attribute(writeback_rate_minimum); | ||
87 | read_attribute(writeback_rate_debug); | 88 | read_attribute(writeback_rate_debug); |
88 | 89 | ||
89 | read_attribute(stripe_size); | 90 | read_attribute(stripe_size); |
@@ -131,15 +132,16 @@ SHOW(__bch_cached_dev) | |||
131 | sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); | 132 | sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); |
132 | 133 | ||
133 | var_print(writeback_rate_update_seconds); | 134 | var_print(writeback_rate_update_seconds); |
134 | var_print(writeback_rate_d_term); | 135 | var_print(writeback_rate_i_term_inverse); |
135 | var_print(writeback_rate_p_term_inverse); | 136 | var_print(writeback_rate_p_term_inverse); |
137 | var_print(writeback_rate_minimum); | ||
136 | 138 | ||
137 | if (attr == &sysfs_writeback_rate_debug) { | 139 | if (attr == &sysfs_writeback_rate_debug) { |
138 | char rate[20]; | 140 | char rate[20]; |
139 | char dirty[20]; | 141 | char dirty[20]; |
140 | char target[20]; | 142 | char target[20]; |
141 | char proportional[20]; | 143 | char proportional[20]; |
142 | char derivative[20]; | 144 | char integral[20]; |
143 | char change[20]; | 145 | char change[20]; |
144 | s64 next_io; | 146 | s64 next_io; |
145 | 147 | ||
@@ -147,7 +149,7 @@ SHOW(__bch_cached_dev) | |||
147 | bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); | 149 | bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); |
148 | bch_hprint(target, dc->writeback_rate_target << 9); | 150 | bch_hprint(target, dc->writeback_rate_target << 9); |
149 | bch_hprint(proportional,dc->writeback_rate_proportional << 9); | 151 | bch_hprint(proportional,dc->writeback_rate_proportional << 9); |
150 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); | 152 | bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); |
151 | bch_hprint(change, dc->writeback_rate_change << 9); | 153 | bch_hprint(change, dc->writeback_rate_change << 9); |
152 | 154 | ||
153 | next_io = div64_s64(dc->writeback_rate.next - local_clock(), | 155 | next_io = div64_s64(dc->writeback_rate.next - local_clock(), |
@@ -158,11 +160,11 @@ SHOW(__bch_cached_dev) | |||
158 | "dirty:\t\t%s\n" | 160 | "dirty:\t\t%s\n" |
159 | "target:\t\t%s\n" | 161 | "target:\t\t%s\n" |
160 | "proportional:\t%s\n" | 162 | "proportional:\t%s\n" |
161 | "derivative:\t%s\n" | 163 | "integral:\t%s\n" |
162 | "change:\t\t%s/sec\n" | 164 | "change:\t\t%s/sec\n" |
163 | "next io:\t%llims\n", | 165 | "next io:\t%llims\n", |
164 | rate, dirty, target, proportional, | 166 | rate, dirty, target, proportional, |
165 | derivative, change, next_io); | 167 | integral, change, next_io); |
166 | } | 168 | } |
167 | 169 | ||
168 | sysfs_hprint(dirty_data, | 170 | sysfs_hprint(dirty_data, |
@@ -214,7 +216,7 @@ STORE(__cached_dev) | |||
214 | dc->writeback_rate.rate, 1, INT_MAX); | 216 | dc->writeback_rate.rate, 1, INT_MAX); |
215 | 217 | ||
216 | d_strtoul_nonzero(writeback_rate_update_seconds); | 218 | d_strtoul_nonzero(writeback_rate_update_seconds); |
217 | d_strtoul(writeback_rate_d_term); | 219 | d_strtoul(writeback_rate_i_term_inverse); |
218 | d_strtoul_nonzero(writeback_rate_p_term_inverse); | 220 | d_strtoul_nonzero(writeback_rate_p_term_inverse); |
219 | 221 | ||
220 | d_strtoi_h(sequential_cutoff); | 222 | d_strtoi_h(sequential_cutoff); |
@@ -320,7 +322,7 @@ static struct attribute *bch_cached_dev_files[] = { | |||
320 | &sysfs_writeback_percent, | 322 | &sysfs_writeback_percent, |
321 | &sysfs_writeback_rate, | 323 | &sysfs_writeback_rate, |
322 | &sysfs_writeback_rate_update_seconds, | 324 | &sysfs_writeback_rate_update_seconds, |
323 | &sysfs_writeback_rate_d_term, | 325 | &sysfs_writeback_rate_i_term_inverse, |
324 | &sysfs_writeback_rate_p_term_inverse, | 326 | &sysfs_writeback_rate_p_term_inverse, |
325 | &sysfs_writeback_rate_debug, | 327 | &sysfs_writeback_rate_debug, |
326 | &sysfs_dirty_data, | 328 | &sysfs_dirty_data, |
@@ -746,6 +748,11 @@ static struct attribute *bch_cache_set_internal_files[] = { | |||
746 | }; | 748 | }; |
747 | KTYPE(bch_cache_set_internal); | 749 | KTYPE(bch_cache_set_internal); |
748 | 750 | ||
751 | static int __bch_cache_cmp(const void *l, const void *r) | ||
752 | { | ||
753 | return *((uint16_t *)r) - *((uint16_t *)l); | ||
754 | } | ||
755 | |||
749 | SHOW(__bch_cache) | 756 | SHOW(__bch_cache) |
750 | { | 757 | { |
751 | struct cache *ca = container_of(kobj, struct cache, kobj); | 758 | struct cache *ca = container_of(kobj, struct cache, kobj); |
@@ -770,9 +777,6 @@ SHOW(__bch_cache) | |||
770 | CACHE_REPLACEMENT(&ca->sb)); | 777 | CACHE_REPLACEMENT(&ca->sb)); |
771 | 778 | ||
772 | if (attr == &sysfs_priority_stats) { | 779 | if (attr == &sysfs_priority_stats) { |
773 | int cmp(const void *l, const void *r) | ||
774 | { return *((uint16_t *) r) - *((uint16_t *) l); } | ||
775 | |||
776 | struct bucket *b; | 780 | struct bucket *b; |
777 | size_t n = ca->sb.nbuckets, i; | 781 | size_t n = ca->sb.nbuckets, i; |
778 | size_t unused = 0, available = 0, dirty = 0, meta = 0; | 782 | size_t unused = 0, available = 0, dirty = 0, meta = 0; |
@@ -801,7 +805,7 @@ SHOW(__bch_cache) | |||
801 | p[i] = ca->buckets[i].prio; | 805 | p[i] = ca->buckets[i].prio; |
802 | mutex_unlock(&ca->set->bucket_lock); | 806 | mutex_unlock(&ca->set->bucket_lock); |
803 | 807 | ||
804 | sort(p, n, sizeof(uint16_t), cmp, NULL); | 808 | sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL); |
805 | 809 | ||
806 | while (n && | 810 | while (n && |
807 | !cached[n - 1]) | 811 | !cached[n - 1]) |
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 176d3c2ef5f5..e548b8b51322 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
@@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) | |||
232 | 232 | ||
233 | d->next += div_u64(done * NSEC_PER_SEC, d->rate); | 233 | d->next += div_u64(done * NSEC_PER_SEC, d->rate); |
234 | 234 | ||
235 | if (time_before64(now + NSEC_PER_SEC, d->next)) | 235 | /* Bound the time. Don't let us fall further than 2 seconds behind |
236 | d->next = now + NSEC_PER_SEC; | 236 | * (this prevents unnecessary backlog that would make it impossible |
237 | * to catch up). If we're ahead of the desired writeback rate, | ||
238 | * don't let us sleep more than 2.5 seconds (so we can notice/respond | ||
239 | * if the control system tells us to speed up!). | ||
240 | */ | ||
241 | if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next)) | ||
242 | d->next = now + NSEC_PER_SEC * 5LLU / 2LLU; | ||
237 | 243 | ||
238 | if (time_after64(now - NSEC_PER_SEC * 2, d->next)) | 244 | if (time_after64(now - NSEC_PER_SEC * 2, d->next)) |
239 | d->next = now - NSEC_PER_SEC * 2; | 245 | d->next = now - NSEC_PER_SEC * 2; |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index f54b58282f77..ed5e8a412eb8 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -442,10 +442,10 @@ struct bch_ratelimit { | |||
442 | uint64_t next; | 442 | uint64_t next; |
443 | 443 | ||
444 | /* | 444 | /* |
445 | * Rate at which we want to do work, in units per nanosecond | 445 | * Rate at which we want to do work, in units per second |
446 | * The units here correspond to the units passed to bch_next_delay() | 446 | * The units here correspond to the units passed to bch_next_delay() |
447 | */ | 447 | */ |
448 | unsigned rate; | 448 | uint32_t rate; |
449 | }; | 449 | }; |
450 | 450 | ||
451 | static inline void bch_ratelimit_reset(struct bch_ratelimit *d) | 451 | static inline void bch_ratelimit_reset(struct bch_ratelimit *d) |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 70454f2ad2fa..56a37884ca8b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -26,48 +26,63 @@ static void __update_writeback_rate(struct cached_dev *dc) | |||
26 | bcache_flash_devs_sectors_dirty(c); | 26 | bcache_flash_devs_sectors_dirty(c); |
27 | uint64_t cache_dirty_target = | 27 | uint64_t cache_dirty_target = |
28 | div_u64(cache_sectors * dc->writeback_percent, 100); | 28 | div_u64(cache_sectors * dc->writeback_percent, 100); |
29 | |||
30 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | 29 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), |
31 | c->cached_dev_sectors); | 30 | c->cached_dev_sectors); |
32 | 31 | ||
33 | /* PD controller */ | 32 | /* |
34 | 33 | * PI controller: | |
34 | * Figures out the amount that should be written per second. | ||
35 | * | ||
36 | * First, the error (number of sectors that are dirty beyond our | ||
37 | * target) is calculated. The error is accumulated (numerically | ||
38 | * integrated). | ||
39 | * | ||
40 | * Then, the proportional value and integral value are scaled | ||
41 | * based on configured values. These are stored as inverses to | ||
42 | * avoid fixed point math and to make configuration easy-- e.g. | ||
43 | * the default value of 40 for writeback_rate_p_term_inverse | ||
44 | * attempts to write at a rate that would retire all the dirty | ||
45 | * blocks in 40 seconds. | ||
46 | * | ||
47 | * The writeback_rate_i_inverse value of 10000 means that 1/10000th | ||
48 | * of the error is accumulated in the integral term per second. | ||
49 | * This acts as a slow, long-term average that is not subject to | ||
50 | * variations in usage like the p term. | ||
51 | */ | ||
35 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); | 52 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
36 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | 53 | int64_t error = dirty - target; |
37 | int64_t proportional = dirty - target; | 54 | int64_t proportional_scaled = |
38 | int64_t change; | 55 | div_s64(error, dc->writeback_rate_p_term_inverse); |
39 | 56 | int64_t integral_scaled; | |
40 | dc->disk.sectors_dirty_last = dirty; | 57 | uint32_t new_rate; |
41 | 58 | ||
42 | /* Scale to sectors per second */ | 59 | if ((error < 0 && dc->writeback_rate_integral > 0) || |
43 | 60 | (error > 0 && time_before64(local_clock(), | |
44 | proportional *= dc->writeback_rate_update_seconds; | 61 | dc->writeback_rate.next + NSEC_PER_MSEC))) { |
45 | proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); | 62 | /* |
46 | 63 | * Only decrease the integral term if it's more than | |
47 | derivative = div_s64(derivative, dc->writeback_rate_update_seconds); | 64 | * zero. Only increase the integral term if the device |
48 | 65 | * is keeping up. (Don't wind up the integral | |
49 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | 66 | * ineffectively in either case). |
50 | (dc->writeback_rate_d_term / | 67 | * |
51 | dc->writeback_rate_update_seconds) ?: 1, 0); | 68 | * It's necessary to scale this by |
52 | 69 | * writeback_rate_update_seconds to keep the integral | |
53 | derivative *= dc->writeback_rate_d_term; | 70 | * term dimensioned properly. |
54 | derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); | 71 | */ |
55 | 72 | dc->writeback_rate_integral += error * | |
56 | change = proportional + derivative; | 73 | dc->writeback_rate_update_seconds; |
74 | } | ||
57 | 75 | ||
58 | /* Don't increase writeback rate if the device isn't keeping up */ | 76 | integral_scaled = div_s64(dc->writeback_rate_integral, |
59 | if (change > 0 && | 77 | dc->writeback_rate_i_term_inverse); |
60 | time_after64(local_clock(), | ||
61 | dc->writeback_rate.next + NSEC_PER_MSEC)) | ||
62 | change = 0; | ||
63 | 78 | ||
64 | dc->writeback_rate.rate = | 79 | new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), |
65 | clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, | 80 | dc->writeback_rate_minimum, NSEC_PER_SEC); |
66 | 1, NSEC_PER_MSEC); | ||
67 | 81 | ||
68 | dc->writeback_rate_proportional = proportional; | 82 | dc->writeback_rate_proportional = proportional_scaled; |
69 | dc->writeback_rate_derivative = derivative; | 83 | dc->writeback_rate_integral_scaled = integral_scaled; |
70 | dc->writeback_rate_change = change; | 84 | dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; |
85 | dc->writeback_rate.rate = new_rate; | ||
71 | dc->writeback_rate_target = target; | 86 | dc->writeback_rate_target = target; |
72 | } | 87 | } |
73 | 88 | ||
@@ -180,13 +195,21 @@ static void write_dirty(struct closure *cl) | |||
180 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 195 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
181 | struct keybuf_key *w = io->bio.bi_private; | 196 | struct keybuf_key *w = io->bio.bi_private; |
182 | 197 | ||
183 | dirty_init(w); | 198 | /* |
184 | bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); | 199 | * IO errors are signalled using the dirty bit on the key. |
185 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); | 200 | * If we failed to read, we should not attempt to write to the |
186 | bio_set_dev(&io->bio, io->dc->bdev); | 201 | * backing device. Instead, immediately go to write_dirty_finish |
187 | io->bio.bi_end_io = dirty_endio; | 202 | * to clean up. |
203 | */ | ||
204 | if (KEY_DIRTY(&w->key)) { | ||
205 | dirty_init(w); | ||
206 | bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); | ||
207 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); | ||
208 | bio_set_dev(&io->bio, io->dc->bdev); | ||
209 | io->bio.bi_end_io = dirty_endio; | ||
188 | 210 | ||
189 | closure_bio_submit(&io->bio, cl); | 211 | closure_bio_submit(&io->bio, cl); |
212 | } | ||
190 | 213 | ||
191 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); | 214 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); |
192 | } | 215 | } |
@@ -418,6 +441,8 @@ static int bch_writeback_thread(void *arg) | |||
418 | struct cached_dev *dc = arg; | 441 | struct cached_dev *dc = arg; |
419 | bool searched_full_index; | 442 | bool searched_full_index; |
420 | 443 | ||
444 | bch_ratelimit_reset(&dc->writeback_rate); | ||
445 | |||
421 | while (!kthread_should_stop()) { | 446 | while (!kthread_should_stop()) { |
422 | down_write(&dc->writeback_lock); | 447 | down_write(&dc->writeback_lock); |
423 | if (!atomic_read(&dc->has_dirty) || | 448 | if (!atomic_read(&dc->has_dirty) || |
@@ -445,7 +470,6 @@ static int bch_writeback_thread(void *arg) | |||
445 | 470 | ||
446 | up_write(&dc->writeback_lock); | 471 | up_write(&dc->writeback_lock); |
447 | 472 | ||
448 | bch_ratelimit_reset(&dc->writeback_rate); | ||
449 | read_dirty(dc); | 473 | read_dirty(dc); |
450 | 474 | ||
451 | if (searched_full_index) { | 475 | if (searched_full_index) { |
@@ -455,6 +479,8 @@ static int bch_writeback_thread(void *arg) | |||
455 | !kthread_should_stop() && | 479 | !kthread_should_stop() && |
456 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) | 480 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) |
457 | delay = schedule_timeout_interruptible(delay); | 481 | delay = schedule_timeout_interruptible(delay); |
482 | |||
483 | bch_ratelimit_reset(&dc->writeback_rate); | ||
458 | } | 484 | } |
459 | } | 485 | } |
460 | 486 | ||
@@ -492,8 +518,6 @@ void bch_sectors_dirty_init(struct bcache_device *d) | |||
492 | 518 | ||
493 | bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), | 519 | bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), |
494 | sectors_dirty_init_fn, 0); | 520 | sectors_dirty_init_fn, 0); |
495 | |||
496 | d->sectors_dirty_last = bcache_dev_sectors_dirty(d); | ||
497 | } | 521 | } |
498 | 522 | ||
499 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 523 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
@@ -507,10 +531,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) | |||
507 | dc->writeback_percent = 10; | 531 | dc->writeback_percent = 10; |
508 | dc->writeback_delay = 30; | 532 | dc->writeback_delay = 30; |
509 | dc->writeback_rate.rate = 1024; | 533 | dc->writeback_rate.rate = 1024; |
534 | dc->writeback_rate_minimum = 8; | ||
510 | 535 | ||
511 | dc->writeback_rate_update_seconds = 5; | 536 | dc->writeback_rate_update_seconds = 5; |
512 | dc->writeback_rate_d_term = 30; | 537 | dc->writeback_rate_p_term_inverse = 40; |
513 | dc->writeback_rate_p_term_inverse = 6000; | 538 | dc->writeback_rate_i_term_inverse = 10000; |
514 | 539 | ||
515 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | 540 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
516 | } | 541 | } |
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 151544740148..a9e3ffb4b03c 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h | |||
@@ -77,7 +77,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, | |||
77 | if (would_skip) | 77 | if (would_skip) |
78 | return false; | 78 | return false; |
79 | 79 | ||
80 | return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK; | 80 | return (op_is_sync(bio->bi_opf) || |
81 | bio->bi_opf & (REQ_META|REQ_PRIO) || | ||
82 | in_use <= CUTOFF_WRITEBACK); | ||
81 | } | 83 | } |
82 | 84 | ||
83 | static inline void bch_writeback_queue(struct cached_dev *dc) | 85 | static inline void bch_writeback_queue(struct cached_dev *dc) |
@@ -90,7 +92,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) | |||
90 | { | 92 | { |
91 | if (!atomic_read(&dc->has_dirty) && | 93 | if (!atomic_read(&dc->has_dirty) && |
92 | !atomic_xchg(&dc->has_dirty, 1)) { | 94 | !atomic_xchg(&dc->has_dirty, 1)) { |
93 | atomic_inc(&dc->count); | 95 | refcount_inc(&dc->count); |
94 | 96 | ||
95 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | 97 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { |
96 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | 98 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index d2121637b4ab..4d8ed74efadf 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index, | |||
368 | pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, | 368 | pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, |
369 | (unsigned long long)index << PAGE_SHIFT); | 369 | (unsigned long long)index << PAGE_SHIFT); |
370 | 370 | ||
371 | bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); | 371 | bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false); |
372 | if (!bh) { | 372 | if (!bh) { |
373 | ret = -ENOMEM; | 373 | ret = -ENOMEM; |
374 | goto out; | 374 | goto out; |
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index eadfcfd106ff..9d32f25489c2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c | |||
@@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) | |||
56 | 56 | ||
57 | int dm_request_based(struct mapped_device *md) | 57 | int dm_request_based(struct mapped_device *md) |
58 | { | 58 | { |
59 | return blk_queue_stackable(md->queue); | 59 | return queue_is_rq_based(md->queue); |
60 | } | 60 | } |
61 | 61 | ||
62 | static void dm_old_start_queue(struct request_queue *q) | 62 | static void dm_old_start_queue(struct request_queue *q) |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ef7b8f201f73..75281828f2cb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -1000,7 +1000,7 @@ verify_rq_based: | |||
1000 | list_for_each_entry(dd, devices, list) { | 1000 | list_for_each_entry(dd, devices, list) { |
1001 | struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); | 1001 | struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); |
1002 | 1002 | ||
1003 | if (!blk_queue_stackable(q)) { | 1003 | if (!queue_is_rq_based(q)) { |
1004 | DMERR("table load rejected: including" | 1004 | DMERR("table load rejected: including" |
1005 | " non-request-stackable devices"); | 1005 | " non-request-stackable devices"); |
1006 | return -EINVAL; | 1006 | return -EINVAL; |
@@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | |||
1847 | */ | 1847 | */ |
1848 | if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) | 1848 | if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) |
1849 | queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); | 1849 | queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); |
1850 | |||
1851 | /* | ||
1852 | * QUEUE_FLAG_STACKABLE must be set after all queue settings are | ||
1853 | * visible to other CPUs because, once the flag is set, incoming bios | ||
1854 | * are processed by request-based dm, which refers to the queue | ||
1855 | * settings. | ||
1856 | * Until the flag set, bios are passed to bio-based dm and queued to | ||
1857 | * md->deferred where queue settings are not needed yet. | ||
1858 | * Those bios are passed to request-based dm at the resume time. | ||
1859 | */ | ||
1860 | smp_mb(); | ||
1861 | if (dm_table_request_based(t)) | ||
1862 | queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); | ||
1863 | } | 1850 | } |
1864 | 1851 | ||
1865 | unsigned int dm_table_get_num_targets(struct dm_table *t) | 1852 | unsigned int dm_table_get_num_targets(struct dm_table *t) |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8aaffa19b29a..a3f8cbb98dd5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -1619,17 +1619,6 @@ static void dm_wq_work(struct work_struct *work); | |||
1619 | void dm_init_md_queue(struct mapped_device *md) | 1619 | void dm_init_md_queue(struct mapped_device *md) |
1620 | { | 1620 | { |
1621 | /* | 1621 | /* |
1622 | * Request-based dm devices cannot be stacked on top of bio-based dm | ||
1623 | * devices. The type of this dm device may not have been decided yet. | ||
1624 | * The type is decided at the first table loading time. | ||
1625 | * To prevent problematic device stacking, clear the queue flag | ||
1626 | * for request stacking support until then. | ||
1627 | * | ||
1628 | * This queue is new, so no concurrency on the queue_flags. | ||
1629 | */ | ||
1630 | queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); | ||
1631 | |||
1632 | /* | ||
1633 | * Initialize data that will only be used by a non-blk-mq DM queue | 1622 | * Initialize data that will only be used by a non-blk-mq DM queue |
1634 | * - must do so here (in alloc_dev callchain) before queue is used | 1623 | * - must do so here (in alloc_dev callchain) before queue is used |
1635 | */ | 1624 | */ |
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig index b7c78a5b1f7a..04008e0bbe81 100644 --- a/drivers/nvme/Kconfig +++ b/drivers/nvme/Kconfig | |||
@@ -1,2 +1,6 @@ | |||
1 | menu "NVME Support" | ||
2 | |||
1 | source "drivers/nvme/host/Kconfig" | 3 | source "drivers/nvme/host/Kconfig" |
2 | source "drivers/nvme/target/Kconfig" | 4 | source "drivers/nvme/target/Kconfig" |
5 | |||
6 | endmenu | ||
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 46d6cb1e03bd..b979cf3bce65 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig | |||
@@ -13,6 +13,15 @@ config BLK_DEV_NVME | |||
13 | To compile this driver as a module, choose M here: the | 13 | To compile this driver as a module, choose M here: the |
14 | module will be called nvme. | 14 | module will be called nvme. |
15 | 15 | ||
16 | config NVME_MULTIPATH | ||
17 | bool "NVMe multipath support" | ||
18 | depends on NVME_CORE | ||
19 | ---help--- | ||
20 | This option enables support for multipath access to NVMe | ||
21 | subsystems. If this option is enabled only a single | ||
22 | /dev/nvmeXnY device will show up for each NVMe namespaces, | ||
23 | even if it is accessible through multiple controllers. | ||
24 | |||
16 | config NVME_FABRICS | 25 | config NVME_FABRICS |
17 | tristate | 26 | tristate |
18 | 27 | ||
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 7b96e4588a12..a25fd43650ad 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile | |||
@@ -6,6 +6,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o | |||
6 | obj-$(CONFIG_NVME_FC) += nvme-fc.o | 6 | obj-$(CONFIG_NVME_FC) += nvme-fc.o |
7 | 7 | ||
8 | nvme-core-y := core.o | 8 | nvme-core-y := core.o |
9 | nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o | ||
9 | nvme-core-$(CONFIG_NVM) += lightnvm.o | 10 | nvme-core-$(CONFIG_NVM) += lightnvm.o |
10 | 11 | ||
11 | nvme-y += pci.o | 12 | nvme-y += pci.o |
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 37f9039bb9ca..25da74d310d1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c | |||
@@ -34,13 +34,13 @@ | |||
34 | 34 | ||
35 | #define NVME_MINORS (1U << MINORBITS) | 35 | #define NVME_MINORS (1U << MINORBITS) |
36 | 36 | ||
37 | unsigned char admin_timeout = 60; | 37 | unsigned int admin_timeout = 60; |
38 | module_param(admin_timeout, byte, 0644); | 38 | module_param(admin_timeout, uint, 0644); |
39 | MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); | 39 | MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); |
40 | EXPORT_SYMBOL_GPL(admin_timeout); | 40 | EXPORT_SYMBOL_GPL(admin_timeout); |
41 | 41 | ||
42 | unsigned char nvme_io_timeout = 30; | 42 | unsigned int nvme_io_timeout = 30; |
43 | module_param_named(io_timeout, nvme_io_timeout, byte, 0644); | 43 | module_param_named(io_timeout, nvme_io_timeout, uint, 0644); |
44 | MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); | 44 | MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); |
45 | EXPORT_SYMBOL_GPL(nvme_io_timeout); | 45 | EXPORT_SYMBOL_GPL(nvme_io_timeout); |
46 | 46 | ||
@@ -52,9 +52,6 @@ static u8 nvme_max_retries = 5; | |||
52 | module_param_named(max_retries, nvme_max_retries, byte, 0644); | 52 | module_param_named(max_retries, nvme_max_retries, byte, 0644); |
53 | MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); | 53 | MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); |
54 | 54 | ||
55 | static int nvme_char_major; | ||
56 | module_param(nvme_char_major, int, 0); | ||
57 | |||
58 | static unsigned long default_ps_max_latency_us = 100000; | 55 | static unsigned long default_ps_max_latency_us = 100000; |
59 | module_param(default_ps_max_latency_us, ulong, 0644); | 56 | module_param(default_ps_max_latency_us, ulong, 0644); |
60 | MODULE_PARM_DESC(default_ps_max_latency_us, | 57 | MODULE_PARM_DESC(default_ps_max_latency_us, |
@@ -71,10 +68,17 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); | |||
71 | struct workqueue_struct *nvme_wq; | 68 | struct workqueue_struct *nvme_wq; |
72 | EXPORT_SYMBOL_GPL(nvme_wq); | 69 | EXPORT_SYMBOL_GPL(nvme_wq); |
73 | 70 | ||
74 | static LIST_HEAD(nvme_ctrl_list); | 71 | static DEFINE_IDA(nvme_subsystems_ida); |
75 | static DEFINE_SPINLOCK(dev_list_lock); | 72 | static LIST_HEAD(nvme_subsystems); |
73 | static DEFINE_MUTEX(nvme_subsystems_lock); | ||
76 | 74 | ||
75 | static DEFINE_IDA(nvme_instance_ida); | ||
76 | static dev_t nvme_chr_devt; | ||
77 | static struct class *nvme_class; | 77 | static struct class *nvme_class; |
78 | static struct class *nvme_subsys_class; | ||
79 | |||
80 | static void nvme_ns_remove(struct nvme_ns *ns); | ||
81 | static int nvme_revalidate_disk(struct gendisk *disk); | ||
78 | 82 | ||
79 | static __le32 nvme_get_log_dw10(u8 lid, size_t size) | 83 | static __le32 nvme_get_log_dw10(u8 lid, size_t size) |
80 | { | 84 | { |
@@ -101,6 +105,51 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) | |||
101 | return ret; | 105 | return ret; |
102 | } | 106 | } |
103 | 107 | ||
108 | static void nvme_delete_ctrl_work(struct work_struct *work) | ||
109 | { | ||
110 | struct nvme_ctrl *ctrl = | ||
111 | container_of(work, struct nvme_ctrl, delete_work); | ||
112 | |||
113 | flush_work(&ctrl->reset_work); | ||
114 | nvme_stop_ctrl(ctrl); | ||
115 | nvme_remove_namespaces(ctrl); | ||
116 | ctrl->ops->delete_ctrl(ctrl); | ||
117 | nvme_uninit_ctrl(ctrl); | ||
118 | nvme_put_ctrl(ctrl); | ||
119 | } | ||
120 | |||
121 | int nvme_delete_ctrl(struct nvme_ctrl *ctrl) | ||
122 | { | ||
123 | if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) | ||
124 | return -EBUSY; | ||
125 | if (!queue_work(nvme_wq, &ctrl->delete_work)) | ||
126 | return -EBUSY; | ||
127 | return 0; | ||
128 | } | ||
129 | EXPORT_SYMBOL_GPL(nvme_delete_ctrl); | ||
130 | |||
131 | int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) | ||
132 | { | ||
133 | int ret = 0; | ||
134 | |||
135 | /* | ||
136 | * Keep a reference until the work is flushed since ->delete_ctrl | ||
137 | * can free the controller. | ||
138 | */ | ||
139 | nvme_get_ctrl(ctrl); | ||
140 | ret = nvme_delete_ctrl(ctrl); | ||
141 | if (!ret) | ||
142 | flush_work(&ctrl->delete_work); | ||
143 | nvme_put_ctrl(ctrl); | ||
144 | return ret; | ||
145 | } | ||
146 | EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); | ||
147 | |||
148 | static inline bool nvme_ns_has_pi(struct nvme_ns *ns) | ||
149 | { | ||
150 | return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); | ||
151 | } | ||
152 | |||
104 | static blk_status_t nvme_error_status(struct request *req) | 153 | static blk_status_t nvme_error_status(struct request *req) |
105 | { | 154 | { |
106 | switch (nvme_req(req)->status & 0x7ff) { | 155 | switch (nvme_req(req)->status & 0x7ff) { |
@@ -142,9 +191,16 @@ static inline bool nvme_req_needs_retry(struct request *req) | |||
142 | void nvme_complete_rq(struct request *req) | 191 | void nvme_complete_rq(struct request *req) |
143 | { | 192 | { |
144 | if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { | 193 | if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { |
145 | nvme_req(req)->retries++; | 194 | if (nvme_req_needs_failover(req)) { |
146 | blk_mq_requeue_request(req, true); | 195 | nvme_failover_req(req); |
147 | return; | 196 | return; |
197 | } | ||
198 | |||
199 | if (!blk_queue_dying(req->q)) { | ||
200 | nvme_req(req)->retries++; | ||
201 | blk_mq_requeue_request(req, true); | ||
202 | return; | ||
203 | } | ||
148 | } | 204 | } |
149 | 205 | ||
150 | blk_mq_end_request(req, nvme_error_status(req)); | 206 | blk_mq_end_request(req, nvme_error_status(req)); |
@@ -153,18 +209,13 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq); | |||
153 | 209 | ||
154 | void nvme_cancel_request(struct request *req, void *data, bool reserved) | 210 | void nvme_cancel_request(struct request *req, void *data, bool reserved) |
155 | { | 211 | { |
156 | int status; | ||
157 | |||
158 | if (!blk_mq_request_started(req)) | 212 | if (!blk_mq_request_started(req)) |
159 | return; | 213 | return; |
160 | 214 | ||
161 | dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, | 215 | dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, |
162 | "Cancelling I/O %d", req->tag); | 216 | "Cancelling I/O %d", req->tag); |
163 | 217 | ||
164 | status = NVME_SC_ABORT_REQ; | 218 | nvme_req(req)->status = NVME_SC_ABORT_REQ; |
165 | if (blk_queue_dying(req->q)) | ||
166 | status |= NVME_SC_DNR; | ||
167 | nvme_req(req)->status = status; | ||
168 | blk_mq_complete_request(req); | 219 | blk_mq_complete_request(req); |
169 | 220 | ||
170 | } | 221 | } |
@@ -205,6 +256,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, | |||
205 | case NVME_CTRL_RECONNECTING: | 256 | case NVME_CTRL_RECONNECTING: |
206 | switch (old_state) { | 257 | switch (old_state) { |
207 | case NVME_CTRL_LIVE: | 258 | case NVME_CTRL_LIVE: |
259 | case NVME_CTRL_RESETTING: | ||
208 | changed = true; | 260 | changed = true; |
209 | /* FALLTHRU */ | 261 | /* FALLTHRU */ |
210 | default: | 262 | default: |
@@ -239,11 +291,29 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, | |||
239 | ctrl->state = new_state; | 291 | ctrl->state = new_state; |
240 | 292 | ||
241 | spin_unlock_irqrestore(&ctrl->lock, flags); | 293 | spin_unlock_irqrestore(&ctrl->lock, flags); |
242 | 294 | if (changed && ctrl->state == NVME_CTRL_LIVE) | |
295 | nvme_kick_requeue_lists(ctrl); | ||
243 | return changed; | 296 | return changed; |
244 | } | 297 | } |
245 | EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); | 298 | EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); |
246 | 299 | ||
300 | static void nvme_free_ns_head(struct kref *ref) | ||
301 | { | ||
302 | struct nvme_ns_head *head = | ||
303 | container_of(ref, struct nvme_ns_head, ref); | ||
304 | |||
305 | nvme_mpath_remove_disk(head); | ||
306 | ida_simple_remove(&head->subsys->ns_ida, head->instance); | ||
307 | list_del_init(&head->entry); | ||
308 | cleanup_srcu_struct(&head->srcu); | ||
309 | kfree(head); | ||
310 | } | ||
311 | |||
312 | static void nvme_put_ns_head(struct nvme_ns_head *head) | ||
313 | { | ||
314 | kref_put(&head->ref, nvme_free_ns_head); | ||
315 | } | ||
316 | |||
247 | static void nvme_free_ns(struct kref *kref) | 317 | static void nvme_free_ns(struct kref *kref) |
248 | { | 318 | { |
249 | struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); | 319 | struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); |
@@ -251,14 +321,8 @@ static void nvme_free_ns(struct kref *kref) | |||
251 | if (ns->ndev) | 321 | if (ns->ndev) |
252 | nvme_nvm_unregister(ns); | 322 | nvme_nvm_unregister(ns); |
253 | 323 | ||
254 | if (ns->disk) { | ||
255 | spin_lock(&dev_list_lock); | ||
256 | ns->disk->private_data = NULL; | ||
257 | spin_unlock(&dev_list_lock); | ||
258 | } | ||
259 | |||
260 | put_disk(ns->disk); | 324 | put_disk(ns->disk); |
261 | ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); | 325 | nvme_put_ns_head(ns->head); |
262 | nvme_put_ctrl(ns->ctrl); | 326 | nvme_put_ctrl(ns->ctrl); |
263 | kfree(ns); | 327 | kfree(ns); |
264 | } | 328 | } |
@@ -268,31 +332,8 @@ static void nvme_put_ns(struct nvme_ns *ns) | |||
268 | kref_put(&ns->kref, nvme_free_ns); | 332 | kref_put(&ns->kref, nvme_free_ns); |
269 | } | 333 | } |
270 | 334 | ||
271 | static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) | ||
272 | { | ||
273 | struct nvme_ns *ns; | ||
274 | |||
275 | spin_lock(&dev_list_lock); | ||
276 | ns = disk->private_data; | ||
277 | if (ns) { | ||
278 | if (!kref_get_unless_zero(&ns->kref)) | ||
279 | goto fail; | ||
280 | if (!try_module_get(ns->ctrl->ops->module)) | ||
281 | goto fail_put_ns; | ||
282 | } | ||
283 | spin_unlock(&dev_list_lock); | ||
284 | |||
285 | return ns; | ||
286 | |||
287 | fail_put_ns: | ||
288 | kref_put(&ns->kref, nvme_free_ns); | ||
289 | fail: | ||
290 | spin_unlock(&dev_list_lock); | ||
291 | return NULL; | ||
292 | } | ||
293 | |||
294 | struct request *nvme_alloc_request(struct request_queue *q, | 335 | struct request *nvme_alloc_request(struct request_queue *q, |
295 | struct nvme_command *cmd, unsigned int flags, int qid) | 336 | struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) |
296 | { | 337 | { |
297 | unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; | 338 | unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; |
298 | struct request *req; | 339 | struct request *req; |
@@ -417,7 +458,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, | |||
417 | { | 458 | { |
418 | memset(cmnd, 0, sizeof(*cmnd)); | 459 | memset(cmnd, 0, sizeof(*cmnd)); |
419 | cmnd->common.opcode = nvme_cmd_flush; | 460 | cmnd->common.opcode = nvme_cmd_flush; |
420 | cmnd->common.nsid = cpu_to_le32(ns->ns_id); | 461 | cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); |
421 | } | 462 | } |
422 | 463 | ||
423 | static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, | 464 | static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, |
@@ -448,7 +489,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, | |||
448 | 489 | ||
449 | memset(cmnd, 0, sizeof(*cmnd)); | 490 | memset(cmnd, 0, sizeof(*cmnd)); |
450 | cmnd->dsm.opcode = nvme_cmd_dsm; | 491 | cmnd->dsm.opcode = nvme_cmd_dsm; |
451 | cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); | 492 | cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); |
452 | cmnd->dsm.nr = cpu_to_le32(segments - 1); | 493 | cmnd->dsm.nr = cpu_to_le32(segments - 1); |
453 | cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); | 494 | cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); |
454 | 495 | ||
@@ -467,16 +508,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, | |||
467 | u16 control = 0; | 508 | u16 control = 0; |
468 | u32 dsmgmt = 0; | 509 | u32 dsmgmt = 0; |
469 | 510 | ||
470 | /* | ||
471 | * If formated with metadata, require the block layer provide a buffer | ||
472 | * unless this namespace is formated such that the metadata can be | ||
473 | * stripped/generated by the controller with PRACT=1. | ||
474 | */ | ||
475 | if (ns && ns->ms && | ||
476 | (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) && | ||
477 | !blk_integrity_rq(req) && !blk_rq_is_passthrough(req)) | ||
478 | return BLK_STS_NOTSUPP; | ||
479 | |||
480 | if (req->cmd_flags & REQ_FUA) | 511 | if (req->cmd_flags & REQ_FUA) |
481 | control |= NVME_RW_FUA; | 512 | control |= NVME_RW_FUA; |
482 | if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) | 513 | if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) |
@@ -487,7 +518,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, | |||
487 | 518 | ||
488 | memset(cmnd, 0, sizeof(*cmnd)); | 519 | memset(cmnd, 0, sizeof(*cmnd)); |
489 | cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); | 520 | cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); |
490 | cmnd->rw.nsid = cpu_to_le32(ns->ns_id); | 521 | cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); |
491 | cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); | 522 | cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); |
492 | cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); | 523 | cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); |
493 | 524 | ||
@@ -495,6 +526,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, | |||
495 | nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); | 526 | nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); |
496 | 527 | ||
497 | if (ns->ms) { | 528 | if (ns->ms) { |
529 | /* | ||
530 | * If formated with metadata, the block layer always provides a | ||
531 | * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else | ||
532 | * we enable the PRACT bit for protection information or set the | ||
533 | * namespace capacity to zero to prevent any I/O. | ||
534 | */ | ||
535 | if (!blk_integrity_rq(req)) { | ||
536 | if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) | ||
537 | return BLK_STS_NOTSUPP; | ||
538 | control |= NVME_RW_PRINFO_PRACT; | ||
539 | } | ||
540 | |||
498 | switch (ns->pi_type) { | 541 | switch (ns->pi_type) { |
499 | case NVME_NS_DPS_PI_TYPE3: | 542 | case NVME_NS_DPS_PI_TYPE3: |
500 | control |= NVME_RW_PRINFO_PRCHK_GUARD; | 543 | control |= NVME_RW_PRINFO_PRCHK_GUARD; |
@@ -507,8 +550,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, | |||
507 | nvme_block_nr(ns, blk_rq_pos(req))); | 550 | nvme_block_nr(ns, blk_rq_pos(req))); |
508 | break; | 551 | break; |
509 | } | 552 | } |
510 | if (!blk_integrity_rq(req)) | ||
511 | control |= NVME_RW_PRINFO_PRACT; | ||
512 | } | 553 | } |
513 | 554 | ||
514 | cmnd->rw.control = cpu_to_le16(control); | 555 | cmnd->rw.control = cpu_to_le16(control); |
@@ -560,7 +601,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); | |||
560 | */ | 601 | */ |
561 | int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, | 602 | int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, |
562 | union nvme_result *result, void *buffer, unsigned bufflen, | 603 | union nvme_result *result, void *buffer, unsigned bufflen, |
563 | unsigned timeout, int qid, int at_head, int flags) | 604 | unsigned timeout, int qid, int at_head, |
605 | blk_mq_req_flags_t flags) | ||
564 | { | 606 | { |
565 | struct request *req; | 607 | struct request *req; |
566 | int ret; | 608 | int ret; |
@@ -778,7 +820,7 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) | |||
778 | } | 820 | } |
779 | 821 | ||
780 | static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, | 822 | static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, |
781 | u8 *eui64, u8 *nguid, uuid_t *uuid) | 823 | struct nvme_ns_ids *ids) |
782 | { | 824 | { |
783 | struct nvme_command c = { }; | 825 | struct nvme_command c = { }; |
784 | int status; | 826 | int status; |
@@ -814,7 +856,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, | |||
814 | goto free_data; | 856 | goto free_data; |
815 | } | 857 | } |
816 | len = NVME_NIDT_EUI64_LEN; | 858 | len = NVME_NIDT_EUI64_LEN; |
817 | memcpy(eui64, data + pos + sizeof(*cur), len); | 859 | memcpy(ids->eui64, data + pos + sizeof(*cur), len); |
818 | break; | 860 | break; |
819 | case NVME_NIDT_NGUID: | 861 | case NVME_NIDT_NGUID: |
820 | if (cur->nidl != NVME_NIDT_NGUID_LEN) { | 862 | if (cur->nidl != NVME_NIDT_NGUID_LEN) { |
@@ -824,7 +866,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, | |||
824 | goto free_data; | 866 | goto free_data; |
825 | } | 867 | } |
826 | len = NVME_NIDT_NGUID_LEN; | 868 | len = NVME_NIDT_NGUID_LEN; |
827 | memcpy(nguid, data + pos + sizeof(*cur), len); | 869 | memcpy(ids->nguid, data + pos + sizeof(*cur), len); |
828 | break; | 870 | break; |
829 | case NVME_NIDT_UUID: | 871 | case NVME_NIDT_UUID: |
830 | if (cur->nidl != NVME_NIDT_UUID_LEN) { | 872 | if (cur->nidl != NVME_NIDT_UUID_LEN) { |
@@ -834,7 +876,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, | |||
834 | goto free_data; | 876 | goto free_data; |
835 | } | 877 | } |
836 | len = NVME_NIDT_UUID_LEN; | 878 | len = NVME_NIDT_UUID_LEN; |
837 | uuid_copy(uuid, data + pos + sizeof(*cur)); | 879 | uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); |
838 | break; | 880 | break; |
839 | default: | 881 | default: |
840 | /* Skip unnkown types */ | 882 | /* Skip unnkown types */ |
@@ -968,7 +1010,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) | |||
968 | memset(&c, 0, sizeof(c)); | 1010 | memset(&c, 0, sizeof(c)); |
969 | c.rw.opcode = io.opcode; | 1011 | c.rw.opcode = io.opcode; |
970 | c.rw.flags = io.flags; | 1012 | c.rw.flags = io.flags; |
971 | c.rw.nsid = cpu_to_le32(ns->ns_id); | 1013 | c.rw.nsid = cpu_to_le32(ns->head->ns_id); |
972 | c.rw.slba = cpu_to_le64(io.slba); | 1014 | c.rw.slba = cpu_to_le64(io.slba); |
973 | c.rw.length = cpu_to_le16(io.nblocks); | 1015 | c.rw.length = cpu_to_le16(io.nblocks); |
974 | c.rw.control = cpu_to_le16(io.control); | 1016 | c.rw.control = cpu_to_le16(io.control); |
@@ -982,12 +1024,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) | |||
982 | metadata, meta_len, io.slba, NULL, 0); | 1024 | metadata, meta_len, io.slba, NULL, 0); |
983 | } | 1025 | } |
984 | 1026 | ||
1027 | static u32 nvme_known_admin_effects(u8 opcode) | ||
1028 | { | ||
1029 | switch (opcode) { | ||
1030 | case nvme_admin_format_nvm: | ||
1031 | return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | | ||
1032 | NVME_CMD_EFFECTS_CSE_MASK; | ||
1033 | case nvme_admin_sanitize_nvm: | ||
1034 | return NVME_CMD_EFFECTS_CSE_MASK; | ||
1035 | default: | ||
1036 | break; | ||
1037 | } | ||
1038 | return 0; | ||
1039 | } | ||
1040 | |||
1041 | static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, | ||
1042 | u8 opcode) | ||
1043 | { | ||
1044 | u32 effects = 0; | ||
1045 | |||
1046 | if (ns) { | ||
1047 | if (ctrl->effects) | ||
1048 | effects = le32_to_cpu(ctrl->effects->iocs[opcode]); | ||
1049 | if (effects & ~NVME_CMD_EFFECTS_CSUPP) | ||
1050 | dev_warn(ctrl->device, | ||
1051 | "IO command:%02x has unhandled effects:%08x\n", | ||
1052 | opcode, effects); | ||
1053 | return 0; | ||
1054 | } | ||
1055 | |||
1056 | if (ctrl->effects) | ||
1057 | effects = le32_to_cpu(ctrl->effects->iocs[opcode]); | ||
1058 | else | ||
1059 | effects = nvme_known_admin_effects(opcode); | ||
1060 | |||
1061 | /* | ||
1062 | * For simplicity, IO to all namespaces is quiesced even if the command | ||
1063 | * effects say only one namespace is affected. | ||
1064 | */ | ||
1065 | if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { | ||
1066 | nvme_start_freeze(ctrl); | ||
1067 | nvme_wait_freeze(ctrl); | ||
1068 | } | ||
1069 | return effects; | ||
1070 | } | ||
1071 | |||
1072 | static void nvme_update_formats(struct nvme_ctrl *ctrl) | ||
1073 | { | ||
1074 | struct nvme_ns *ns; | ||
1075 | |||
1076 | mutex_lock(&ctrl->namespaces_mutex); | ||
1077 | list_for_each_entry(ns, &ctrl->namespaces, list) { | ||
1078 | if (ns->disk && nvme_revalidate_disk(ns->disk)) | ||
1079 | nvme_ns_remove(ns); | ||
1080 | } | ||
1081 | mutex_unlock(&ctrl->namespaces_mutex); | ||
1082 | } | ||
1083 | |||
1084 | static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) | ||
1085 | { | ||
1086 | /* | ||
1087 | * Revalidate LBA changes prior to unfreezing. This is necessary to | ||
1088 | * prevent memory corruption if a logical block size was changed by | ||
1089 | * this command. | ||
1090 | */ | ||
1091 | if (effects & NVME_CMD_EFFECTS_LBCC) | ||
1092 | nvme_update_formats(ctrl); | ||
1093 | if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) | ||
1094 | nvme_unfreeze(ctrl); | ||
1095 | if (effects & NVME_CMD_EFFECTS_CCC) | ||
1096 | nvme_init_identify(ctrl); | ||
1097 | if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) | ||
1098 | nvme_queue_scan(ctrl); | ||
1099 | } | ||
1100 | |||
985 | static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, | 1101 | static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, |
986 | struct nvme_passthru_cmd __user *ucmd) | 1102 | struct nvme_passthru_cmd __user *ucmd) |
987 | { | 1103 | { |
988 | struct nvme_passthru_cmd cmd; | 1104 | struct nvme_passthru_cmd cmd; |
989 | struct nvme_command c; | 1105 | struct nvme_command c; |
990 | unsigned timeout = 0; | 1106 | unsigned timeout = 0; |
1107 | u32 effects; | ||
991 | int status; | 1108 | int status; |
992 | 1109 | ||
993 | if (!capable(CAP_SYS_ADMIN)) | 1110 | if (!capable(CAP_SYS_ADMIN)) |
@@ -1013,10 +1130,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, | |||
1013 | if (cmd.timeout_ms) | 1130 | if (cmd.timeout_ms) |
1014 | timeout = msecs_to_jiffies(cmd.timeout_ms); | 1131 | timeout = msecs_to_jiffies(cmd.timeout_ms); |
1015 | 1132 | ||
1133 | effects = nvme_passthru_start(ctrl, ns, cmd.opcode); | ||
1016 | status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, | 1134 | status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, |
1017 | (void __user *)(uintptr_t)cmd.addr, cmd.data_len, | 1135 | (void __user *)(uintptr_t)cmd.addr, cmd.data_len, |
1018 | (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, | 1136 | (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, |
1019 | 0, &cmd.result, timeout); | 1137 | 0, &cmd.result, timeout); |
1138 | nvme_passthru_end(ctrl, effects); | ||
1139 | |||
1020 | if (status >= 0) { | 1140 | if (status >= 0) { |
1021 | if (put_user(cmd.result, &ucmd->result)) | 1141 | if (put_user(cmd.result, &ucmd->result)) |
1022 | return -EFAULT; | 1142 | return -EFAULT; |
@@ -1025,15 +1145,37 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, | |||
1025 | return status; | 1145 | return status; |
1026 | } | 1146 | } |
1027 | 1147 | ||
1028 | static int nvme_ioctl(struct block_device *bdev, fmode_t mode, | 1148 | /* |
1029 | unsigned int cmd, unsigned long arg) | 1149 | * Issue ioctl requests on the first available path. Note that unlike normal |
1150 | * block layer requests we will not retry failed request on another controller. | ||
1151 | */ | ||
1152 | static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, | ||
1153 | struct nvme_ns_head **head, int *srcu_idx) | ||
1030 | { | 1154 | { |
1031 | struct nvme_ns *ns = bdev->bd_disk->private_data; | 1155 | #ifdef CONFIG_NVME_MULTIPATH |
1156 | if (disk->fops == &nvme_ns_head_ops) { | ||
1157 | *head = disk->private_data; | ||
1158 | *srcu_idx = srcu_read_lock(&(*head)->srcu); | ||
1159 | return nvme_find_path(*head); | ||
1160 | } | ||
1161 | #endif | ||
1162 | *head = NULL; | ||
1163 | *srcu_idx = -1; | ||
1164 | return disk->private_data; | ||
1165 | } | ||
1032 | 1166 | ||
1167 | static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) | ||
1168 | { | ||
1169 | if (head) | ||
1170 | srcu_read_unlock(&head->srcu, idx); | ||
1171 | } | ||
1172 | |||
1173 | static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg) | ||
1174 | { | ||
1033 | switch (cmd) { | 1175 | switch (cmd) { |
1034 | case NVME_IOCTL_ID: | 1176 | case NVME_IOCTL_ID: |
1035 | force_successful_syscall_return(); | 1177 | force_successful_syscall_return(); |
1036 | return ns->ns_id; | 1178 | return ns->head->ns_id; |
1037 | case NVME_IOCTL_ADMIN_CMD: | 1179 | case NVME_IOCTL_ADMIN_CMD: |
1038 | return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); | 1180 | return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); |
1039 | case NVME_IOCTL_IO_CMD: | 1181 | case NVME_IOCTL_IO_CMD: |
@@ -1052,27 +1194,39 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, | |||
1052 | } | 1194 | } |
1053 | } | 1195 | } |
1054 | 1196 | ||
1055 | #ifdef CONFIG_COMPAT | 1197 | static int nvme_ioctl(struct block_device *bdev, fmode_t mode, |
1056 | static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, | 1198 | unsigned int cmd, unsigned long arg) |
1057 | unsigned int cmd, unsigned long arg) | ||
1058 | { | 1199 | { |
1059 | return nvme_ioctl(bdev, mode, cmd, arg); | 1200 | struct nvme_ns_head *head = NULL; |
1201 | struct nvme_ns *ns; | ||
1202 | int srcu_idx, ret; | ||
1203 | |||
1204 | ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); | ||
1205 | if (unlikely(!ns)) | ||
1206 | ret = -EWOULDBLOCK; | ||
1207 | else | ||
1208 | ret = nvme_ns_ioctl(ns, cmd, arg); | ||
1209 | nvme_put_ns_from_disk(head, srcu_idx); | ||
1210 | return ret; | ||
1060 | } | 1211 | } |
1061 | #else | ||
1062 | #define nvme_compat_ioctl NULL | ||
1063 | #endif | ||
1064 | 1212 | ||
1065 | static int nvme_open(struct block_device *bdev, fmode_t mode) | 1213 | static int nvme_open(struct block_device *bdev, fmode_t mode) |
1066 | { | 1214 | { |
1067 | return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; | 1215 | struct nvme_ns *ns = bdev->bd_disk->private_data; |
1216 | |||
1217 | #ifdef CONFIG_NVME_MULTIPATH | ||
1218 | /* should never be called due to GENHD_FL_HIDDEN */ | ||
1219 | if (WARN_ON_ONCE(ns->head->disk)) | ||
1220 | return -ENXIO; | ||
1221 | #endif | ||
1222 | if (!kref_get_unless_zero(&ns->kref)) | ||
1223 | return -ENXIO; | ||
1224 | return 0; | ||
1068 | } | 1225 | } |
1069 | 1226 | ||
1070 | static void nvme_release(struct gendisk *disk, fmode_t mode) | 1227 | static void nvme_release(struct gendisk *disk, fmode_t mode) |
1071 | { | 1228 | { |
1072 | struct nvme_ns *ns = disk->private_data; | 1229 | nvme_put_ns(disk->private_data); |
1073 | |||
1074 | module_put(ns->ctrl->ops->module); | ||
1075 | nvme_put_ns(ns); | ||
1076 | } | 1230 | } |
1077 | 1231 | ||
1078 | static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) | 1232 | static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
@@ -1085,35 +1239,12 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |||
1085 | } | 1239 | } |
1086 | 1240 | ||
1087 | #ifdef CONFIG_BLK_DEV_INTEGRITY | 1241 | #ifdef CONFIG_BLK_DEV_INTEGRITY |
1088 | static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, | 1242 | static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) |
1089 | u16 bs) | ||
1090 | { | ||
1091 | struct nvme_ns *ns = disk->private_data; | ||
1092 | u16 old_ms = ns->ms; | ||
1093 | u8 pi_type = 0; | ||
1094 | |||
1095 | ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); | ||
1096 | ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); | ||
1097 | |||
1098 | /* PI implementation requires metadata equal t10 pi tuple size */ | ||
1099 | if (ns->ms == sizeof(struct t10_pi_tuple)) | ||
1100 | pi_type = id->dps & NVME_NS_DPS_PI_MASK; | ||
1101 | |||
1102 | if (blk_get_integrity(disk) && | ||
1103 | (ns->pi_type != pi_type || ns->ms != old_ms || | ||
1104 | bs != queue_logical_block_size(disk->queue) || | ||
1105 | (ns->ms && ns->ext))) | ||
1106 | blk_integrity_unregister(disk); | ||
1107 | |||
1108 | ns->pi_type = pi_type; | ||
1109 | } | ||
1110 | |||
1111 | static void nvme_init_integrity(struct nvme_ns *ns) | ||
1112 | { | 1243 | { |
1113 | struct blk_integrity integrity; | 1244 | struct blk_integrity integrity; |
1114 | 1245 | ||
1115 | memset(&integrity, 0, sizeof(integrity)); | 1246 | memset(&integrity, 0, sizeof(integrity)); |
1116 | switch (ns->pi_type) { | 1247 | switch (pi_type) { |
1117 | case NVME_NS_DPS_PI_TYPE3: | 1248 | case NVME_NS_DPS_PI_TYPE3: |
1118 | integrity.profile = &t10_pi_type3_crc; | 1249 | integrity.profile = &t10_pi_type3_crc; |
1119 | integrity.tag_size = sizeof(u16) + sizeof(u32); | 1250 | integrity.tag_size = sizeof(u16) + sizeof(u32); |
@@ -1129,16 +1260,12 @@ static void nvme_init_integrity(struct nvme_ns *ns) | |||
1129 | integrity.profile = NULL; | 1260 | integrity.profile = NULL; |
1130 | break; | 1261 | break; |
1131 | } | 1262 | } |
1132 | integrity.tuple_size = ns->ms; | 1263 | integrity.tuple_size = ms; |
1133 | blk_integrity_register(ns->disk, &integrity); | 1264 | blk_integrity_register(disk, &integrity); |
1134 | blk_queue_max_integrity_segments(ns->queue, 1); | 1265 | blk_queue_max_integrity_segments(disk->queue, 1); |
1135 | } | 1266 | } |
1136 | #else | 1267 | #else |
1137 | static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, | 1268 | static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) |
1138 | u16 bs) | ||
1139 | { | ||
1140 | } | ||
1141 | static void nvme_init_integrity(struct nvme_ns *ns) | ||
1142 | { | 1269 | { |
1143 | } | 1270 | } |
1144 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ | 1271 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
@@ -1149,53 +1276,89 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) | |||
1149 | blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); | 1276 | blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); |
1150 | } | 1277 | } |
1151 | 1278 | ||
1152 | static void nvme_config_discard(struct nvme_ns *ns) | 1279 | static void nvme_config_discard(struct nvme_ctrl *ctrl, |
1280 | unsigned stream_alignment, struct request_queue *queue) | ||
1153 | { | 1281 | { |
1154 | struct nvme_ctrl *ctrl = ns->ctrl; | 1282 | u32 size = queue_logical_block_size(queue); |
1155 | u32 logical_block_size = queue_logical_block_size(ns->queue); | 1283 | |
1284 | if (stream_alignment) | ||
1285 | size *= stream_alignment; | ||
1156 | 1286 | ||
1157 | BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < | 1287 | BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < |
1158 | NVME_DSM_MAX_RANGES); | 1288 | NVME_DSM_MAX_RANGES); |
1159 | 1289 | ||
1160 | if (ctrl->nr_streams && ns->sws && ns->sgs) { | 1290 | queue->limits.discard_alignment = size; |
1161 | unsigned int sz = logical_block_size * ns->sws * ns->sgs; | 1291 | queue->limits.discard_granularity = size; |
1162 | 1292 | ||
1163 | ns->queue->limits.discard_alignment = sz; | 1293 | blk_queue_max_discard_sectors(queue, UINT_MAX); |
1164 | ns->queue->limits.discard_granularity = sz; | 1294 | blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); |
1165 | } else { | 1295 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue); |
1166 | ns->queue->limits.discard_alignment = logical_block_size; | ||
1167 | ns->queue->limits.discard_granularity = logical_block_size; | ||
1168 | } | ||
1169 | blk_queue_max_discard_sectors(ns->queue, UINT_MAX); | ||
1170 | blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES); | ||
1171 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); | ||
1172 | 1296 | ||
1173 | if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) | 1297 | if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) |
1174 | blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX); | 1298 | blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); |
1175 | } | 1299 | } |
1176 | 1300 | ||
1177 | static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, | 1301 | static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, |
1178 | struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid) | 1302 | struct nvme_id_ns *id, struct nvme_ns_ids *ids) |
1179 | { | 1303 | { |
1304 | memset(ids, 0, sizeof(*ids)); | ||
1305 | |||
1180 | if (ctrl->vs >= NVME_VS(1, 1, 0)) | 1306 | if (ctrl->vs >= NVME_VS(1, 1, 0)) |
1181 | memcpy(eui64, id->eui64, sizeof(id->eui64)); | 1307 | memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); |
1182 | if (ctrl->vs >= NVME_VS(1, 2, 0)) | 1308 | if (ctrl->vs >= NVME_VS(1, 2, 0)) |
1183 | memcpy(nguid, id->nguid, sizeof(id->nguid)); | 1309 | memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); |
1184 | if (ctrl->vs >= NVME_VS(1, 3, 0)) { | 1310 | if (ctrl->vs >= NVME_VS(1, 3, 0)) { |
1185 | /* Don't treat error as fatal we potentially | 1311 | /* Don't treat error as fatal we potentially |
1186 | * already have a NGUID or EUI-64 | 1312 | * already have a NGUID or EUI-64 |
1187 | */ | 1313 | */ |
1188 | if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid)) | 1314 | if (nvme_identify_ns_descs(ctrl, nsid, ids)) |
1189 | dev_warn(ctrl->device, | 1315 | dev_warn(ctrl->device, |
1190 | "%s: Identify Descriptors failed\n", __func__); | 1316 | "%s: Identify Descriptors failed\n", __func__); |
1191 | } | 1317 | } |
1192 | } | 1318 | } |
1193 | 1319 | ||
1320 | static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) | ||
1321 | { | ||
1322 | return !uuid_is_null(&ids->uuid) || | ||
1323 | memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || | ||
1324 | memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); | ||
1325 | } | ||
1326 | |||
1327 | static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) | ||
1328 | { | ||
1329 | return uuid_equal(&a->uuid, &b->uuid) && | ||
1330 | memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && | ||
1331 | memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; | ||
1332 | } | ||
1333 | |||
1334 | static void nvme_update_disk_info(struct gendisk *disk, | ||
1335 | struct nvme_ns *ns, struct nvme_id_ns *id) | ||
1336 | { | ||
1337 | sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); | ||
1338 | unsigned stream_alignment = 0; | ||
1339 | |||
1340 | if (ns->ctrl->nr_streams && ns->sws && ns->sgs) | ||
1341 | stream_alignment = ns->sws * ns->sgs; | ||
1342 | |||
1343 | blk_mq_freeze_queue(disk->queue); | ||
1344 | blk_integrity_unregister(disk); | ||
1345 | |||
1346 | blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift); | ||
1347 | if (ns->ms && !ns->ext && | ||
1348 | (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) | ||
1349 | nvme_init_integrity(disk, ns->ms, ns->pi_type); | ||
1350 | if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) | ||
1351 | capacity = 0; | ||
1352 | set_capacity(disk, capacity); | ||
1353 | |||
1354 | if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) | ||
1355 | nvme_config_discard(ns->ctrl, stream_alignment, disk->queue); | ||
1356 | blk_mq_unfreeze_queue(disk->queue); | ||
1357 | } | ||
1358 | |||
1194 | static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) | 1359 | static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) |
1195 | { | 1360 | { |
1196 | struct nvme_ns *ns = disk->private_data; | 1361 | struct nvme_ns *ns = disk->private_data; |
1197 | struct nvme_ctrl *ctrl = ns->ctrl; | ||
1198 | u16 bs; | ||
1199 | 1362 | ||
1200 | /* | 1363 | /* |
1201 | * If identify namespace failed, use default 512 byte block size so | 1364 | * If identify namespace failed, use default 512 byte block size so |
@@ -1204,26 +1367,22 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) | |||
1204 | ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; | 1367 | ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; |
1205 | if (ns->lba_shift == 0) | 1368 | if (ns->lba_shift == 0) |
1206 | ns->lba_shift = 9; | 1369 | ns->lba_shift = 9; |
1207 | bs = 1 << ns->lba_shift; | ||
1208 | ns->noiob = le16_to_cpu(id->noiob); | 1370 | ns->noiob = le16_to_cpu(id->noiob); |
1371 | ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); | ||
1372 | ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); | ||
1373 | /* the PI implementation requires metadata equal t10 pi tuple size */ | ||
1374 | if (ns->ms == sizeof(struct t10_pi_tuple)) | ||
1375 | ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; | ||
1376 | else | ||
1377 | ns->pi_type = 0; | ||
1209 | 1378 | ||
1210 | blk_mq_freeze_queue(disk->queue); | ||
1211 | |||
1212 | if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) | ||
1213 | nvme_prep_integrity(disk, id, bs); | ||
1214 | blk_queue_logical_block_size(ns->queue, bs); | ||
1215 | if (ns->noiob) | 1379 | if (ns->noiob) |
1216 | nvme_set_chunk_size(ns); | 1380 | nvme_set_chunk_size(ns); |
1217 | if (ns->ms && !blk_get_integrity(disk) && !ns->ext) | 1381 | nvme_update_disk_info(disk, ns, id); |
1218 | nvme_init_integrity(ns); | 1382 | #ifdef CONFIG_NVME_MULTIPATH |
1219 | if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) | 1383 | if (ns->head->disk) |
1220 | set_capacity(disk, 0); | 1384 | nvme_update_disk_info(ns->head->disk, ns, id); |
1221 | else | 1385 | #endif |
1222 | set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); | ||
1223 | |||
1224 | if (ctrl->oncs & NVME_CTRL_ONCS_DSM) | ||
1225 | nvme_config_discard(ns); | ||
1226 | blk_mq_unfreeze_queue(disk->queue); | ||
1227 | } | 1386 | } |
1228 | 1387 | ||
1229 | static int nvme_revalidate_disk(struct gendisk *disk) | 1388 | static int nvme_revalidate_disk(struct gendisk *disk) |
@@ -1231,8 +1390,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) | |||
1231 | struct nvme_ns *ns = disk->private_data; | 1390 | struct nvme_ns *ns = disk->private_data; |
1232 | struct nvme_ctrl *ctrl = ns->ctrl; | 1391 | struct nvme_ctrl *ctrl = ns->ctrl; |
1233 | struct nvme_id_ns *id; | 1392 | struct nvme_id_ns *id; |
1234 | u8 eui64[8] = { 0 }, nguid[16] = { 0 }; | 1393 | struct nvme_ns_ids ids; |
1235 | uuid_t uuid = uuid_null; | ||
1236 | int ret = 0; | 1394 | int ret = 0; |
1237 | 1395 | ||
1238 | if (test_bit(NVME_NS_DEAD, &ns->flags)) { | 1396 | if (test_bit(NVME_NS_DEAD, &ns->flags)) { |
@@ -1240,7 +1398,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) | |||
1240 | return -ENODEV; | 1398 | return -ENODEV; |
1241 | } | 1399 | } |
1242 | 1400 | ||
1243 | id = nvme_identify_ns(ctrl, ns->ns_id); | 1401 | id = nvme_identify_ns(ctrl, ns->head->ns_id); |
1244 | if (!id) | 1402 | if (!id) |
1245 | return -ENODEV; | 1403 | return -ENODEV; |
1246 | 1404 | ||
@@ -1250,12 +1408,10 @@ static int nvme_revalidate_disk(struct gendisk *disk) | |||
1250 | } | 1408 | } |
1251 | 1409 | ||
1252 | __nvme_revalidate_disk(disk, id); | 1410 | __nvme_revalidate_disk(disk, id); |
1253 | nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid); | 1411 | nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); |
1254 | if (!uuid_equal(&ns->uuid, &uuid) || | 1412 | if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { |
1255 | memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) || | ||
1256 | memcmp(&ns->eui, &eui64, sizeof(ns->eui))) { | ||
1257 | dev_err(ctrl->device, | 1413 | dev_err(ctrl->device, |
1258 | "identifiers changed for nsid %d\n", ns->ns_id); | 1414 | "identifiers changed for nsid %d\n", ns->head->ns_id); |
1259 | ret = -ENODEV; | 1415 | ret = -ENODEV; |
1260 | } | 1416 | } |
1261 | 1417 | ||
@@ -1287,8 +1443,10 @@ static char nvme_pr_type(enum pr_type type) | |||
1287 | static int nvme_pr_command(struct block_device *bdev, u32 cdw10, | 1443 | static int nvme_pr_command(struct block_device *bdev, u32 cdw10, |
1288 | u64 key, u64 sa_key, u8 op) | 1444 | u64 key, u64 sa_key, u8 op) |
1289 | { | 1445 | { |
1290 | struct nvme_ns *ns = bdev->bd_disk->private_data; | 1446 | struct nvme_ns_head *head = NULL; |
1447 | struct nvme_ns *ns; | ||
1291 | struct nvme_command c; | 1448 | struct nvme_command c; |
1449 | int srcu_idx, ret; | ||
1292 | u8 data[16] = { 0, }; | 1450 | u8 data[16] = { 0, }; |
1293 | 1451 | ||
1294 | put_unaligned_le64(key, &data[0]); | 1452 | put_unaligned_le64(key, &data[0]); |
@@ -1296,10 +1454,16 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, | |||
1296 | 1454 | ||
1297 | memset(&c, 0, sizeof(c)); | 1455 | memset(&c, 0, sizeof(c)); |
1298 | c.common.opcode = op; | 1456 | c.common.opcode = op; |
1299 | c.common.nsid = cpu_to_le32(ns->ns_id); | 1457 | c.common.nsid = cpu_to_le32(head->ns_id); |
1300 | c.common.cdw10[0] = cpu_to_le32(cdw10); | 1458 | c.common.cdw10[0] = cpu_to_le32(cdw10); |
1301 | 1459 | ||
1302 | return nvme_submit_sync_cmd(ns->queue, &c, data, 16); | 1460 | ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); |
1461 | if (unlikely(!ns)) | ||
1462 | ret = -EWOULDBLOCK; | ||
1463 | else | ||
1464 | ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); | ||
1465 | nvme_put_ns_from_disk(head, srcu_idx); | ||
1466 | return ret; | ||
1303 | } | 1467 | } |
1304 | 1468 | ||
1305 | static int nvme_pr_register(struct block_device *bdev, u64 old, | 1469 | static int nvme_pr_register(struct block_device *bdev, u64 old, |
@@ -1381,7 +1545,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); | |||
1381 | static const struct block_device_operations nvme_fops = { | 1545 | static const struct block_device_operations nvme_fops = { |
1382 | .owner = THIS_MODULE, | 1546 | .owner = THIS_MODULE, |
1383 | .ioctl = nvme_ioctl, | 1547 | .ioctl = nvme_ioctl, |
1384 | .compat_ioctl = nvme_compat_ioctl, | 1548 | .compat_ioctl = nvme_ioctl, |
1385 | .open = nvme_open, | 1549 | .open = nvme_open, |
1386 | .release = nvme_release, | 1550 | .release = nvme_release, |
1387 | .getgeo = nvme_getgeo, | 1551 | .getgeo = nvme_getgeo, |
@@ -1389,6 +1553,32 @@ static const struct block_device_operations nvme_fops = { | |||
1389 | .pr_ops = &nvme_pr_ops, | 1553 | .pr_ops = &nvme_pr_ops, |
1390 | }; | 1554 | }; |
1391 | 1555 | ||
1556 | #ifdef CONFIG_NVME_MULTIPATH | ||
1557 | static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) | ||
1558 | { | ||
1559 | struct nvme_ns_head *head = bdev->bd_disk->private_data; | ||
1560 | |||
1561 | if (!kref_get_unless_zero(&head->ref)) | ||
1562 | return -ENXIO; | ||
1563 | return 0; | ||
1564 | } | ||
1565 | |||
1566 | static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) | ||
1567 | { | ||
1568 | nvme_put_ns_head(disk->private_data); | ||
1569 | } | ||
1570 | |||
1571 | const struct block_device_operations nvme_ns_head_ops = { | ||
1572 | .owner = THIS_MODULE, | ||
1573 | .open = nvme_ns_head_open, | ||
1574 | .release = nvme_ns_head_release, | ||
1575 | .ioctl = nvme_ioctl, | ||
1576 | .compat_ioctl = nvme_ioctl, | ||
1577 | .getgeo = nvme_getgeo, | ||
1578 | .pr_ops = &nvme_pr_ops, | ||
1579 | }; | ||
1580 | #endif /* CONFIG_NVME_MULTIPATH */ | ||
1581 | |||
1392 | static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) | 1582 | static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) |
1393 | { | 1583 | { |
1394 | unsigned long timeout = | 1584 | unsigned long timeout = |
@@ -1737,14 +1927,15 @@ static bool quirk_matches(const struct nvme_id_ctrl *id, | |||
1737 | string_matches(id->fr, q->fr, sizeof(id->fr)); | 1927 | string_matches(id->fr, q->fr, sizeof(id->fr)); |
1738 | } | 1928 | } |
1739 | 1929 | ||
1740 | static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | 1930 | static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, |
1931 | struct nvme_id_ctrl *id) | ||
1741 | { | 1932 | { |
1742 | size_t nqnlen; | 1933 | size_t nqnlen; |
1743 | int off; | 1934 | int off; |
1744 | 1935 | ||
1745 | nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); | 1936 | nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); |
1746 | if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { | 1937 | if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { |
1747 | strcpy(ctrl->subnqn, id->subnqn); | 1938 | strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); |
1748 | return; | 1939 | return; |
1749 | } | 1940 | } |
1750 | 1941 | ||
@@ -1752,14 +1943,222 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | |||
1752 | dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); | 1943 | dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); |
1753 | 1944 | ||
1754 | /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ | 1945 | /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ |
1755 | off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE, | 1946 | off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, |
1756 | "nqn.2014.08.org.nvmexpress:%4x%4x", | 1947 | "nqn.2014.08.org.nvmexpress:%4x%4x", |
1757 | le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); | 1948 | le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); |
1758 | memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn)); | 1949 | memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); |
1759 | off += sizeof(id->sn); | 1950 | off += sizeof(id->sn); |
1760 | memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn)); | 1951 | memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); |
1761 | off += sizeof(id->mn); | 1952 | off += sizeof(id->mn); |
1762 | memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); | 1953 | memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); |
1954 | } | ||
1955 | |||
1956 | static void __nvme_release_subsystem(struct nvme_subsystem *subsys) | ||
1957 | { | ||
1958 | ida_simple_remove(&nvme_subsystems_ida, subsys->instance); | ||
1959 | kfree(subsys); | ||
1960 | } | ||
1961 | |||
1962 | static void nvme_release_subsystem(struct device *dev) | ||
1963 | { | ||
1964 | __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev)); | ||
1965 | } | ||
1966 | |||
1967 | static void nvme_destroy_subsystem(struct kref *ref) | ||
1968 | { | ||
1969 | struct nvme_subsystem *subsys = | ||
1970 | container_of(ref, struct nvme_subsystem, ref); | ||
1971 | |||
1972 | mutex_lock(&nvme_subsystems_lock); | ||
1973 | list_del(&subsys->entry); | ||
1974 | mutex_unlock(&nvme_subsystems_lock); | ||
1975 | |||
1976 | ida_destroy(&subsys->ns_ida); | ||
1977 | device_del(&subsys->dev); | ||
1978 | put_device(&subsys->dev); | ||
1979 | } | ||
1980 | |||
1981 | static void nvme_put_subsystem(struct nvme_subsystem *subsys) | ||
1982 | { | ||
1983 | kref_put(&subsys->ref, nvme_destroy_subsystem); | ||
1984 | } | ||
1985 | |||
1986 | static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) | ||
1987 | { | ||
1988 | struct nvme_subsystem *subsys; | ||
1989 | |||
1990 | lockdep_assert_held(&nvme_subsystems_lock); | ||
1991 | |||
1992 | list_for_each_entry(subsys, &nvme_subsystems, entry) { | ||
1993 | if (strcmp(subsys->subnqn, subsysnqn)) | ||
1994 | continue; | ||
1995 | if (!kref_get_unless_zero(&subsys->ref)) | ||
1996 | continue; | ||
1997 | return subsys; | ||
1998 | } | ||
1999 | |||
2000 | return NULL; | ||
2001 | } | ||
2002 | |||
2003 | #define SUBSYS_ATTR_RO(_name, _mode, _show) \ | ||
2004 | struct device_attribute subsys_attr_##_name = \ | ||
2005 | __ATTR(_name, _mode, _show, NULL) | ||
2006 | |||
2007 | static ssize_t nvme_subsys_show_nqn(struct device *dev, | ||
2008 | struct device_attribute *attr, | ||
2009 | char *buf) | ||
2010 | { | ||
2011 | struct nvme_subsystem *subsys = | ||
2012 | container_of(dev, struct nvme_subsystem, dev); | ||
2013 | |||
2014 | return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); | ||
2015 | } | ||
2016 | static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); | ||
2017 | |||
2018 | #define nvme_subsys_show_str_function(field) \ | ||
2019 | static ssize_t subsys_##field##_show(struct device *dev, \ | ||
2020 | struct device_attribute *attr, char *buf) \ | ||
2021 | { \ | ||
2022 | struct nvme_subsystem *subsys = \ | ||
2023 | container_of(dev, struct nvme_subsystem, dev); \ | ||
2024 | return sprintf(buf, "%.*s\n", \ | ||
2025 | (int)sizeof(subsys->field), subsys->field); \ | ||
2026 | } \ | ||
2027 | static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); | ||
2028 | |||
2029 | nvme_subsys_show_str_function(model); | ||
2030 | nvme_subsys_show_str_function(serial); | ||
2031 | nvme_subsys_show_str_function(firmware_rev); | ||
2032 | |||
2033 | static struct attribute *nvme_subsys_attrs[] = { | ||
2034 | &subsys_attr_model.attr, | ||
2035 | &subsys_attr_serial.attr, | ||
2036 | &subsys_attr_firmware_rev.attr, | ||
2037 | &subsys_attr_subsysnqn.attr, | ||
2038 | NULL, | ||
2039 | }; | ||
2040 | |||
2041 | static struct attribute_group nvme_subsys_attrs_group = { | ||
2042 | .attrs = nvme_subsys_attrs, | ||
2043 | }; | ||
2044 | |||
2045 | static const struct attribute_group *nvme_subsys_attrs_groups[] = { | ||
2046 | &nvme_subsys_attrs_group, | ||
2047 | NULL, | ||
2048 | }; | ||
2049 | |||
2050 | static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | ||
2051 | { | ||
2052 | struct nvme_subsystem *subsys, *found; | ||
2053 | int ret; | ||
2054 | |||
2055 | subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); | ||
2056 | if (!subsys) | ||
2057 | return -ENOMEM; | ||
2058 | ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); | ||
2059 | if (ret < 0) { | ||
2060 | kfree(subsys); | ||
2061 | return ret; | ||
2062 | } | ||
2063 | subsys->instance = ret; | ||
2064 | mutex_init(&subsys->lock); | ||
2065 | kref_init(&subsys->ref); | ||
2066 | INIT_LIST_HEAD(&subsys->ctrls); | ||
2067 | INIT_LIST_HEAD(&subsys->nsheads); | ||
2068 | nvme_init_subnqn(subsys, ctrl, id); | ||
2069 | memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); | ||
2070 | memcpy(subsys->model, id->mn, sizeof(subsys->model)); | ||
2071 | memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); | ||
2072 | subsys->vendor_id = le16_to_cpu(id->vid); | ||
2073 | subsys->cmic = id->cmic; | ||
2074 | |||
2075 | subsys->dev.class = nvme_subsys_class; | ||
2076 | subsys->dev.release = nvme_release_subsystem; | ||
2077 | subsys->dev.groups = nvme_subsys_attrs_groups; | ||
2078 | dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); | ||
2079 | device_initialize(&subsys->dev); | ||
2080 | |||
2081 | mutex_lock(&nvme_subsystems_lock); | ||
2082 | found = __nvme_find_get_subsystem(subsys->subnqn); | ||
2083 | if (found) { | ||
2084 | /* | ||
2085 | * Verify that the subsystem actually supports multiple | ||
2086 | * controllers, else bail out. | ||
2087 | */ | ||
2088 | if (!(id->cmic & (1 << 1))) { | ||
2089 | dev_err(ctrl->device, | ||
2090 | "ignoring ctrl due to duplicate subnqn (%s).\n", | ||
2091 | found->subnqn); | ||
2092 | nvme_put_subsystem(found); | ||
2093 | ret = -EINVAL; | ||
2094 | goto out_unlock; | ||
2095 | } | ||
2096 | |||
2097 | __nvme_release_subsystem(subsys); | ||
2098 | subsys = found; | ||
2099 | } else { | ||
2100 | ret = device_add(&subsys->dev); | ||
2101 | if (ret) { | ||
2102 | dev_err(ctrl->device, | ||
2103 | "failed to register subsystem device.\n"); | ||
2104 | goto out_unlock; | ||
2105 | } | ||
2106 | ida_init(&subsys->ns_ida); | ||
2107 | list_add_tail(&subsys->entry, &nvme_subsystems); | ||
2108 | } | ||
2109 | |||
2110 | ctrl->subsys = subsys; | ||
2111 | mutex_unlock(&nvme_subsystems_lock); | ||
2112 | |||
2113 | if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, | ||
2114 | dev_name(ctrl->device))) { | ||
2115 | dev_err(ctrl->device, | ||
2116 | "failed to create sysfs link from subsystem.\n"); | ||
2117 | /* the transport driver will eventually put the subsystem */ | ||
2118 | return -EINVAL; | ||
2119 | } | ||
2120 | |||
2121 | mutex_lock(&subsys->lock); | ||
2122 | list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); | ||
2123 | mutex_unlock(&subsys->lock); | ||
2124 | |||
2125 | return 0; | ||
2126 | |||
2127 | out_unlock: | ||
2128 | mutex_unlock(&nvme_subsystems_lock); | ||
2129 | put_device(&subsys->dev); | ||
2130 | return ret; | ||
2131 | } | ||
2132 | |||
2133 | static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, | ||
2134 | size_t size) | ||
2135 | { | ||
2136 | struct nvme_command c = { }; | ||
2137 | |||
2138 | c.common.opcode = nvme_admin_get_log_page; | ||
2139 | c.common.nsid = cpu_to_le32(NVME_NSID_ALL); | ||
2140 | c.common.cdw10[0] = nvme_get_log_dw10(log_page, size); | ||
2141 | |||
2142 | return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); | ||
2143 | } | ||
2144 | |||
2145 | static int nvme_get_effects_log(struct nvme_ctrl *ctrl) | ||
2146 | { | ||
2147 | int ret; | ||
2148 | |||
2149 | if (!ctrl->effects) | ||
2150 | ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); | ||
2151 | |||
2152 | if (!ctrl->effects) | ||
2153 | return 0; | ||
2154 | |||
2155 | ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, | ||
2156 | sizeof(*ctrl->effects)); | ||
2157 | if (ret) { | ||
2158 | kfree(ctrl->effects); | ||
2159 | ctrl->effects = NULL; | ||
2160 | } | ||
2161 | return ret; | ||
1763 | } | 2162 | } |
1764 | 2163 | ||
1765 | /* | 2164 | /* |
@@ -1797,9 +2196,19 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) | |||
1797 | return -EIO; | 2196 | return -EIO; |
1798 | } | 2197 | } |
1799 | 2198 | ||
1800 | nvme_init_subnqn(ctrl, id); | 2199 | if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { |
2200 | ret = nvme_get_effects_log(ctrl); | ||
2201 | if (ret < 0) | ||
2202 | return ret; | ||
2203 | } | ||
1801 | 2204 | ||
1802 | if (!ctrl->identified) { | 2205 | if (!ctrl->identified) { |
2206 | int i; | ||
2207 | |||
2208 | ret = nvme_init_subsystem(ctrl, id); | ||
2209 | if (ret) | ||
2210 | goto out_free; | ||
2211 | |||
1803 | /* | 2212 | /* |
1804 | * Check for quirks. Quirk can depend on firmware version, | 2213 | * Check for quirks. Quirk can depend on firmware version, |
1805 | * so, in principle, the set of quirks present can change | 2214 | * so, in principle, the set of quirks present can change |
@@ -1808,9 +2217,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) | |||
1808 | * the device, but we'd have to make sure that the driver | 2217 | * the device, but we'd have to make sure that the driver |
1809 | * behaves intelligently if the quirks change. | 2218 | * behaves intelligently if the quirks change. |
1810 | */ | 2219 | */ |
1811 | |||
1812 | int i; | ||
1813 | |||
1814 | for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { | 2220 | for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { |
1815 | if (quirk_matches(id, &core_quirks[i])) | 2221 | if (quirk_matches(id, &core_quirks[i])) |
1816 | ctrl->quirks |= core_quirks[i].quirks; | 2222 | ctrl->quirks |= core_quirks[i].quirks; |
@@ -1823,14 +2229,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) | |||
1823 | } | 2229 | } |
1824 | 2230 | ||
1825 | ctrl->oacs = le16_to_cpu(id->oacs); | 2231 | ctrl->oacs = le16_to_cpu(id->oacs); |
1826 | ctrl->vid = le16_to_cpu(id->vid); | ||
1827 | ctrl->oncs = le16_to_cpup(&id->oncs); | 2232 | ctrl->oncs = le16_to_cpup(&id->oncs); |
1828 | atomic_set(&ctrl->abort_limit, id->acl + 1); | 2233 | atomic_set(&ctrl->abort_limit, id->acl + 1); |
1829 | ctrl->vwc = id->vwc; | 2234 | ctrl->vwc = id->vwc; |
1830 | ctrl->cntlid = le16_to_cpup(&id->cntlid); | 2235 | ctrl->cntlid = le16_to_cpup(&id->cntlid); |
1831 | memcpy(ctrl->serial, id->sn, sizeof(id->sn)); | ||
1832 | memcpy(ctrl->model, id->mn, sizeof(id->mn)); | ||
1833 | memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); | ||
1834 | if (id->mdts) | 2236 | if (id->mdts) |
1835 | max_hw_sectors = 1 << (id->mdts + page_shift - 9); | 2237 | max_hw_sectors = 1 << (id->mdts + page_shift - 9); |
1836 | else | 2238 | else |
@@ -1931,33 +2333,12 @@ EXPORT_SYMBOL_GPL(nvme_init_identify); | |||
1931 | 2333 | ||
1932 | static int nvme_dev_open(struct inode *inode, struct file *file) | 2334 | static int nvme_dev_open(struct inode *inode, struct file *file) |
1933 | { | 2335 | { |
1934 | struct nvme_ctrl *ctrl; | 2336 | struct nvme_ctrl *ctrl = |
1935 | int instance = iminor(inode); | 2337 | container_of(inode->i_cdev, struct nvme_ctrl, cdev); |
1936 | int ret = -ENODEV; | ||
1937 | |||
1938 | spin_lock(&dev_list_lock); | ||
1939 | list_for_each_entry(ctrl, &nvme_ctrl_list, node) { | ||
1940 | if (ctrl->instance != instance) | ||
1941 | continue; | ||
1942 | |||
1943 | if (!ctrl->admin_q) { | ||
1944 | ret = -EWOULDBLOCK; | ||
1945 | break; | ||
1946 | } | ||
1947 | if (!kref_get_unless_zero(&ctrl->kref)) | ||
1948 | break; | ||
1949 | file->private_data = ctrl; | ||
1950 | ret = 0; | ||
1951 | break; | ||
1952 | } | ||
1953 | spin_unlock(&dev_list_lock); | ||
1954 | |||
1955 | return ret; | ||
1956 | } | ||
1957 | 2338 | ||
1958 | static int nvme_dev_release(struct inode *inode, struct file *file) | 2339 | if (ctrl->state != NVME_CTRL_LIVE) |
1959 | { | 2340 | return -EWOULDBLOCK; |
1960 | nvme_put_ctrl(file->private_data); | 2341 | file->private_data = ctrl; |
1961 | return 0; | 2342 | return 0; |
1962 | } | 2343 | } |
1963 | 2344 | ||
@@ -2021,7 +2402,6 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, | |||
2021 | static const struct file_operations nvme_dev_fops = { | 2402 | static const struct file_operations nvme_dev_fops = { |
2022 | .owner = THIS_MODULE, | 2403 | .owner = THIS_MODULE, |
2023 | .open = nvme_dev_open, | 2404 | .open = nvme_dev_open, |
2024 | .release = nvme_dev_release, | ||
2025 | .unlocked_ioctl = nvme_dev_ioctl, | 2405 | .unlocked_ioctl = nvme_dev_ioctl, |
2026 | .compat_ioctl = nvme_dev_ioctl, | 2406 | .compat_ioctl = nvme_dev_ioctl, |
2027 | }; | 2407 | }; |
@@ -2051,77 +2431,86 @@ static ssize_t nvme_sysfs_rescan(struct device *dev, | |||
2051 | } | 2431 | } |
2052 | static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); | 2432 | static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); |
2053 | 2433 | ||
2434 | static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) | ||
2435 | { | ||
2436 | struct gendisk *disk = dev_to_disk(dev); | ||
2437 | |||
2438 | if (disk->fops == &nvme_fops) | ||
2439 | return nvme_get_ns_from_dev(dev)->head; | ||
2440 | else | ||
2441 | return disk->private_data; | ||
2442 | } | ||
2443 | |||
2054 | static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, | 2444 | static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, |
2055 | char *buf) | 2445 | char *buf) |
2056 | { | 2446 | { |
2057 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | 2447 | struct nvme_ns_head *head = dev_to_ns_head(dev); |
2058 | struct nvme_ctrl *ctrl = ns->ctrl; | 2448 | struct nvme_ns_ids *ids = &head->ids; |
2059 | int serial_len = sizeof(ctrl->serial); | 2449 | struct nvme_subsystem *subsys = head->subsys; |
2060 | int model_len = sizeof(ctrl->model); | 2450 | int serial_len = sizeof(subsys->serial); |
2451 | int model_len = sizeof(subsys->model); | ||
2061 | 2452 | ||
2062 | if (!uuid_is_null(&ns->uuid)) | 2453 | if (!uuid_is_null(&ids->uuid)) |
2063 | return sprintf(buf, "uuid.%pU\n", &ns->uuid); | 2454 | return sprintf(buf, "uuid.%pU\n", &ids->uuid); |
2064 | 2455 | ||
2065 | if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) | 2456 | if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) |
2066 | return sprintf(buf, "eui.%16phN\n", ns->nguid); | 2457 | return sprintf(buf, "eui.%16phN\n", ids->nguid); |
2067 | 2458 | ||
2068 | if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) | 2459 | if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) |
2069 | return sprintf(buf, "eui.%8phN\n", ns->eui); | 2460 | return sprintf(buf, "eui.%8phN\n", ids->eui64); |
2070 | 2461 | ||
2071 | while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' || | 2462 | while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || |
2072 | ctrl->serial[serial_len - 1] == '\0')) | 2463 | subsys->serial[serial_len - 1] == '\0')) |
2073 | serial_len--; | 2464 | serial_len--; |
2074 | while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' || | 2465 | while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || |
2075 | ctrl->model[model_len - 1] == '\0')) | 2466 | subsys->model[model_len - 1] == '\0')) |
2076 | model_len--; | 2467 | model_len--; |
2077 | 2468 | ||
2078 | return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, | 2469 | return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, |
2079 | serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); | 2470 | serial_len, subsys->serial, model_len, subsys->model, |
2471 | head->ns_id); | ||
2080 | } | 2472 | } |
2081 | static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); | 2473 | static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); |
2082 | 2474 | ||
2083 | static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, | 2475 | static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, |
2084 | char *buf) | 2476 | char *buf) |
2085 | { | 2477 | { |
2086 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | 2478 | return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); |
2087 | return sprintf(buf, "%pU\n", ns->nguid); | ||
2088 | } | 2479 | } |
2089 | static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); | 2480 | static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); |
2090 | 2481 | ||
2091 | static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, | 2482 | static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, |
2092 | char *buf) | 2483 | char *buf) |
2093 | { | 2484 | { |
2094 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | 2485 | struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; |
2095 | 2486 | ||
2096 | /* For backward compatibility expose the NGUID to userspace if | 2487 | /* For backward compatibility expose the NGUID to userspace if |
2097 | * we have no UUID set | 2488 | * we have no UUID set |
2098 | */ | 2489 | */ |
2099 | if (uuid_is_null(&ns->uuid)) { | 2490 | if (uuid_is_null(&ids->uuid)) { |
2100 | printk_ratelimited(KERN_WARNING | 2491 | printk_ratelimited(KERN_WARNING |
2101 | "No UUID available providing old NGUID\n"); | 2492 | "No UUID available providing old NGUID\n"); |
2102 | return sprintf(buf, "%pU\n", ns->nguid); | 2493 | return sprintf(buf, "%pU\n", ids->nguid); |
2103 | } | 2494 | } |
2104 | return sprintf(buf, "%pU\n", &ns->uuid); | 2495 | return sprintf(buf, "%pU\n", &ids->uuid); |
2105 | } | 2496 | } |
2106 | static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); | 2497 | static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); |
2107 | 2498 | ||
2108 | static ssize_t eui_show(struct device *dev, struct device_attribute *attr, | 2499 | static ssize_t eui_show(struct device *dev, struct device_attribute *attr, |
2109 | char *buf) | 2500 | char *buf) |
2110 | { | 2501 | { |
2111 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | 2502 | return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); |
2112 | return sprintf(buf, "%8phd\n", ns->eui); | ||
2113 | } | 2503 | } |
2114 | static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); | 2504 | static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); |
2115 | 2505 | ||
2116 | static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, | 2506 | static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, |
2117 | char *buf) | 2507 | char *buf) |
2118 | { | 2508 | { |
2119 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | 2509 | return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); |
2120 | return sprintf(buf, "%d\n", ns->ns_id); | ||
2121 | } | 2510 | } |
2122 | static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); | 2511 | static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); |
2123 | 2512 | ||
2124 | static struct attribute *nvme_ns_attrs[] = { | 2513 | static struct attribute *nvme_ns_id_attrs[] = { |
2125 | &dev_attr_wwid.attr, | 2514 | &dev_attr_wwid.attr, |
2126 | &dev_attr_uuid.attr, | 2515 | &dev_attr_uuid.attr, |
2127 | &dev_attr_nguid.attr, | 2516 | &dev_attr_nguid.attr, |
@@ -2130,31 +2519,31 @@ static struct attribute *nvme_ns_attrs[] = { | |||
2130 | NULL, | 2519 | NULL, |
2131 | }; | 2520 | }; |
2132 | 2521 | ||
2133 | static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, | 2522 | static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, |
2134 | struct attribute *a, int n) | 2523 | struct attribute *a, int n) |
2135 | { | 2524 | { |
2136 | struct device *dev = container_of(kobj, struct device, kobj); | 2525 | struct device *dev = container_of(kobj, struct device, kobj); |
2137 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | 2526 | struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; |
2138 | 2527 | ||
2139 | if (a == &dev_attr_uuid.attr) { | 2528 | if (a == &dev_attr_uuid.attr) { |
2140 | if (uuid_is_null(&ns->uuid) && | 2529 | if (uuid_is_null(&ids->uuid) && |
2141 | !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) | 2530 | !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) |
2142 | return 0; | 2531 | return 0; |
2143 | } | 2532 | } |
2144 | if (a == &dev_attr_nguid.attr) { | 2533 | if (a == &dev_attr_nguid.attr) { |
2145 | if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) | 2534 | if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) |
2146 | return 0; | 2535 | return 0; |
2147 | } | 2536 | } |
2148 | if (a == &dev_attr_eui.attr) { | 2537 | if (a == &dev_attr_eui.attr) { |
2149 | if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) | 2538 | if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) |
2150 | return 0; | 2539 | return 0; |
2151 | } | 2540 | } |
2152 | return a->mode; | 2541 | return a->mode; |
2153 | } | 2542 | } |
2154 | 2543 | ||
2155 | static const struct attribute_group nvme_ns_attr_group = { | 2544 | const struct attribute_group nvme_ns_id_attr_group = { |
2156 | .attrs = nvme_ns_attrs, | 2545 | .attrs = nvme_ns_id_attrs, |
2157 | .is_visible = nvme_ns_attrs_are_visible, | 2546 | .is_visible = nvme_ns_id_attrs_are_visible, |
2158 | }; | 2547 | }; |
2159 | 2548 | ||
2160 | #define nvme_show_str_function(field) \ | 2549 | #define nvme_show_str_function(field) \ |
@@ -2162,10 +2551,15 @@ static ssize_t field##_show(struct device *dev, \ | |||
2162 | struct device_attribute *attr, char *buf) \ | 2551 | struct device_attribute *attr, char *buf) \ |
2163 | { \ | 2552 | { \ |
2164 | struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ | 2553 | struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ |
2165 | return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ | 2554 | return sprintf(buf, "%.*s\n", \ |
2555 | (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ | ||
2166 | } \ | 2556 | } \ |
2167 | static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); | 2557 | static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); |
2168 | 2558 | ||
2559 | nvme_show_str_function(model); | ||
2560 | nvme_show_str_function(serial); | ||
2561 | nvme_show_str_function(firmware_rev); | ||
2562 | |||
2169 | #define nvme_show_int_function(field) \ | 2563 | #define nvme_show_int_function(field) \ |
2170 | static ssize_t field##_show(struct device *dev, \ | 2564 | static ssize_t field##_show(struct device *dev, \ |
2171 | struct device_attribute *attr, char *buf) \ | 2565 | struct device_attribute *attr, char *buf) \ |
@@ -2175,9 +2569,6 @@ static ssize_t field##_show(struct device *dev, \ | |||
2175 | } \ | 2569 | } \ |
2176 | static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); | 2570 | static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); |
2177 | 2571 | ||
2178 | nvme_show_str_function(model); | ||
2179 | nvme_show_str_function(serial); | ||
2180 | nvme_show_str_function(firmware_rev); | ||
2181 | nvme_show_int_function(cntlid); | 2572 | nvme_show_int_function(cntlid); |
2182 | 2573 | ||
2183 | static ssize_t nvme_sysfs_delete(struct device *dev, | 2574 | static ssize_t nvme_sysfs_delete(struct device *dev, |
@@ -2187,7 +2578,7 @@ static ssize_t nvme_sysfs_delete(struct device *dev, | |||
2187 | struct nvme_ctrl *ctrl = dev_get_drvdata(dev); | 2578 | struct nvme_ctrl *ctrl = dev_get_drvdata(dev); |
2188 | 2579 | ||
2189 | if (device_remove_file_self(dev, attr)) | 2580 | if (device_remove_file_self(dev, attr)) |
2190 | ctrl->ops->delete_ctrl(ctrl); | 2581 | nvme_delete_ctrl_sync(ctrl); |
2191 | return count; | 2582 | return count; |
2192 | } | 2583 | } |
2193 | static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); | 2584 | static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); |
@@ -2231,7 +2622,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, | |||
2231 | { | 2622 | { |
2232 | struct nvme_ctrl *ctrl = dev_get_drvdata(dev); | 2623 | struct nvme_ctrl *ctrl = dev_get_drvdata(dev); |
2233 | 2624 | ||
2234 | return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn); | 2625 | return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); |
2235 | } | 2626 | } |
2236 | static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); | 2627 | static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); |
2237 | 2628 | ||
@@ -2284,12 +2675,128 @@ static const struct attribute_group *nvme_dev_attr_groups[] = { | |||
2284 | NULL, | 2675 | NULL, |
2285 | }; | 2676 | }; |
2286 | 2677 | ||
2678 | static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, | ||
2679 | unsigned nsid) | ||
2680 | { | ||
2681 | struct nvme_ns_head *h; | ||
2682 | |||
2683 | lockdep_assert_held(&subsys->lock); | ||
2684 | |||
2685 | list_for_each_entry(h, &subsys->nsheads, entry) { | ||
2686 | if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) | ||
2687 | return h; | ||
2688 | } | ||
2689 | |||
2690 | return NULL; | ||
2691 | } | ||
2692 | |||
2693 | static int __nvme_check_ids(struct nvme_subsystem *subsys, | ||
2694 | struct nvme_ns_head *new) | ||
2695 | { | ||
2696 | struct nvme_ns_head *h; | ||
2697 | |||
2698 | lockdep_assert_held(&subsys->lock); | ||
2699 | |||
2700 | list_for_each_entry(h, &subsys->nsheads, entry) { | ||
2701 | if (nvme_ns_ids_valid(&new->ids) && | ||
2702 | nvme_ns_ids_equal(&new->ids, &h->ids)) | ||
2703 | return -EINVAL; | ||
2704 | } | ||
2705 | |||
2706 | return 0; | ||
2707 | } | ||
2708 | |||
2709 | static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, | ||
2710 | unsigned nsid, struct nvme_id_ns *id) | ||
2711 | { | ||
2712 | struct nvme_ns_head *head; | ||
2713 | int ret = -ENOMEM; | ||
2714 | |||
2715 | head = kzalloc(sizeof(*head), GFP_KERNEL); | ||
2716 | if (!head) | ||
2717 | goto out; | ||
2718 | ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); | ||
2719 | if (ret < 0) | ||
2720 | goto out_free_head; | ||
2721 | head->instance = ret; | ||
2722 | INIT_LIST_HEAD(&head->list); | ||
2723 | init_srcu_struct(&head->srcu); | ||
2724 | head->subsys = ctrl->subsys; | ||
2725 | head->ns_id = nsid; | ||
2726 | kref_init(&head->ref); | ||
2727 | |||
2728 | nvme_report_ns_ids(ctrl, nsid, id, &head->ids); | ||
2729 | |||
2730 | ret = __nvme_check_ids(ctrl->subsys, head); | ||
2731 | if (ret) { | ||
2732 | dev_err(ctrl->device, | ||
2733 | "duplicate IDs for nsid %d\n", nsid); | ||
2734 | goto out_cleanup_srcu; | ||
2735 | } | ||
2736 | |||
2737 | ret = nvme_mpath_alloc_disk(ctrl, head); | ||
2738 | if (ret) | ||
2739 | goto out_cleanup_srcu; | ||
2740 | |||
2741 | list_add_tail(&head->entry, &ctrl->subsys->nsheads); | ||
2742 | return head; | ||
2743 | out_cleanup_srcu: | ||
2744 | cleanup_srcu_struct(&head->srcu); | ||
2745 | ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); | ||
2746 | out_free_head: | ||
2747 | kfree(head); | ||
2748 | out: | ||
2749 | return ERR_PTR(ret); | ||
2750 | } | ||
2751 | |||
2752 | static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, | ||
2753 | struct nvme_id_ns *id, bool *new) | ||
2754 | { | ||
2755 | struct nvme_ctrl *ctrl = ns->ctrl; | ||
2756 | bool is_shared = id->nmic & (1 << 0); | ||
2757 | struct nvme_ns_head *head = NULL; | ||
2758 | int ret = 0; | ||
2759 | |||
2760 | mutex_lock(&ctrl->subsys->lock); | ||
2761 | if (is_shared) | ||
2762 | head = __nvme_find_ns_head(ctrl->subsys, nsid); | ||
2763 | if (!head) { | ||
2764 | head = nvme_alloc_ns_head(ctrl, nsid, id); | ||
2765 | if (IS_ERR(head)) { | ||
2766 | ret = PTR_ERR(head); | ||
2767 | goto out_unlock; | ||
2768 | } | ||
2769 | |||
2770 | *new = true; | ||
2771 | } else { | ||
2772 | struct nvme_ns_ids ids; | ||
2773 | |||
2774 | nvme_report_ns_ids(ctrl, nsid, id, &ids); | ||
2775 | if (!nvme_ns_ids_equal(&head->ids, &ids)) { | ||
2776 | dev_err(ctrl->device, | ||
2777 | "IDs don't match for shared namespace %d\n", | ||
2778 | nsid); | ||
2779 | ret = -EINVAL; | ||
2780 | goto out_unlock; | ||
2781 | } | ||
2782 | |||
2783 | *new = false; | ||
2784 | } | ||
2785 | |||
2786 | list_add_tail(&ns->siblings, &head->list); | ||
2787 | ns->head = head; | ||
2788 | |||
2789 | out_unlock: | ||
2790 | mutex_unlock(&ctrl->subsys->lock); | ||
2791 | return ret; | ||
2792 | } | ||
2793 | |||
2287 | static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) | 2794 | static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) |
2288 | { | 2795 | { |
2289 | struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); | 2796 | struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); |
2290 | struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); | 2797 | struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); |
2291 | 2798 | ||
2292 | return nsa->ns_id - nsb->ns_id; | 2799 | return nsa->head->ns_id - nsb->head->ns_id; |
2293 | } | 2800 | } |
2294 | 2801 | ||
2295 | static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) | 2802 | static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) |
@@ -2298,12 +2805,13 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) | |||
2298 | 2805 | ||
2299 | mutex_lock(&ctrl->namespaces_mutex); | 2806 | mutex_lock(&ctrl->namespaces_mutex); |
2300 | list_for_each_entry(ns, &ctrl->namespaces, list) { | 2807 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
2301 | if (ns->ns_id == nsid) { | 2808 | if (ns->head->ns_id == nsid) { |
2302 | kref_get(&ns->kref); | 2809 | if (!kref_get_unless_zero(&ns->kref)) |
2810 | continue; | ||
2303 | ret = ns; | 2811 | ret = ns; |
2304 | break; | 2812 | break; |
2305 | } | 2813 | } |
2306 | if (ns->ns_id > nsid) | 2814 | if (ns->head->ns_id > nsid) |
2307 | break; | 2815 | break; |
2308 | } | 2816 | } |
2309 | mutex_unlock(&ctrl->namespaces_mutex); | 2817 | mutex_unlock(&ctrl->namespaces_mutex); |
@@ -2318,7 +2826,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) | |||
2318 | if (!ctrl->nr_streams) | 2826 | if (!ctrl->nr_streams) |
2319 | return 0; | 2827 | return 0; |
2320 | 2828 | ||
2321 | ret = nvme_get_stream_params(ctrl, &s, ns->ns_id); | 2829 | ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); |
2322 | if (ret) | 2830 | if (ret) |
2323 | return ret; | 2831 | return ret; |
2324 | 2832 | ||
@@ -2342,33 +2850,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) | |||
2342 | struct gendisk *disk; | 2850 | struct gendisk *disk; |
2343 | struct nvme_id_ns *id; | 2851 | struct nvme_id_ns *id; |
2344 | char disk_name[DISK_NAME_LEN]; | 2852 | char disk_name[DISK_NAME_LEN]; |
2345 | int node = dev_to_node(ctrl->dev); | 2853 | int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; |
2854 | bool new = true; | ||
2346 | 2855 | ||
2347 | ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); | 2856 | ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); |
2348 | if (!ns) | 2857 | if (!ns) |
2349 | return; | 2858 | return; |
2350 | 2859 | ||
2351 | ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); | ||
2352 | if (ns->instance < 0) | ||
2353 | goto out_free_ns; | ||
2354 | |||
2355 | ns->queue = blk_mq_init_queue(ctrl->tagset); | 2860 | ns->queue = blk_mq_init_queue(ctrl->tagset); |
2356 | if (IS_ERR(ns->queue)) | 2861 | if (IS_ERR(ns->queue)) |
2357 | goto out_release_instance; | 2862 | goto out_free_ns; |
2358 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); | 2863 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); |
2359 | ns->queue->queuedata = ns; | 2864 | ns->queue->queuedata = ns; |
2360 | ns->ctrl = ctrl; | 2865 | ns->ctrl = ctrl; |
2361 | 2866 | ||
2362 | kref_init(&ns->kref); | 2867 | kref_init(&ns->kref); |
2363 | ns->ns_id = nsid; | ||
2364 | ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ | 2868 | ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ |
2365 | 2869 | ||
2366 | blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); | 2870 | blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); |
2367 | nvme_set_queue_limits(ctrl, ns->queue); | 2871 | nvme_set_queue_limits(ctrl, ns->queue); |
2368 | nvme_setup_streams_ns(ctrl, ns); | 2872 | nvme_setup_streams_ns(ctrl, ns); |
2369 | 2873 | ||
2370 | sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); | ||
2371 | |||
2372 | id = nvme_identify_ns(ctrl, nsid); | 2874 | id = nvme_identify_ns(ctrl, nsid); |
2373 | if (!id) | 2875 | if (!id) |
2374 | goto out_free_queue; | 2876 | goto out_free_queue; |
@@ -2376,23 +2878,49 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) | |||
2376 | if (id->ncap == 0) | 2878 | if (id->ncap == 0) |
2377 | goto out_free_id; | 2879 | goto out_free_id; |
2378 | 2880 | ||
2379 | nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); | 2881 | if (nvme_init_ns_head(ns, nsid, id, &new)) |
2882 | goto out_free_id; | ||
2883 | |||
2884 | #ifdef CONFIG_NVME_MULTIPATH | ||
2885 | /* | ||
2886 | * If multipathing is enabled we need to always use the subsystem | ||
2887 | * instance number for numbering our devices to avoid conflicts | ||
2888 | * between subsystems that have multiple controllers and thus use | ||
2889 | * the multipath-aware subsystem node and those that have a single | ||
2890 | * controller and use the controller node directly. | ||
2891 | */ | ||
2892 | if (ns->head->disk) { | ||
2893 | sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, | ||
2894 | ctrl->cntlid, ns->head->instance); | ||
2895 | flags = GENHD_FL_HIDDEN; | ||
2896 | } else { | ||
2897 | sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, | ||
2898 | ns->head->instance); | ||
2899 | } | ||
2900 | #else | ||
2901 | /* | ||
2902 | * But without the multipath code enabled, multiple controller per | ||
2903 | * subsystems are visible as devices and thus we cannot use the | ||
2904 | * subsystem instance. | ||
2905 | */ | ||
2906 | sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); | ||
2907 | #endif | ||
2380 | 2908 | ||
2381 | if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { | 2909 | if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { |
2382 | if (nvme_nvm_register(ns, disk_name, node)) { | 2910 | if (nvme_nvm_register(ns, disk_name, node)) { |
2383 | dev_warn(ctrl->device, "LightNVM init failure\n"); | 2911 | dev_warn(ctrl->device, "LightNVM init failure\n"); |
2384 | goto out_free_id; | 2912 | goto out_unlink_ns; |
2385 | } | 2913 | } |
2386 | } | 2914 | } |
2387 | 2915 | ||
2388 | disk = alloc_disk_node(0, node); | 2916 | disk = alloc_disk_node(0, node); |
2389 | if (!disk) | 2917 | if (!disk) |
2390 | goto out_free_id; | 2918 | goto out_unlink_ns; |
2391 | 2919 | ||
2392 | disk->fops = &nvme_fops; | 2920 | disk->fops = &nvme_fops; |
2393 | disk->private_data = ns; | 2921 | disk->private_data = ns; |
2394 | disk->queue = ns->queue; | 2922 | disk->queue = ns->queue; |
2395 | disk->flags = GENHD_FL_EXT_DEVT; | 2923 | disk->flags = flags; |
2396 | memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); | 2924 | memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); |
2397 | ns->disk = disk; | 2925 | ns->disk = disk; |
2398 | 2926 | ||
@@ -2402,49 +2930,65 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) | |||
2402 | list_add_tail(&ns->list, &ctrl->namespaces); | 2930 | list_add_tail(&ns->list, &ctrl->namespaces); |
2403 | mutex_unlock(&ctrl->namespaces_mutex); | 2931 | mutex_unlock(&ctrl->namespaces_mutex); |
2404 | 2932 | ||
2405 | kref_get(&ctrl->kref); | 2933 | nvme_get_ctrl(ctrl); |
2406 | 2934 | ||
2407 | kfree(id); | 2935 | kfree(id); |
2408 | 2936 | ||
2409 | device_add_disk(ctrl->device, ns->disk); | 2937 | device_add_disk(ctrl->device, ns->disk); |
2410 | if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, | 2938 | if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, |
2411 | &nvme_ns_attr_group)) | 2939 | &nvme_ns_id_attr_group)) |
2412 | pr_warn("%s: failed to create sysfs group for identification\n", | 2940 | pr_warn("%s: failed to create sysfs group for identification\n", |
2413 | ns->disk->disk_name); | 2941 | ns->disk->disk_name); |
2414 | if (ns->ndev && nvme_nvm_register_sysfs(ns)) | 2942 | if (ns->ndev && nvme_nvm_register_sysfs(ns)) |
2415 | pr_warn("%s: failed to register lightnvm sysfs group for identification\n", | 2943 | pr_warn("%s: failed to register lightnvm sysfs group for identification\n", |
2416 | ns->disk->disk_name); | 2944 | ns->disk->disk_name); |
2945 | |||
2946 | if (new) | ||
2947 | nvme_mpath_add_disk(ns->head); | ||
2948 | nvme_mpath_add_disk_links(ns); | ||
2417 | return; | 2949 | return; |
2950 | out_unlink_ns: | ||
2951 | mutex_lock(&ctrl->subsys->lock); | ||
2952 | list_del_rcu(&ns->siblings); | ||
2953 | mutex_unlock(&ctrl->subsys->lock); | ||
2418 | out_free_id: | 2954 | out_free_id: |
2419 | kfree(id); | 2955 | kfree(id); |
2420 | out_free_queue: | 2956 | out_free_queue: |
2421 | blk_cleanup_queue(ns->queue); | 2957 | blk_cleanup_queue(ns->queue); |
2422 | out_release_instance: | ||
2423 | ida_simple_remove(&ctrl->ns_ida, ns->instance); | ||
2424 | out_free_ns: | 2958 | out_free_ns: |
2425 | kfree(ns); | 2959 | kfree(ns); |
2426 | } | 2960 | } |
2427 | 2961 | ||
2428 | static void nvme_ns_remove(struct nvme_ns *ns) | 2962 | static void nvme_ns_remove(struct nvme_ns *ns) |
2429 | { | 2963 | { |
2964 | struct nvme_ns_head *head = ns->head; | ||
2965 | |||
2430 | if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) | 2966 | if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) |
2431 | return; | 2967 | return; |
2432 | 2968 | ||
2433 | if (ns->disk && ns->disk->flags & GENHD_FL_UP) { | 2969 | if (ns->disk && ns->disk->flags & GENHD_FL_UP) { |
2434 | if (blk_get_integrity(ns->disk)) | 2970 | if (blk_get_integrity(ns->disk)) |
2435 | blk_integrity_unregister(ns->disk); | 2971 | blk_integrity_unregister(ns->disk); |
2972 | nvme_mpath_remove_disk_links(ns); | ||
2436 | sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, | 2973 | sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, |
2437 | &nvme_ns_attr_group); | 2974 | &nvme_ns_id_attr_group); |
2438 | if (ns->ndev) | 2975 | if (ns->ndev) |
2439 | nvme_nvm_unregister_sysfs(ns); | 2976 | nvme_nvm_unregister_sysfs(ns); |
2440 | del_gendisk(ns->disk); | 2977 | del_gendisk(ns->disk); |
2441 | blk_cleanup_queue(ns->queue); | 2978 | blk_cleanup_queue(ns->queue); |
2442 | } | 2979 | } |
2443 | 2980 | ||
2981 | mutex_lock(&ns->ctrl->subsys->lock); | ||
2982 | nvme_mpath_clear_current_path(ns); | ||
2983 | if (head) | ||
2984 | list_del_rcu(&ns->siblings); | ||
2985 | mutex_unlock(&ns->ctrl->subsys->lock); | ||
2986 | |||
2444 | mutex_lock(&ns->ctrl->namespaces_mutex); | 2987 | mutex_lock(&ns->ctrl->namespaces_mutex); |
2445 | list_del_init(&ns->list); | 2988 | list_del_init(&ns->list); |
2446 | mutex_unlock(&ns->ctrl->namespaces_mutex); | 2989 | mutex_unlock(&ns->ctrl->namespaces_mutex); |
2447 | 2990 | ||
2991 | synchronize_srcu(&head->srcu); | ||
2448 | nvme_put_ns(ns); | 2992 | nvme_put_ns(ns); |
2449 | } | 2993 | } |
2450 | 2994 | ||
@@ -2467,7 +3011,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, | |||
2467 | struct nvme_ns *ns, *next; | 3011 | struct nvme_ns *ns, *next; |
2468 | 3012 | ||
2469 | list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { | 3013 | list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { |
2470 | if (ns->ns_id > nsid) | 3014 | if (ns->head->ns_id > nsid) |
2471 | nvme_ns_remove(ns); | 3015 | nvme_ns_remove(ns); |
2472 | } | 3016 | } |
2473 | } | 3017 | } |
@@ -2583,20 +3127,29 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) | |||
2583 | } | 3127 | } |
2584 | EXPORT_SYMBOL_GPL(nvme_remove_namespaces); | 3128 | EXPORT_SYMBOL_GPL(nvme_remove_namespaces); |
2585 | 3129 | ||
3130 | static void nvme_aen_uevent(struct nvme_ctrl *ctrl) | ||
3131 | { | ||
3132 | char *envp[2] = { NULL, NULL }; | ||
3133 | u32 aen_result = ctrl->aen_result; | ||
3134 | |||
3135 | ctrl->aen_result = 0; | ||
3136 | if (!aen_result) | ||
3137 | return; | ||
3138 | |||
3139 | envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); | ||
3140 | if (!envp[0]) | ||
3141 | return; | ||
3142 | kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); | ||
3143 | kfree(envp[0]); | ||
3144 | } | ||
3145 | |||
2586 | static void nvme_async_event_work(struct work_struct *work) | 3146 | static void nvme_async_event_work(struct work_struct *work) |
2587 | { | 3147 | { |
2588 | struct nvme_ctrl *ctrl = | 3148 | struct nvme_ctrl *ctrl = |
2589 | container_of(work, struct nvme_ctrl, async_event_work); | 3149 | container_of(work, struct nvme_ctrl, async_event_work); |
2590 | 3150 | ||
2591 | spin_lock_irq(&ctrl->lock); | 3151 | nvme_aen_uevent(ctrl); |
2592 | while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { | 3152 | ctrl->ops->submit_async_event(ctrl); |
2593 | int aer_idx = --ctrl->event_limit; | ||
2594 | |||
2595 | spin_unlock_irq(&ctrl->lock); | ||
2596 | ctrl->ops->submit_async_event(ctrl, aer_idx); | ||
2597 | spin_lock_irq(&ctrl->lock); | ||
2598 | } | ||
2599 | spin_unlock_irq(&ctrl->lock); | ||
2600 | } | 3153 | } |
2601 | 3154 | ||
2602 | static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) | 3155 | static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) |
@@ -2615,18 +3168,13 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) | |||
2615 | 3168 | ||
2616 | static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) | 3169 | static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) |
2617 | { | 3170 | { |
2618 | struct nvme_command c = { }; | ||
2619 | struct nvme_fw_slot_info_log *log; | 3171 | struct nvme_fw_slot_info_log *log; |
2620 | 3172 | ||
2621 | log = kmalloc(sizeof(*log), GFP_KERNEL); | 3173 | log = kmalloc(sizeof(*log), GFP_KERNEL); |
2622 | if (!log) | 3174 | if (!log) |
2623 | return; | 3175 | return; |
2624 | 3176 | ||
2625 | c.common.opcode = nvme_admin_get_log_page; | 3177 | if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) |
2626 | c.common.nsid = cpu_to_le32(NVME_NSID_ALL); | ||
2627 | c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log)); | ||
2628 | |||
2629 | if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log))) | ||
2630 | dev_warn(ctrl->device, | 3178 | dev_warn(ctrl->device, |
2631 | "Get FW SLOT INFO log error\n"); | 3179 | "Get FW SLOT INFO log error\n"); |
2632 | kfree(log); | 3180 | kfree(log); |
@@ -2660,7 +3208,7 @@ static void nvme_fw_act_work(struct work_struct *work) | |||
2660 | return; | 3208 | return; |
2661 | 3209 | ||
2662 | nvme_start_queues(ctrl); | 3210 | nvme_start_queues(ctrl); |
2663 | /* read FW slot informationi to clear the AER*/ | 3211 | /* read FW slot information to clear the AER */ |
2664 | nvme_get_fw_slot_info(ctrl); | 3212 | nvme_get_fw_slot_info(ctrl); |
2665 | } | 3213 | } |
2666 | 3214 | ||
@@ -2668,24 +3216,21 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, | |||
2668 | union nvme_result *res) | 3216 | union nvme_result *res) |
2669 | { | 3217 | { |
2670 | u32 result = le32_to_cpu(res->u32); | 3218 | u32 result = le32_to_cpu(res->u32); |
2671 | bool done = true; | ||
2672 | 3219 | ||
2673 | switch (le16_to_cpu(status) >> 1) { | 3220 | if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) |
2674 | case NVME_SC_SUCCESS: | 3221 | return; |
2675 | done = false; | 3222 | |
2676 | /*FALLTHRU*/ | 3223 | switch (result & 0x7) { |
2677 | case NVME_SC_ABORT_REQ: | 3224 | case NVME_AER_ERROR: |
2678 | ++ctrl->event_limit; | 3225 | case NVME_AER_SMART: |
2679 | if (ctrl->state == NVME_CTRL_LIVE) | 3226 | case NVME_AER_CSS: |
2680 | queue_work(nvme_wq, &ctrl->async_event_work); | 3227 | case NVME_AER_VS: |
3228 | ctrl->aen_result = result; | ||
2681 | break; | 3229 | break; |
2682 | default: | 3230 | default: |
2683 | break; | 3231 | break; |
2684 | } | 3232 | } |
2685 | 3233 | ||
2686 | if (done) | ||
2687 | return; | ||
2688 | |||
2689 | switch (result & 0xff07) { | 3234 | switch (result & 0xff07) { |
2690 | case NVME_AER_NOTICE_NS_CHANGED: | 3235 | case NVME_AER_NOTICE_NS_CHANGED: |
2691 | dev_info(ctrl->device, "rescanning\n"); | 3236 | dev_info(ctrl->device, "rescanning\n"); |
@@ -2697,44 +3242,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, | |||
2697 | default: | 3242 | default: |
2698 | dev_warn(ctrl->device, "async event result %08x\n", result); | 3243 | dev_warn(ctrl->device, "async event result %08x\n", result); |
2699 | } | 3244 | } |
2700 | } | ||
2701 | EXPORT_SYMBOL_GPL(nvme_complete_async_event); | ||
2702 | |||
2703 | void nvme_queue_async_events(struct nvme_ctrl *ctrl) | ||
2704 | { | ||
2705 | ctrl->event_limit = NVME_NR_AERS; | ||
2706 | queue_work(nvme_wq, &ctrl->async_event_work); | 3245 | queue_work(nvme_wq, &ctrl->async_event_work); |
2707 | } | 3246 | } |
2708 | EXPORT_SYMBOL_GPL(nvme_queue_async_events); | 3247 | EXPORT_SYMBOL_GPL(nvme_complete_async_event); |
2709 | |||
2710 | static DEFINE_IDA(nvme_instance_ida); | ||
2711 | |||
2712 | static int nvme_set_instance(struct nvme_ctrl *ctrl) | ||
2713 | { | ||
2714 | int instance, error; | ||
2715 | |||
2716 | do { | ||
2717 | if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) | ||
2718 | return -ENODEV; | ||
2719 | |||
2720 | spin_lock(&dev_list_lock); | ||
2721 | error = ida_get_new(&nvme_instance_ida, &instance); | ||
2722 | spin_unlock(&dev_list_lock); | ||
2723 | } while (error == -EAGAIN); | ||
2724 | |||
2725 | if (error) | ||
2726 | return -ENODEV; | ||
2727 | |||
2728 | ctrl->instance = instance; | ||
2729 | return 0; | ||
2730 | } | ||
2731 | |||
2732 | static void nvme_release_instance(struct nvme_ctrl *ctrl) | ||
2733 | { | ||
2734 | spin_lock(&dev_list_lock); | ||
2735 | ida_remove(&nvme_instance_ida, ctrl->instance); | ||
2736 | spin_unlock(&dev_list_lock); | ||
2737 | } | ||
2738 | 3248 | ||
2739 | void nvme_stop_ctrl(struct nvme_ctrl *ctrl) | 3249 | void nvme_stop_ctrl(struct nvme_ctrl *ctrl) |
2740 | { | 3250 | { |
@@ -2752,7 +3262,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) | |||
2752 | 3262 | ||
2753 | if (ctrl->queue_count > 1) { | 3263 | if (ctrl->queue_count > 1) { |
2754 | nvme_queue_scan(ctrl); | 3264 | nvme_queue_scan(ctrl); |
2755 | nvme_queue_async_events(ctrl); | 3265 | queue_work(nvme_wq, &ctrl->async_event_work); |
2756 | nvme_start_queues(ctrl); | 3266 | nvme_start_queues(ctrl); |
2757 | } | 3267 | } |
2758 | } | 3268 | } |
@@ -2760,30 +3270,31 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); | |||
2760 | 3270 | ||
2761 | void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) | 3271 | void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) |
2762 | { | 3272 | { |
2763 | device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); | 3273 | cdev_device_del(&ctrl->cdev, ctrl->device); |
2764 | |||
2765 | spin_lock(&dev_list_lock); | ||
2766 | list_del(&ctrl->node); | ||
2767 | spin_unlock(&dev_list_lock); | ||
2768 | } | 3274 | } |
2769 | EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); | 3275 | EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); |
2770 | 3276 | ||
2771 | static void nvme_free_ctrl(struct kref *kref) | 3277 | static void nvme_free_ctrl(struct device *dev) |
2772 | { | 3278 | { |
2773 | struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); | 3279 | struct nvme_ctrl *ctrl = |
3280 | container_of(dev, struct nvme_ctrl, ctrl_device); | ||
3281 | struct nvme_subsystem *subsys = ctrl->subsys; | ||
2774 | 3282 | ||
2775 | put_device(ctrl->device); | 3283 | ida_simple_remove(&nvme_instance_ida, ctrl->instance); |
2776 | nvme_release_instance(ctrl); | 3284 | kfree(ctrl->effects); |
2777 | ida_destroy(&ctrl->ns_ida); | 3285 | |
3286 | if (subsys) { | ||
3287 | mutex_lock(&subsys->lock); | ||
3288 | list_del(&ctrl->subsys_entry); | ||
3289 | mutex_unlock(&subsys->lock); | ||
3290 | sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); | ||
3291 | } | ||
2778 | 3292 | ||
2779 | ctrl->ops->free_ctrl(ctrl); | 3293 | ctrl->ops->free_ctrl(ctrl); |
2780 | } | ||
2781 | 3294 | ||
2782 | void nvme_put_ctrl(struct nvme_ctrl *ctrl) | 3295 | if (subsys) |
2783 | { | 3296 | nvme_put_subsystem(subsys); |
2784 | kref_put(&ctrl->kref, nvme_free_ctrl); | ||
2785 | } | 3297 | } |
2786 | EXPORT_SYMBOL_GPL(nvme_put_ctrl); | ||
2787 | 3298 | ||
2788 | /* | 3299 | /* |
2789 | * Initialize a NVMe controller structures. This needs to be called during | 3300 | * Initialize a NVMe controller structures. This needs to be called during |
@@ -2799,32 +3310,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, | |||
2799 | spin_lock_init(&ctrl->lock); | 3310 | spin_lock_init(&ctrl->lock); |
2800 | INIT_LIST_HEAD(&ctrl->namespaces); | 3311 | INIT_LIST_HEAD(&ctrl->namespaces); |
2801 | mutex_init(&ctrl->namespaces_mutex); | 3312 | mutex_init(&ctrl->namespaces_mutex); |
2802 | kref_init(&ctrl->kref); | ||
2803 | ctrl->dev = dev; | 3313 | ctrl->dev = dev; |
2804 | ctrl->ops = ops; | 3314 | ctrl->ops = ops; |
2805 | ctrl->quirks = quirks; | 3315 | ctrl->quirks = quirks; |
2806 | INIT_WORK(&ctrl->scan_work, nvme_scan_work); | 3316 | INIT_WORK(&ctrl->scan_work, nvme_scan_work); |
2807 | INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); | 3317 | INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); |
2808 | INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); | 3318 | INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); |
3319 | INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); | ||
2809 | 3320 | ||
2810 | ret = nvme_set_instance(ctrl); | 3321 | ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); |
2811 | if (ret) | 3322 | if (ret < 0) |
2812 | goto out; | 3323 | goto out; |
2813 | 3324 | ctrl->instance = ret; | |
2814 | ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, | 3325 | |
2815 | MKDEV(nvme_char_major, ctrl->instance), | 3326 | device_initialize(&ctrl->ctrl_device); |
2816 | ctrl, nvme_dev_attr_groups, | 3327 | ctrl->device = &ctrl->ctrl_device; |
2817 | "nvme%d", ctrl->instance); | 3328 | ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); |
2818 | if (IS_ERR(ctrl->device)) { | 3329 | ctrl->device->class = nvme_class; |
2819 | ret = PTR_ERR(ctrl->device); | 3330 | ctrl->device->parent = ctrl->dev; |
3331 | ctrl->device->groups = nvme_dev_attr_groups; | ||
3332 | ctrl->device->release = nvme_free_ctrl; | ||
3333 | dev_set_drvdata(ctrl->device, ctrl); | ||
3334 | ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); | ||
3335 | if (ret) | ||
2820 | goto out_release_instance; | 3336 | goto out_release_instance; |
2821 | } | ||
2822 | get_device(ctrl->device); | ||
2823 | ida_init(&ctrl->ns_ida); | ||
2824 | 3337 | ||
2825 | spin_lock(&dev_list_lock); | 3338 | cdev_init(&ctrl->cdev, &nvme_dev_fops); |
2826 | list_add_tail(&ctrl->node, &nvme_ctrl_list); | 3339 | ctrl->cdev.owner = ops->module; |
2827 | spin_unlock(&dev_list_lock); | 3340 | ret = cdev_device_add(&ctrl->cdev, ctrl->device); |
3341 | if (ret) | ||
3342 | goto out_free_name; | ||
2828 | 3343 | ||
2829 | /* | 3344 | /* |
2830 | * Initialize latency tolerance controls. The sysfs files won't | 3345 | * Initialize latency tolerance controls. The sysfs files won't |
@@ -2835,8 +3350,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, | |||
2835 | min(default_ps_max_latency_us, (unsigned long)S32_MAX)); | 3350 | min(default_ps_max_latency_us, (unsigned long)S32_MAX)); |
2836 | 3351 | ||
2837 | return 0; | 3352 | return 0; |
3353 | out_free_name: | ||
3354 | kfree_const(dev->kobj.name); | ||
2838 | out_release_instance: | 3355 | out_release_instance: |
2839 | nvme_release_instance(ctrl); | 3356 | ida_simple_remove(&nvme_instance_ida, ctrl->instance); |
2840 | out: | 3357 | out: |
2841 | return ret; | 3358 | return ret; |
2842 | } | 3359 | } |
@@ -2945,6 +3462,16 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) | |||
2945 | } | 3462 | } |
2946 | EXPORT_SYMBOL_GPL(nvme_start_queues); | 3463 | EXPORT_SYMBOL_GPL(nvme_start_queues); |
2947 | 3464 | ||
3465 | int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) | ||
3466 | { | ||
3467 | if (!ctrl->ops->reinit_request) | ||
3468 | return 0; | ||
3469 | |||
3470 | return blk_mq_tagset_iter(set, set->driver_data, | ||
3471 | ctrl->ops->reinit_request); | ||
3472 | } | ||
3473 | EXPORT_SYMBOL_GPL(nvme_reinit_tagset); | ||
3474 | |||
2948 | int __init nvme_core_init(void) | 3475 | int __init nvme_core_init(void) |
2949 | { | 3476 | { |
2950 | int result; | 3477 | int result; |
@@ -2954,12 +3481,9 @@ int __init nvme_core_init(void) | |||
2954 | if (!nvme_wq) | 3481 | if (!nvme_wq) |
2955 | return -ENOMEM; | 3482 | return -ENOMEM; |
2956 | 3483 | ||
2957 | result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", | 3484 | result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); |
2958 | &nvme_dev_fops); | ||
2959 | if (result < 0) | 3485 | if (result < 0) |
2960 | goto destroy_wq; | 3486 | goto destroy_wq; |
2961 | else if (result > 0) | ||
2962 | nvme_char_major = result; | ||
2963 | 3487 | ||
2964 | nvme_class = class_create(THIS_MODULE, "nvme"); | 3488 | nvme_class = class_create(THIS_MODULE, "nvme"); |
2965 | if (IS_ERR(nvme_class)) { | 3489 | if (IS_ERR(nvme_class)) { |
@@ -2967,10 +3491,17 @@ int __init nvme_core_init(void) | |||
2967 | goto unregister_chrdev; | 3491 | goto unregister_chrdev; |
2968 | } | 3492 | } |
2969 | 3493 | ||
3494 | nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); | ||
3495 | if (IS_ERR(nvme_subsys_class)) { | ||
3496 | result = PTR_ERR(nvme_subsys_class); | ||
3497 | goto destroy_class; | ||
3498 | } | ||
2970 | return 0; | 3499 | return 0; |
2971 | 3500 | ||
3501 | destroy_class: | ||
3502 | class_destroy(nvme_class); | ||
2972 | unregister_chrdev: | 3503 | unregister_chrdev: |
2973 | __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); | 3504 | unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); |
2974 | destroy_wq: | 3505 | destroy_wq: |
2975 | destroy_workqueue(nvme_wq); | 3506 | destroy_workqueue(nvme_wq); |
2976 | return result; | 3507 | return result; |
@@ -2978,8 +3509,10 @@ destroy_wq: | |||
2978 | 3509 | ||
2979 | void nvme_core_exit(void) | 3510 | void nvme_core_exit(void) |
2980 | { | 3511 | { |
3512 | ida_destroy(&nvme_subsystems_ida); | ||
3513 | class_destroy(nvme_subsys_class); | ||
2981 | class_destroy(nvme_class); | 3514 | class_destroy(nvme_class); |
2982 | __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); | 3515 | unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); |
2983 | destroy_workqueue(nvme_wq); | 3516 | destroy_workqueue(nvme_wq); |
2984 | } | 3517 | } |
2985 | 3518 | ||
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 555c976cc2ee..76b4fe6816a0 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c | |||
@@ -548,6 +548,7 @@ static const match_table_t opt_tokens = { | |||
548 | { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, | 548 | { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, |
549 | { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, | 549 | { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, |
550 | { NVMF_OPT_HOST_ID, "hostid=%s" }, | 550 | { NVMF_OPT_HOST_ID, "hostid=%s" }, |
551 | { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, | ||
551 | { NVMF_OPT_ERR, NULL } | 552 | { NVMF_OPT_ERR, NULL } |
552 | }; | 553 | }; |
553 | 554 | ||
@@ -566,6 +567,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, | |||
566 | opts->nr_io_queues = num_online_cpus(); | 567 | opts->nr_io_queues = num_online_cpus(); |
567 | opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; | 568 | opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; |
568 | opts->kato = NVME_DEFAULT_KATO; | 569 | opts->kato = NVME_DEFAULT_KATO; |
570 | opts->duplicate_connect = false; | ||
569 | 571 | ||
570 | options = o = kstrdup(buf, GFP_KERNEL); | 572 | options = o = kstrdup(buf, GFP_KERNEL); |
571 | if (!options) | 573 | if (!options) |
@@ -742,6 +744,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, | |||
742 | goto out; | 744 | goto out; |
743 | } | 745 | } |
744 | break; | 746 | break; |
747 | case NVMF_OPT_DUP_CONNECT: | ||
748 | opts->duplicate_connect = true; | ||
749 | break; | ||
745 | default: | 750 | default: |
746 | pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", | 751 | pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", |
747 | p); | 752 | p); |
@@ -823,7 +828,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); | |||
823 | #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) | 828 | #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) |
824 | #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ | 829 | #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ |
825 | NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ | 830 | NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ |
826 | NVMF_OPT_HOST_ID) | 831 | NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT) |
827 | 832 | ||
828 | static struct nvme_ctrl * | 833 | static struct nvme_ctrl * |
829 | nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) | 834 | nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) |
@@ -841,6 +846,9 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) | |||
841 | if (ret) | 846 | if (ret) |
842 | goto out_free_opts; | 847 | goto out_free_opts; |
843 | 848 | ||
849 | |||
850 | request_module("nvme-%s", opts->transport); | ||
851 | |||
844 | /* | 852 | /* |
845 | * Check the generic options first as we need a valid transport for | 853 | * Check the generic options first as we need a valid transport for |
846 | * the lookup below. Then clear the generic flags so that transport | 854 | * the lookup below. Then clear the generic flags so that transport |
@@ -874,12 +882,12 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) | |||
874 | goto out_unlock; | 882 | goto out_unlock; |
875 | } | 883 | } |
876 | 884 | ||
877 | if (strcmp(ctrl->subnqn, opts->subsysnqn)) { | 885 | if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { |
878 | dev_warn(ctrl->device, | 886 | dev_warn(ctrl->device, |
879 | "controller returned incorrect NQN: \"%s\".\n", | 887 | "controller returned incorrect NQN: \"%s\".\n", |
880 | ctrl->subnqn); | 888 | ctrl->subsys->subnqn); |
881 | up_read(&nvmf_transports_rwsem); | 889 | up_read(&nvmf_transports_rwsem); |
882 | ctrl->ops->delete_ctrl(ctrl); | 890 | nvme_delete_ctrl_sync(ctrl); |
883 | return ERR_PTR(-EINVAL); | 891 | return ERR_PTR(-EINVAL); |
884 | } | 892 | } |
885 | 893 | ||
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index bf33663218cd..42232e731f19 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h | |||
@@ -57,6 +57,7 @@ enum { | |||
57 | NVMF_OPT_HOST_TRADDR = 1 << 10, | 57 | NVMF_OPT_HOST_TRADDR = 1 << 10, |
58 | NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, | 58 | NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, |
59 | NVMF_OPT_HOST_ID = 1 << 12, | 59 | NVMF_OPT_HOST_ID = 1 << 12, |
60 | NVMF_OPT_DUP_CONNECT = 1 << 13, | ||
60 | }; | 61 | }; |
61 | 62 | ||
62 | /** | 63 | /** |
@@ -96,6 +97,7 @@ struct nvmf_ctrl_options { | |||
96 | unsigned int nr_io_queues; | 97 | unsigned int nr_io_queues; |
97 | unsigned int reconnect_delay; | 98 | unsigned int reconnect_delay; |
98 | bool discovery_nqn; | 99 | bool discovery_nqn; |
100 | bool duplicate_connect; | ||
99 | unsigned int kato; | 101 | unsigned int kato; |
100 | struct nvmf_host *host; | 102 | struct nvmf_host *host; |
101 | int max_reconnects; | 103 | int max_reconnects; |
@@ -131,6 +133,18 @@ struct nvmf_transport_ops { | |||
131 | struct nvmf_ctrl_options *opts); | 133 | struct nvmf_ctrl_options *opts); |
132 | }; | 134 | }; |
133 | 135 | ||
136 | static inline bool | ||
137 | nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, | ||
138 | struct nvmf_ctrl_options *opts) | ||
139 | { | ||
140 | if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || | ||
141 | strcmp(opts->host->nqn, ctrl->opts->host->nqn) || | ||
142 | memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t))) | ||
143 | return false; | ||
144 | |||
145 | return true; | ||
146 | } | ||
147 | |||
134 | int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); | 148 | int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); |
135 | int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); | 149 | int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); |
136 | int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); | 150 | int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); |
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index be49d0f79381..7ab0be55c7d0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c | |||
@@ -30,27 +30,19 @@ | |||
30 | /* *************************** Data Structures/Defines ****************** */ | 30 | /* *************************** Data Structures/Defines ****************** */ |
31 | 31 | ||
32 | 32 | ||
33 | /* | ||
34 | * We handle AEN commands ourselves and don't even let the | ||
35 | * block layer know about them. | ||
36 | */ | ||
37 | #define NVME_FC_NR_AEN_COMMANDS 1 | ||
38 | #define NVME_FC_AQ_BLKMQ_DEPTH \ | ||
39 | (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) | ||
40 | #define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1) | ||
41 | |||
42 | enum nvme_fc_queue_flags { | 33 | enum nvme_fc_queue_flags { |
43 | NVME_FC_Q_CONNECTED = (1 << 0), | 34 | NVME_FC_Q_CONNECTED = (1 << 0), |
44 | }; | 35 | }; |
45 | 36 | ||
46 | #define NVMEFC_QUEUE_DELAY 3 /* ms units */ | 37 | #define NVMEFC_QUEUE_DELAY 3 /* ms units */ |
47 | 38 | ||
39 | #define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */ | ||
40 | |||
48 | struct nvme_fc_queue { | 41 | struct nvme_fc_queue { |
49 | struct nvme_fc_ctrl *ctrl; | 42 | struct nvme_fc_ctrl *ctrl; |
50 | struct device *dev; | 43 | struct device *dev; |
51 | struct blk_mq_hw_ctx *hctx; | 44 | struct blk_mq_hw_ctx *hctx; |
52 | void *lldd_handle; | 45 | void *lldd_handle; |
53 | int queue_size; | ||
54 | size_t cmnd_capsule_len; | 46 | size_t cmnd_capsule_len; |
55 | u32 qnum; | 47 | u32 qnum; |
56 | u32 rqcnt; | 48 | u32 rqcnt; |
@@ -124,6 +116,7 @@ struct nvme_fc_lport { | |||
124 | struct device *dev; /* physical device for dma */ | 116 | struct device *dev; /* physical device for dma */ |
125 | struct nvme_fc_port_template *ops; | 117 | struct nvme_fc_port_template *ops; |
126 | struct kref ref; | 118 | struct kref ref; |
119 | atomic_t act_rport_cnt; | ||
127 | } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ | 120 | } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ |
128 | 121 | ||
129 | struct nvme_fc_rport { | 122 | struct nvme_fc_rport { |
@@ -136,6 +129,8 @@ struct nvme_fc_rport { | |||
136 | struct nvme_fc_lport *lport; | 129 | struct nvme_fc_lport *lport; |
137 | spinlock_t lock; | 130 | spinlock_t lock; |
138 | struct kref ref; | 131 | struct kref ref; |
132 | atomic_t act_ctrl_cnt; | ||
133 | unsigned long dev_loss_end; | ||
139 | } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ | 134 | } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ |
140 | 135 | ||
141 | enum nvme_fcctrl_flags { | 136 | enum nvme_fcctrl_flags { |
@@ -150,6 +145,7 @@ struct nvme_fc_ctrl { | |||
150 | struct nvme_fc_rport *rport; | 145 | struct nvme_fc_rport *rport; |
151 | u32 cnum; | 146 | u32 cnum; |
152 | 147 | ||
148 | bool assoc_active; | ||
153 | u64 association_id; | 149 | u64 association_id; |
154 | 150 | ||
155 | struct list_head ctrl_list; /* rport->ctrl_list */ | 151 | struct list_head ctrl_list; /* rport->ctrl_list */ |
@@ -157,7 +153,6 @@ struct nvme_fc_ctrl { | |||
157 | struct blk_mq_tag_set admin_tag_set; | 153 | struct blk_mq_tag_set admin_tag_set; |
158 | struct blk_mq_tag_set tag_set; | 154 | struct blk_mq_tag_set tag_set; |
159 | 155 | ||
160 | struct work_struct delete_work; | ||
161 | struct delayed_work connect_work; | 156 | struct delayed_work connect_work; |
162 | 157 | ||
163 | struct kref ref; | 158 | struct kref ref; |
@@ -165,7 +160,7 @@ struct nvme_fc_ctrl { | |||
165 | u32 iocnt; | 160 | u32 iocnt; |
166 | wait_queue_head_t ioabort_wait; | 161 | wait_queue_head_t ioabort_wait; |
167 | 162 | ||
168 | struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS]; | 163 | struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS]; |
169 | 164 | ||
170 | struct nvme_ctrl ctrl; | 165 | struct nvme_ctrl ctrl; |
171 | }; | 166 | }; |
@@ -213,10 +208,16 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt); | |||
213 | 208 | ||
214 | 209 | ||
215 | 210 | ||
211 | /* | ||
212 | * These items are short-term. They will eventually be moved into | ||
213 | * a generic FC class. See comments in module init. | ||
214 | */ | ||
215 | static struct class *fc_class; | ||
216 | static struct device *fc_udev_device; | ||
217 | |||
216 | 218 | ||
217 | /* *********************** FC-NVME Port Management ************************ */ | 219 | /* *********************** FC-NVME Port Management ************************ */ |
218 | 220 | ||
219 | static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *); | ||
220 | static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, | 221 | static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, |
221 | struct nvme_fc_queue *, unsigned int); | 222 | struct nvme_fc_queue *, unsigned int); |
222 | 223 | ||
@@ -235,9 +236,6 @@ nvme_fc_free_lport(struct kref *ref) | |||
235 | list_del(&lport->port_list); | 236 | list_del(&lport->port_list); |
236 | spin_unlock_irqrestore(&nvme_fc_lock, flags); | 237 | spin_unlock_irqrestore(&nvme_fc_lock, flags); |
237 | 238 | ||
238 | /* let the LLDD know we've finished tearing it down */ | ||
239 | lport->ops->localport_delete(&lport->localport); | ||
240 | |||
241 | ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); | 239 | ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); |
242 | ida_destroy(&lport->endp_cnt); | 240 | ida_destroy(&lport->endp_cnt); |
243 | 241 | ||
@@ -260,7 +258,9 @@ nvme_fc_lport_get(struct nvme_fc_lport *lport) | |||
260 | 258 | ||
261 | 259 | ||
262 | static struct nvme_fc_lport * | 260 | static struct nvme_fc_lport * |
263 | nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) | 261 | nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo, |
262 | struct nvme_fc_port_template *ops, | ||
263 | struct device *dev) | ||
264 | { | 264 | { |
265 | struct nvme_fc_lport *lport; | 265 | struct nvme_fc_lport *lport; |
266 | unsigned long flags; | 266 | unsigned long flags; |
@@ -272,6 +272,11 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) | |||
272 | lport->localport.port_name != pinfo->port_name) | 272 | lport->localport.port_name != pinfo->port_name) |
273 | continue; | 273 | continue; |
274 | 274 | ||
275 | if (lport->dev != dev) { | ||
276 | lport = ERR_PTR(-EXDEV); | ||
277 | goto out_done; | ||
278 | } | ||
279 | |||
275 | if (lport->localport.port_state != FC_OBJSTATE_DELETED) { | 280 | if (lport->localport.port_state != FC_OBJSTATE_DELETED) { |
276 | lport = ERR_PTR(-EEXIST); | 281 | lport = ERR_PTR(-EEXIST); |
277 | goto out_done; | 282 | goto out_done; |
@@ -288,6 +293,7 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) | |||
288 | 293 | ||
289 | /* resume the lport */ | 294 | /* resume the lport */ |
290 | 295 | ||
296 | lport->ops = ops; | ||
291 | lport->localport.port_role = pinfo->port_role; | 297 | lport->localport.port_role = pinfo->port_role; |
292 | lport->localport.port_id = pinfo->port_id; | 298 | lport->localport.port_id = pinfo->port_id; |
293 | lport->localport.port_state = FC_OBJSTATE_ONLINE; | 299 | lport->localport.port_state = FC_OBJSTATE_ONLINE; |
@@ -348,7 +354,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, | |||
348 | * expired, we can simply re-enable the localport. Remoteports | 354 | * expired, we can simply re-enable the localport. Remoteports |
349 | * and controller reconnections should resume naturally. | 355 | * and controller reconnections should resume naturally. |
350 | */ | 356 | */ |
351 | newrec = nvme_fc_attach_to_unreg_lport(pinfo); | 357 | newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev); |
352 | 358 | ||
353 | /* found an lport, but something about its state is bad */ | 359 | /* found an lport, but something about its state is bad */ |
354 | if (IS_ERR(newrec)) { | 360 | if (IS_ERR(newrec)) { |
@@ -384,6 +390,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, | |||
384 | INIT_LIST_HEAD(&newrec->port_list); | 390 | INIT_LIST_HEAD(&newrec->port_list); |
385 | INIT_LIST_HEAD(&newrec->endp_list); | 391 | INIT_LIST_HEAD(&newrec->endp_list); |
386 | kref_init(&newrec->ref); | 392 | kref_init(&newrec->ref); |
393 | atomic_set(&newrec->act_rport_cnt, 0); | ||
387 | newrec->ops = template; | 394 | newrec->ops = template; |
388 | newrec->dev = dev; | 395 | newrec->dev = dev; |
389 | ida_init(&newrec->endp_cnt); | 396 | ida_init(&newrec->endp_cnt); |
@@ -446,12 +453,177 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr) | |||
446 | 453 | ||
447 | spin_unlock_irqrestore(&nvme_fc_lock, flags); | 454 | spin_unlock_irqrestore(&nvme_fc_lock, flags); |
448 | 455 | ||
456 | if (atomic_read(&lport->act_rport_cnt) == 0) | ||
457 | lport->ops->localport_delete(&lport->localport); | ||
458 | |||
449 | nvme_fc_lport_put(lport); | 459 | nvme_fc_lport_put(lport); |
450 | 460 | ||
451 | return 0; | 461 | return 0; |
452 | } | 462 | } |
453 | EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport); | 463 | EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport); |
454 | 464 | ||
465 | /* | ||
466 | * TRADDR strings, per FC-NVME are fixed format: | ||
467 | * "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters | ||
468 | * udev event will only differ by prefix of what field is | ||
469 | * being specified: | ||
470 | * "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters | ||
471 | * 19 + 43 + null_fudge = 64 characters | ||
472 | */ | ||
473 | #define FCNVME_TRADDR_LENGTH 64 | ||
474 | |||
475 | static void | ||
476 | nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport, | ||
477 | struct nvme_fc_rport *rport) | ||
478 | { | ||
479 | char hostaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_HOST_TRADDR=...*/ | ||
480 | char tgtaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_TRADDR=...*/ | ||
481 | char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL }; | ||
482 | |||
483 | if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY)) | ||
484 | return; | ||
485 | |||
486 | snprintf(hostaddr, sizeof(hostaddr), | ||
487 | "NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx", | ||
488 | lport->localport.node_name, lport->localport.port_name); | ||
489 | snprintf(tgtaddr, sizeof(tgtaddr), | ||
490 | "NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx", | ||
491 | rport->remoteport.node_name, rport->remoteport.port_name); | ||
492 | kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp); | ||
493 | } | ||
494 | |||
495 | static void | ||
496 | nvme_fc_free_rport(struct kref *ref) | ||
497 | { | ||
498 | struct nvme_fc_rport *rport = | ||
499 | container_of(ref, struct nvme_fc_rport, ref); | ||
500 | struct nvme_fc_lport *lport = | ||
501 | localport_to_lport(rport->remoteport.localport); | ||
502 | unsigned long flags; | ||
503 | |||
504 | WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED); | ||
505 | WARN_ON(!list_empty(&rport->ctrl_list)); | ||
506 | |||
507 | /* remove from lport list */ | ||
508 | spin_lock_irqsave(&nvme_fc_lock, flags); | ||
509 | list_del(&rport->endp_list); | ||
510 | spin_unlock_irqrestore(&nvme_fc_lock, flags); | ||
511 | |||
512 | ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); | ||
513 | |||
514 | kfree(rport); | ||
515 | |||
516 | nvme_fc_lport_put(lport); | ||
517 | } | ||
518 | |||
519 | static void | ||
520 | nvme_fc_rport_put(struct nvme_fc_rport *rport) | ||
521 | { | ||
522 | kref_put(&rport->ref, nvme_fc_free_rport); | ||
523 | } | ||
524 | |||
525 | static int | ||
526 | nvme_fc_rport_get(struct nvme_fc_rport *rport) | ||
527 | { | ||
528 | return kref_get_unless_zero(&rport->ref); | ||
529 | } | ||
530 | |||
531 | static void | ||
532 | nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl) | ||
533 | { | ||
534 | switch (ctrl->ctrl.state) { | ||
535 | case NVME_CTRL_NEW: | ||
536 | case NVME_CTRL_RECONNECTING: | ||
537 | /* | ||
538 | * As all reconnects were suppressed, schedule a | ||
539 | * connect. | ||
540 | */ | ||
541 | dev_info(ctrl->ctrl.device, | ||
542 | "NVME-FC{%d}: connectivity re-established. " | ||
543 | "Attempting reconnect\n", ctrl->cnum); | ||
544 | |||
545 | queue_delayed_work(nvme_wq, &ctrl->connect_work, 0); | ||
546 | break; | ||
547 | |||
548 | case NVME_CTRL_RESETTING: | ||
549 | /* | ||
550 | * Controller is already in the process of terminating the | ||
551 | * association. No need to do anything further. The reconnect | ||
552 | * step will naturally occur after the reset completes. | ||
553 | */ | ||
554 | break; | ||
555 | |||
556 | default: | ||
557 | /* no action to take - let it delete */ | ||
558 | break; | ||
559 | } | ||
560 | } | ||
561 | |||
562 | static struct nvme_fc_rport * | ||
563 | nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport, | ||
564 | struct nvme_fc_port_info *pinfo) | ||
565 | { | ||
566 | struct nvme_fc_rport *rport; | ||
567 | struct nvme_fc_ctrl *ctrl; | ||
568 | unsigned long flags; | ||
569 | |||
570 | spin_lock_irqsave(&nvme_fc_lock, flags); | ||
571 | |||
572 | list_for_each_entry(rport, &lport->endp_list, endp_list) { | ||
573 | if (rport->remoteport.node_name != pinfo->node_name || | ||
574 | rport->remoteport.port_name != pinfo->port_name) | ||
575 | continue; | ||
576 | |||
577 | if (!nvme_fc_rport_get(rport)) { | ||
578 | rport = ERR_PTR(-ENOLCK); | ||
579 | goto out_done; | ||
580 | } | ||
581 | |||
582 | spin_unlock_irqrestore(&nvme_fc_lock, flags); | ||
583 | |||
584 | spin_lock_irqsave(&rport->lock, flags); | ||
585 | |||
586 | /* has it been unregistered */ | ||
587 | if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) { | ||
588 | /* means lldd called us twice */ | ||
589 | spin_unlock_irqrestore(&rport->lock, flags); | ||
590 | nvme_fc_rport_put(rport); | ||
591 | return ERR_PTR(-ESTALE); | ||
592 | } | ||
593 | |||
594 | rport->remoteport.port_state = FC_OBJSTATE_ONLINE; | ||
595 | rport->dev_loss_end = 0; | ||
596 | |||
597 | /* | ||
598 | * kick off a reconnect attempt on all associations to the | ||
599 | * remote port. A successful reconnects will resume i/o. | ||
600 | */ | ||
601 | list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) | ||
602 | nvme_fc_resume_controller(ctrl); | ||
603 | |||
604 | spin_unlock_irqrestore(&rport->lock, flags); | ||
605 | |||
606 | return rport; | ||
607 | } | ||
608 | |||
609 | rport = NULL; | ||
610 | |||
611 | out_done: | ||
612 | spin_unlock_irqrestore(&nvme_fc_lock, flags); | ||
613 | |||
614 | return rport; | ||
615 | } | ||
616 | |||
617 | static inline void | ||
618 | __nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport, | ||
619 | struct nvme_fc_port_info *pinfo) | ||
620 | { | ||
621 | if (pinfo->dev_loss_tmo) | ||
622 | rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; | ||
623 | else | ||
624 | rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; | ||
625 | } | ||
626 | |||
455 | /** | 627 | /** |
456 | * nvme_fc_register_remoteport - transport entry point called by an | 628 | * nvme_fc_register_remoteport - transport entry point called by an |
457 | * LLDD to register the existence of a NVME | 629 | * LLDD to register the existence of a NVME |
@@ -478,28 +650,52 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, | |||
478 | unsigned long flags; | 650 | unsigned long flags; |
479 | int ret, idx; | 651 | int ret, idx; |
480 | 652 | ||
653 | if (!nvme_fc_lport_get(lport)) { | ||
654 | ret = -ESHUTDOWN; | ||
655 | goto out_reghost_failed; | ||
656 | } | ||
657 | |||
658 | /* | ||
659 | * look to see if there is already a remoteport that is waiting | ||
660 | * for a reconnect (within dev_loss_tmo) with the same WWN's. | ||
661 | * If so, transition to it and reconnect. | ||
662 | */ | ||
663 | newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo); | ||
664 | |||
665 | /* found an rport, but something about its state is bad */ | ||
666 | if (IS_ERR(newrec)) { | ||
667 | ret = PTR_ERR(newrec); | ||
668 | goto out_lport_put; | ||
669 | |||
670 | /* found existing rport, which was resumed */ | ||
671 | } else if (newrec) { | ||
672 | nvme_fc_lport_put(lport); | ||
673 | __nvme_fc_set_dev_loss_tmo(newrec, pinfo); | ||
674 | nvme_fc_signal_discovery_scan(lport, newrec); | ||
675 | *portptr = &newrec->remoteport; | ||
676 | return 0; | ||
677 | } | ||
678 | |||
679 | /* nothing found - allocate a new remoteport struct */ | ||
680 | |||
481 | newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz), | 681 | newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz), |
482 | GFP_KERNEL); | 682 | GFP_KERNEL); |
483 | if (!newrec) { | 683 | if (!newrec) { |
484 | ret = -ENOMEM; | 684 | ret = -ENOMEM; |
485 | goto out_reghost_failed; | 685 | goto out_lport_put; |
486 | } | ||
487 | |||
488 | if (!nvme_fc_lport_get(lport)) { | ||
489 | ret = -ESHUTDOWN; | ||
490 | goto out_kfree_rport; | ||
491 | } | 686 | } |
492 | 687 | ||
493 | idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); | 688 | idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); |
494 | if (idx < 0) { | 689 | if (idx < 0) { |
495 | ret = -ENOSPC; | 690 | ret = -ENOSPC; |
496 | goto out_lport_put; | 691 | goto out_kfree_rport; |
497 | } | 692 | } |
498 | 693 | ||
499 | INIT_LIST_HEAD(&newrec->endp_list); | 694 | INIT_LIST_HEAD(&newrec->endp_list); |
500 | INIT_LIST_HEAD(&newrec->ctrl_list); | 695 | INIT_LIST_HEAD(&newrec->ctrl_list); |
501 | INIT_LIST_HEAD(&newrec->ls_req_list); | 696 | INIT_LIST_HEAD(&newrec->ls_req_list); |
502 | kref_init(&newrec->ref); | 697 | kref_init(&newrec->ref); |
698 | atomic_set(&newrec->act_ctrl_cnt, 0); | ||
503 | spin_lock_init(&newrec->lock); | 699 | spin_lock_init(&newrec->lock); |
504 | newrec->remoteport.localport = &lport->localport; | 700 | newrec->remoteport.localport = &lport->localport; |
505 | newrec->dev = lport->dev; | 701 | newrec->dev = lport->dev; |
@@ -511,63 +707,27 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, | |||
511 | newrec->remoteport.port_id = pinfo->port_id; | 707 | newrec->remoteport.port_id = pinfo->port_id; |
512 | newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; | 708 | newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; |
513 | newrec->remoteport.port_num = idx; | 709 | newrec->remoteport.port_num = idx; |
710 | __nvme_fc_set_dev_loss_tmo(newrec, pinfo); | ||
514 | 711 | ||
515 | spin_lock_irqsave(&nvme_fc_lock, flags); | 712 | spin_lock_irqsave(&nvme_fc_lock, flags); |
516 | list_add_tail(&newrec->endp_list, &lport->endp_list); | 713 | list_add_tail(&newrec->endp_list, &lport->endp_list); |
517 | spin_unlock_irqrestore(&nvme_fc_lock, flags); | 714 | spin_unlock_irqrestore(&nvme_fc_lock, flags); |
518 | 715 | ||
716 | nvme_fc_signal_discovery_scan(lport, newrec); | ||
717 | |||
519 | *portptr = &newrec->remoteport; | 718 | *portptr = &newrec->remoteport; |
520 | return 0; | 719 | return 0; |
521 | 720 | ||
522 | out_lport_put: | ||
523 | nvme_fc_lport_put(lport); | ||
524 | out_kfree_rport: | 721 | out_kfree_rport: |
525 | kfree(newrec); | 722 | kfree(newrec); |
723 | out_lport_put: | ||
724 | nvme_fc_lport_put(lport); | ||
526 | out_reghost_failed: | 725 | out_reghost_failed: |
527 | *portptr = NULL; | 726 | *portptr = NULL; |
528 | return ret; | 727 | return ret; |
529 | } | 728 | } |
530 | EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport); | 729 | EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport); |
531 | 730 | ||
532 | static void | ||
533 | nvme_fc_free_rport(struct kref *ref) | ||
534 | { | ||
535 | struct nvme_fc_rport *rport = | ||
536 | container_of(ref, struct nvme_fc_rport, ref); | ||
537 | struct nvme_fc_lport *lport = | ||
538 | localport_to_lport(rport->remoteport.localport); | ||
539 | unsigned long flags; | ||
540 | |||
541 | WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED); | ||
542 | WARN_ON(!list_empty(&rport->ctrl_list)); | ||
543 | |||
544 | /* remove from lport list */ | ||
545 | spin_lock_irqsave(&nvme_fc_lock, flags); | ||
546 | list_del(&rport->endp_list); | ||
547 | spin_unlock_irqrestore(&nvme_fc_lock, flags); | ||
548 | |||
549 | /* let the LLDD know we've finished tearing it down */ | ||
550 | lport->ops->remoteport_delete(&rport->remoteport); | ||
551 | |||
552 | ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); | ||
553 | |||
554 | kfree(rport); | ||
555 | |||
556 | nvme_fc_lport_put(lport); | ||
557 | } | ||
558 | |||
559 | static void | ||
560 | nvme_fc_rport_put(struct nvme_fc_rport *rport) | ||
561 | { | ||
562 | kref_put(&rport->ref, nvme_fc_free_rport); | ||
563 | } | ||
564 | |||
565 | static int | ||
566 | nvme_fc_rport_get(struct nvme_fc_rport *rport) | ||
567 | { | ||
568 | return kref_get_unless_zero(&rport->ref); | ||
569 | } | ||
570 | |||
571 | static int | 731 | static int |
572 | nvme_fc_abort_lsops(struct nvme_fc_rport *rport) | 732 | nvme_fc_abort_lsops(struct nvme_fc_rport *rport) |
573 | { | 733 | { |
@@ -592,6 +752,58 @@ restart: | |||
592 | return 0; | 752 | return 0; |
593 | } | 753 | } |
594 | 754 | ||
755 | static void | ||
756 | nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) | ||
757 | { | ||
758 | dev_info(ctrl->ctrl.device, | ||
759 | "NVME-FC{%d}: controller connectivity lost. Awaiting " | ||
760 | "Reconnect", ctrl->cnum); | ||
761 | |||
762 | switch (ctrl->ctrl.state) { | ||
763 | case NVME_CTRL_NEW: | ||
764 | case NVME_CTRL_LIVE: | ||
765 | /* | ||
766 | * Schedule a controller reset. The reset will terminate the | ||
767 | * association and schedule the reconnect timer. Reconnects | ||
768 | * will be attempted until either the ctlr_loss_tmo | ||
769 | * (max_retries * connect_delay) expires or the remoteport's | ||
770 | * dev_loss_tmo expires. | ||
771 | */ | ||
772 | if (nvme_reset_ctrl(&ctrl->ctrl)) { | ||
773 | dev_warn(ctrl->ctrl.device, | ||
774 | "NVME-FC{%d}: Couldn't schedule reset. " | ||
775 | "Deleting controller.\n", | ||
776 | ctrl->cnum); | ||
777 | nvme_delete_ctrl(&ctrl->ctrl); | ||
778 | } | ||
779 | break; | ||
780 | |||
781 | case NVME_CTRL_RECONNECTING: | ||
782 | /* | ||
783 | * The association has already been terminated and the | ||
784 | * controller is attempting reconnects. No need to do anything | ||
785 | * futher. Reconnects will be attempted until either the | ||
786 | * ctlr_loss_tmo (max_retries * connect_delay) expires or the | ||
787 | * remoteport's dev_loss_tmo expires. | ||
788 | */ | ||
789 | break; | ||
790 | |||
791 | case NVME_CTRL_RESETTING: | ||
792 | /* | ||
793 | * Controller is already in the process of terminating the | ||
794 | * association. No need to do anything further. The reconnect | ||
795 | * step will kick in naturally after the association is | ||
796 | * terminated. | ||
797 | */ | ||
798 | break; | ||
799 | |||
800 | case NVME_CTRL_DELETING: | ||
801 | default: | ||
802 | /* no action to take - let it delete */ | ||
803 | break; | ||
804 | } | ||
805 | } | ||
806 | |||
595 | /** | 807 | /** |
596 | * nvme_fc_unregister_remoteport - transport entry point called by an | 808 | * nvme_fc_unregister_remoteport - transport entry point called by an |
597 | * LLDD to deregister/remove a previously | 809 | * LLDD to deregister/remove a previously |
@@ -621,19 +833,78 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) | |||
621 | } | 833 | } |
622 | portptr->port_state = FC_OBJSTATE_DELETED; | 834 | portptr->port_state = FC_OBJSTATE_DELETED; |
623 | 835 | ||
624 | /* tear down all associations to the remote port */ | 836 | rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ); |
625 | list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) | 837 | |
626 | __nvme_fc_del_ctrl(ctrl); | 838 | list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { |
839 | /* if dev_loss_tmo==0, dev loss is immediate */ | ||
840 | if (!portptr->dev_loss_tmo) { | ||
841 | dev_warn(ctrl->ctrl.device, | ||
842 | "NVME-FC{%d}: controller connectivity lost. " | ||
843 | "Deleting controller.\n", | ||
844 | ctrl->cnum); | ||
845 | nvme_delete_ctrl(&ctrl->ctrl); | ||
846 | } else | ||
847 | nvme_fc_ctrl_connectivity_loss(ctrl); | ||
848 | } | ||
627 | 849 | ||
628 | spin_unlock_irqrestore(&rport->lock, flags); | 850 | spin_unlock_irqrestore(&rport->lock, flags); |
629 | 851 | ||
630 | nvme_fc_abort_lsops(rport); | 852 | nvme_fc_abort_lsops(rport); |
631 | 853 | ||
854 | if (atomic_read(&rport->act_ctrl_cnt) == 0) | ||
855 | rport->lport->ops->remoteport_delete(portptr); | ||
856 | |||
857 | /* | ||
858 | * release the reference, which will allow, if all controllers | ||
859 | * go away, which should only occur after dev_loss_tmo occurs, | ||
860 | * for the rport to be torn down. | ||
861 | */ | ||
632 | nvme_fc_rport_put(rport); | 862 | nvme_fc_rport_put(rport); |
863 | |||
633 | return 0; | 864 | return 0; |
634 | } | 865 | } |
635 | EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); | 866 | EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); |
636 | 867 | ||
868 | /** | ||
869 | * nvme_fc_rescan_remoteport - transport entry point called by an | ||
870 | * LLDD to request a nvme device rescan. | ||
871 | * @remoteport: pointer to the (registered) remote port that is to be | ||
872 | * rescanned. | ||
873 | * | ||
874 | * Returns: N/A | ||
875 | */ | ||
876 | void | ||
877 | nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport) | ||
878 | { | ||
879 | struct nvme_fc_rport *rport = remoteport_to_rport(remoteport); | ||
880 | |||
881 | nvme_fc_signal_discovery_scan(rport->lport, rport); | ||
882 | } | ||
883 | EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport); | ||
884 | |||
885 | int | ||
886 | nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr, | ||
887 | u32 dev_loss_tmo) | ||
888 | { | ||
889 | struct nvme_fc_rport *rport = remoteport_to_rport(portptr); | ||
890 | unsigned long flags; | ||
891 | |||
892 | spin_lock_irqsave(&rport->lock, flags); | ||
893 | |||
894 | if (portptr->port_state != FC_OBJSTATE_ONLINE) { | ||
895 | spin_unlock_irqrestore(&rport->lock, flags); | ||
896 | return -EINVAL; | ||
897 | } | ||
898 | |||
899 | /* a dev_loss_tmo of 0 (immediate) is allowed to be set */ | ||
900 | rport->remoteport.dev_loss_tmo = dev_loss_tmo; | ||
901 | |||
902 | spin_unlock_irqrestore(&rport->lock, flags); | ||
903 | |||
904 | return 0; | ||
905 | } | ||
906 | EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss); | ||
907 | |||
637 | 908 | ||
638 | /* *********************** FC-NVME DMA Handling **************************** */ | 909 | /* *********************** FC-NVME DMA Handling **************************** */ |
639 | 910 | ||
@@ -723,7 +994,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, | |||
723 | dma_unmap_sg(dev, sg, nents, dir); | 994 | dma_unmap_sg(dev, sg, nents, dir); |
724 | } | 995 | } |
725 | 996 | ||
726 | |||
727 | /* *********************** FC-NVME LS Handling **************************** */ | 997 | /* *********************** FC-NVME LS Handling **************************** */ |
728 | 998 | ||
729 | static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *); | 999 | static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *); |
@@ -1266,7 +1536,7 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) | |||
1266 | unsigned long flags; | 1536 | unsigned long flags; |
1267 | int i, ret; | 1537 | int i, ret; |
1268 | 1538 | ||
1269 | for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { | 1539 | for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { |
1270 | if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE) | 1540 | if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE) |
1271 | continue; | 1541 | continue; |
1272 | 1542 | ||
@@ -1331,7 +1601,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) | |||
1331 | struct nvme_command *sqe = &op->cmd_iu.sqe; | 1601 | struct nvme_command *sqe = &op->cmd_iu.sqe; |
1332 | __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); | 1602 | __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); |
1333 | union nvme_result result; | 1603 | union nvme_result result; |
1334 | bool complete_rq, terminate_assoc = true; | 1604 | bool terminate_assoc = true; |
1335 | 1605 | ||
1336 | /* | 1606 | /* |
1337 | * WARNING: | 1607 | * WARNING: |
@@ -1373,8 +1643,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) | |||
1373 | fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, | 1643 | fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, |
1374 | sizeof(op->rsp_iu), DMA_FROM_DEVICE); | 1644 | sizeof(op->rsp_iu), DMA_FROM_DEVICE); |
1375 | 1645 | ||
1376 | if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) | 1646 | if (atomic_read(&op->state) == FCPOP_STATE_ABORTED || |
1377 | status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); | 1647 | op->flags & FCOP_FLAGS_TERMIO) |
1648 | status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); | ||
1378 | else if (freq->status) | 1649 | else if (freq->status) |
1379 | status = cpu_to_le16(NVME_SC_INTERNAL << 1); | 1650 | status = cpu_to_le16(NVME_SC_INTERNAL << 1); |
1380 | 1651 | ||
@@ -1438,23 +1709,27 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) | |||
1438 | done: | 1709 | done: |
1439 | if (op->flags & FCOP_FLAGS_AEN) { | 1710 | if (op->flags & FCOP_FLAGS_AEN) { |
1440 | nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); | 1711 | nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); |
1441 | complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); | 1712 | __nvme_fc_fcpop_chk_teardowns(ctrl, op); |
1442 | atomic_set(&op->state, FCPOP_STATE_IDLE); | 1713 | atomic_set(&op->state, FCPOP_STATE_IDLE); |
1443 | op->flags = FCOP_FLAGS_AEN; /* clear other flags */ | 1714 | op->flags = FCOP_FLAGS_AEN; /* clear other flags */ |
1444 | nvme_fc_ctrl_put(ctrl); | 1715 | nvme_fc_ctrl_put(ctrl); |
1445 | goto check_error; | 1716 | goto check_error; |
1446 | } | 1717 | } |
1447 | 1718 | ||
1448 | complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); | 1719 | /* |
1449 | if (!complete_rq) { | 1720 | * Force failures of commands if we're killing the controller |
1450 | if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { | 1721 | * or have an error on a command used to create an new association |
1451 | status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); | 1722 | */ |
1452 | if (blk_queue_dying(rq->q)) | 1723 | if (status && |
1453 | status |= cpu_to_le16(NVME_SC_DNR << 1); | 1724 | (blk_queue_dying(rq->q) || |
1454 | } | 1725 | ctrl->ctrl.state == NVME_CTRL_NEW || |
1455 | nvme_end_request(rq, status, result); | 1726 | ctrl->ctrl.state == NVME_CTRL_RECONNECTING)) |
1456 | } else | 1727 | status |= cpu_to_le16(NVME_SC_DNR << 1); |
1728 | |||
1729 | if (__nvme_fc_fcpop_chk_teardowns(ctrl, op)) | ||
1457 | __nvme_fc_final_op_cleanup(rq); | 1730 | __nvme_fc_final_op_cleanup(rq); |
1731 | else | ||
1732 | nvme_end_request(rq, status, result); | ||
1458 | 1733 | ||
1459 | check_error: | 1734 | check_error: |
1460 | if (terminate_assoc) | 1735 | if (terminate_assoc) |
@@ -1531,7 +1806,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) | |||
1531 | int i, ret; | 1806 | int i, ret; |
1532 | 1807 | ||
1533 | aen_op = ctrl->aen_ops; | 1808 | aen_op = ctrl->aen_ops; |
1534 | for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { | 1809 | for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { |
1535 | private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, | 1810 | private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, |
1536 | GFP_KERNEL); | 1811 | GFP_KERNEL); |
1537 | if (!private) | 1812 | if (!private) |
@@ -1541,7 +1816,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) | |||
1541 | sqe = &cmdiu->sqe; | 1816 | sqe = &cmdiu->sqe; |
1542 | ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0], | 1817 | ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0], |
1543 | aen_op, (struct request *)NULL, | 1818 | aen_op, (struct request *)NULL, |
1544 | (AEN_CMDID_BASE + i)); | 1819 | (NVME_AQ_BLK_MQ_DEPTH + i)); |
1545 | if (ret) { | 1820 | if (ret) { |
1546 | kfree(private); | 1821 | kfree(private); |
1547 | return ret; | 1822 | return ret; |
@@ -1554,7 +1829,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) | |||
1554 | memset(sqe, 0, sizeof(*sqe)); | 1829 | memset(sqe, 0, sizeof(*sqe)); |
1555 | sqe->common.opcode = nvme_admin_async_event; | 1830 | sqe->common.opcode = nvme_admin_async_event; |
1556 | /* Note: core layer may overwrite the sqe.command_id value */ | 1831 | /* Note: core layer may overwrite the sqe.command_id value */ |
1557 | sqe->common.command_id = AEN_CMDID_BASE + i; | 1832 | sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i; |
1558 | } | 1833 | } |
1559 | return 0; | 1834 | return 0; |
1560 | } | 1835 | } |
@@ -1566,7 +1841,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) | |||
1566 | int i; | 1841 | int i; |
1567 | 1842 | ||
1568 | aen_op = ctrl->aen_ops; | 1843 | aen_op = ctrl->aen_ops; |
1569 | for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { | 1844 | for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { |
1570 | if (!aen_op->fcp_req.private) | 1845 | if (!aen_op->fcp_req.private) |
1571 | continue; | 1846 | continue; |
1572 | 1847 | ||
@@ -1610,7 +1885,7 @@ nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, | |||
1610 | } | 1885 | } |
1611 | 1886 | ||
1612 | static void | 1887 | static void |
1613 | nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size) | 1888 | nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx) |
1614 | { | 1889 | { |
1615 | struct nvme_fc_queue *queue; | 1890 | struct nvme_fc_queue *queue; |
1616 | 1891 | ||
@@ -1626,8 +1901,6 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size) | |||
1626 | else | 1901 | else |
1627 | queue->cmnd_capsule_len = sizeof(struct nvme_command); | 1902 | queue->cmnd_capsule_len = sizeof(struct nvme_command); |
1628 | 1903 | ||
1629 | queue->queue_size = queue_size; | ||
1630 | |||
1631 | /* | 1904 | /* |
1632 | * Considered whether we should allocate buffers for all SQEs | 1905 | * Considered whether we should allocate buffers for all SQEs |
1633 | * and CQEs and dma map them - mapping their respective entries | 1906 | * and CQEs and dma map them - mapping their respective entries |
@@ -1751,7 +2024,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl) | |||
1751 | int i; | 2024 | int i; |
1752 | 2025 | ||
1753 | for (i = 1; i < ctrl->ctrl.queue_count; i++) | 2026 | for (i = 1; i < ctrl->ctrl.queue_count; i++) |
1754 | nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize); | 2027 | nvme_fc_init_queue(ctrl, i); |
1755 | } | 2028 | } |
1756 | 2029 | ||
1757 | static void | 2030 | static void |
@@ -1825,13 +2098,6 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) | |||
1825 | dev_warn(ctrl->ctrl.device, | 2098 | dev_warn(ctrl->ctrl.device, |
1826 | "NVME-FC{%d}: resetting controller\n", ctrl->cnum); | 2099 | "NVME-FC{%d}: resetting controller\n", ctrl->cnum); |
1827 | 2100 | ||
1828 | if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { | ||
1829 | dev_err(ctrl->ctrl.device, | ||
1830 | "NVME-FC{%d}: error_recovery: Couldn't change state " | ||
1831 | "to RECONNECTING\n", ctrl->cnum); | ||
1832 | return; | ||
1833 | } | ||
1834 | |||
1835 | nvme_reset_ctrl(&ctrl->ctrl); | 2101 | nvme_reset_ctrl(&ctrl->ctrl); |
1836 | } | 2102 | } |
1837 | 2103 | ||
@@ -1842,13 +2108,14 @@ nvme_fc_timeout(struct request *rq, bool reserved) | |||
1842 | struct nvme_fc_ctrl *ctrl = op->ctrl; | 2108 | struct nvme_fc_ctrl *ctrl = op->ctrl; |
1843 | int ret; | 2109 | int ret; |
1844 | 2110 | ||
1845 | if (reserved) | 2111 | if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || |
2112 | atomic_read(&op->state) == FCPOP_STATE_ABORTED) | ||
1846 | return BLK_EH_RESET_TIMER; | 2113 | return BLK_EH_RESET_TIMER; |
1847 | 2114 | ||
1848 | ret = __nvme_fc_abort_op(ctrl, op); | 2115 | ret = __nvme_fc_abort_op(ctrl, op); |
1849 | if (ret) | 2116 | if (ret) |
1850 | /* io wasn't active to abort consider it done */ | 2117 | /* io wasn't active to abort */ |
1851 | return BLK_EH_HANDLED; | 2118 | return BLK_EH_NOT_HANDLED; |
1852 | 2119 | ||
1853 | /* | 2120 | /* |
1854 | * we can't individually ABTS an io without affecting the queue, | 2121 | * we can't individually ABTS an io without affecting the queue, |
@@ -1859,7 +2126,12 @@ nvme_fc_timeout(struct request *rq, bool reserved) | |||
1859 | */ | 2126 | */ |
1860 | nvme_fc_error_recovery(ctrl, "io timeout error"); | 2127 | nvme_fc_error_recovery(ctrl, "io timeout error"); |
1861 | 2128 | ||
1862 | return BLK_EH_HANDLED; | 2129 | /* |
2130 | * the io abort has been initiated. Have the reset timer | ||
2131 | * restarted and the abort completion will complete the io | ||
2132 | * shortly. Avoids a synchronous wait while the abort finishes. | ||
2133 | */ | ||
2134 | return BLK_EH_RESET_TIMER; | ||
1863 | } | 2135 | } |
1864 | 2136 | ||
1865 | static int | 2137 | static int |
@@ -2110,7 +2382,7 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) | |||
2110 | } | 2382 | } |
2111 | 2383 | ||
2112 | static void | 2384 | static void |
2113 | nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) | 2385 | nvme_fc_submit_async_event(struct nvme_ctrl *arg) |
2114 | { | 2386 | { |
2115 | struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg); | 2387 | struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg); |
2116 | struct nvme_fc_fcp_op *aen_op; | 2388 | struct nvme_fc_fcp_op *aen_op; |
@@ -2118,9 +2390,6 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) | |||
2118 | bool terminating = false; | 2390 | bool terminating = false; |
2119 | blk_status_t ret; | 2391 | blk_status_t ret; |
2120 | 2392 | ||
2121 | if (aer_idx > NVME_FC_NR_AEN_COMMANDS) | ||
2122 | return; | ||
2123 | |||
2124 | spin_lock_irqsave(&ctrl->lock, flags); | 2393 | spin_lock_irqsave(&ctrl->lock, flags); |
2125 | if (ctrl->flags & FCCTRL_TERMIO) | 2394 | if (ctrl->flags & FCCTRL_TERMIO) |
2126 | terminating = true; | 2395 | terminating = true; |
@@ -2129,13 +2398,13 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) | |||
2129 | if (terminating) | 2398 | if (terminating) |
2130 | return; | 2399 | return; |
2131 | 2400 | ||
2132 | aen_op = &ctrl->aen_ops[aer_idx]; | 2401 | aen_op = &ctrl->aen_ops[0]; |
2133 | 2402 | ||
2134 | ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0, | 2403 | ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0, |
2135 | NVMEFC_FCP_NODATA); | 2404 | NVMEFC_FCP_NODATA); |
2136 | if (ret) | 2405 | if (ret) |
2137 | dev_err(ctrl->ctrl.device, | 2406 | dev_err(ctrl->ctrl.device, |
2138 | "failed async event work [%d]\n", aer_idx); | 2407 | "failed async event work\n"); |
2139 | } | 2408 | } |
2140 | 2409 | ||
2141 | static void | 2410 | static void |
@@ -2337,7 +2606,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) | |||
2337 | 2606 | ||
2338 | nvme_fc_init_io_queues(ctrl); | 2607 | nvme_fc_init_io_queues(ctrl); |
2339 | 2608 | ||
2340 | ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request); | 2609 | ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); |
2341 | if (ret) | 2610 | if (ret) |
2342 | goto out_free_io_queues; | 2611 | goto out_free_io_queues; |
2343 | 2612 | ||
@@ -2360,6 +2629,61 @@ out_free_io_queues: | |||
2360 | return ret; | 2629 | return ret; |
2361 | } | 2630 | } |
2362 | 2631 | ||
2632 | static void | ||
2633 | nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport) | ||
2634 | { | ||
2635 | struct nvme_fc_lport *lport = rport->lport; | ||
2636 | |||
2637 | atomic_inc(&lport->act_rport_cnt); | ||
2638 | } | ||
2639 | |||
2640 | static void | ||
2641 | nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport) | ||
2642 | { | ||
2643 | struct nvme_fc_lport *lport = rport->lport; | ||
2644 | u32 cnt; | ||
2645 | |||
2646 | cnt = atomic_dec_return(&lport->act_rport_cnt); | ||
2647 | if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED) | ||
2648 | lport->ops->localport_delete(&lport->localport); | ||
2649 | } | ||
2650 | |||
2651 | static int | ||
2652 | nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl) | ||
2653 | { | ||
2654 | struct nvme_fc_rport *rport = ctrl->rport; | ||
2655 | u32 cnt; | ||
2656 | |||
2657 | if (ctrl->assoc_active) | ||
2658 | return 1; | ||
2659 | |||
2660 | ctrl->assoc_active = true; | ||
2661 | cnt = atomic_inc_return(&rport->act_ctrl_cnt); | ||
2662 | if (cnt == 1) | ||
2663 | nvme_fc_rport_active_on_lport(rport); | ||
2664 | |||
2665 | return 0; | ||
2666 | } | ||
2667 | |||
2668 | static int | ||
2669 | nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl) | ||
2670 | { | ||
2671 | struct nvme_fc_rport *rport = ctrl->rport; | ||
2672 | struct nvme_fc_lport *lport = rport->lport; | ||
2673 | u32 cnt; | ||
2674 | |||
2675 | /* ctrl->assoc_active=false will be set independently */ | ||
2676 | |||
2677 | cnt = atomic_dec_return(&rport->act_ctrl_cnt); | ||
2678 | if (cnt == 0) { | ||
2679 | if (rport->remoteport.port_state == FC_OBJSTATE_DELETED) | ||
2680 | lport->ops->remoteport_delete(&rport->remoteport); | ||
2681 | nvme_fc_rport_inactive_on_lport(rport); | ||
2682 | } | ||
2683 | |||
2684 | return 0; | ||
2685 | } | ||
2686 | |||
2363 | /* | 2687 | /* |
2364 | * This routine restarts the controller on the host side, and | 2688 | * This routine restarts the controller on the host side, and |
2365 | * on the link side, recreates the controller association. | 2689 | * on the link side, recreates the controller association. |
@@ -2368,26 +2692,31 @@ static int | |||
2368 | nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) | 2692 | nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) |
2369 | { | 2693 | { |
2370 | struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; | 2694 | struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; |
2371 | u32 segs; | ||
2372 | int ret; | 2695 | int ret; |
2373 | bool changed; | 2696 | bool changed; |
2374 | 2697 | ||
2375 | ++ctrl->ctrl.nr_reconnects; | 2698 | ++ctrl->ctrl.nr_reconnects; |
2376 | 2699 | ||
2700 | if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) | ||
2701 | return -ENODEV; | ||
2702 | |||
2703 | if (nvme_fc_ctlr_active_on_rport(ctrl)) | ||
2704 | return -ENOTUNIQ; | ||
2705 | |||
2377 | /* | 2706 | /* |
2378 | * Create the admin queue | 2707 | * Create the admin queue |
2379 | */ | 2708 | */ |
2380 | 2709 | ||
2381 | nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH); | 2710 | nvme_fc_init_queue(ctrl, 0); |
2382 | 2711 | ||
2383 | ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, | 2712 | ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, |
2384 | NVME_FC_AQ_BLKMQ_DEPTH); | 2713 | NVME_AQ_BLK_MQ_DEPTH); |
2385 | if (ret) | 2714 | if (ret) |
2386 | goto out_free_queue; | 2715 | goto out_free_queue; |
2387 | 2716 | ||
2388 | ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0], | 2717 | ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0], |
2389 | NVME_FC_AQ_BLKMQ_DEPTH, | 2718 | NVME_AQ_BLK_MQ_DEPTH, |
2390 | (NVME_FC_AQ_BLKMQ_DEPTH / 4)); | 2719 | (NVME_AQ_BLK_MQ_DEPTH / 4)); |
2391 | if (ret) | 2720 | if (ret) |
2392 | goto out_delete_hw_queue; | 2721 | goto out_delete_hw_queue; |
2393 | 2722 | ||
@@ -2419,9 +2748,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) | |||
2419 | if (ret) | 2748 | if (ret) |
2420 | goto out_disconnect_admin_queue; | 2749 | goto out_disconnect_admin_queue; |
2421 | 2750 | ||
2422 | segs = min_t(u32, NVME_FC_MAX_SEGMENTS, | 2751 | ctrl->ctrl.max_hw_sectors = |
2423 | ctrl->lport->ops->max_sgl_segments); | 2752 | (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); |
2424 | ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9); | ||
2425 | 2753 | ||
2426 | ret = nvme_init_identify(&ctrl->ctrl); | 2754 | ret = nvme_init_identify(&ctrl->ctrl); |
2427 | if (ret) | 2755 | if (ret) |
@@ -2465,11 +2793,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) | |||
2465 | } | 2793 | } |
2466 | 2794 | ||
2467 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); | 2795 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); |
2468 | WARN_ON_ONCE(!changed); | ||
2469 | 2796 | ||
2470 | ctrl->ctrl.nr_reconnects = 0; | 2797 | ctrl->ctrl.nr_reconnects = 0; |
2471 | 2798 | ||
2472 | nvme_start_ctrl(&ctrl->ctrl); | 2799 | if (changed) |
2800 | nvme_start_ctrl(&ctrl->ctrl); | ||
2473 | 2801 | ||
2474 | return 0; /* Success */ | 2802 | return 0; /* Success */ |
2475 | 2803 | ||
@@ -2482,6 +2810,8 @@ out_delete_hw_queue: | |||
2482 | __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); | 2810 | __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); |
2483 | out_free_queue: | 2811 | out_free_queue: |
2484 | nvme_fc_free_queue(&ctrl->queues[0]); | 2812 | nvme_fc_free_queue(&ctrl->queues[0]); |
2813 | ctrl->assoc_active = false; | ||
2814 | nvme_fc_ctlr_inactive_on_rport(ctrl); | ||
2485 | 2815 | ||
2486 | return ret; | 2816 | return ret; |
2487 | } | 2817 | } |
@@ -2497,6 +2827,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) | |||
2497 | { | 2827 | { |
2498 | unsigned long flags; | 2828 | unsigned long flags; |
2499 | 2829 | ||
2830 | if (!ctrl->assoc_active) | ||
2831 | return; | ||
2832 | ctrl->assoc_active = false; | ||
2833 | |||
2500 | spin_lock_irqsave(&ctrl->lock, flags); | 2834 | spin_lock_irqsave(&ctrl->lock, flags); |
2501 | ctrl->flags |= FCCTRL_TERMIO; | 2835 | ctrl->flags |= FCCTRL_TERMIO; |
2502 | ctrl->iocnt = 0; | 2836 | ctrl->iocnt = 0; |
@@ -2537,7 +2871,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) | |||
2537 | * use blk_mq_tagset_busy_itr() and the transport routine to | 2871 | * use blk_mq_tagset_busy_itr() and the transport routine to |
2538 | * terminate the exchanges. | 2872 | * terminate the exchanges. |
2539 | */ | 2873 | */ |
2540 | blk_mq_quiesce_queue(ctrl->ctrl.admin_q); | 2874 | if (ctrl->ctrl.state != NVME_CTRL_NEW) |
2875 | blk_mq_quiesce_queue(ctrl->ctrl.admin_q); | ||
2541 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, | 2876 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, |
2542 | nvme_fc_terminate_exchange, &ctrl->ctrl); | 2877 | nvme_fc_terminate_exchange, &ctrl->ctrl); |
2543 | 2878 | ||
@@ -2568,102 +2903,64 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) | |||
2568 | 2903 | ||
2569 | __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); | 2904 | __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); |
2570 | nvme_fc_free_queue(&ctrl->queues[0]); | 2905 | nvme_fc_free_queue(&ctrl->queues[0]); |
2906 | |||
2907 | nvme_fc_ctlr_inactive_on_rport(ctrl); | ||
2571 | } | 2908 | } |
2572 | 2909 | ||
2573 | static void | 2910 | static void |
2574 | nvme_fc_delete_ctrl_work(struct work_struct *work) | 2911 | nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) |
2575 | { | 2912 | { |
2576 | struct nvme_fc_ctrl *ctrl = | 2913 | struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); |
2577 | container_of(work, struct nvme_fc_ctrl, delete_work); | ||
2578 | 2914 | ||
2579 | cancel_work_sync(&ctrl->ctrl.reset_work); | ||
2580 | cancel_delayed_work_sync(&ctrl->connect_work); | 2915 | cancel_delayed_work_sync(&ctrl->connect_work); |
2581 | nvme_stop_ctrl(&ctrl->ctrl); | ||
2582 | nvme_remove_namespaces(&ctrl->ctrl); | ||
2583 | /* | 2916 | /* |
2584 | * kill the association on the link side. this will block | 2917 | * kill the association on the link side. this will block |
2585 | * waiting for io to terminate | 2918 | * waiting for io to terminate |
2586 | */ | 2919 | */ |
2587 | nvme_fc_delete_association(ctrl); | 2920 | nvme_fc_delete_association(ctrl); |
2588 | |||
2589 | /* | ||
2590 | * tear down the controller | ||
2591 | * After the last reference on the nvme ctrl is removed, | ||
2592 | * the transport nvme_fc_nvme_ctrl_freed() callback will be | ||
2593 | * invoked. From there, the transport will tear down it's | ||
2594 | * logical queues and association. | ||
2595 | */ | ||
2596 | nvme_uninit_ctrl(&ctrl->ctrl); | ||
2597 | |||
2598 | nvme_put_ctrl(&ctrl->ctrl); | ||
2599 | } | ||
2600 | |||
2601 | static bool | ||
2602 | __nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) | ||
2603 | { | ||
2604 | if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) | ||
2605 | return true; | ||
2606 | |||
2607 | if (!queue_work(nvme_wq, &ctrl->delete_work)) | ||
2608 | return true; | ||
2609 | |||
2610 | return false; | ||
2611 | } | ||
2612 | |||
2613 | static int | ||
2614 | __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) | ||
2615 | { | ||
2616 | return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0; | ||
2617 | } | ||
2618 | |||
2619 | /* | ||
2620 | * Request from nvme core layer to delete the controller | ||
2621 | */ | ||
2622 | static int | ||
2623 | nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) | ||
2624 | { | ||
2625 | struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); | ||
2626 | int ret; | ||
2627 | |||
2628 | if (!kref_get_unless_zero(&ctrl->ctrl.kref)) | ||
2629 | return -EBUSY; | ||
2630 | |||
2631 | ret = __nvme_fc_del_ctrl(ctrl); | ||
2632 | |||
2633 | if (!ret) | ||
2634 | flush_workqueue(nvme_wq); | ||
2635 | |||
2636 | nvme_put_ctrl(&ctrl->ctrl); | ||
2637 | |||
2638 | return ret; | ||
2639 | } | 2921 | } |
2640 | 2922 | ||
2641 | static void | 2923 | static void |
2642 | nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) | 2924 | nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) |
2643 | { | 2925 | { |
2644 | /* If we are resetting/deleting then do nothing */ | 2926 | struct nvme_fc_rport *rport = ctrl->rport; |
2645 | if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { | 2927 | struct nvme_fc_remote_port *portptr = &rport->remoteport; |
2646 | WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || | 2928 | unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ; |
2647 | ctrl->ctrl.state == NVME_CTRL_LIVE); | 2929 | bool recon = true; |
2648 | return; | ||
2649 | } | ||
2650 | 2930 | ||
2651 | dev_info(ctrl->ctrl.device, | 2931 | if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) |
2652 | "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", | 2932 | return; |
2653 | ctrl->cnum, status); | ||
2654 | 2933 | ||
2655 | if (nvmf_should_reconnect(&ctrl->ctrl)) { | 2934 | if (portptr->port_state == FC_OBJSTATE_ONLINE) |
2656 | dev_info(ctrl->ctrl.device, | 2935 | dev_info(ctrl->ctrl.device, |
2657 | "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", | 2936 | "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", |
2658 | ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); | 2937 | ctrl->cnum, status); |
2659 | queue_delayed_work(nvme_wq, &ctrl->connect_work, | 2938 | else if (time_after_eq(jiffies, rport->dev_loss_end)) |
2660 | ctrl->ctrl.opts->reconnect_delay * HZ); | 2939 | recon = false; |
2940 | |||
2941 | if (recon && nvmf_should_reconnect(&ctrl->ctrl)) { | ||
2942 | if (portptr->port_state == FC_OBJSTATE_ONLINE) | ||
2943 | dev_info(ctrl->ctrl.device, | ||
2944 | "NVME-FC{%d}: Reconnect attempt in %ld " | ||
2945 | "seconds\n", | ||
2946 | ctrl->cnum, recon_delay / HZ); | ||
2947 | else if (time_after(jiffies + recon_delay, rport->dev_loss_end)) | ||
2948 | recon_delay = rport->dev_loss_end - jiffies; | ||
2949 | |||
2950 | queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay); | ||
2661 | } else { | 2951 | } else { |
2662 | dev_warn(ctrl->ctrl.device, | 2952 | if (portptr->port_state == FC_OBJSTATE_ONLINE) |
2953 | dev_warn(ctrl->ctrl.device, | ||
2663 | "NVME-FC{%d}: Max reconnect attempts (%d) " | 2954 | "NVME-FC{%d}: Max reconnect attempts (%d) " |
2664 | "reached. Removing controller\n", | 2955 | "reached. Removing controller\n", |
2665 | ctrl->cnum, ctrl->ctrl.nr_reconnects); | 2956 | ctrl->cnum, ctrl->ctrl.nr_reconnects); |
2666 | WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); | 2957 | else |
2958 | dev_warn(ctrl->ctrl.device, | ||
2959 | "NVME-FC{%d}: dev_loss_tmo (%d) expired " | ||
2960 | "while waiting for remoteport connectivity. " | ||
2961 | "Removing controller\n", ctrl->cnum, | ||
2962 | portptr->dev_loss_tmo); | ||
2963 | WARN_ON(nvme_delete_ctrl(&ctrl->ctrl)); | ||
2667 | } | 2964 | } |
2668 | } | 2965 | } |
2669 | 2966 | ||
@@ -2675,15 +2972,28 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) | |||
2675 | int ret; | 2972 | int ret; |
2676 | 2973 | ||
2677 | nvme_stop_ctrl(&ctrl->ctrl); | 2974 | nvme_stop_ctrl(&ctrl->ctrl); |
2975 | |||
2678 | /* will block will waiting for io to terminate */ | 2976 | /* will block will waiting for io to terminate */ |
2679 | nvme_fc_delete_association(ctrl); | 2977 | nvme_fc_delete_association(ctrl); |
2680 | 2978 | ||
2681 | ret = nvme_fc_create_association(ctrl); | 2979 | if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { |
2980 | dev_err(ctrl->ctrl.device, | ||
2981 | "NVME-FC{%d}: error_recovery: Couldn't change state " | ||
2982 | "to RECONNECTING\n", ctrl->cnum); | ||
2983 | return; | ||
2984 | } | ||
2985 | |||
2986 | if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) | ||
2987 | ret = nvme_fc_create_association(ctrl); | ||
2988 | else | ||
2989 | ret = -ENOTCONN; | ||
2990 | |||
2682 | if (ret) | 2991 | if (ret) |
2683 | nvme_fc_reconnect_or_delete(ctrl, ret); | 2992 | nvme_fc_reconnect_or_delete(ctrl, ret); |
2684 | else | 2993 | else |
2685 | dev_info(ctrl->ctrl.device, | 2994 | dev_info(ctrl->ctrl.device, |
2686 | "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); | 2995 | "NVME-FC{%d}: controller reset complete\n", |
2996 | ctrl->cnum); | ||
2687 | } | 2997 | } |
2688 | 2998 | ||
2689 | static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { | 2999 | static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { |
@@ -2695,8 +3005,9 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { | |||
2695 | .reg_write32 = nvmf_reg_write32, | 3005 | .reg_write32 = nvmf_reg_write32, |
2696 | .free_ctrl = nvme_fc_nvme_ctrl_freed, | 3006 | .free_ctrl = nvme_fc_nvme_ctrl_freed, |
2697 | .submit_async_event = nvme_fc_submit_async_event, | 3007 | .submit_async_event = nvme_fc_submit_async_event, |
2698 | .delete_ctrl = nvme_fc_del_nvme_ctrl, | 3008 | .delete_ctrl = nvme_fc_delete_ctrl, |
2699 | .get_address = nvmf_get_address, | 3009 | .get_address = nvmf_get_address, |
3010 | .reinit_request = nvme_fc_reinit_request, | ||
2700 | }; | 3011 | }; |
2701 | 3012 | ||
2702 | static void | 3013 | static void |
@@ -2728,6 +3039,33 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = { | |||
2728 | }; | 3039 | }; |
2729 | 3040 | ||
2730 | 3041 | ||
3042 | /* | ||
3043 | * Fails a controller request if it matches an existing controller | ||
3044 | * (association) with the same tuple: | ||
3045 | * <Host NQN, Host ID, local FC port, remote FC port, SUBSYS NQN> | ||
3046 | * | ||
3047 | * The ports don't need to be compared as they are intrinsically | ||
3048 | * already matched by the port pointers supplied. | ||
3049 | */ | ||
3050 | static bool | ||
3051 | nvme_fc_existing_controller(struct nvme_fc_rport *rport, | ||
3052 | struct nvmf_ctrl_options *opts) | ||
3053 | { | ||
3054 | struct nvme_fc_ctrl *ctrl; | ||
3055 | unsigned long flags; | ||
3056 | bool found = false; | ||
3057 | |||
3058 | spin_lock_irqsave(&rport->lock, flags); | ||
3059 | list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { | ||
3060 | found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts); | ||
3061 | if (found) | ||
3062 | break; | ||
3063 | } | ||
3064 | spin_unlock_irqrestore(&rport->lock, flags); | ||
3065 | |||
3066 | return found; | ||
3067 | } | ||
3068 | |||
2731 | static struct nvme_ctrl * | 3069 | static struct nvme_ctrl * |
2732 | nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, | 3070 | nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, |
2733 | struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) | 3071 | struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) |
@@ -2742,6 +3080,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, | |||
2742 | goto out_fail; | 3080 | goto out_fail; |
2743 | } | 3081 | } |
2744 | 3082 | ||
3083 | if (!opts->duplicate_connect && | ||
3084 | nvme_fc_existing_controller(rport, opts)) { | ||
3085 | ret = -EALREADY; | ||
3086 | goto out_fail; | ||
3087 | } | ||
3088 | |||
2745 | ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); | 3089 | ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); |
2746 | if (!ctrl) { | 3090 | if (!ctrl) { |
2747 | ret = -ENOMEM; | 3091 | ret = -ENOMEM; |
@@ -2760,12 +3104,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, | |||
2760 | ctrl->rport = rport; | 3104 | ctrl->rport = rport; |
2761 | ctrl->dev = lport->dev; | 3105 | ctrl->dev = lport->dev; |
2762 | ctrl->cnum = idx; | 3106 | ctrl->cnum = idx; |
3107 | ctrl->assoc_active = false; | ||
2763 | init_waitqueue_head(&ctrl->ioabort_wait); | 3108 | init_waitqueue_head(&ctrl->ioabort_wait); |
2764 | 3109 | ||
2765 | get_device(ctrl->dev); | 3110 | get_device(ctrl->dev); |
2766 | kref_init(&ctrl->ref); | 3111 | kref_init(&ctrl->ref); |
2767 | 3112 | ||
2768 | INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); | ||
2769 | INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); | 3113 | INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); |
2770 | INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); | 3114 | INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); |
2771 | spin_lock_init(&ctrl->lock); | 3115 | spin_lock_init(&ctrl->lock); |
@@ -2787,7 +3131,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, | |||
2787 | 3131 | ||
2788 | memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); | 3132 | memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); |
2789 | ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; | 3133 | ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; |
2790 | ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH; | 3134 | ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; |
2791 | ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ | 3135 | ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ |
2792 | ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; | 3136 | ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; |
2793 | ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) + | 3137 | ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) + |
@@ -2797,6 +3141,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, | |||
2797 | ctrl->admin_tag_set.driver_data = ctrl; | 3141 | ctrl->admin_tag_set.driver_data = ctrl; |
2798 | ctrl->admin_tag_set.nr_hw_queues = 1; | 3142 | ctrl->admin_tag_set.nr_hw_queues = 1; |
2799 | ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; | 3143 | ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; |
3144 | ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; | ||
2800 | 3145 | ||
2801 | ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); | 3146 | ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); |
2802 | if (ret) | 3147 | if (ret) |
@@ -2878,7 +3223,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, | |||
2878 | return ERR_PTR(ret); | 3223 | return ERR_PTR(ret); |
2879 | } | 3224 | } |
2880 | 3225 | ||
2881 | kref_get(&ctrl->ctrl.kref); | 3226 | nvme_get_ctrl(&ctrl->ctrl); |
2882 | 3227 | ||
2883 | dev_info(ctrl->ctrl.device, | 3228 | dev_info(ctrl->ctrl.device, |
2884 | "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", | 3229 | "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", |
@@ -3026,7 +3371,50 @@ static struct nvmf_transport_ops nvme_fc_transport = { | |||
3026 | 3371 | ||
3027 | static int __init nvme_fc_init_module(void) | 3372 | static int __init nvme_fc_init_module(void) |
3028 | { | 3373 | { |
3029 | return nvmf_register_transport(&nvme_fc_transport); | 3374 | int ret; |
3375 | |||
3376 | /* | ||
3377 | * NOTE: | ||
3378 | * It is expected that in the future the kernel will combine | ||
3379 | * the FC-isms that are currently under scsi and now being | ||
3380 | * added to by NVME into a new standalone FC class. The SCSI | ||
3381 | * and NVME protocols and their devices would be under this | ||
3382 | * new FC class. | ||
3383 | * | ||
3384 | * As we need something to post FC-specific udev events to, | ||
3385 | * specifically for nvme probe events, start by creating the | ||
3386 | * new device class. When the new standalone FC class is | ||
3387 | * put in place, this code will move to a more generic | ||
3388 | * location for the class. | ||
3389 | */ | ||
3390 | fc_class = class_create(THIS_MODULE, "fc"); | ||
3391 | if (IS_ERR(fc_class)) { | ||
3392 | pr_err("couldn't register class fc\n"); | ||
3393 | return PTR_ERR(fc_class); | ||
3394 | } | ||
3395 | |||
3396 | /* | ||
3397 | * Create a device for the FC-centric udev events | ||
3398 | */ | ||
3399 | fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL, | ||
3400 | "fc_udev_device"); | ||
3401 | if (IS_ERR(fc_udev_device)) { | ||
3402 | pr_err("couldn't create fc_udev device!\n"); | ||
3403 | ret = PTR_ERR(fc_udev_device); | ||
3404 | goto out_destroy_class; | ||
3405 | } | ||
3406 | |||
3407 | ret = nvmf_register_transport(&nvme_fc_transport); | ||
3408 | if (ret) | ||
3409 | goto out_destroy_device; | ||
3410 | |||
3411 | return 0; | ||
3412 | |||
3413 | out_destroy_device: | ||
3414 | device_destroy(fc_class, MKDEV(0, 0)); | ||
3415 | out_destroy_class: | ||
3416 | class_destroy(fc_class); | ||
3417 | return ret; | ||
3030 | } | 3418 | } |
3031 | 3419 | ||
3032 | static void __exit nvme_fc_exit_module(void) | 3420 | static void __exit nvme_fc_exit_module(void) |
@@ -3039,6 +3427,9 @@ static void __exit nvme_fc_exit_module(void) | |||
3039 | 3427 | ||
3040 | ida_destroy(&nvme_fc_local_port_cnt); | 3428 | ida_destroy(&nvme_fc_local_port_cnt); |
3041 | ida_destroy(&nvme_fc_ctrl_cnt); | 3429 | ida_destroy(&nvme_fc_ctrl_cnt); |
3430 | |||
3431 | device_destroy(fc_class, MKDEV(0, 0)); | ||
3432 | class_destroy(fc_class); | ||
3042 | } | 3433 | } |
3043 | 3434 | ||
3044 | module_init(nvme_fc_init_module); | 3435 | module_init(nvme_fc_init_module); |
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 1f79e3f141e6..ba3d7f3349e5 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c | |||
@@ -305,7 +305,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) | |||
305 | int ret; | 305 | int ret; |
306 | 306 | ||
307 | c.identity.opcode = nvme_nvm_admin_identity; | 307 | c.identity.opcode = nvme_nvm_admin_identity; |
308 | c.identity.nsid = cpu_to_le32(ns->ns_id); | 308 | c.identity.nsid = cpu_to_le32(ns->head->ns_id); |
309 | c.identity.chnl_off = 0; | 309 | c.identity.chnl_off = 0; |
310 | 310 | ||
311 | nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL); | 311 | nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL); |
@@ -344,7 +344,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, | |||
344 | int ret = 0; | 344 | int ret = 0; |
345 | 345 | ||
346 | c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; | 346 | c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; |
347 | c.l2p.nsid = cpu_to_le32(ns->ns_id); | 347 | c.l2p.nsid = cpu_to_le32(ns->head->ns_id); |
348 | entries = kmalloc(len, GFP_KERNEL); | 348 | entries = kmalloc(len, GFP_KERNEL); |
349 | if (!entries) | 349 | if (!entries) |
350 | return -ENOMEM; | 350 | return -ENOMEM; |
@@ -402,7 +402,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, | |||
402 | int ret = 0; | 402 | int ret = 0; |
403 | 403 | ||
404 | c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; | 404 | c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; |
405 | c.get_bb.nsid = cpu_to_le32(ns->ns_id); | 405 | c.get_bb.nsid = cpu_to_le32(ns->head->ns_id); |
406 | c.get_bb.spba = cpu_to_le64(ppa.ppa); | 406 | c.get_bb.spba = cpu_to_le64(ppa.ppa); |
407 | 407 | ||
408 | bb_tbl = kzalloc(tblsz, GFP_KERNEL); | 408 | bb_tbl = kzalloc(tblsz, GFP_KERNEL); |
@@ -452,7 +452,7 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, | |||
452 | int ret = 0; | 452 | int ret = 0; |
453 | 453 | ||
454 | c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; | 454 | c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; |
455 | c.set_bb.nsid = cpu_to_le32(ns->ns_id); | 455 | c.set_bb.nsid = cpu_to_le32(ns->head->ns_id); |
456 | c.set_bb.spba = cpu_to_le64(ppas->ppa); | 456 | c.set_bb.spba = cpu_to_le64(ppas->ppa); |
457 | c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); | 457 | c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); |
458 | c.set_bb.value = type; | 458 | c.set_bb.value = type; |
@@ -469,7 +469,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, | |||
469 | struct nvme_nvm_command *c) | 469 | struct nvme_nvm_command *c) |
470 | { | 470 | { |
471 | c->ph_rw.opcode = rqd->opcode; | 471 | c->ph_rw.opcode = rqd->opcode; |
472 | c->ph_rw.nsid = cpu_to_le32(ns->ns_id); | 472 | c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id); |
473 | c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); | 473 | c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); |
474 | c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); | 474 | c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); |
475 | c->ph_rw.control = cpu_to_le16(rqd->flags); | 475 | c->ph_rw.control = cpu_to_le16(rqd->flags); |
@@ -492,34 +492,47 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status) | |||
492 | blk_mq_free_request(rq); | 492 | blk_mq_free_request(rq); |
493 | } | 493 | } |
494 | 494 | ||
495 | static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) | 495 | static struct request *nvme_nvm_alloc_request(struct request_queue *q, |
496 | struct nvm_rq *rqd, | ||
497 | struct nvme_nvm_command *cmd) | ||
496 | { | 498 | { |
497 | struct request_queue *q = dev->q; | ||
498 | struct nvme_ns *ns = q->queuedata; | 499 | struct nvme_ns *ns = q->queuedata; |
499 | struct request *rq; | 500 | struct request *rq; |
500 | struct bio *bio = rqd->bio; | ||
501 | struct nvme_nvm_command *cmd; | ||
502 | |||
503 | cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); | ||
504 | if (!cmd) | ||
505 | return -ENOMEM; | ||
506 | 501 | ||
507 | nvme_nvm_rqtocmd(rqd, ns, cmd); | 502 | nvme_nvm_rqtocmd(rqd, ns, cmd); |
508 | 503 | ||
509 | rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); | 504 | rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); |
510 | if (IS_ERR(rq)) { | 505 | if (IS_ERR(rq)) |
511 | kfree(cmd); | 506 | return rq; |
512 | return PTR_ERR(rq); | 507 | |
513 | } | ||
514 | rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; | 508 | rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; |
515 | 509 | ||
516 | if (bio) { | 510 | if (rqd->bio) { |
517 | blk_init_request_from_bio(rq, bio); | 511 | blk_init_request_from_bio(rq, rqd->bio); |
518 | } else { | 512 | } else { |
519 | rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); | 513 | rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); |
520 | rq->__data_len = 0; | 514 | rq->__data_len = 0; |
521 | } | 515 | } |
522 | 516 | ||
517 | return rq; | ||
518 | } | ||
519 | |||
520 | static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) | ||
521 | { | ||
522 | struct request_queue *q = dev->q; | ||
523 | struct nvme_nvm_command *cmd; | ||
524 | struct request *rq; | ||
525 | |||
526 | cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); | ||
527 | if (!cmd) | ||
528 | return -ENOMEM; | ||
529 | |||
530 | rq = nvme_nvm_alloc_request(q, rqd, cmd); | ||
531 | if (IS_ERR(rq)) { | ||
532 | kfree(cmd); | ||
533 | return PTR_ERR(rq); | ||
534 | } | ||
535 | |||
523 | rq->end_io_data = rqd; | 536 | rq->end_io_data = rqd; |
524 | 537 | ||
525 | blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); | 538 | blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); |
@@ -527,6 +540,34 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) | |||
527 | return 0; | 540 | return 0; |
528 | } | 541 | } |
529 | 542 | ||
543 | static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) | ||
544 | { | ||
545 | struct request_queue *q = dev->q; | ||
546 | struct request *rq; | ||
547 | struct nvme_nvm_command cmd; | ||
548 | int ret = 0; | ||
549 | |||
550 | memset(&cmd, 0, sizeof(struct nvme_nvm_command)); | ||
551 | |||
552 | rq = nvme_nvm_alloc_request(q, rqd, &cmd); | ||
553 | if (IS_ERR(rq)) | ||
554 | return PTR_ERR(rq); | ||
555 | |||
556 | /* I/Os can fail and the error is signaled through rqd. Callers must | ||
557 | * handle the error accordingly. | ||
558 | */ | ||
559 | blk_execute_rq(q, NULL, rq, 0); | ||
560 | if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) | ||
561 | ret = -EINTR; | ||
562 | |||
563 | rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); | ||
564 | rqd->error = nvme_req(rq)->status; | ||
565 | |||
566 | blk_mq_free_request(rq); | ||
567 | |||
568 | return ret; | ||
569 | } | ||
570 | |||
530 | static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) | 571 | static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) |
531 | { | 572 | { |
532 | struct nvme_ns *ns = nvmdev->q->queuedata; | 573 | struct nvme_ns *ns = nvmdev->q->queuedata; |
@@ -562,6 +603,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { | |||
562 | .set_bb_tbl = nvme_nvm_set_bb_tbl, | 603 | .set_bb_tbl = nvme_nvm_set_bb_tbl, |
563 | 604 | ||
564 | .submit_io = nvme_nvm_submit_io, | 605 | .submit_io = nvme_nvm_submit_io, |
606 | .submit_io_sync = nvme_nvm_submit_io_sync, | ||
565 | 607 | ||
566 | .create_dma_pool = nvme_nvm_create_dma_pool, | 608 | .create_dma_pool = nvme_nvm_create_dma_pool, |
567 | .destroy_dma_pool = nvme_nvm_destroy_dma_pool, | 609 | .destroy_dma_pool = nvme_nvm_destroy_dma_pool, |
@@ -600,8 +642,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, | |||
600 | 642 | ||
601 | rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; | 643 | rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; |
602 | 644 | ||
603 | rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; | ||
604 | |||
605 | if (ppa_buf && ppa_len) { | 645 | if (ppa_buf && ppa_len) { |
606 | ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); | 646 | ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); |
607 | if (!ppa_list) { | 647 | if (!ppa_list) { |
@@ -691,7 +731,7 @@ static int nvme_nvm_submit_vio(struct nvme_ns *ns, | |||
691 | 731 | ||
692 | memset(&c, 0, sizeof(c)); | 732 | memset(&c, 0, sizeof(c)); |
693 | c.ph_rw.opcode = vio.opcode; | 733 | c.ph_rw.opcode = vio.opcode; |
694 | c.ph_rw.nsid = cpu_to_le32(ns->ns_id); | 734 | c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id); |
695 | c.ph_rw.control = cpu_to_le16(vio.control); | 735 | c.ph_rw.control = cpu_to_le16(vio.control); |
696 | c.ph_rw.length = cpu_to_le16(vio.nppas); | 736 | c.ph_rw.length = cpu_to_le16(vio.nppas); |
697 | 737 | ||
@@ -728,7 +768,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, | |||
728 | 768 | ||
729 | memset(&c, 0, sizeof(c)); | 769 | memset(&c, 0, sizeof(c)); |
730 | c.common.opcode = vcmd.opcode; | 770 | c.common.opcode = vcmd.opcode; |
731 | c.common.nsid = cpu_to_le32(ns->ns_id); | 771 | c.common.nsid = cpu_to_le32(ns->head->ns_id); |
732 | c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); | 772 | c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); |
733 | c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); | 773 | c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); |
734 | /* cdw11-12 */ | 774 | /* cdw11-12 */ |
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c new file mode 100644 index 000000000000..78d92151a904 --- /dev/null +++ b/drivers/nvme/host/multipath.c | |||
@@ -0,0 +1,291 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2017 Christoph Hellwig. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms and conditions of the GNU General Public License, | ||
6 | * version 2, as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | */ | ||
13 | |||
14 | #include <linux/moduleparam.h> | ||
15 | #include "nvme.h" | ||
16 | |||
17 | static bool multipath = true; | ||
18 | module_param(multipath, bool, 0644); | ||
19 | MODULE_PARM_DESC(multipath, | ||
20 | "turn on native support for multiple controllers per subsystem"); | ||
21 | |||
22 | void nvme_failover_req(struct request *req) | ||
23 | { | ||
24 | struct nvme_ns *ns = req->q->queuedata; | ||
25 | unsigned long flags; | ||
26 | |||
27 | spin_lock_irqsave(&ns->head->requeue_lock, flags); | ||
28 | blk_steal_bios(&ns->head->requeue_list, req); | ||
29 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); | ||
30 | blk_mq_end_request(req, 0); | ||
31 | |||
32 | nvme_reset_ctrl(ns->ctrl); | ||
33 | kblockd_schedule_work(&ns->head->requeue_work); | ||
34 | } | ||
35 | |||
36 | bool nvme_req_needs_failover(struct request *req) | ||
37 | { | ||
38 | if (!(req->cmd_flags & REQ_NVME_MPATH)) | ||
39 | return false; | ||
40 | |||
41 | switch (nvme_req(req)->status & 0x7ff) { | ||
42 | /* | ||
43 | * Generic command status: | ||
44 | */ | ||
45 | case NVME_SC_INVALID_OPCODE: | ||
46 | case NVME_SC_INVALID_FIELD: | ||
47 | case NVME_SC_INVALID_NS: | ||
48 | case NVME_SC_LBA_RANGE: | ||
49 | case NVME_SC_CAP_EXCEEDED: | ||
50 | case NVME_SC_RESERVATION_CONFLICT: | ||
51 | return false; | ||
52 | |||
53 | /* | ||
54 | * I/O command set specific error. Unfortunately these values are | ||
55 | * reused for fabrics commands, but those should never get here. | ||
56 | */ | ||
57 | case NVME_SC_BAD_ATTRIBUTES: | ||
58 | case NVME_SC_INVALID_PI: | ||
59 | case NVME_SC_READ_ONLY: | ||
60 | case NVME_SC_ONCS_NOT_SUPPORTED: | ||
61 | WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == | ||
62 | nvme_fabrics_command); | ||
63 | return false; | ||
64 | |||
65 | /* | ||
66 | * Media and Data Integrity Errors: | ||
67 | */ | ||
68 | case NVME_SC_WRITE_FAULT: | ||
69 | case NVME_SC_READ_ERROR: | ||
70 | case NVME_SC_GUARD_CHECK: | ||
71 | case NVME_SC_APPTAG_CHECK: | ||
72 | case NVME_SC_REFTAG_CHECK: | ||
73 | case NVME_SC_COMPARE_FAILED: | ||
74 | case NVME_SC_ACCESS_DENIED: | ||
75 | case NVME_SC_UNWRITTEN_BLOCK: | ||
76 | return false; | ||
77 | } | ||
78 | |||
79 | /* Everything else could be a path failure, so should be retried */ | ||
80 | return true; | ||
81 | } | ||
82 | |||
83 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) | ||
84 | { | ||
85 | struct nvme_ns *ns; | ||
86 | |||
87 | mutex_lock(&ctrl->namespaces_mutex); | ||
88 | list_for_each_entry(ns, &ctrl->namespaces, list) { | ||
89 | if (ns->head->disk) | ||
90 | kblockd_schedule_work(&ns->head->requeue_work); | ||
91 | } | ||
92 | mutex_unlock(&ctrl->namespaces_mutex); | ||
93 | } | ||
94 | |||
95 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) | ||
96 | { | ||
97 | struct nvme_ns *ns; | ||
98 | |||
99 | list_for_each_entry_rcu(ns, &head->list, siblings) { | ||
100 | if (ns->ctrl->state == NVME_CTRL_LIVE) { | ||
101 | rcu_assign_pointer(head->current_path, ns); | ||
102 | return ns; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) | ||
110 | { | ||
111 | struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); | ||
112 | |||
113 | if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) | ||
114 | ns = __nvme_find_path(head); | ||
115 | return ns; | ||
116 | } | ||
117 | |||
118 | static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, | ||
119 | struct bio *bio) | ||
120 | { | ||
121 | struct nvme_ns_head *head = q->queuedata; | ||
122 | struct device *dev = disk_to_dev(head->disk); | ||
123 | struct nvme_ns *ns; | ||
124 | blk_qc_t ret = BLK_QC_T_NONE; | ||
125 | int srcu_idx; | ||
126 | |||
127 | srcu_idx = srcu_read_lock(&head->srcu); | ||
128 | ns = nvme_find_path(head); | ||
129 | if (likely(ns)) { | ||
130 | bio->bi_disk = ns->disk; | ||
131 | bio->bi_opf |= REQ_NVME_MPATH; | ||
132 | ret = direct_make_request(bio); | ||
133 | } else if (!list_empty_careful(&head->list)) { | ||
134 | dev_warn_ratelimited(dev, "no path available - requeing I/O\n"); | ||
135 | |||
136 | spin_lock_irq(&head->requeue_lock); | ||
137 | bio_list_add(&head->requeue_list, bio); | ||
138 | spin_unlock_irq(&head->requeue_lock); | ||
139 | } else { | ||
140 | dev_warn_ratelimited(dev, "no path - failing I/O\n"); | ||
141 | |||
142 | bio->bi_status = BLK_STS_IOERR; | ||
143 | bio_endio(bio); | ||
144 | } | ||
145 | |||
146 | srcu_read_unlock(&head->srcu, srcu_idx); | ||
147 | return ret; | ||
148 | } | ||
149 | |||
150 | static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) | ||
151 | { | ||
152 | struct nvme_ns_head *head = q->queuedata; | ||
153 | struct nvme_ns *ns; | ||
154 | bool found = false; | ||
155 | int srcu_idx; | ||
156 | |||
157 | srcu_idx = srcu_read_lock(&head->srcu); | ||
158 | ns = srcu_dereference(head->current_path, &head->srcu); | ||
159 | if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) | ||
160 | found = ns->queue->poll_fn(q, qc); | ||
161 | srcu_read_unlock(&head->srcu, srcu_idx); | ||
162 | return found; | ||
163 | } | ||
164 | |||
165 | static void nvme_requeue_work(struct work_struct *work) | ||
166 | { | ||
167 | struct nvme_ns_head *head = | ||
168 | container_of(work, struct nvme_ns_head, requeue_work); | ||
169 | struct bio *bio, *next; | ||
170 | |||
171 | spin_lock_irq(&head->requeue_lock); | ||
172 | next = bio_list_get(&head->requeue_list); | ||
173 | spin_unlock_irq(&head->requeue_lock); | ||
174 | |||
175 | while ((bio = next) != NULL) { | ||
176 | next = bio->bi_next; | ||
177 | bio->bi_next = NULL; | ||
178 | |||
179 | /* | ||
180 | * Reset disk to the mpath node and resubmit to select a new | ||
181 | * path. | ||
182 | */ | ||
183 | bio->bi_disk = head->disk; | ||
184 | generic_make_request(bio); | ||
185 | } | ||
186 | } | ||
187 | |||
188 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) | ||
189 | { | ||
190 | struct request_queue *q; | ||
191 | bool vwc = false; | ||
192 | |||
193 | bio_list_init(&head->requeue_list); | ||
194 | spin_lock_init(&head->requeue_lock); | ||
195 | INIT_WORK(&head->requeue_work, nvme_requeue_work); | ||
196 | |||
197 | /* | ||
198 | * Add a multipath node if the subsystems supports multiple controllers. | ||
199 | * We also do this for private namespaces as the namespace sharing data could | ||
200 | * change after a rescan. | ||
201 | */ | ||
202 | if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) | ||
203 | return 0; | ||
204 | |||
205 | q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); | ||
206 | if (!q) | ||
207 | goto out; | ||
208 | q->queuedata = head; | ||
209 | blk_queue_make_request(q, nvme_ns_head_make_request); | ||
210 | q->poll_fn = nvme_ns_head_poll; | ||
211 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); | ||
212 | /* set to a default value for 512 until disk is validated */ | ||
213 | blk_queue_logical_block_size(q, 512); | ||
214 | |||
215 | /* we need to propagate up the VMC settings */ | ||
216 | if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) | ||
217 | vwc = true; | ||
218 | blk_queue_write_cache(q, vwc, vwc); | ||
219 | |||
220 | head->disk = alloc_disk(0); | ||
221 | if (!head->disk) | ||
222 | goto out_cleanup_queue; | ||
223 | head->disk->fops = &nvme_ns_head_ops; | ||
224 | head->disk->private_data = head; | ||
225 | head->disk->queue = q; | ||
226 | head->disk->flags = GENHD_FL_EXT_DEVT; | ||
227 | sprintf(head->disk->disk_name, "nvme%dn%d", | ||
228 | ctrl->subsys->instance, head->instance); | ||
229 | return 0; | ||
230 | |||
231 | out_cleanup_queue: | ||
232 | blk_cleanup_queue(q); | ||
233 | out: | ||
234 | return -ENOMEM; | ||
235 | } | ||
236 | |||
237 | void nvme_mpath_add_disk(struct nvme_ns_head *head) | ||
238 | { | ||
239 | if (!head->disk) | ||
240 | return; | ||
241 | device_add_disk(&head->subsys->dev, head->disk); | ||
242 | if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, | ||
243 | &nvme_ns_id_attr_group)) | ||
244 | pr_warn("%s: failed to create sysfs group for identification\n", | ||
245 | head->disk->disk_name); | ||
246 | } | ||
247 | |||
248 | void nvme_mpath_add_disk_links(struct nvme_ns *ns) | ||
249 | { | ||
250 | struct kobject *slave_disk_kobj, *holder_disk_kobj; | ||
251 | |||
252 | if (!ns->head->disk) | ||
253 | return; | ||
254 | |||
255 | slave_disk_kobj = &disk_to_dev(ns->disk)->kobj; | ||
256 | if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj, | ||
257 | kobject_name(slave_disk_kobj))) | ||
258 | return; | ||
259 | |||
260 | holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj; | ||
261 | if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj, | ||
262 | kobject_name(holder_disk_kobj))) | ||
263 | sysfs_remove_link(ns->head->disk->slave_dir, | ||
264 | kobject_name(slave_disk_kobj)); | ||
265 | } | ||
266 | |||
267 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) | ||
268 | { | ||
269 | if (!head->disk) | ||
270 | return; | ||
271 | sysfs_remove_group(&disk_to_dev(head->disk)->kobj, | ||
272 | &nvme_ns_id_attr_group); | ||
273 | del_gendisk(head->disk); | ||
274 | blk_set_queue_dying(head->disk->queue); | ||
275 | /* make sure all pending bios are cleaned up */ | ||
276 | kblockd_schedule_work(&head->requeue_work); | ||
277 | flush_work(&head->requeue_work); | ||
278 | blk_cleanup_queue(head->disk->queue); | ||
279 | put_disk(head->disk); | ||
280 | } | ||
281 | |||
282 | void nvme_mpath_remove_disk_links(struct nvme_ns *ns) | ||
283 | { | ||
284 | if (!ns->head->disk) | ||
285 | return; | ||
286 | |||
287 | sysfs_remove_link(ns->disk->part0.holder_dir, | ||
288 | kobject_name(&disk_to_dev(ns->head->disk)->kobj)); | ||
289 | sysfs_remove_link(ns->head->disk->slave_dir, | ||
290 | kobject_name(&disk_to_dev(ns->disk)->kobj)); | ||
291 | } | ||
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d3f3c4447515..c0873a68872f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h | |||
@@ -15,16 +15,17 @@ | |||
15 | #define _NVME_H | 15 | #define _NVME_H |
16 | 16 | ||
17 | #include <linux/nvme.h> | 17 | #include <linux/nvme.h> |
18 | #include <linux/cdev.h> | ||
18 | #include <linux/pci.h> | 19 | #include <linux/pci.h> |
19 | #include <linux/kref.h> | 20 | #include <linux/kref.h> |
20 | #include <linux/blk-mq.h> | 21 | #include <linux/blk-mq.h> |
21 | #include <linux/lightnvm.h> | 22 | #include <linux/lightnvm.h> |
22 | #include <linux/sed-opal.h> | 23 | #include <linux/sed-opal.h> |
23 | 24 | ||
24 | extern unsigned char nvme_io_timeout; | 25 | extern unsigned int nvme_io_timeout; |
25 | #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) | 26 | #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) |
26 | 27 | ||
27 | extern unsigned char admin_timeout; | 28 | extern unsigned int admin_timeout; |
28 | #define ADMIN_TIMEOUT (admin_timeout * HZ) | 29 | #define ADMIN_TIMEOUT (admin_timeout * HZ) |
29 | 30 | ||
30 | #define NVME_DEFAULT_KATO 5 | 31 | #define NVME_DEFAULT_KATO 5 |
@@ -94,6 +95,11 @@ struct nvme_request { | |||
94 | u16 status; | 95 | u16 status; |
95 | }; | 96 | }; |
96 | 97 | ||
98 | /* | ||
99 | * Mark a bio as coming in through the mpath node. | ||
100 | */ | ||
101 | #define REQ_NVME_MPATH REQ_DRV | ||
102 | |||
97 | enum { | 103 | enum { |
98 | NVME_REQ_CANCELLED = (1 << 0), | 104 | NVME_REQ_CANCELLED = (1 << 0), |
99 | }; | 105 | }; |
@@ -127,24 +133,23 @@ struct nvme_ctrl { | |||
127 | struct request_queue *admin_q; | 133 | struct request_queue *admin_q; |
128 | struct request_queue *connect_q; | 134 | struct request_queue *connect_q; |
129 | struct device *dev; | 135 | struct device *dev; |
130 | struct kref kref; | ||
131 | int instance; | 136 | int instance; |
132 | struct blk_mq_tag_set *tagset; | 137 | struct blk_mq_tag_set *tagset; |
133 | struct blk_mq_tag_set *admin_tagset; | 138 | struct blk_mq_tag_set *admin_tagset; |
134 | struct list_head namespaces; | 139 | struct list_head namespaces; |
135 | struct mutex namespaces_mutex; | 140 | struct mutex namespaces_mutex; |
141 | struct device ctrl_device; | ||
136 | struct device *device; /* char device */ | 142 | struct device *device; /* char device */ |
137 | struct list_head node; | 143 | struct cdev cdev; |
138 | struct ida ns_ida; | ||
139 | struct work_struct reset_work; | 144 | struct work_struct reset_work; |
145 | struct work_struct delete_work; | ||
146 | |||
147 | struct nvme_subsystem *subsys; | ||
148 | struct list_head subsys_entry; | ||
140 | 149 | ||
141 | struct opal_dev *opal_dev; | 150 | struct opal_dev *opal_dev; |
142 | 151 | ||
143 | char name[12]; | 152 | char name[12]; |
144 | char serial[20]; | ||
145 | char model[40]; | ||
146 | char firmware_rev[8]; | ||
147 | char subnqn[NVMF_NQN_SIZE]; | ||
148 | u16 cntlid; | 153 | u16 cntlid; |
149 | 154 | ||
150 | u32 ctrl_config; | 155 | u32 ctrl_config; |
@@ -155,23 +160,23 @@ struct nvme_ctrl { | |||
155 | u32 page_size; | 160 | u32 page_size; |
156 | u32 max_hw_sectors; | 161 | u32 max_hw_sectors; |
157 | u16 oncs; | 162 | u16 oncs; |
158 | u16 vid; | ||
159 | u16 oacs; | 163 | u16 oacs; |
160 | u16 nssa; | 164 | u16 nssa; |
161 | u16 nr_streams; | 165 | u16 nr_streams; |
162 | atomic_t abort_limit; | 166 | atomic_t abort_limit; |
163 | u8 event_limit; | ||
164 | u8 vwc; | 167 | u8 vwc; |
165 | u32 vs; | 168 | u32 vs; |
166 | u32 sgls; | 169 | u32 sgls; |
167 | u16 kas; | 170 | u16 kas; |
168 | u8 npss; | 171 | u8 npss; |
169 | u8 apsta; | 172 | u8 apsta; |
173 | u32 aen_result; | ||
170 | unsigned int shutdown_timeout; | 174 | unsigned int shutdown_timeout; |
171 | unsigned int kato; | 175 | unsigned int kato; |
172 | bool subsystem; | 176 | bool subsystem; |
173 | unsigned long quirks; | 177 | unsigned long quirks; |
174 | struct nvme_id_power_state psd[32]; | 178 | struct nvme_id_power_state psd[32]; |
179 | struct nvme_effects_log *effects; | ||
175 | struct work_struct scan_work; | 180 | struct work_struct scan_work; |
176 | struct work_struct async_event_work; | 181 | struct work_struct async_event_work; |
177 | struct delayed_work ka_work; | 182 | struct delayed_work ka_work; |
@@ -197,21 +202,72 @@ struct nvme_ctrl { | |||
197 | struct nvmf_ctrl_options *opts; | 202 | struct nvmf_ctrl_options *opts; |
198 | }; | 203 | }; |
199 | 204 | ||
205 | struct nvme_subsystem { | ||
206 | int instance; | ||
207 | struct device dev; | ||
208 | /* | ||
209 | * Because we unregister the device on the last put we need | ||
210 | * a separate refcount. | ||
211 | */ | ||
212 | struct kref ref; | ||
213 | struct list_head entry; | ||
214 | struct mutex lock; | ||
215 | struct list_head ctrls; | ||
216 | struct list_head nsheads; | ||
217 | char subnqn[NVMF_NQN_SIZE]; | ||
218 | char serial[20]; | ||
219 | char model[40]; | ||
220 | char firmware_rev[8]; | ||
221 | u8 cmic; | ||
222 | u16 vendor_id; | ||
223 | struct ida ns_ida; | ||
224 | }; | ||
225 | |||
226 | /* | ||
227 | * Container structure for uniqueue namespace identifiers. | ||
228 | */ | ||
229 | struct nvme_ns_ids { | ||
230 | u8 eui64[8]; | ||
231 | u8 nguid[16]; | ||
232 | uuid_t uuid; | ||
233 | }; | ||
234 | |||
235 | /* | ||
236 | * Anchor structure for namespaces. There is one for each namespace in a | ||
237 | * NVMe subsystem that any of our controllers can see, and the namespace | ||
238 | * structure for each controller is chained of it. For private namespaces | ||
239 | * there is a 1:1 relation to our namespace structures, that is ->list | ||
240 | * only ever has a single entry for private namespaces. | ||
241 | */ | ||
242 | struct nvme_ns_head { | ||
243 | #ifdef CONFIG_NVME_MULTIPATH | ||
244 | struct gendisk *disk; | ||
245 | struct nvme_ns __rcu *current_path; | ||
246 | struct bio_list requeue_list; | ||
247 | spinlock_t requeue_lock; | ||
248 | struct work_struct requeue_work; | ||
249 | #endif | ||
250 | struct list_head list; | ||
251 | struct srcu_struct srcu; | ||
252 | struct nvme_subsystem *subsys; | ||
253 | unsigned ns_id; | ||
254 | struct nvme_ns_ids ids; | ||
255 | struct list_head entry; | ||
256 | struct kref ref; | ||
257 | int instance; | ||
258 | }; | ||
259 | |||
200 | struct nvme_ns { | 260 | struct nvme_ns { |
201 | struct list_head list; | 261 | struct list_head list; |
202 | 262 | ||
203 | struct nvme_ctrl *ctrl; | 263 | struct nvme_ctrl *ctrl; |
204 | struct request_queue *queue; | 264 | struct request_queue *queue; |
205 | struct gendisk *disk; | 265 | struct gendisk *disk; |
266 | struct list_head siblings; | ||
206 | struct nvm_dev *ndev; | 267 | struct nvm_dev *ndev; |
207 | struct kref kref; | 268 | struct kref kref; |
208 | int instance; | 269 | struct nvme_ns_head *head; |
209 | 270 | ||
210 | u8 eui[8]; | ||
211 | u8 nguid[16]; | ||
212 | uuid_t uuid; | ||
213 | |||
214 | unsigned ns_id; | ||
215 | int lba_shift; | 271 | int lba_shift; |
216 | u16 ms; | 272 | u16 ms; |
217 | u16 sgs; | 273 | u16 sgs; |
@@ -234,9 +290,10 @@ struct nvme_ctrl_ops { | |||
234 | int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); | 290 | int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); |
235 | int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); | 291 | int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); |
236 | void (*free_ctrl)(struct nvme_ctrl *ctrl); | 292 | void (*free_ctrl)(struct nvme_ctrl *ctrl); |
237 | void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); | 293 | void (*submit_async_event)(struct nvme_ctrl *ctrl); |
238 | int (*delete_ctrl)(struct nvme_ctrl *ctrl); | 294 | void (*delete_ctrl)(struct nvme_ctrl *ctrl); |
239 | int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); | 295 | int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); |
296 | int (*reinit_request)(void *data, struct request *rq); | ||
240 | }; | 297 | }; |
241 | 298 | ||
242 | static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) | 299 | static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) |
@@ -278,6 +335,16 @@ static inline void nvme_end_request(struct request *req, __le16 status, | |||
278 | blk_mq_complete_request(req); | 335 | blk_mq_complete_request(req); |
279 | } | 336 | } |
280 | 337 | ||
338 | static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl) | ||
339 | { | ||
340 | get_device(ctrl->device); | ||
341 | } | ||
342 | |||
343 | static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) | ||
344 | { | ||
345 | put_device(ctrl->device); | ||
346 | } | ||
347 | |||
281 | void nvme_complete_rq(struct request *req); | 348 | void nvme_complete_rq(struct request *req); |
282 | void nvme_cancel_request(struct request *req, void *data, bool reserved); | 349 | void nvme_cancel_request(struct request *req, void *data, bool reserved); |
283 | bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, | 350 | bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, |
@@ -299,10 +366,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); | |||
299 | int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, | 366 | int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, |
300 | bool send); | 367 | bool send); |
301 | 368 | ||
302 | #define NVME_NR_AERS 1 | ||
303 | void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, | 369 | void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, |
304 | union nvme_result *res); | 370 | union nvme_result *res); |
305 | void nvme_queue_async_events(struct nvme_ctrl *ctrl); | ||
306 | 371 | ||
307 | void nvme_stop_queues(struct nvme_ctrl *ctrl); | 372 | void nvme_stop_queues(struct nvme_ctrl *ctrl); |
308 | void nvme_start_queues(struct nvme_ctrl *ctrl); | 373 | void nvme_start_queues(struct nvme_ctrl *ctrl); |
@@ -311,21 +376,79 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl); | |||
311 | void nvme_wait_freeze(struct nvme_ctrl *ctrl); | 376 | void nvme_wait_freeze(struct nvme_ctrl *ctrl); |
312 | void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); | 377 | void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); |
313 | void nvme_start_freeze(struct nvme_ctrl *ctrl); | 378 | void nvme_start_freeze(struct nvme_ctrl *ctrl); |
379 | int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); | ||
314 | 380 | ||
315 | #define NVME_QID_ANY -1 | 381 | #define NVME_QID_ANY -1 |
316 | struct request *nvme_alloc_request(struct request_queue *q, | 382 | struct request *nvme_alloc_request(struct request_queue *q, |
317 | struct nvme_command *cmd, unsigned int flags, int qid); | 383 | struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); |
318 | blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, | 384 | blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, |
319 | struct nvme_command *cmd); | 385 | struct nvme_command *cmd); |
320 | int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, | 386 | int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, |
321 | void *buf, unsigned bufflen); | 387 | void *buf, unsigned bufflen); |
322 | int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, | 388 | int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, |
323 | union nvme_result *result, void *buffer, unsigned bufflen, | 389 | union nvme_result *result, void *buffer, unsigned bufflen, |
324 | unsigned timeout, int qid, int at_head, int flags); | 390 | unsigned timeout, int qid, int at_head, |
391 | blk_mq_req_flags_t flags); | ||
325 | int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); | 392 | int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); |
326 | void nvme_start_keep_alive(struct nvme_ctrl *ctrl); | 393 | void nvme_start_keep_alive(struct nvme_ctrl *ctrl); |
327 | void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); | 394 | void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); |
328 | int nvme_reset_ctrl(struct nvme_ctrl *ctrl); | 395 | int nvme_reset_ctrl(struct nvme_ctrl *ctrl); |
396 | int nvme_delete_ctrl(struct nvme_ctrl *ctrl); | ||
397 | int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); | ||
398 | |||
399 | extern const struct attribute_group nvme_ns_id_attr_group; | ||
400 | extern const struct block_device_operations nvme_ns_head_ops; | ||
401 | |||
402 | #ifdef CONFIG_NVME_MULTIPATH | ||
403 | void nvme_failover_req(struct request *req); | ||
404 | bool nvme_req_needs_failover(struct request *req); | ||
405 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); | ||
406 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); | ||
407 | void nvme_mpath_add_disk(struct nvme_ns_head *head); | ||
408 | void nvme_mpath_add_disk_links(struct nvme_ns *ns); | ||
409 | void nvme_mpath_remove_disk(struct nvme_ns_head *head); | ||
410 | void nvme_mpath_remove_disk_links(struct nvme_ns *ns); | ||
411 | |||
412 | static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) | ||
413 | { | ||
414 | struct nvme_ns_head *head = ns->head; | ||
415 | |||
416 | if (head && ns == srcu_dereference(head->current_path, &head->srcu)) | ||
417 | rcu_assign_pointer(head->current_path, NULL); | ||
418 | } | ||
419 | struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); | ||
420 | #else | ||
421 | static inline void nvme_failover_req(struct request *req) | ||
422 | { | ||
423 | } | ||
424 | static inline bool nvme_req_needs_failover(struct request *req) | ||
425 | { | ||
426 | return false; | ||
427 | } | ||
428 | static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) | ||
429 | { | ||
430 | } | ||
431 | static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, | ||
432 | struct nvme_ns_head *head) | ||
433 | { | ||
434 | return 0; | ||
435 | } | ||
436 | static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) | ||
437 | { | ||
438 | } | ||
439 | static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) | ||
440 | { | ||
441 | } | ||
442 | static inline void nvme_mpath_add_disk_links(struct nvme_ns *ns) | ||
443 | { | ||
444 | } | ||
445 | static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns) | ||
446 | { | ||
447 | } | ||
448 | static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) | ||
449 | { | ||
450 | } | ||
451 | #endif /* CONFIG_NVME_MULTIPATH */ | ||
329 | 452 | ||
330 | #ifdef CONFIG_NVM | 453 | #ifdef CONFIG_NVM |
331 | int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); | 454 | int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); |
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3f5a04c586ce..a11cfd470089 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c | |||
@@ -13,7 +13,6 @@ | |||
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/aer.h> | 15 | #include <linux/aer.h> |
16 | #include <linux/bitops.h> | ||
17 | #include <linux/blkdev.h> | 16 | #include <linux/blkdev.h> |
18 | #include <linux/blk-mq.h> | 17 | #include <linux/blk-mq.h> |
19 | #include <linux/blk-mq-pci.h> | 18 | #include <linux/blk-mq-pci.h> |
@@ -26,12 +25,9 @@ | |||
26 | #include <linux/mutex.h> | 25 | #include <linux/mutex.h> |
27 | #include <linux/once.h> | 26 | #include <linux/once.h> |
28 | #include <linux/pci.h> | 27 | #include <linux/pci.h> |
29 | #include <linux/poison.h> | ||
30 | #include <linux/t10-pi.h> | 28 | #include <linux/t10-pi.h> |
31 | #include <linux/timer.h> | ||
32 | #include <linux/types.h> | 29 | #include <linux/types.h> |
33 | #include <linux/io-64-nonatomic-lo-hi.h> | 30 | #include <linux/io-64-nonatomic-lo-hi.h> |
34 | #include <asm/unaligned.h> | ||
35 | #include <linux/sed-opal.h> | 31 | #include <linux/sed-opal.h> |
36 | 32 | ||
37 | #include "nvme.h" | 33 | #include "nvme.h" |
@@ -39,11 +35,7 @@ | |||
39 | #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) | 35 | #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) |
40 | #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) | 36 | #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) |
41 | 37 | ||
42 | /* | 38 | #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) |
43 | * We handle AEN commands ourselves and don't even let the | ||
44 | * block layer know about them. | ||
45 | */ | ||
46 | #define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) | ||
47 | 39 | ||
48 | static int use_threaded_interrupts; | 40 | static int use_threaded_interrupts; |
49 | module_param(use_threaded_interrupts, int, 0); | 41 | module_param(use_threaded_interrupts, int, 0); |
@@ -57,6 +49,12 @@ module_param(max_host_mem_size_mb, uint, 0444); | |||
57 | MODULE_PARM_DESC(max_host_mem_size_mb, | 49 | MODULE_PARM_DESC(max_host_mem_size_mb, |
58 | "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); | 50 | "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); |
59 | 51 | ||
52 | static unsigned int sgl_threshold = SZ_32K; | ||
53 | module_param(sgl_threshold, uint, 0644); | ||
54 | MODULE_PARM_DESC(sgl_threshold, | ||
55 | "Use SGLs when average request segment size is larger or equal to " | ||
56 | "this size. Use 0 to disable SGLs."); | ||
57 | |||
60 | static int io_queue_depth_set(const char *val, const struct kernel_param *kp); | 58 | static int io_queue_depth_set(const char *val, const struct kernel_param *kp); |
61 | static const struct kernel_param_ops io_queue_depth_ops = { | 59 | static const struct kernel_param_ops io_queue_depth_ops = { |
62 | .set = io_queue_depth_set, | 60 | .set = io_queue_depth_set, |
@@ -178,6 +176,7 @@ struct nvme_queue { | |||
178 | struct nvme_iod { | 176 | struct nvme_iod { |
179 | struct nvme_request req; | 177 | struct nvme_request req; |
180 | struct nvme_queue *nvmeq; | 178 | struct nvme_queue *nvmeq; |
179 | bool use_sgl; | ||
181 | int aborted; | 180 | int aborted; |
182 | int npages; /* In the PRP list. 0 means small pool in use */ | 181 | int npages; /* In the PRP list. 0 means small pool in use */ |
183 | int nents; /* Used in scatterlist */ | 182 | int nents; /* Used in scatterlist */ |
@@ -331,17 +330,35 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) | |||
331 | return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); | 330 | return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); |
332 | } | 331 | } |
333 | 332 | ||
334 | static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, | 333 | /* |
335 | unsigned int size, unsigned int nseg) | 334 | * Calculates the number of pages needed for the SGL segments. For example a 4k |
335 | * page can accommodate 256 SGL descriptors. | ||
336 | */ | ||
337 | static int nvme_pci_npages_sgl(unsigned int num_seg) | ||
336 | { | 338 | { |
337 | return sizeof(__le64 *) * nvme_npages(size, dev) + | 339 | return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); |
338 | sizeof(struct scatterlist) * nseg; | ||
339 | } | 340 | } |
340 | 341 | ||
341 | static unsigned int nvme_cmd_size(struct nvme_dev *dev) | 342 | static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, |
343 | unsigned int size, unsigned int nseg, bool use_sgl) | ||
342 | { | 344 | { |
343 | return sizeof(struct nvme_iod) + | 345 | size_t alloc_size; |
344 | nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); | 346 | |
347 | if (use_sgl) | ||
348 | alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); | ||
349 | else | ||
350 | alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); | ||
351 | |||
352 | return alloc_size + sizeof(struct scatterlist) * nseg; | ||
353 | } | ||
354 | |||
355 | static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl) | ||
356 | { | ||
357 | unsigned int alloc_size = nvme_pci_iod_alloc_size(dev, | ||
358 | NVME_INT_BYTES(dev), NVME_INT_PAGES, | ||
359 | use_sgl); | ||
360 | |||
361 | return sizeof(struct nvme_iod) + alloc_size; | ||
345 | } | 362 | } |
346 | 363 | ||
347 | static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, | 364 | static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, |
@@ -425,10 +442,10 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, | |||
425 | nvmeq->sq_tail = tail; | 442 | nvmeq->sq_tail = tail; |
426 | } | 443 | } |
427 | 444 | ||
428 | static __le64 **iod_list(struct request *req) | 445 | static void **nvme_pci_iod_list(struct request *req) |
429 | { | 446 | { |
430 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); | 447 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
431 | return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); | 448 | return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); |
432 | } | 449 | } |
433 | 450 | ||
434 | static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) | 451 | static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) |
@@ -438,7 +455,10 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) | |||
438 | unsigned int size = blk_rq_payload_bytes(rq); | 455 | unsigned int size = blk_rq_payload_bytes(rq); |
439 | 456 | ||
440 | if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { | 457 | if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { |
441 | iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); | 458 | size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, |
459 | iod->use_sgl); | ||
460 | |||
461 | iod->sg = kmalloc(alloc_size, GFP_ATOMIC); | ||
442 | if (!iod->sg) | 462 | if (!iod->sg) |
443 | return BLK_STS_RESOURCE; | 463 | return BLK_STS_RESOURCE; |
444 | } else { | 464 | } else { |
@@ -456,18 +476,31 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) | |||
456 | static void nvme_free_iod(struct nvme_dev *dev, struct request *req) | 476 | static void nvme_free_iod(struct nvme_dev *dev, struct request *req) |
457 | { | 477 | { |
458 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); | 478 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
459 | const int last_prp = dev->ctrl.page_size / 8 - 1; | 479 | const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; |
480 | dma_addr_t dma_addr = iod->first_dma, next_dma_addr; | ||
481 | |||
460 | int i; | 482 | int i; |
461 | __le64 **list = iod_list(req); | ||
462 | dma_addr_t prp_dma = iod->first_dma; | ||
463 | 483 | ||
464 | if (iod->npages == 0) | 484 | if (iod->npages == 0) |
465 | dma_pool_free(dev->prp_small_pool, list[0], prp_dma); | 485 | dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], |
486 | dma_addr); | ||
487 | |||
466 | for (i = 0; i < iod->npages; i++) { | 488 | for (i = 0; i < iod->npages; i++) { |
467 | __le64 *prp_list = list[i]; | 489 | void *addr = nvme_pci_iod_list(req)[i]; |
468 | dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); | 490 | |
469 | dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); | 491 | if (iod->use_sgl) { |
470 | prp_dma = next_prp_dma; | 492 | struct nvme_sgl_desc *sg_list = addr; |
493 | |||
494 | next_dma_addr = | ||
495 | le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr); | ||
496 | } else { | ||
497 | __le64 *prp_list = addr; | ||
498 | |||
499 | next_dma_addr = le64_to_cpu(prp_list[last_prp]); | ||
500 | } | ||
501 | |||
502 | dma_pool_free(dev->prp_page_pool, addr, dma_addr); | ||
503 | dma_addr = next_dma_addr; | ||
471 | } | 504 | } |
472 | 505 | ||
473 | if (iod->sg != iod->inline_sg) | 506 | if (iod->sg != iod->inline_sg) |
@@ -555,7 +588,8 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents) | |||
555 | } | 588 | } |
556 | } | 589 | } |
557 | 590 | ||
558 | static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) | 591 | static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, |
592 | struct request *req, struct nvme_rw_command *cmnd) | ||
559 | { | 593 | { |
560 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); | 594 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
561 | struct dma_pool *pool; | 595 | struct dma_pool *pool; |
@@ -566,14 +600,16 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) | |||
566 | u32 page_size = dev->ctrl.page_size; | 600 | u32 page_size = dev->ctrl.page_size; |
567 | int offset = dma_addr & (page_size - 1); | 601 | int offset = dma_addr & (page_size - 1); |
568 | __le64 *prp_list; | 602 | __le64 *prp_list; |
569 | __le64 **list = iod_list(req); | 603 | void **list = nvme_pci_iod_list(req); |
570 | dma_addr_t prp_dma; | 604 | dma_addr_t prp_dma; |
571 | int nprps, i; | 605 | int nprps, i; |
572 | 606 | ||
607 | iod->use_sgl = false; | ||
608 | |||
573 | length -= (page_size - offset); | 609 | length -= (page_size - offset); |
574 | if (length <= 0) { | 610 | if (length <= 0) { |
575 | iod->first_dma = 0; | 611 | iod->first_dma = 0; |
576 | return BLK_STS_OK; | 612 | goto done; |
577 | } | 613 | } |
578 | 614 | ||
579 | dma_len -= (page_size - offset); | 615 | dma_len -= (page_size - offset); |
@@ -587,7 +623,7 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) | |||
587 | 623 | ||
588 | if (length <= page_size) { | 624 | if (length <= page_size) { |
589 | iod->first_dma = dma_addr; | 625 | iod->first_dma = dma_addr; |
590 | return BLK_STS_OK; | 626 | goto done; |
591 | } | 627 | } |
592 | 628 | ||
593 | nprps = DIV_ROUND_UP(length, page_size); | 629 | nprps = DIV_ROUND_UP(length, page_size); |
@@ -634,6 +670,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) | |||
634 | dma_len = sg_dma_len(sg); | 670 | dma_len = sg_dma_len(sg); |
635 | } | 671 | } |
636 | 672 | ||
673 | done: | ||
674 | cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); | ||
675 | cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); | ||
676 | |||
637 | return BLK_STS_OK; | 677 | return BLK_STS_OK; |
638 | 678 | ||
639 | bad_sgl: | 679 | bad_sgl: |
@@ -643,6 +683,110 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) | |||
643 | return BLK_STS_IOERR; | 683 | return BLK_STS_IOERR; |
644 | } | 684 | } |
645 | 685 | ||
686 | static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, | ||
687 | struct scatterlist *sg) | ||
688 | { | ||
689 | sge->addr = cpu_to_le64(sg_dma_address(sg)); | ||
690 | sge->length = cpu_to_le32(sg_dma_len(sg)); | ||
691 | sge->type = NVME_SGL_FMT_DATA_DESC << 4; | ||
692 | } | ||
693 | |||
694 | static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, | ||
695 | dma_addr_t dma_addr, int entries) | ||
696 | { | ||
697 | sge->addr = cpu_to_le64(dma_addr); | ||
698 | if (entries < SGES_PER_PAGE) { | ||
699 | sge->length = cpu_to_le32(entries * sizeof(*sge)); | ||
700 | sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; | ||
701 | } else { | ||
702 | sge->length = cpu_to_le32(PAGE_SIZE); | ||
703 | sge->type = NVME_SGL_FMT_SEG_DESC << 4; | ||
704 | } | ||
705 | } | ||
706 | |||
707 | static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, | ||
708 | struct request *req, struct nvme_rw_command *cmd) | ||
709 | { | ||
710 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); | ||
711 | int length = blk_rq_payload_bytes(req); | ||
712 | struct dma_pool *pool; | ||
713 | struct nvme_sgl_desc *sg_list; | ||
714 | struct scatterlist *sg = iod->sg; | ||
715 | int entries = iod->nents, i = 0; | ||
716 | dma_addr_t sgl_dma; | ||
717 | |||
718 | iod->use_sgl = true; | ||
719 | |||
720 | /* setting the transfer type as SGL */ | ||
721 | cmd->flags = NVME_CMD_SGL_METABUF; | ||
722 | |||
723 | if (length == sg_dma_len(sg)) { | ||
724 | nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); | ||
725 | return BLK_STS_OK; | ||
726 | } | ||
727 | |||
728 | if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { | ||
729 | pool = dev->prp_small_pool; | ||
730 | iod->npages = 0; | ||
731 | } else { | ||
732 | pool = dev->prp_page_pool; | ||
733 | iod->npages = 1; | ||
734 | } | ||
735 | |||
736 | sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); | ||
737 | if (!sg_list) { | ||
738 | iod->npages = -1; | ||
739 | return BLK_STS_RESOURCE; | ||
740 | } | ||
741 | |||
742 | nvme_pci_iod_list(req)[0] = sg_list; | ||
743 | iod->first_dma = sgl_dma; | ||
744 | |||
745 | nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); | ||
746 | |||
747 | do { | ||
748 | if (i == SGES_PER_PAGE) { | ||
749 | struct nvme_sgl_desc *old_sg_desc = sg_list; | ||
750 | struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; | ||
751 | |||
752 | sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); | ||
753 | if (!sg_list) | ||
754 | return BLK_STS_RESOURCE; | ||
755 | |||
756 | i = 0; | ||
757 | nvme_pci_iod_list(req)[iod->npages++] = sg_list; | ||
758 | sg_list[i++] = *link; | ||
759 | nvme_pci_sgl_set_seg(link, sgl_dma, entries); | ||
760 | } | ||
761 | |||
762 | nvme_pci_sgl_set_data(&sg_list[i++], sg); | ||
763 | |||
764 | length -= sg_dma_len(sg); | ||
765 | sg = sg_next(sg); | ||
766 | entries--; | ||
767 | } while (length > 0); | ||
768 | |||
769 | WARN_ON(entries > 0); | ||
770 | return BLK_STS_OK; | ||
771 | } | ||
772 | |||
773 | static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) | ||
774 | { | ||
775 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); | ||
776 | unsigned int avg_seg_size; | ||
777 | |||
778 | avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), | ||
779 | blk_rq_nr_phys_segments(req)); | ||
780 | |||
781 | if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) | ||
782 | return false; | ||
783 | if (!iod->nvmeq->qid) | ||
784 | return false; | ||
785 | if (!sgl_threshold || avg_seg_size < sgl_threshold) | ||
786 | return false; | ||
787 | return true; | ||
788 | } | ||
789 | |||
646 | static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, | 790 | static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, |
647 | struct nvme_command *cmnd) | 791 | struct nvme_command *cmnd) |
648 | { | 792 | { |
@@ -662,7 +806,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, | |||
662 | DMA_ATTR_NO_WARN)) | 806 | DMA_ATTR_NO_WARN)) |
663 | goto out; | 807 | goto out; |
664 | 808 | ||
665 | ret = nvme_setup_prps(dev, req); | 809 | if (nvme_pci_use_sgls(dev, req)) |
810 | ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); | ||
811 | else | ||
812 | ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); | ||
813 | |||
666 | if (ret != BLK_STS_OK) | 814 | if (ret != BLK_STS_OK) |
667 | goto out_unmap; | 815 | goto out_unmap; |
668 | 816 | ||
@@ -682,8 +830,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, | |||
682 | goto out_unmap; | 830 | goto out_unmap; |
683 | } | 831 | } |
684 | 832 | ||
685 | cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); | ||
686 | cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); | ||
687 | if (blk_integrity_rq(req)) | 833 | if (blk_integrity_rq(req)) |
688 | cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); | 834 | cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); |
689 | return BLK_STS_OK; | 835 | return BLK_STS_OK; |
@@ -804,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, | |||
804 | * for them but rather special case them here. | 950 | * for them but rather special case them here. |
805 | */ | 951 | */ |
806 | if (unlikely(nvmeq->qid == 0 && | 952 | if (unlikely(nvmeq->qid == 0 && |
807 | cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) { | 953 | cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { |
808 | nvme_complete_async_event(&nvmeq->dev->ctrl, | 954 | nvme_complete_async_event(&nvmeq->dev->ctrl, |
809 | cqe->status, &cqe->result); | 955 | cqe->status, &cqe->result); |
810 | return; | 956 | return; |
@@ -897,7 +1043,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) | |||
897 | return __nvme_poll(nvmeq, tag); | 1043 | return __nvme_poll(nvmeq, tag); |
898 | } | 1044 | } |
899 | 1045 | ||
900 | static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) | 1046 | static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) |
901 | { | 1047 | { |
902 | struct nvme_dev *dev = to_nvme_dev(ctrl); | 1048 | struct nvme_dev *dev = to_nvme_dev(ctrl); |
903 | struct nvme_queue *nvmeq = dev->queues[0]; | 1049 | struct nvme_queue *nvmeq = dev->queues[0]; |
@@ -905,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) | |||
905 | 1051 | ||
906 | memset(&c, 0, sizeof(c)); | 1052 | memset(&c, 0, sizeof(c)); |
907 | c.common.opcode = nvme_admin_async_event; | 1053 | c.common.opcode = nvme_admin_async_event; |
908 | c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; | 1054 | c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; |
909 | 1055 | ||
910 | spin_lock_irq(&nvmeq->q_lock); | 1056 | spin_lock_irq(&nvmeq->q_lock); |
911 | __nvme_submit_cmd(nvmeq, &c); | 1057 | __nvme_submit_cmd(nvmeq, &c); |
@@ -930,7 +1076,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, | |||
930 | int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; | 1076 | int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; |
931 | 1077 | ||
932 | /* | 1078 | /* |
933 | * Note: we (ab)use the fact the the prp fields survive if no data | 1079 | * Note: we (ab)use the fact that the prp fields survive if no data |
934 | * is attached to the request. | 1080 | * is attached to the request. |
935 | */ | 1081 | */ |
936 | memset(&c, 0, sizeof(c)); | 1082 | memset(&c, 0, sizeof(c)); |
@@ -951,7 +1097,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, | |||
951 | int flags = NVME_QUEUE_PHYS_CONTIG; | 1097 | int flags = NVME_QUEUE_PHYS_CONTIG; |
952 | 1098 | ||
953 | /* | 1099 | /* |
954 | * Note: we (ab)use the fact the the prp fields survive if no data | 1100 | * Note: we (ab)use the fact that the prp fields survive if no data |
955 | * is attached to the request. | 1101 | * is attached to the request. |
956 | */ | 1102 | */ |
957 | memset(&c, 0, sizeof(c)); | 1103 | memset(&c, 0, sizeof(c)); |
@@ -1372,14 +1518,10 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) | |||
1372 | dev->admin_tagset.ops = &nvme_mq_admin_ops; | 1518 | dev->admin_tagset.ops = &nvme_mq_admin_ops; |
1373 | dev->admin_tagset.nr_hw_queues = 1; | 1519 | dev->admin_tagset.nr_hw_queues = 1; |
1374 | 1520 | ||
1375 | /* | 1521 | dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; |
1376 | * Subtract one to leave an empty queue entry for 'Full Queue' | ||
1377 | * condition. See NVM-Express 1.2 specification, section 4.1.2. | ||
1378 | */ | ||
1379 | dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; | ||
1380 | dev->admin_tagset.timeout = ADMIN_TIMEOUT; | 1522 | dev->admin_tagset.timeout = ADMIN_TIMEOUT; |
1381 | dev->admin_tagset.numa_node = dev_to_node(dev->dev); | 1523 | dev->admin_tagset.numa_node = dev_to_node(dev->dev); |
1382 | dev->admin_tagset.cmd_size = nvme_cmd_size(dev); | 1524 | dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); |
1383 | dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; | 1525 | dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; |
1384 | dev->admin_tagset.driver_data = dev; | 1526 | dev->admin_tagset.driver_data = dev; |
1385 | 1527 | ||
@@ -1906,7 +2048,11 @@ static int nvme_dev_add(struct nvme_dev *dev) | |||
1906 | dev->tagset.numa_node = dev_to_node(dev->dev); | 2048 | dev->tagset.numa_node = dev_to_node(dev->dev); |
1907 | dev->tagset.queue_depth = | 2049 | dev->tagset.queue_depth = |
1908 | min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; | 2050 | min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; |
1909 | dev->tagset.cmd_size = nvme_cmd_size(dev); | 2051 | dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false); |
2052 | if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) { | ||
2053 | dev->tagset.cmd_size = max(dev->tagset.cmd_size, | ||
2054 | nvme_pci_cmd_size(dev, true)); | ||
2055 | } | ||
1910 | dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; | 2056 | dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; |
1911 | dev->tagset.driver_data = dev; | 2057 | dev->tagset.driver_data = dev; |
1912 | 2058 | ||
@@ -2132,9 +2278,9 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) | |||
2132 | { | 2278 | { |
2133 | dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); | 2279 | dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); |
2134 | 2280 | ||
2135 | kref_get(&dev->ctrl.kref); | 2281 | nvme_get_ctrl(&dev->ctrl); |
2136 | nvme_dev_disable(dev, false); | 2282 | nvme_dev_disable(dev, false); |
2137 | if (!schedule_work(&dev->remove_work)) | 2283 | if (!queue_work(nvme_wq, &dev->remove_work)) |
2138 | nvme_put_ctrl(&dev->ctrl); | 2284 | nvme_put_ctrl(&dev->ctrl); |
2139 | } | 2285 | } |
2140 | 2286 | ||
@@ -2557,6 +2703,7 @@ static int __init nvme_init(void) | |||
2557 | static void __exit nvme_exit(void) | 2703 | static void __exit nvme_exit(void) |
2558 | { | 2704 | { |
2559 | pci_unregister_driver(&nvme_driver); | 2705 | pci_unregister_driver(&nvme_driver); |
2706 | flush_workqueue(nvme_wq); | ||
2560 | _nvme_check_size(); | 2707 | _nvme_check_size(); |
2561 | } | 2708 | } |
2562 | 2709 | ||
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0ebb539f3bd3..4f9bf2f815c3 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c | |||
@@ -41,17 +41,9 @@ | |||
41 | 41 | ||
42 | #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 | 42 | #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 |
43 | 43 | ||
44 | /* | ||
45 | * We handle AEN commands ourselves and don't even let the | ||
46 | * block layer know about them. | ||
47 | */ | ||
48 | #define NVME_RDMA_NR_AEN_COMMANDS 1 | ||
49 | #define NVME_RDMA_AQ_BLKMQ_DEPTH \ | ||
50 | (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) | ||
51 | |||
52 | struct nvme_rdma_device { | 44 | struct nvme_rdma_device { |
53 | struct ib_device *dev; | 45 | struct ib_device *dev; |
54 | struct ib_pd *pd; | 46 | struct ib_pd *pd; |
55 | struct kref ref; | 47 | struct kref ref; |
56 | struct list_head entry; | 48 | struct list_head entry; |
57 | }; | 49 | }; |
@@ -79,8 +71,8 @@ struct nvme_rdma_request { | |||
79 | }; | 71 | }; |
80 | 72 | ||
81 | enum nvme_rdma_queue_flags { | 73 | enum nvme_rdma_queue_flags { |
82 | NVME_RDMA_Q_LIVE = 0, | 74 | NVME_RDMA_Q_ALLOCATED = 0, |
83 | NVME_RDMA_Q_DELETING = 1, | 75 | NVME_RDMA_Q_LIVE = 1, |
84 | }; | 76 | }; |
85 | 77 | ||
86 | struct nvme_rdma_queue { | 78 | struct nvme_rdma_queue { |
@@ -105,7 +97,6 @@ struct nvme_rdma_ctrl { | |||
105 | 97 | ||
106 | /* other member variables */ | 98 | /* other member variables */ |
107 | struct blk_mq_tag_set tag_set; | 99 | struct blk_mq_tag_set tag_set; |
108 | struct work_struct delete_work; | ||
109 | struct work_struct err_work; | 100 | struct work_struct err_work; |
110 | 101 | ||
111 | struct nvme_rdma_qe async_event_sqe; | 102 | struct nvme_rdma_qe async_event_sqe; |
@@ -274,6 +265,9 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) | |||
274 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); | 265 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); |
275 | int ret = 0; | 266 | int ret = 0; |
276 | 267 | ||
268 | if (WARN_ON_ONCE(!req->mr)) | ||
269 | return 0; | ||
270 | |||
277 | ib_dereg_mr(req->mr); | 271 | ib_dereg_mr(req->mr); |
278 | 272 | ||
279 | req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, | 273 | req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, |
@@ -434,11 +428,9 @@ out_err: | |||
434 | 428 | ||
435 | static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) | 429 | static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) |
436 | { | 430 | { |
437 | struct nvme_rdma_device *dev; | 431 | struct nvme_rdma_device *dev = queue->device; |
438 | struct ib_device *ibdev; | 432 | struct ib_device *ibdev = dev->dev; |
439 | 433 | ||
440 | dev = queue->device; | ||
441 | ibdev = dev->dev; | ||
442 | rdma_destroy_qp(queue->cm_id); | 434 | rdma_destroy_qp(queue->cm_id); |
443 | ib_free_cq(queue->ib_cq); | 435 | ib_free_cq(queue->ib_cq); |
444 | 436 | ||
@@ -493,7 +485,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) | |||
493 | return 0; | 485 | return 0; |
494 | 486 | ||
495 | out_destroy_qp: | 487 | out_destroy_qp: |
496 | ib_destroy_qp(queue->qp); | 488 | rdma_destroy_qp(queue->cm_id); |
497 | out_destroy_ib_cq: | 489 | out_destroy_ib_cq: |
498 | ib_free_cq(queue->ib_cq); | 490 | ib_free_cq(queue->ib_cq); |
499 | out_put_dev: | 491 | out_put_dev: |
@@ -544,11 +536,11 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, | |||
544 | ret = nvme_rdma_wait_for_cm(queue); | 536 | ret = nvme_rdma_wait_for_cm(queue); |
545 | if (ret) { | 537 | if (ret) { |
546 | dev_info(ctrl->ctrl.device, | 538 | dev_info(ctrl->ctrl.device, |
547 | "rdma_resolve_addr wait failed (%d).\n", ret); | 539 | "rdma connection establishment failed (%d)\n", ret); |
548 | goto out_destroy_cm_id; | 540 | goto out_destroy_cm_id; |
549 | } | 541 | } |
550 | 542 | ||
551 | clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); | 543 | set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags); |
552 | 544 | ||
553 | return 0; | 545 | return 0; |
554 | 546 | ||
@@ -568,7 +560,7 @@ static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) | |||
568 | 560 | ||
569 | static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) | 561 | static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) |
570 | { | 562 | { |
571 | if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) | 563 | if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) |
572 | return; | 564 | return; |
573 | 565 | ||
574 | if (nvme_rdma_queue_idx(queue) == 0) { | 566 | if (nvme_rdma_queue_idx(queue) == 0) { |
@@ -676,11 +668,10 @@ out_free_queues: | |||
676 | return ret; | 668 | return ret; |
677 | } | 669 | } |
678 | 670 | ||
679 | static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin) | 671 | static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, |
672 | struct blk_mq_tag_set *set) | ||
680 | { | 673 | { |
681 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); | 674 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); |
682 | struct blk_mq_tag_set *set = admin ? | ||
683 | &ctrl->admin_tag_set : &ctrl->tag_set; | ||
684 | 675 | ||
685 | blk_mq_free_tag_set(set); | 676 | blk_mq_free_tag_set(set); |
686 | nvme_rdma_dev_put(ctrl->device); | 677 | nvme_rdma_dev_put(ctrl->device); |
@@ -697,7 +688,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, | |||
697 | set = &ctrl->admin_tag_set; | 688 | set = &ctrl->admin_tag_set; |
698 | memset(set, 0, sizeof(*set)); | 689 | memset(set, 0, sizeof(*set)); |
699 | set->ops = &nvme_rdma_admin_mq_ops; | 690 | set->ops = &nvme_rdma_admin_mq_ops; |
700 | set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; | 691 | set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; |
701 | set->reserved_tags = 2; /* connect + keep-alive */ | 692 | set->reserved_tags = 2; /* connect + keep-alive */ |
702 | set->numa_node = NUMA_NO_NODE; | 693 | set->numa_node = NUMA_NO_NODE; |
703 | set->cmd_size = sizeof(struct nvme_rdma_request) + | 694 | set->cmd_size = sizeof(struct nvme_rdma_request) + |
@@ -705,6 +696,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, | |||
705 | set->driver_data = ctrl; | 696 | set->driver_data = ctrl; |
706 | set->nr_hw_queues = 1; | 697 | set->nr_hw_queues = 1; |
707 | set->timeout = ADMIN_TIMEOUT; | 698 | set->timeout = ADMIN_TIMEOUT; |
699 | set->flags = BLK_MQ_F_NO_SCHED; | ||
708 | } else { | 700 | } else { |
709 | set = &ctrl->tag_set; | 701 | set = &ctrl->tag_set; |
710 | memset(set, 0, sizeof(*set)); | 702 | memset(set, 0, sizeof(*set)); |
@@ -748,7 +740,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, | |||
748 | nvme_rdma_stop_queue(&ctrl->queues[0]); | 740 | nvme_rdma_stop_queue(&ctrl->queues[0]); |
749 | if (remove) { | 741 | if (remove) { |
750 | blk_cleanup_queue(ctrl->ctrl.admin_q); | 742 | blk_cleanup_queue(ctrl->ctrl.admin_q); |
751 | nvme_rdma_free_tagset(&ctrl->ctrl, true); | 743 | nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); |
752 | } | 744 | } |
753 | nvme_rdma_free_queue(&ctrl->queues[0]); | 745 | nvme_rdma_free_queue(&ctrl->queues[0]); |
754 | } | 746 | } |
@@ -780,8 +772,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, | |||
780 | goto out_free_tagset; | 772 | goto out_free_tagset; |
781 | } | 773 | } |
782 | } else { | 774 | } else { |
783 | error = blk_mq_reinit_tagset(&ctrl->admin_tag_set, | 775 | error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); |
784 | nvme_rdma_reinit_request); | ||
785 | if (error) | 776 | if (error) |
786 | goto out_free_queue; | 777 | goto out_free_queue; |
787 | } | 778 | } |
@@ -825,7 +816,7 @@ out_cleanup_queue: | |||
825 | blk_cleanup_queue(ctrl->ctrl.admin_q); | 816 | blk_cleanup_queue(ctrl->ctrl.admin_q); |
826 | out_free_tagset: | 817 | out_free_tagset: |
827 | if (new) | 818 | if (new) |
828 | nvme_rdma_free_tagset(&ctrl->ctrl, true); | 819 | nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); |
829 | out_free_queue: | 820 | out_free_queue: |
830 | nvme_rdma_free_queue(&ctrl->queues[0]); | 821 | nvme_rdma_free_queue(&ctrl->queues[0]); |
831 | return error; | 822 | return error; |
@@ -837,7 +828,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, | |||
837 | nvme_rdma_stop_io_queues(ctrl); | 828 | nvme_rdma_stop_io_queues(ctrl); |
838 | if (remove) { | 829 | if (remove) { |
839 | blk_cleanup_queue(ctrl->ctrl.connect_q); | 830 | blk_cleanup_queue(ctrl->ctrl.connect_q); |
840 | nvme_rdma_free_tagset(&ctrl->ctrl, false); | 831 | nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); |
841 | } | 832 | } |
842 | nvme_rdma_free_io_queues(ctrl); | 833 | nvme_rdma_free_io_queues(ctrl); |
843 | } | 834 | } |
@@ -863,8 +854,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) | |||
863 | goto out_free_tag_set; | 854 | goto out_free_tag_set; |
864 | } | 855 | } |
865 | } else { | 856 | } else { |
866 | ret = blk_mq_reinit_tagset(&ctrl->tag_set, | 857 | ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); |
867 | nvme_rdma_reinit_request); | ||
868 | if (ret) | 858 | if (ret) |
869 | goto out_free_io_queues; | 859 | goto out_free_io_queues; |
870 | 860 | ||
@@ -883,7 +873,7 @@ out_cleanup_connect_q: | |||
883 | blk_cleanup_queue(ctrl->ctrl.connect_q); | 873 | blk_cleanup_queue(ctrl->ctrl.connect_q); |
884 | out_free_tag_set: | 874 | out_free_tag_set: |
885 | if (new) | 875 | if (new) |
886 | nvme_rdma_free_tagset(&ctrl->ctrl, false); | 876 | nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); |
887 | out_free_io_queues: | 877 | out_free_io_queues: |
888 | nvme_rdma_free_io_queues(ctrl); | 878 | nvme_rdma_free_io_queues(ctrl); |
889 | return ret; | 879 | return ret; |
@@ -922,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) | |||
922 | ctrl->ctrl.opts->reconnect_delay * HZ); | 912 | ctrl->ctrl.opts->reconnect_delay * HZ); |
923 | } else { | 913 | } else { |
924 | dev_info(ctrl->ctrl.device, "Removing controller...\n"); | 914 | dev_info(ctrl->ctrl.device, "Removing controller...\n"); |
925 | queue_work(nvme_wq, &ctrl->delete_work); | 915 | nvme_delete_ctrl(&ctrl->ctrl); |
926 | } | 916 | } |
927 | } | 917 | } |
928 | 918 | ||
@@ -935,10 +925,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
935 | 925 | ||
936 | ++ctrl->ctrl.nr_reconnects; | 926 | ++ctrl->ctrl.nr_reconnects; |
937 | 927 | ||
938 | if (ctrl->ctrl.queue_count > 1) | ||
939 | nvme_rdma_destroy_io_queues(ctrl, false); | ||
940 | |||
941 | nvme_rdma_destroy_admin_queue(ctrl, false); | ||
942 | ret = nvme_rdma_configure_admin_queue(ctrl, false); | 928 | ret = nvme_rdma_configure_admin_queue(ctrl, false); |
943 | if (ret) | 929 | if (ret) |
944 | goto requeue; | 930 | goto requeue; |
@@ -946,7 +932,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
946 | if (ctrl->ctrl.queue_count > 1) { | 932 | if (ctrl->ctrl.queue_count > 1) { |
947 | ret = nvme_rdma_configure_io_queues(ctrl, false); | 933 | ret = nvme_rdma_configure_io_queues(ctrl, false); |
948 | if (ret) | 934 | if (ret) |
949 | goto requeue; | 935 | goto destroy_admin; |
950 | } | 936 | } |
951 | 937 | ||
952 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); | 938 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); |
@@ -956,14 +942,17 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
956 | return; | 942 | return; |
957 | } | 943 | } |
958 | 944 | ||
959 | ctrl->ctrl.nr_reconnects = 0; | ||
960 | |||
961 | nvme_start_ctrl(&ctrl->ctrl); | 945 | nvme_start_ctrl(&ctrl->ctrl); |
962 | 946 | ||
963 | dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); | 947 | dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", |
948 | ctrl->ctrl.nr_reconnects); | ||
949 | |||
950 | ctrl->ctrl.nr_reconnects = 0; | ||
964 | 951 | ||
965 | return; | 952 | return; |
966 | 953 | ||
954 | destroy_admin: | ||
955 | nvme_rdma_destroy_admin_queue(ctrl, false); | ||
967 | requeue: | 956 | requeue: |
968 | dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", | 957 | dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", |
969 | ctrl->ctrl.nr_reconnects); | 958 | ctrl->ctrl.nr_reconnects); |
@@ -979,17 +968,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) | |||
979 | 968 | ||
980 | if (ctrl->ctrl.queue_count > 1) { | 969 | if (ctrl->ctrl.queue_count > 1) { |
981 | nvme_stop_queues(&ctrl->ctrl); | 970 | nvme_stop_queues(&ctrl->ctrl); |
982 | nvme_rdma_stop_io_queues(ctrl); | ||
983 | } | ||
984 | blk_mq_quiesce_queue(ctrl->ctrl.admin_q); | ||
985 | nvme_rdma_stop_queue(&ctrl->queues[0]); | ||
986 | |||
987 | /* We must take care of fastfail/requeue all our inflight requests */ | ||
988 | if (ctrl->ctrl.queue_count > 1) | ||
989 | blk_mq_tagset_busy_iter(&ctrl->tag_set, | 971 | blk_mq_tagset_busy_iter(&ctrl->tag_set, |
990 | nvme_cancel_request, &ctrl->ctrl); | 972 | nvme_cancel_request, &ctrl->ctrl); |
973 | nvme_rdma_destroy_io_queues(ctrl, false); | ||
974 | } | ||
975 | |||
976 | blk_mq_quiesce_queue(ctrl->ctrl.admin_q); | ||
991 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, | 977 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, |
992 | nvme_cancel_request, &ctrl->ctrl); | 978 | nvme_cancel_request, &ctrl->ctrl); |
979 | nvme_rdma_destroy_admin_queue(ctrl, false); | ||
993 | 980 | ||
994 | /* | 981 | /* |
995 | * queues are not a live anymore, so restart the queues to fail fast | 982 | * queues are not a live anymore, so restart the queues to fail fast |
@@ -1065,7 +1052,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, | |||
1065 | if (!blk_rq_bytes(rq)) | 1052 | if (!blk_rq_bytes(rq)) |
1066 | return; | 1053 | return; |
1067 | 1054 | ||
1068 | if (req->mr->need_inval) { | 1055 | if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) { |
1069 | res = nvme_rdma_inv_rkey(queue, req); | 1056 | res = nvme_rdma_inv_rkey(queue, req); |
1070 | if (unlikely(res < 0)) { | 1057 | if (unlikely(res < 0)) { |
1071 | dev_err(ctrl->ctrl.device, | 1058 | dev_err(ctrl->ctrl.device, |
@@ -1314,7 +1301,7 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) | |||
1314 | return queue->ctrl->tag_set.tags[queue_idx - 1]; | 1301 | return queue->ctrl->tag_set.tags[queue_idx - 1]; |
1315 | } | 1302 | } |
1316 | 1303 | ||
1317 | static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) | 1304 | static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) |
1318 | { | 1305 | { |
1319 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); | 1306 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); |
1320 | struct nvme_rdma_queue *queue = &ctrl->queues[0]; | 1307 | struct nvme_rdma_queue *queue = &ctrl->queues[0]; |
@@ -1324,14 +1311,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) | |||
1324 | struct ib_sge sge; | 1311 | struct ib_sge sge; |
1325 | int ret; | 1312 | int ret; |
1326 | 1313 | ||
1327 | if (WARN_ON_ONCE(aer_idx != 0)) | ||
1328 | return; | ||
1329 | |||
1330 | ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); | 1314 | ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); |
1331 | 1315 | ||
1332 | memset(cmd, 0, sizeof(*cmd)); | 1316 | memset(cmd, 0, sizeof(*cmd)); |
1333 | cmd->common.opcode = nvme_admin_async_event; | 1317 | cmd->common.opcode = nvme_admin_async_event; |
1334 | cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; | 1318 | cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; |
1335 | cmd->common.flags |= NVME_CMD_SGL_METABUF; | 1319 | cmd->common.flags |= NVME_CMD_SGL_METABUF; |
1336 | nvme_rdma_set_sg_null(cmd); | 1320 | nvme_rdma_set_sg_null(cmd); |
1337 | 1321 | ||
@@ -1393,7 +1377,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) | |||
1393 | * for them but rather special case them here. | 1377 | * for them but rather special case them here. |
1394 | */ | 1378 | */ |
1395 | if (unlikely(nvme_rdma_queue_idx(queue) == 0 && | 1379 | if (unlikely(nvme_rdma_queue_idx(queue) == 0 && |
1396 | cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) | 1380 | cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) |
1397 | nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, | 1381 | nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, |
1398 | &cqe->result); | 1382 | &cqe->result); |
1399 | else | 1383 | else |
@@ -1590,6 +1574,10 @@ nvme_rdma_timeout(struct request *rq, bool reserved) | |||
1590 | { | 1574 | { |
1591 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); | 1575 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); |
1592 | 1576 | ||
1577 | dev_warn(req->queue->ctrl->ctrl.device, | ||
1578 | "I/O %d QID %d timeout, reset controller\n", | ||
1579 | rq->tag, nvme_rdma_queue_idx(req->queue)); | ||
1580 | |||
1593 | /* queue error recovery */ | 1581 | /* queue error recovery */ |
1594 | nvme_rdma_error_recovery(req->queue->ctrl); | 1582 | nvme_rdma_error_recovery(req->queue->ctrl); |
1595 | 1583 | ||
@@ -1767,50 +1755,9 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) | |||
1767 | nvme_rdma_destroy_admin_queue(ctrl, shutdown); | 1755 | nvme_rdma_destroy_admin_queue(ctrl, shutdown); |
1768 | } | 1756 | } |
1769 | 1757 | ||
1770 | static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) | 1758 | static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) |
1771 | { | 1759 | { |
1772 | nvme_remove_namespaces(&ctrl->ctrl); | 1760 | nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); |
1773 | nvme_rdma_shutdown_ctrl(ctrl, true); | ||
1774 | nvme_uninit_ctrl(&ctrl->ctrl); | ||
1775 | nvme_put_ctrl(&ctrl->ctrl); | ||
1776 | } | ||
1777 | |||
1778 | static void nvme_rdma_del_ctrl_work(struct work_struct *work) | ||
1779 | { | ||
1780 | struct nvme_rdma_ctrl *ctrl = container_of(work, | ||
1781 | struct nvme_rdma_ctrl, delete_work); | ||
1782 | |||
1783 | nvme_stop_ctrl(&ctrl->ctrl); | ||
1784 | nvme_rdma_remove_ctrl(ctrl); | ||
1785 | } | ||
1786 | |||
1787 | static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) | ||
1788 | { | ||
1789 | if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) | ||
1790 | return -EBUSY; | ||
1791 | |||
1792 | if (!queue_work(nvme_wq, &ctrl->delete_work)) | ||
1793 | return -EBUSY; | ||
1794 | |||
1795 | return 0; | ||
1796 | } | ||
1797 | |||
1798 | static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) | ||
1799 | { | ||
1800 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); | ||
1801 | int ret = 0; | ||
1802 | |||
1803 | /* | ||
1804 | * Keep a reference until all work is flushed since | ||
1805 | * __nvme_rdma_del_ctrl can free the ctrl mem | ||
1806 | */ | ||
1807 | if (!kref_get_unless_zero(&ctrl->ctrl.kref)) | ||
1808 | return -EBUSY; | ||
1809 | ret = __nvme_rdma_del_ctrl(ctrl); | ||
1810 | if (!ret) | ||
1811 | flush_work(&ctrl->delete_work); | ||
1812 | nvme_put_ctrl(&ctrl->ctrl); | ||
1813 | return ret; | ||
1814 | } | 1761 | } |
1815 | 1762 | ||
1816 | static void nvme_rdma_reset_ctrl_work(struct work_struct *work) | 1763 | static void nvme_rdma_reset_ctrl_work(struct work_struct *work) |
@@ -1834,7 +1781,11 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) | |||
1834 | } | 1781 | } |
1835 | 1782 | ||
1836 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); | 1783 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); |
1837 | WARN_ON_ONCE(!changed); | 1784 | if (!changed) { |
1785 | /* state change failure is ok if we're in DELETING state */ | ||
1786 | WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); | ||
1787 | return; | ||
1788 | } | ||
1838 | 1789 | ||
1839 | nvme_start_ctrl(&ctrl->ctrl); | 1790 | nvme_start_ctrl(&ctrl->ctrl); |
1840 | 1791 | ||
@@ -1842,7 +1793,10 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) | |||
1842 | 1793 | ||
1843 | out_fail: | 1794 | out_fail: |
1844 | dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); | 1795 | dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); |
1845 | nvme_rdma_remove_ctrl(ctrl); | 1796 | nvme_remove_namespaces(&ctrl->ctrl); |
1797 | nvme_rdma_shutdown_ctrl(ctrl, true); | ||
1798 | nvme_uninit_ctrl(&ctrl->ctrl); | ||
1799 | nvme_put_ctrl(&ctrl->ctrl); | ||
1846 | } | 1800 | } |
1847 | 1801 | ||
1848 | static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { | 1802 | static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { |
@@ -1854,10 +1808,88 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { | |||
1854 | .reg_write32 = nvmf_reg_write32, | 1808 | .reg_write32 = nvmf_reg_write32, |
1855 | .free_ctrl = nvme_rdma_free_ctrl, | 1809 | .free_ctrl = nvme_rdma_free_ctrl, |
1856 | .submit_async_event = nvme_rdma_submit_async_event, | 1810 | .submit_async_event = nvme_rdma_submit_async_event, |
1857 | .delete_ctrl = nvme_rdma_del_ctrl, | 1811 | .delete_ctrl = nvme_rdma_delete_ctrl, |
1858 | .get_address = nvmf_get_address, | 1812 | .get_address = nvmf_get_address, |
1813 | .reinit_request = nvme_rdma_reinit_request, | ||
1859 | }; | 1814 | }; |
1860 | 1815 | ||
1816 | static inline bool | ||
1817 | __nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl, | ||
1818 | struct nvmf_ctrl_options *opts) | ||
1819 | { | ||
1820 | char *stdport = __stringify(NVME_RDMA_IP_PORT); | ||
1821 | |||
1822 | |||
1823 | if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) || | ||
1824 | strcmp(opts->traddr, ctrl->ctrl.opts->traddr)) | ||
1825 | return false; | ||
1826 | |||
1827 | if (opts->mask & NVMF_OPT_TRSVCID && | ||
1828 | ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { | ||
1829 | if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid)) | ||
1830 | return false; | ||
1831 | } else if (opts->mask & NVMF_OPT_TRSVCID) { | ||
1832 | if (strcmp(opts->trsvcid, stdport)) | ||
1833 | return false; | ||
1834 | } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { | ||
1835 | if (strcmp(stdport, ctrl->ctrl.opts->trsvcid)) | ||
1836 | return false; | ||
1837 | } | ||
1838 | /* else, it's a match as both have stdport. Fall to next checks */ | ||
1839 | |||
1840 | /* | ||
1841 | * checking the local address is rough. In most cases, one | ||
1842 | * is not specified and the host port is selected by the stack. | ||
1843 | * | ||
1844 | * Assume no match if: | ||
1845 | * local address is specified and address is not the same | ||
1846 | * local address is not specified but remote is, or vice versa | ||
1847 | * (admin using specific host_traddr when it matters). | ||
1848 | */ | ||
1849 | if (opts->mask & NVMF_OPT_HOST_TRADDR && | ||
1850 | ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { | ||
1851 | if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr)) | ||
1852 | return false; | ||
1853 | } else if (opts->mask & NVMF_OPT_HOST_TRADDR || | ||
1854 | ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) | ||
1855 | return false; | ||
1856 | /* | ||
1857 | * if neither controller had an host port specified, assume it's | ||
1858 | * a match as everything else matched. | ||
1859 | */ | ||
1860 | |||
1861 | return true; | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1865 | * Fails a connection request if it matches an existing controller | ||
1866 | * (association) with the same tuple: | ||
1867 | * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN> | ||
1868 | * | ||
1869 | * if local address is not specified in the request, it will match an | ||
1870 | * existing controller with all the other parameters the same and no | ||
1871 | * local port address specified as well. | ||
1872 | * | ||
1873 | * The ports don't need to be compared as they are intrinsically | ||
1874 | * already matched by the port pointers supplied. | ||
1875 | */ | ||
1876 | static bool | ||
1877 | nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) | ||
1878 | { | ||
1879 | struct nvme_rdma_ctrl *ctrl; | ||
1880 | bool found = false; | ||
1881 | |||
1882 | mutex_lock(&nvme_rdma_ctrl_mutex); | ||
1883 | list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { | ||
1884 | found = __nvme_rdma_options_match(ctrl, opts); | ||
1885 | if (found) | ||
1886 | break; | ||
1887 | } | ||
1888 | mutex_unlock(&nvme_rdma_ctrl_mutex); | ||
1889 | |||
1890 | return found; | ||
1891 | } | ||
1892 | |||
1861 | static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, | 1893 | static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, |
1862 | struct nvmf_ctrl_options *opts) | 1894 | struct nvmf_ctrl_options *opts) |
1863 | { | 1895 | { |
@@ -1894,6 +1926,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, | |||
1894 | } | 1926 | } |
1895 | } | 1927 | } |
1896 | 1928 | ||
1929 | if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) { | ||
1930 | ret = -EALREADY; | ||
1931 | goto out_free_ctrl; | ||
1932 | } | ||
1933 | |||
1897 | ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, | 1934 | ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, |
1898 | 0 /* no quirks, we're perfect! */); | 1935 | 0 /* no quirks, we're perfect! */); |
1899 | if (ret) | 1936 | if (ret) |
@@ -1902,7 +1939,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, | |||
1902 | INIT_DELAYED_WORK(&ctrl->reconnect_work, | 1939 | INIT_DELAYED_WORK(&ctrl->reconnect_work, |
1903 | nvme_rdma_reconnect_ctrl_work); | 1940 | nvme_rdma_reconnect_ctrl_work); |
1904 | INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); | 1941 | INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); |
1905 | INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); | ||
1906 | INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); | 1942 | INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); |
1907 | 1943 | ||
1908 | ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ | 1944 | ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ |
@@ -1961,7 +1997,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, | |||
1961 | dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", | 1997 | dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", |
1962 | ctrl->ctrl.opts->subsysnqn, &ctrl->addr); | 1998 | ctrl->ctrl.opts->subsysnqn, &ctrl->addr); |
1963 | 1999 | ||
1964 | kref_get(&ctrl->ctrl.kref); | 2000 | nvme_get_ctrl(&ctrl->ctrl); |
1965 | 2001 | ||
1966 | mutex_lock(&nvme_rdma_ctrl_mutex); | 2002 | mutex_lock(&nvme_rdma_ctrl_mutex); |
1967 | list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); | 2003 | list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); |
@@ -2006,7 +2042,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) | |||
2006 | dev_info(ctrl->ctrl.device, | 2042 | dev_info(ctrl->ctrl.device, |
2007 | "Removing ctrl: NQN \"%s\", addr %pISp\n", | 2043 | "Removing ctrl: NQN \"%s\", addr %pISp\n", |
2008 | ctrl->ctrl.opts->subsysnqn, &ctrl->addr); | 2044 | ctrl->ctrl.opts->subsysnqn, &ctrl->addr); |
2009 | __nvme_rdma_del_ctrl(ctrl); | 2045 | nvme_delete_ctrl(&ctrl->ctrl); |
2010 | } | 2046 | } |
2011 | mutex_unlock(&nvme_rdma_ctrl_mutex); | 2047 | mutex_unlock(&nvme_rdma_ctrl_mutex); |
2012 | 2048 | ||
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c4a0bf36e752..90dcdc40ac71 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c | |||
@@ -35,17 +35,14 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) | |||
35 | static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, | 35 | static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, |
36 | struct nvme_smart_log *slog) | 36 | struct nvme_smart_log *slog) |
37 | { | 37 | { |
38 | u16 status; | ||
39 | struct nvmet_ns *ns; | 38 | struct nvmet_ns *ns; |
40 | u64 host_reads, host_writes, data_units_read, data_units_written; | 39 | u64 host_reads, host_writes, data_units_read, data_units_written; |
41 | 40 | ||
42 | status = NVME_SC_SUCCESS; | ||
43 | ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); | 41 | ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); |
44 | if (!ns) { | 42 | if (!ns) { |
45 | status = NVME_SC_INVALID_NS; | ||
46 | pr_err("nvmet : Could not find namespace id : %d\n", | 43 | pr_err("nvmet : Could not find namespace id : %d\n", |
47 | le32_to_cpu(req->cmd->get_log_page.nsid)); | 44 | le32_to_cpu(req->cmd->get_log_page.nsid)); |
48 | goto out; | 45 | return NVME_SC_INVALID_NS; |
49 | } | 46 | } |
50 | 47 | ||
51 | host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); | 48 | host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); |
@@ -58,20 +55,18 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, | |||
58 | put_unaligned_le64(host_writes, &slog->host_writes[0]); | 55 | put_unaligned_le64(host_writes, &slog->host_writes[0]); |
59 | put_unaligned_le64(data_units_written, &slog->data_units_written[0]); | 56 | put_unaligned_le64(data_units_written, &slog->data_units_written[0]); |
60 | nvmet_put_namespace(ns); | 57 | nvmet_put_namespace(ns); |
61 | out: | 58 | |
62 | return status; | 59 | return NVME_SC_SUCCESS; |
63 | } | 60 | } |
64 | 61 | ||
65 | static u16 nvmet_get_smart_log_all(struct nvmet_req *req, | 62 | static u16 nvmet_get_smart_log_all(struct nvmet_req *req, |
66 | struct nvme_smart_log *slog) | 63 | struct nvme_smart_log *slog) |
67 | { | 64 | { |
68 | u16 status; | ||
69 | u64 host_reads = 0, host_writes = 0; | 65 | u64 host_reads = 0, host_writes = 0; |
70 | u64 data_units_read = 0, data_units_written = 0; | 66 | u64 data_units_read = 0, data_units_written = 0; |
71 | struct nvmet_ns *ns; | 67 | struct nvmet_ns *ns; |
72 | struct nvmet_ctrl *ctrl; | 68 | struct nvmet_ctrl *ctrl; |
73 | 69 | ||
74 | status = NVME_SC_SUCCESS; | ||
75 | ctrl = req->sq->ctrl; | 70 | ctrl = req->sq->ctrl; |
76 | 71 | ||
77 | rcu_read_lock(); | 72 | rcu_read_lock(); |
@@ -91,7 +86,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, | |||
91 | put_unaligned_le64(host_writes, &slog->host_writes[0]); | 86 | put_unaligned_le64(host_writes, &slog->host_writes[0]); |
92 | put_unaligned_le64(data_units_written, &slog->data_units_written[0]); | 87 | put_unaligned_le64(data_units_written, &slog->data_units_written[0]); |
93 | 88 | ||
94 | return status; | 89 | return NVME_SC_SUCCESS; |
95 | } | 90 | } |
96 | 91 | ||
97 | static u16 nvmet_get_smart_log(struct nvmet_req *req, | 92 | static u16 nvmet_get_smart_log(struct nvmet_req *req, |
@@ -144,10 +139,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) | |||
144 | } | 139 | } |
145 | smart_log = buf; | 140 | smart_log = buf; |
146 | status = nvmet_get_smart_log(req, smart_log); | 141 | status = nvmet_get_smart_log(req, smart_log); |
147 | if (status) { | 142 | if (status) |
148 | memset(buf, '\0', data_len); | ||
149 | goto err; | 143 | goto err; |
150 | } | ||
151 | break; | 144 | break; |
152 | case NVME_LOG_FW_SLOT: | 145 | case NVME_LOG_FW_SLOT: |
153 | /* | 146 | /* |
@@ -300,7 +293,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) | |||
300 | } | 293 | } |
301 | 294 | ||
302 | /* | 295 | /* |
303 | * nuse = ncap = nsze isn't aways true, but we have no way to find | 296 | * nuse = ncap = nsze isn't always true, but we have no way to find |
304 | * that out from the underlying device. | 297 | * that out from the underlying device. |
305 | */ | 298 | */ |
306 | id->ncap = id->nuse = id->nsze = | 299 | id->ncap = id->nuse = id->nsze = |
@@ -424,7 +417,7 @@ out: | |||
424 | } | 417 | } |
425 | 418 | ||
426 | /* | 419 | /* |
427 | * A "mimimum viable" abort implementation: the command is mandatory in the | 420 | * A "minimum viable" abort implementation: the command is mandatory in the |
428 | * spec, but we are not required to do any useful work. We couldn't really | 421 | * spec, but we are not required to do any useful work. We couldn't really |
429 | * do a useful abort, so don't bother even with waiting for the command | 422 | * do a useful abort, so don't bother even with waiting for the command |
430 | * to be exectuted and return immediately telling the command to abort | 423 | * to be exectuted and return immediately telling the command to abort |
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 645ba7eee35d..b54748ad5f48 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c | |||
@@ -57,6 +57,17 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) | |||
57 | return 0; | 57 | return 0; |
58 | } | 58 | } |
59 | 59 | ||
60 | static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) | ||
61 | { | ||
62 | struct nvmet_ns *ns; | ||
63 | |||
64 | if (list_empty(&subsys->namespaces)) | ||
65 | return 0; | ||
66 | |||
67 | ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link); | ||
68 | return ns->nsid; | ||
69 | } | ||
70 | |||
60 | static u32 nvmet_async_event_result(struct nvmet_async_event *aen) | 71 | static u32 nvmet_async_event_result(struct nvmet_async_event *aen) |
61 | { | 72 | { |
62 | return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); | 73 | return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); |
@@ -334,6 +345,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns) | |||
334 | 345 | ||
335 | ns->enabled = false; | 346 | ns->enabled = false; |
336 | list_del_rcu(&ns->dev_link); | 347 | list_del_rcu(&ns->dev_link); |
348 | if (ns->nsid == subsys->max_nsid) | ||
349 | subsys->max_nsid = nvmet_max_nsid(subsys); | ||
337 | mutex_unlock(&subsys->lock); | 350 | mutex_unlock(&subsys->lock); |
338 | 351 | ||
339 | /* | 352 | /* |
@@ -497,6 +510,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, | |||
497 | req->ops = ops; | 510 | req->ops = ops; |
498 | req->sg = NULL; | 511 | req->sg = NULL; |
499 | req->sg_cnt = 0; | 512 | req->sg_cnt = 0; |
513 | req->transfer_len = 0; | ||
500 | req->rsp->status = 0; | 514 | req->rsp->status = 0; |
501 | 515 | ||
502 | /* no support for fused commands yet */ | 516 | /* no support for fused commands yet */ |
@@ -546,6 +560,15 @@ void nvmet_req_uninit(struct nvmet_req *req) | |||
546 | } | 560 | } |
547 | EXPORT_SYMBOL_GPL(nvmet_req_uninit); | 561 | EXPORT_SYMBOL_GPL(nvmet_req_uninit); |
548 | 562 | ||
563 | void nvmet_req_execute(struct nvmet_req *req) | ||
564 | { | ||
565 | if (unlikely(req->data_len != req->transfer_len)) | ||
566 | nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); | ||
567 | else | ||
568 | req->execute(req); | ||
569 | } | ||
570 | EXPORT_SYMBOL_GPL(nvmet_req_execute); | ||
571 | |||
549 | static inline bool nvmet_cc_en(u32 cc) | 572 | static inline bool nvmet_cc_en(u32 cc) |
550 | { | 573 | { |
551 | return (cc >> NVME_CC_EN_SHIFT) & 0x1; | 574 | return (cc >> NVME_CC_EN_SHIFT) & 0x1; |
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 58e010bdda3e..739b8feadc7d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c | |||
@@ -76,7 +76,6 @@ struct nvmet_fc_fcp_iod { | |||
76 | dma_addr_t rspdma; | 76 | dma_addr_t rspdma; |
77 | struct scatterlist *data_sg; | 77 | struct scatterlist *data_sg; |
78 | int data_sg_cnt; | 78 | int data_sg_cnt; |
79 | u32 total_length; | ||
80 | u32 offset; | 79 | u32 offset; |
81 | enum nvmet_fcp_datadir io_dir; | 80 | enum nvmet_fcp_datadir io_dir; |
82 | bool active; | 81 | bool active; |
@@ -150,6 +149,7 @@ struct nvmet_fc_tgt_assoc { | |||
150 | struct list_head a_list; | 149 | struct list_head a_list; |
151 | struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; | 150 | struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; |
152 | struct kref ref; | 151 | struct kref ref; |
152 | struct work_struct del_work; | ||
153 | }; | 153 | }; |
154 | 154 | ||
155 | 155 | ||
@@ -232,6 +232,7 @@ static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport); | |||
232 | static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); | 232 | static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); |
233 | static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, | 233 | static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, |
234 | struct nvmet_fc_fcp_iod *fod); | 234 | struct nvmet_fc_fcp_iod *fod); |
235 | static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc); | ||
235 | 236 | ||
236 | 237 | ||
237 | /* *********************** FC-NVME DMA Handling **************************** */ | 238 | /* *********************** FC-NVME DMA Handling **************************** */ |
@@ -802,6 +803,16 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, | |||
802 | return NULL; | 803 | return NULL; |
803 | } | 804 | } |
804 | 805 | ||
806 | static void | ||
807 | nvmet_fc_delete_assoc(struct work_struct *work) | ||
808 | { | ||
809 | struct nvmet_fc_tgt_assoc *assoc = | ||
810 | container_of(work, struct nvmet_fc_tgt_assoc, del_work); | ||
811 | |||
812 | nvmet_fc_delete_target_assoc(assoc); | ||
813 | nvmet_fc_tgt_a_put(assoc); | ||
814 | } | ||
815 | |||
805 | static struct nvmet_fc_tgt_assoc * | 816 | static struct nvmet_fc_tgt_assoc * |
806 | nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) | 817 | nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) |
807 | { | 818 | { |
@@ -826,6 +837,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) | |||
826 | assoc->a_id = idx; | 837 | assoc->a_id = idx; |
827 | INIT_LIST_HEAD(&assoc->a_list); | 838 | INIT_LIST_HEAD(&assoc->a_list); |
828 | kref_init(&assoc->ref); | 839 | kref_init(&assoc->ref); |
840 | INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc); | ||
829 | 841 | ||
830 | while (needrandom) { | 842 | while (needrandom) { |
831 | get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID); | 843 | get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID); |
@@ -1118,8 +1130,7 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) | |||
1118 | nvmet_fc_tgtport_put(tgtport); | 1130 | nvmet_fc_tgtport_put(tgtport); |
1119 | 1131 | ||
1120 | if (found_ctrl) { | 1132 | if (found_ctrl) { |
1121 | nvmet_fc_delete_target_assoc(assoc); | 1133 | schedule_work(&assoc->del_work); |
1122 | nvmet_fc_tgt_a_put(assoc); | ||
1123 | return; | 1134 | return; |
1124 | } | 1135 | } |
1125 | 1136 | ||
@@ -1688,7 +1699,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) | |||
1688 | u32 page_len, length; | 1699 | u32 page_len, length; |
1689 | int i = 0; | 1700 | int i = 0; |
1690 | 1701 | ||
1691 | length = fod->total_length; | 1702 | length = fod->req.transfer_len; |
1692 | nent = DIV_ROUND_UP(length, PAGE_SIZE); | 1703 | nent = DIV_ROUND_UP(length, PAGE_SIZE); |
1693 | sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); | 1704 | sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); |
1694 | if (!sg) | 1705 | if (!sg) |
@@ -1777,7 +1788,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, | |||
1777 | u32 rsn, rspcnt, xfr_length; | 1788 | u32 rsn, rspcnt, xfr_length; |
1778 | 1789 | ||
1779 | if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP) | 1790 | if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP) |
1780 | xfr_length = fod->total_length; | 1791 | xfr_length = fod->req.transfer_len; |
1781 | else | 1792 | else |
1782 | xfr_length = fod->offset; | 1793 | xfr_length = fod->offset; |
1783 | 1794 | ||
@@ -1803,7 +1814,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, | |||
1803 | rspcnt = atomic_inc_return(&fod->queue->zrspcnt); | 1814 | rspcnt = atomic_inc_return(&fod->queue->zrspcnt); |
1804 | if (!(rspcnt % fod->queue->ersp_ratio) || | 1815 | if (!(rspcnt % fod->queue->ersp_ratio) || |
1805 | sqe->opcode == nvme_fabrics_command || | 1816 | sqe->opcode == nvme_fabrics_command || |
1806 | xfr_length != fod->total_length || | 1817 | xfr_length != fod->req.transfer_len || |
1807 | (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || | 1818 | (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || |
1808 | (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || | 1819 | (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || |
1809 | queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head))) | 1820 | queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head))) |
@@ -1880,7 +1891,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, | |||
1880 | fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; | 1891 | fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; |
1881 | 1892 | ||
1882 | tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, | 1893 | tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, |
1883 | (fod->total_length - fod->offset)); | 1894 | (fod->req.transfer_len - fod->offset)); |
1884 | fcpreq->transfer_length = tlen; | 1895 | fcpreq->transfer_length = tlen; |
1885 | fcpreq->transferred_length = 0; | 1896 | fcpreq->transferred_length = 0; |
1886 | fcpreq->fcp_error = 0; | 1897 | fcpreq->fcp_error = 0; |
@@ -1894,7 +1905,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, | |||
1894 | * combined xfr with response. | 1905 | * combined xfr with response. |
1895 | */ | 1906 | */ |
1896 | if ((op == NVMET_FCOP_READDATA) && | 1907 | if ((op == NVMET_FCOP_READDATA) && |
1897 | ((fod->offset + fcpreq->transfer_length) == fod->total_length) && | 1908 | ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) && |
1898 | (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) { | 1909 | (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) { |
1899 | fcpreq->op = NVMET_FCOP_READDATA_RSP; | 1910 | fcpreq->op = NVMET_FCOP_READDATA_RSP; |
1900 | nvmet_fc_prep_fcp_rsp(tgtport, fod); | 1911 | nvmet_fc_prep_fcp_rsp(tgtport, fod); |
@@ -1974,7 +1985,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) | |||
1974 | } | 1985 | } |
1975 | 1986 | ||
1976 | fod->offset += fcpreq->transferred_length; | 1987 | fod->offset += fcpreq->transferred_length; |
1977 | if (fod->offset != fod->total_length) { | 1988 | if (fod->offset != fod->req.transfer_len) { |
1978 | spin_lock_irqsave(&fod->flock, flags); | 1989 | spin_lock_irqsave(&fod->flock, flags); |
1979 | fod->writedataactive = true; | 1990 | fod->writedataactive = true; |
1980 | spin_unlock_irqrestore(&fod->flock, flags); | 1991 | spin_unlock_irqrestore(&fod->flock, flags); |
@@ -1986,9 +1997,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) | |||
1986 | } | 1997 | } |
1987 | 1998 | ||
1988 | /* data transfer complete, resume with nvmet layer */ | 1999 | /* data transfer complete, resume with nvmet layer */ |
1989 | 2000 | nvmet_req_execute(&fod->req); | |
1990 | fod->req.execute(&fod->req); | ||
1991 | |||
1992 | break; | 2001 | break; |
1993 | 2002 | ||
1994 | case NVMET_FCOP_READDATA: | 2003 | case NVMET_FCOP_READDATA: |
@@ -2011,7 +2020,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) | |||
2011 | } | 2020 | } |
2012 | 2021 | ||
2013 | fod->offset += fcpreq->transferred_length; | 2022 | fod->offset += fcpreq->transferred_length; |
2014 | if (fod->offset != fod->total_length) { | 2023 | if (fod->offset != fod->req.transfer_len) { |
2015 | /* transfer the next chunk */ | 2024 | /* transfer the next chunk */ |
2016 | nvmet_fc_transfer_fcp_data(tgtport, fod, | 2025 | nvmet_fc_transfer_fcp_data(tgtport, fod, |
2017 | NVMET_FCOP_READDATA); | 2026 | NVMET_FCOP_READDATA); |
@@ -2148,7 +2157,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, | |||
2148 | 2157 | ||
2149 | fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done; | 2158 | fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done; |
2150 | 2159 | ||
2151 | fod->total_length = be32_to_cpu(cmdiu->data_len); | 2160 | fod->req.transfer_len = be32_to_cpu(cmdiu->data_len); |
2152 | if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) { | 2161 | if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) { |
2153 | fod->io_dir = NVMET_FCP_WRITE; | 2162 | fod->io_dir = NVMET_FCP_WRITE; |
2154 | if (!nvme_is_write(&cmdiu->sqe)) | 2163 | if (!nvme_is_write(&cmdiu->sqe)) |
@@ -2159,7 +2168,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, | |||
2159 | goto transport_error; | 2168 | goto transport_error; |
2160 | } else { | 2169 | } else { |
2161 | fod->io_dir = NVMET_FCP_NODATA; | 2170 | fod->io_dir = NVMET_FCP_NODATA; |
2162 | if (fod->total_length) | 2171 | if (fod->req.transfer_len) |
2163 | goto transport_error; | 2172 | goto transport_error; |
2164 | } | 2173 | } |
2165 | 2174 | ||
@@ -2167,9 +2176,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, | |||
2167 | fod->req.rsp = &fod->rspiubuf.cqe; | 2176 | fod->req.rsp = &fod->rspiubuf.cqe; |
2168 | fod->req.port = fod->queue->port; | 2177 | fod->req.port = fod->queue->port; |
2169 | 2178 | ||
2170 | /* ensure nvmet handlers will set cmd handler callback */ | ||
2171 | fod->req.execute = NULL; | ||
2172 | |||
2173 | /* clear any response payload */ | 2179 | /* clear any response payload */ |
2174 | memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); | 2180 | memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); |
2175 | 2181 | ||
@@ -2189,7 +2195,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, | |||
2189 | /* keep a running counter of tail position */ | 2195 | /* keep a running counter of tail position */ |
2190 | atomic_inc(&fod->queue->sqtail); | 2196 | atomic_inc(&fod->queue->sqtail); |
2191 | 2197 | ||
2192 | if (fod->total_length) { | 2198 | if (fod->req.transfer_len) { |
2193 | ret = nvmet_fc_alloc_tgt_pgs(fod); | 2199 | ret = nvmet_fc_alloc_tgt_pgs(fod); |
2194 | if (ret) { | 2200 | if (ret) { |
2195 | nvmet_req_complete(&fod->req, ret); | 2201 | nvmet_req_complete(&fod->req, ret); |
@@ -2212,9 +2218,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, | |||
2212 | * can invoke the nvmet_layer now. If read data, cmd completion will | 2218 | * can invoke the nvmet_layer now. If read data, cmd completion will |
2213 | * push the data | 2219 | * push the data |
2214 | */ | 2220 | */ |
2215 | 2221 | nvmet_req_execute(&fod->req); | |
2216 | fod->req.execute(&fod->req); | ||
2217 | |||
2218 | return; | 2222 | return; |
2219 | 2223 | ||
2220 | transport_error: | 2224 | transport_error: |
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 0d4c23dc4532..0a4372a016f2 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c | |||
@@ -33,18 +33,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) | |||
33 | req->ns->blksize_shift; | 33 | req->ns->blksize_shift; |
34 | } | 34 | } |
35 | 35 | ||
36 | static void nvmet_inline_bio_init(struct nvmet_req *req) | ||
37 | { | ||
38 | struct bio *bio = &req->inline_bio; | ||
39 | |||
40 | bio_init(bio, req->inline_bvec, NVMET_MAX_INLINE_BIOVEC); | ||
41 | } | ||
42 | |||
43 | static void nvmet_execute_rw(struct nvmet_req *req) | 36 | static void nvmet_execute_rw(struct nvmet_req *req) |
44 | { | 37 | { |
45 | int sg_cnt = req->sg_cnt; | 38 | int sg_cnt = req->sg_cnt; |
39 | struct bio *bio = &req->inline_bio; | ||
46 | struct scatterlist *sg; | 40 | struct scatterlist *sg; |
47 | struct bio *bio; | ||
48 | sector_t sector; | 41 | sector_t sector; |
49 | blk_qc_t cookie; | 42 | blk_qc_t cookie; |
50 | int op, op_flags = 0, i; | 43 | int op, op_flags = 0, i; |
@@ -66,8 +59,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) | |||
66 | sector = le64_to_cpu(req->cmd->rw.slba); | 59 | sector = le64_to_cpu(req->cmd->rw.slba); |
67 | sector <<= (req->ns->blksize_shift - 9); | 60 | sector <<= (req->ns->blksize_shift - 9); |
68 | 61 | ||
69 | nvmet_inline_bio_init(req); | 62 | bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); |
70 | bio = &req->inline_bio; | ||
71 | bio_set_dev(bio, req->ns->bdev); | 63 | bio_set_dev(bio, req->ns->bdev); |
72 | bio->bi_iter.bi_sector = sector; | 64 | bio->bi_iter.bi_sector = sector; |
73 | bio->bi_private = req; | 65 | bio->bi_private = req; |
@@ -94,16 +86,14 @@ static void nvmet_execute_rw(struct nvmet_req *req) | |||
94 | 86 | ||
95 | cookie = submit_bio(bio); | 87 | cookie = submit_bio(bio); |
96 | 88 | ||
97 | blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie); | 89 | blk_poll(bdev_get_queue(req->ns->bdev), cookie); |
98 | } | 90 | } |
99 | 91 | ||
100 | static void nvmet_execute_flush(struct nvmet_req *req) | 92 | static void nvmet_execute_flush(struct nvmet_req *req) |
101 | { | 93 | { |
102 | struct bio *bio; | 94 | struct bio *bio = &req->inline_bio; |
103 | |||
104 | nvmet_inline_bio_init(req); | ||
105 | bio = &req->inline_bio; | ||
106 | 95 | ||
96 | bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); | ||
107 | bio_set_dev(bio, req->ns->bdev); | 97 | bio_set_dev(bio, req->ns->bdev); |
108 | bio->bi_private = req; | 98 | bio->bi_private = req; |
109 | bio->bi_end_io = nvmet_bio_done; | 99 | bio->bi_end_io = nvmet_bio_done; |
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 92628c432926..96d390416789 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c | |||
@@ -23,14 +23,6 @@ | |||
23 | 23 | ||
24 | #define NVME_LOOP_MAX_SEGMENTS 256 | 24 | #define NVME_LOOP_MAX_SEGMENTS 256 |
25 | 25 | ||
26 | /* | ||
27 | * We handle AEN commands ourselves and don't even let the | ||
28 | * block layer know about them. | ||
29 | */ | ||
30 | #define NVME_LOOP_NR_AEN_COMMANDS 1 | ||
31 | #define NVME_LOOP_AQ_BLKMQ_DEPTH \ | ||
32 | (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) | ||
33 | |||
34 | struct nvme_loop_iod { | 26 | struct nvme_loop_iod { |
35 | struct nvme_request nvme_req; | 27 | struct nvme_request nvme_req; |
36 | struct nvme_command cmd; | 28 | struct nvme_command cmd; |
@@ -53,7 +45,6 @@ struct nvme_loop_ctrl { | |||
53 | struct nvme_ctrl ctrl; | 45 | struct nvme_ctrl ctrl; |
54 | 46 | ||
55 | struct nvmet_ctrl *target_ctrl; | 47 | struct nvmet_ctrl *target_ctrl; |
56 | struct work_struct delete_work; | ||
57 | }; | 48 | }; |
58 | 49 | ||
59 | static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) | 50 | static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) |
@@ -113,7 +104,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req) | |||
113 | * for them but rather special case them here. | 104 | * for them but rather special case them here. |
114 | */ | 105 | */ |
115 | if (unlikely(nvme_loop_queue_idx(queue) == 0 && | 106 | if (unlikely(nvme_loop_queue_idx(queue) == 0 && |
116 | cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { | 107 | cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { |
117 | nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, | 108 | nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, |
118 | &cqe->result); | 109 | &cqe->result); |
119 | } else { | 110 | } else { |
@@ -136,7 +127,7 @@ static void nvme_loop_execute_work(struct work_struct *work) | |||
136 | struct nvme_loop_iod *iod = | 127 | struct nvme_loop_iod *iod = |
137 | container_of(work, struct nvme_loop_iod, work); | 128 | container_of(work, struct nvme_loop_iod, work); |
138 | 129 | ||
139 | iod->req.execute(&iod->req); | 130 | nvmet_req_execute(&iod->req); |
140 | } | 131 | } |
141 | 132 | ||
142 | static enum blk_eh_timer_return | 133 | static enum blk_eh_timer_return |
@@ -185,6 +176,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
185 | 176 | ||
186 | iod->req.sg = iod->sg_table.sgl; | 177 | iod->req.sg = iod->sg_table.sgl; |
187 | iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); | 178 | iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); |
179 | iod->req.transfer_len = blk_rq_bytes(req); | ||
188 | } | 180 | } |
189 | 181 | ||
190 | blk_mq_start_request(req); | 182 | blk_mq_start_request(req); |
@@ -193,7 +185,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
193 | return BLK_STS_OK; | 185 | return BLK_STS_OK; |
194 | } | 186 | } |
195 | 187 | ||
196 | static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) | 188 | static void nvme_loop_submit_async_event(struct nvme_ctrl *arg) |
197 | { | 189 | { |
198 | struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); | 190 | struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); |
199 | struct nvme_loop_queue *queue = &ctrl->queues[0]; | 191 | struct nvme_loop_queue *queue = &ctrl->queues[0]; |
@@ -201,7 +193,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) | |||
201 | 193 | ||
202 | memset(&iod->cmd, 0, sizeof(iod->cmd)); | 194 | memset(&iod->cmd, 0, sizeof(iod->cmd)); |
203 | iod->cmd.common.opcode = nvme_admin_async_event; | 195 | iod->cmd.common.opcode = nvme_admin_async_event; |
204 | iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; | 196 | iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH; |
205 | iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; | 197 | iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; |
206 | 198 | ||
207 | if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, | 199 | if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, |
@@ -357,7 +349,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) | |||
357 | 349 | ||
358 | memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); | 350 | memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); |
359 | ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; | 351 | ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; |
360 | ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; | 352 | ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; |
361 | ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ | 353 | ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ |
362 | ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; | 354 | ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; |
363 | ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + | 355 | ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + |
@@ -365,6 +357,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) | |||
365 | ctrl->admin_tag_set.driver_data = ctrl; | 357 | ctrl->admin_tag_set.driver_data = ctrl; |
366 | ctrl->admin_tag_set.nr_hw_queues = 1; | 358 | ctrl->admin_tag_set.nr_hw_queues = 1; |
367 | ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; | 359 | ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; |
360 | ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; | ||
368 | 361 | ||
369 | ctrl->queues[0].ctrl = ctrl; | 362 | ctrl->queues[0].ctrl = ctrl; |
370 | error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); | 363 | error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); |
@@ -438,41 +431,9 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) | |||
438 | nvme_loop_destroy_admin_queue(ctrl); | 431 | nvme_loop_destroy_admin_queue(ctrl); |
439 | } | 432 | } |
440 | 433 | ||
441 | static void nvme_loop_del_ctrl_work(struct work_struct *work) | 434 | static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl) |
442 | { | 435 | { |
443 | struct nvme_loop_ctrl *ctrl = container_of(work, | 436 | nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl)); |
444 | struct nvme_loop_ctrl, delete_work); | ||
445 | |||
446 | nvme_stop_ctrl(&ctrl->ctrl); | ||
447 | nvme_remove_namespaces(&ctrl->ctrl); | ||
448 | nvme_loop_shutdown_ctrl(ctrl); | ||
449 | nvme_uninit_ctrl(&ctrl->ctrl); | ||
450 | nvme_put_ctrl(&ctrl->ctrl); | ||
451 | } | ||
452 | |||
453 | static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) | ||
454 | { | ||
455 | if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) | ||
456 | return -EBUSY; | ||
457 | |||
458 | if (!queue_work(nvme_wq, &ctrl->delete_work)) | ||
459 | return -EBUSY; | ||
460 | |||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl) | ||
465 | { | ||
466 | struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); | ||
467 | int ret; | ||
468 | |||
469 | ret = __nvme_loop_del_ctrl(ctrl); | ||
470 | if (ret) | ||
471 | return ret; | ||
472 | |||
473 | flush_work(&ctrl->delete_work); | ||
474 | |||
475 | return 0; | ||
476 | } | 437 | } |
477 | 438 | ||
478 | static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) | 439 | static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) |
@@ -482,7 +443,7 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) | |||
482 | mutex_lock(&nvme_loop_ctrl_mutex); | 443 | mutex_lock(&nvme_loop_ctrl_mutex); |
483 | list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { | 444 | list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { |
484 | if (ctrl->ctrl.cntlid == nctrl->cntlid) | 445 | if (ctrl->ctrl.cntlid == nctrl->cntlid) |
485 | __nvme_loop_del_ctrl(ctrl); | 446 | nvme_delete_ctrl(&ctrl->ctrl); |
486 | } | 447 | } |
487 | mutex_unlock(&nvme_loop_ctrl_mutex); | 448 | mutex_unlock(&nvme_loop_ctrl_mutex); |
488 | } | 449 | } |
@@ -538,7 +499,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { | |||
538 | .reg_write32 = nvmf_reg_write32, | 499 | .reg_write32 = nvmf_reg_write32, |
539 | .free_ctrl = nvme_loop_free_ctrl, | 500 | .free_ctrl = nvme_loop_free_ctrl, |
540 | .submit_async_event = nvme_loop_submit_async_event, | 501 | .submit_async_event = nvme_loop_submit_async_event, |
541 | .delete_ctrl = nvme_loop_del_ctrl, | 502 | .delete_ctrl = nvme_loop_delete_ctrl_host, |
542 | }; | 503 | }; |
543 | 504 | ||
544 | static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) | 505 | static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) |
@@ -600,7 +561,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, | |||
600 | ctrl->ctrl.opts = opts; | 561 | ctrl->ctrl.opts = opts; |
601 | INIT_LIST_HEAD(&ctrl->list); | 562 | INIT_LIST_HEAD(&ctrl->list); |
602 | 563 | ||
603 | INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); | ||
604 | INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); | 564 | INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); |
605 | 565 | ||
606 | ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, | 566 | ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, |
@@ -641,7 +601,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, | |||
641 | dev_info(ctrl->ctrl.device, | 601 | dev_info(ctrl->ctrl.device, |
642 | "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); | 602 | "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); |
643 | 603 | ||
644 | kref_get(&ctrl->ctrl.kref); | 604 | nvme_get_ctrl(&ctrl->ctrl); |
645 | 605 | ||
646 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); | 606 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); |
647 | WARN_ON_ONCE(!changed); | 607 | WARN_ON_ONCE(!changed); |
@@ -730,7 +690,7 @@ static void __exit nvme_loop_cleanup_module(void) | |||
730 | 690 | ||
731 | mutex_lock(&nvme_loop_ctrl_mutex); | 691 | mutex_lock(&nvme_loop_ctrl_mutex); |
732 | list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) | 692 | list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) |
733 | __nvme_loop_del_ctrl(ctrl); | 693 | nvme_delete_ctrl(&ctrl->ctrl); |
734 | mutex_unlock(&nvme_loop_ctrl_mutex); | 694 | mutex_unlock(&nvme_loop_ctrl_mutex); |
735 | 695 | ||
736 | flush_workqueue(nvme_wq); | 696 | flush_workqueue(nvme_wq); |
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 87e429bfcd8a..417f6c0331cc 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h | |||
@@ -223,7 +223,10 @@ struct nvmet_req { | |||
223 | struct bio inline_bio; | 223 | struct bio inline_bio; |
224 | struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; | 224 | struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; |
225 | int sg_cnt; | 225 | int sg_cnt; |
226 | /* data length as parsed from the command: */ | ||
226 | size_t data_len; | 227 | size_t data_len; |
228 | /* data length as parsed from the SGL descriptor: */ | ||
229 | size_t transfer_len; | ||
227 | 230 | ||
228 | struct nvmet_port *port; | 231 | struct nvmet_port *port; |
229 | 232 | ||
@@ -266,6 +269,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); | |||
266 | bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, | 269 | bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, |
267 | struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); | 270 | struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); |
268 | void nvmet_req_uninit(struct nvmet_req *req); | 271 | void nvmet_req_uninit(struct nvmet_req *req); |
272 | void nvmet_req_execute(struct nvmet_req *req); | ||
269 | void nvmet_req_complete(struct nvmet_req *req, u16 status); | 273 | void nvmet_req_complete(struct nvmet_req *req, u16 status); |
270 | 274 | ||
271 | void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, | 275 | void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, |
@@ -314,7 +318,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, | |||
314 | u32 nvmet_get_log_page_len(struct nvme_command *cmd); | 318 | u32 nvmet_get_log_page_len(struct nvme_command *cmd); |
315 | 319 | ||
316 | #define NVMET_QUEUE_SIZE 1024 | 320 | #define NVMET_QUEUE_SIZE 1024 |
317 | #define NVMET_NR_QUEUES 64 | 321 | #define NVMET_NR_QUEUES 128 |
318 | #define NVMET_MAX_CMD NVMET_QUEUE_SIZE | 322 | #define NVMET_MAX_CMD NVMET_QUEUE_SIZE |
319 | #define NVMET_KAS 10 | 323 | #define NVMET_KAS 10 |
320 | #define NVMET_DISC_KATO 120 | 324 | #define NVMET_DISC_KATO 120 |
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 76d2bb793afe..49912909c298 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c | |||
@@ -148,14 +148,14 @@ static inline u32 get_unaligned_le24(const u8 *p) | |||
148 | static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) | 148 | static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) |
149 | { | 149 | { |
150 | return nvme_is_write(rsp->req.cmd) && | 150 | return nvme_is_write(rsp->req.cmd) && |
151 | rsp->req.data_len && | 151 | rsp->req.transfer_len && |
152 | !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); | 152 | !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); |
153 | } | 153 | } |
154 | 154 | ||
155 | static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) | 155 | static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) |
156 | { | 156 | { |
157 | return !nvme_is_write(rsp->req.cmd) && | 157 | return !nvme_is_write(rsp->req.cmd) && |
158 | rsp->req.data_len && | 158 | rsp->req.transfer_len && |
159 | !rsp->req.rsp->status && | 159 | !rsp->req.rsp->status && |
160 | !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); | 160 | !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); |
161 | } | 161 | } |
@@ -577,7 +577,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) | |||
577 | return; | 577 | return; |
578 | } | 578 | } |
579 | 579 | ||
580 | rsp->req.execute(&rsp->req); | 580 | nvmet_req_execute(&rsp->req); |
581 | } | 581 | } |
582 | 582 | ||
583 | static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, | 583 | static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, |
@@ -609,6 +609,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) | |||
609 | 609 | ||
610 | nvmet_rdma_use_inline_sg(rsp, len, off); | 610 | nvmet_rdma_use_inline_sg(rsp, len, off); |
611 | rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; | 611 | rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; |
612 | rsp->req.transfer_len += len; | ||
612 | return 0; | 613 | return 0; |
613 | } | 614 | } |
614 | 615 | ||
@@ -636,6 +637,7 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, | |||
636 | nvmet_data_dir(&rsp->req)); | 637 | nvmet_data_dir(&rsp->req)); |
637 | if (ret < 0) | 638 | if (ret < 0) |
638 | return NVME_SC_INTERNAL; | 639 | return NVME_SC_INTERNAL; |
640 | rsp->req.transfer_len += len; | ||
639 | rsp->n_rdma += ret; | 641 | rsp->n_rdma += ret; |
640 | 642 | ||
641 | if (invalidate) { | 643 | if (invalidate) { |
@@ -693,7 +695,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) | |||
693 | queue->cm_id->port_num, &rsp->read_cqe, NULL)) | 695 | queue->cm_id->port_num, &rsp->read_cqe, NULL)) |
694 | nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); | 696 | nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); |
695 | } else { | 697 | } else { |
696 | rsp->req.execute(&rsp->req); | 698 | nvmet_req_execute(&rsp->req); |
697 | } | 699 | } |
698 | 700 | ||
699 | return true; | 701 | return true; |
@@ -1512,15 +1514,17 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = { | |||
1512 | 1514 | ||
1513 | static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) | 1515 | static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) |
1514 | { | 1516 | { |
1515 | struct nvmet_rdma_queue *queue; | 1517 | struct nvmet_rdma_queue *queue, *tmp; |
1516 | 1518 | ||
1517 | /* Device is being removed, delete all queues using this device */ | 1519 | /* Device is being removed, delete all queues using this device */ |
1518 | mutex_lock(&nvmet_rdma_queue_mutex); | 1520 | mutex_lock(&nvmet_rdma_queue_mutex); |
1519 | list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { | 1521 | list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, |
1522 | queue_list) { | ||
1520 | if (queue->dev->device != ib_device) | 1523 | if (queue->dev->device != ib_device) |
1521 | continue; | 1524 | continue; |
1522 | 1525 | ||
1523 | pr_info("Removing queue %d\n", queue->idx); | 1526 | pr_info("Removing queue %d\n", queue->idx); |
1527 | list_del_init(&queue->queue_list); | ||
1524 | __nvmet_rdma_queue_disconnect(queue); | 1528 | __nvmet_rdma_queue_disconnect(queue); |
1525 | } | 1529 | } |
1526 | mutex_unlock(&nvmet_rdma_queue_mutex); | 1530 | mutex_unlock(&nvmet_rdma_queue_mutex); |
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 41366339b950..766955318005 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig | |||
@@ -130,7 +130,8 @@ config CHR_DEV_OSST | |||
130 | 130 | ||
131 | config BLK_DEV_SR | 131 | config BLK_DEV_SR |
132 | tristate "SCSI CDROM support" | 132 | tristate "SCSI CDROM support" |
133 | depends on SCSI | 133 | depends on SCSI && BLK_DEV |
134 | select CDROM | ||
134 | ---help--- | 135 | ---help--- |
135 | If you want to use a CD or DVD drive attached to your computer | 136 | If you want to use a CD or DVD drive attached to your computer |
136 | by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO | 137 | by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO |
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index c17677f494af..3e02bc3a7c3f 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c | |||
@@ -3246,6 +3246,11 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport) | |||
3246 | continue; | 3246 | continue; |
3247 | if (ndlp->rport) | 3247 | if (ndlp->rport) |
3248 | ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo; | 3248 | ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo; |
3249 | #if (IS_ENABLED(CONFIG_NVME_FC)) | ||
3250 | if (ndlp->nrport) | ||
3251 | nvme_fc_set_remoteport_devloss(ndlp->nrport->remoteport, | ||
3252 | vport->cfg_devloss_tmo); | ||
3253 | #endif | ||
3249 | } | 3254 | } |
3250 | spin_unlock_irq(shost->host_lock); | 3255 | spin_unlock_irq(shost->host_lock); |
3251 | } | 3256 | } |
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index bcc1694cebcd..54de24c785dd 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c | |||
@@ -252,9 +252,9 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, | |||
252 | struct scsi_request *rq; | 252 | struct scsi_request *rq; |
253 | int ret = DRIVER_ERROR << 24; | 253 | int ret = DRIVER_ERROR << 24; |
254 | 254 | ||
255 | req = blk_get_request(sdev->request_queue, | 255 | req = blk_get_request_flags(sdev->request_queue, |
256 | data_direction == DMA_TO_DEVICE ? | 256 | data_direction == DMA_TO_DEVICE ? |
257 | REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); | 257 | REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT); |
258 | if (IS_ERR(req)) | 258 | if (IS_ERR(req)) |
259 | return ret; | 259 | return ret; |
260 | rq = scsi_req(req); | 260 | rq = scsi_req(req); |
@@ -268,7 +268,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, | |||
268 | rq->retries = retries; | 268 | rq->retries = retries; |
269 | req->timeout = timeout; | 269 | req->timeout = timeout; |
270 | req->cmd_flags |= flags; | 270 | req->cmd_flags |= flags; |
271 | req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT; | 271 | req->rq_flags |= rq_flags | RQF_QUIET; |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * head injection *required* here otherwise quiesce won't work | 274 | * head injection *required* here otherwise quiesce won't work |
@@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) | |||
1301 | /* | 1301 | /* |
1302 | * If the devices is blocked we defer normal commands. | 1302 | * If the devices is blocked we defer normal commands. |
1303 | */ | 1303 | */ |
1304 | if (!(req->rq_flags & RQF_PREEMPT)) | 1304 | if (req && !(req->rq_flags & RQF_PREEMPT)) |
1305 | ret = BLKPREP_DEFER; | 1305 | ret = BLKPREP_DEFER; |
1306 | break; | 1306 | break; |
1307 | default: | 1307 | default: |
@@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) | |||
1310 | * special commands. In particular any user initiated | 1310 | * special commands. In particular any user initiated |
1311 | * command is not allowed. | 1311 | * command is not allowed. |
1312 | */ | 1312 | */ |
1313 | if (!(req->rq_flags & RQF_PREEMPT)) | 1313 | if (req && !(req->rq_flags & RQF_PREEMPT)) |
1314 | ret = BLKPREP_KILL; | 1314 | ret = BLKPREP_KILL; |
1315 | break; | 1315 | break; |
1316 | } | 1316 | } |
@@ -1940,6 +1940,33 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) | |||
1940 | blk_mq_complete_request(cmd->request); | 1940 | blk_mq_complete_request(cmd->request); |
1941 | } | 1941 | } |
1942 | 1942 | ||
1943 | static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) | ||
1944 | { | ||
1945 | struct request_queue *q = hctx->queue; | ||
1946 | struct scsi_device *sdev = q->queuedata; | ||
1947 | |||
1948 | atomic_dec(&sdev->device_busy); | ||
1949 | put_device(&sdev->sdev_gendev); | ||
1950 | } | ||
1951 | |||
1952 | static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) | ||
1953 | { | ||
1954 | struct request_queue *q = hctx->queue; | ||
1955 | struct scsi_device *sdev = q->queuedata; | ||
1956 | |||
1957 | if (!get_device(&sdev->sdev_gendev)) | ||
1958 | goto out; | ||
1959 | if (!scsi_dev_queue_ready(q, sdev)) | ||
1960 | goto out_put_device; | ||
1961 | |||
1962 | return true; | ||
1963 | |||
1964 | out_put_device: | ||
1965 | put_device(&sdev->sdev_gendev); | ||
1966 | out: | ||
1967 | return false; | ||
1968 | } | ||
1969 | |||
1943 | static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, | 1970 | static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, |
1944 | const struct blk_mq_queue_data *bd) | 1971 | const struct blk_mq_queue_data *bd) |
1945 | { | 1972 | { |
@@ -1953,16 +1980,11 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1953 | 1980 | ||
1954 | ret = prep_to_mq(scsi_prep_state_check(sdev, req)); | 1981 | ret = prep_to_mq(scsi_prep_state_check(sdev, req)); |
1955 | if (ret != BLK_STS_OK) | 1982 | if (ret != BLK_STS_OK) |
1956 | goto out; | 1983 | goto out_put_budget; |
1957 | 1984 | ||
1958 | ret = BLK_STS_RESOURCE; | 1985 | ret = BLK_STS_RESOURCE; |
1959 | if (!get_device(&sdev->sdev_gendev)) | ||
1960 | goto out; | ||
1961 | |||
1962 | if (!scsi_dev_queue_ready(q, sdev)) | ||
1963 | goto out_put_device; | ||
1964 | if (!scsi_target_queue_ready(shost, sdev)) | 1986 | if (!scsi_target_queue_ready(shost, sdev)) |
1965 | goto out_dec_device_busy; | 1987 | goto out_put_budget; |
1966 | if (!scsi_host_queue_ready(q, shost, sdev)) | 1988 | if (!scsi_host_queue_ready(q, shost, sdev)) |
1967 | goto out_dec_target_busy; | 1989 | goto out_dec_target_busy; |
1968 | 1990 | ||
@@ -1993,15 +2015,12 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1993 | return BLK_STS_OK; | 2015 | return BLK_STS_OK; |
1994 | 2016 | ||
1995 | out_dec_host_busy: | 2017 | out_dec_host_busy: |
1996 | atomic_dec(&shost->host_busy); | 2018 | atomic_dec(&shost->host_busy); |
1997 | out_dec_target_busy: | 2019 | out_dec_target_busy: |
1998 | if (scsi_target(sdev)->can_queue > 0) | 2020 | if (scsi_target(sdev)->can_queue > 0) |
1999 | atomic_dec(&scsi_target(sdev)->target_busy); | 2021 | atomic_dec(&scsi_target(sdev)->target_busy); |
2000 | out_dec_device_busy: | 2022 | out_put_budget: |
2001 | atomic_dec(&sdev->device_busy); | 2023 | scsi_mq_put_budget(hctx); |
2002 | out_put_device: | ||
2003 | put_device(&sdev->sdev_gendev); | ||
2004 | out: | ||
2005 | switch (ret) { | 2024 | switch (ret) { |
2006 | case BLK_STS_OK: | 2025 | case BLK_STS_OK: |
2007 | break; | 2026 | break; |
@@ -2205,6 +2224,8 @@ struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev) | |||
2205 | } | 2224 | } |
2206 | 2225 | ||
2207 | static const struct blk_mq_ops scsi_mq_ops = { | 2226 | static const struct blk_mq_ops scsi_mq_ops = { |
2227 | .get_budget = scsi_mq_get_budget, | ||
2228 | .put_budget = scsi_mq_put_budget, | ||
2208 | .queue_rq = scsi_queue_rq, | 2229 | .queue_rq = scsi_queue_rq, |
2209 | .complete = scsi_softirq_done, | 2230 | .complete = scsi_softirq_done, |
2210 | .timeout = scsi_timeout, | 2231 | .timeout = scsi_timeout, |
@@ -2919,21 +2940,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev) | |||
2919 | int | 2940 | int |
2920 | scsi_device_quiesce(struct scsi_device *sdev) | 2941 | scsi_device_quiesce(struct scsi_device *sdev) |
2921 | { | 2942 | { |
2943 | struct request_queue *q = sdev->request_queue; | ||
2922 | int err; | 2944 | int err; |
2923 | 2945 | ||
2946 | /* | ||
2947 | * It is allowed to call scsi_device_quiesce() multiple times from | ||
2948 | * the same context but concurrent scsi_device_quiesce() calls are | ||
2949 | * not allowed. | ||
2950 | */ | ||
2951 | WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); | ||
2952 | |||
2953 | blk_set_preempt_only(q); | ||
2954 | |||
2955 | blk_mq_freeze_queue(q); | ||
2956 | /* | ||
2957 | * Ensure that the effect of blk_set_preempt_only() will be visible | ||
2958 | * for percpu_ref_tryget() callers that occur after the queue | ||
2959 | * unfreeze even if the queue was already frozen before this function | ||
2960 | * was called. See also https://lwn.net/Articles/573497/. | ||
2961 | */ | ||
2962 | synchronize_rcu(); | ||
2963 | blk_mq_unfreeze_queue(q); | ||
2964 | |||
2924 | mutex_lock(&sdev->state_mutex); | 2965 | mutex_lock(&sdev->state_mutex); |
2925 | err = scsi_device_set_state(sdev, SDEV_QUIESCE); | 2966 | err = scsi_device_set_state(sdev, SDEV_QUIESCE); |
2967 | if (err == 0) | ||
2968 | sdev->quiesced_by = current; | ||
2969 | else | ||
2970 | blk_clear_preempt_only(q); | ||
2926 | mutex_unlock(&sdev->state_mutex); | 2971 | mutex_unlock(&sdev->state_mutex); |
2927 | 2972 | ||
2928 | if (err) | 2973 | return err; |
2929 | return err; | ||
2930 | |||
2931 | scsi_run_queue(sdev->request_queue); | ||
2932 | while (atomic_read(&sdev->device_busy)) { | ||
2933 | msleep_interruptible(200); | ||
2934 | scsi_run_queue(sdev->request_queue); | ||
2935 | } | ||
2936 | return 0; | ||
2937 | } | 2974 | } |
2938 | EXPORT_SYMBOL(scsi_device_quiesce); | 2975 | EXPORT_SYMBOL(scsi_device_quiesce); |
2939 | 2976 | ||
@@ -2953,9 +2990,11 @@ void scsi_device_resume(struct scsi_device *sdev) | |||
2953 | * device deleted during suspend) | 2990 | * device deleted during suspend) |
2954 | */ | 2991 | */ |
2955 | mutex_lock(&sdev->state_mutex); | 2992 | mutex_lock(&sdev->state_mutex); |
2956 | if (sdev->sdev_state == SDEV_QUIESCE && | 2993 | WARN_ON_ONCE(!sdev->quiesced_by); |
2957 | scsi_device_set_state(sdev, SDEV_RUNNING) == 0) | 2994 | sdev->quiesced_by = NULL; |
2958 | scsi_run_queue(sdev->request_queue); | 2995 | blk_clear_preempt_only(sdev->request_queue); |
2996 | if (sdev->sdev_state == SDEV_QUIESCE) | ||
2997 | scsi_device_set_state(sdev, SDEV_RUNNING); | ||
2959 | mutex_unlock(&sdev->state_mutex); | 2998 | mutex_unlock(&sdev->state_mutex); |
2960 | } | 2999 | } |
2961 | EXPORT_SYMBOL(scsi_device_resume); | 3000 | EXPORT_SYMBOL(scsi_device_resume); |
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index aa28874e8fb9..f098877eed4a 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c | |||
@@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd) | |||
217 | if (sfp->parentdp->device->type == TYPE_SCANNER) | 217 | if (sfp->parentdp->device->type == TYPE_SCANNER) |
218 | return 0; | 218 | return 0; |
219 | 219 | ||
220 | return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE); | 220 | return blk_verify_command(cmd, filp->f_mode); |
221 | } | 221 | } |
222 | 222 | ||
223 | static int | 223 | static int |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 789f55e851ae..4a181fcb5175 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode) | |||
54 | } | 54 | } |
55 | EXPORT_SYMBOL(I_BDEV); | 55 | EXPORT_SYMBOL(I_BDEV); |
56 | 56 | ||
57 | void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) | ||
58 | { | ||
59 | struct va_format vaf; | ||
60 | va_list args; | ||
61 | |||
62 | va_start(args, fmt); | ||
63 | vaf.fmt = fmt; | ||
64 | vaf.va = &args; | ||
65 | printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); | ||
66 | va_end(args); | ||
67 | } | ||
68 | |||
69 | static void bdev_write_inode(struct block_device *bdev) | 57 | static void bdev_write_inode(struct block_device *bdev) |
70 | { | 58 | { |
71 | struct inode *inode = bdev->bd_inode; | 59 | struct inode *inode = bdev->bd_inode; |
@@ -249,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, | |||
249 | if (!READ_ONCE(bio.bi_private)) | 237 | if (!READ_ONCE(bio.bi_private)) |
250 | break; | 238 | break; |
251 | if (!(iocb->ki_flags & IOCB_HIPRI) || | 239 | if (!(iocb->ki_flags & IOCB_HIPRI) || |
252 | !blk_mq_poll(bdev_get_queue(bdev), qc)) | 240 | !blk_poll(bdev_get_queue(bdev), qc)) |
253 | io_schedule(); | 241 | io_schedule(); |
254 | } | 242 | } |
255 | __set_current_state(TASK_RUNNING); | 243 | __set_current_state(TASK_RUNNING); |
@@ -414,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) | |||
414 | break; | 402 | break; |
415 | 403 | ||
416 | if (!(iocb->ki_flags & IOCB_HIPRI) || | 404 | if (!(iocb->ki_flags & IOCB_HIPRI) || |
417 | !blk_mq_poll(bdev_get_queue(bdev), qc)) | 405 | !blk_poll(bdev_get_queue(bdev), qc)) |
418 | io_schedule(); | 406 | io_schedule(); |
419 | } | 407 | } |
420 | __set_current_state(TASK_RUNNING); | 408 | __set_current_state(TASK_RUNNING); |
@@ -674,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, | |||
674 | if (!ops->rw_page || bdev_get_integrity(bdev)) | 662 | if (!ops->rw_page || bdev_get_integrity(bdev)) |
675 | return result; | 663 | return result; |
676 | 664 | ||
677 | result = blk_queue_enter(bdev->bd_queue, false); | 665 | result = blk_queue_enter(bdev->bd_queue, 0); |
678 | if (result) | 666 | if (result) |
679 | return result; | 667 | return result; |
680 | result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); | 668 | result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); |
@@ -710,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, | |||
710 | 698 | ||
711 | if (!ops->rw_page || bdev_get_integrity(bdev)) | 699 | if (!ops->rw_page || bdev_get_integrity(bdev)) |
712 | return -EOPNOTSUPP; | 700 | return -EOPNOTSUPP; |
713 | result = blk_queue_enter(bdev->bd_queue, false); | 701 | result = blk_queue_enter(bdev->bd_queue, 0); |
714 | if (result) | 702 | if (result) |
715 | return result; | 703 | return result; |
716 | 704 | ||
diff --git a/fs/buffer.c b/fs/buffer.c index 49b7e9bdcd1d..1c18a22a6013 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -253,27 +253,6 @@ out: | |||
253 | } | 253 | } |
254 | 254 | ||
255 | /* | 255 | /* |
256 | * Kick the writeback threads then try to free up some ZONE_NORMAL memory. | ||
257 | */ | ||
258 | static void free_more_memory(void) | ||
259 | { | ||
260 | struct zoneref *z; | ||
261 | int nid; | ||
262 | |||
263 | wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); | ||
264 | yield(); | ||
265 | |||
266 | for_each_online_node(nid) { | ||
267 | |||
268 | z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), | ||
269 | gfp_zone(GFP_NOFS), NULL); | ||
270 | if (z->zone) | ||
271 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, | ||
272 | GFP_NOFS, NULL); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * I/O completion handler for block_read_full_page() - pages | 256 | * I/O completion handler for block_read_full_page() - pages |
278 | * which come unlocked at the end of I/O. | 257 | * which come unlocked at the end of I/O. |
279 | */ | 258 | */ |
@@ -861,16 +840,19 @@ int remove_inode_buffers(struct inode *inode) | |||
861 | * which may not fail from ordinary buffer allocations. | 840 | * which may not fail from ordinary buffer allocations. |
862 | */ | 841 | */ |
863 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | 842 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, |
864 | int retry) | 843 | bool retry) |
865 | { | 844 | { |
866 | struct buffer_head *bh, *head; | 845 | struct buffer_head *bh, *head; |
846 | gfp_t gfp = GFP_NOFS; | ||
867 | long offset; | 847 | long offset; |
868 | 848 | ||
869 | try_again: | 849 | if (retry) |
850 | gfp |= __GFP_NOFAIL; | ||
851 | |||
870 | head = NULL; | 852 | head = NULL; |
871 | offset = PAGE_SIZE; | 853 | offset = PAGE_SIZE; |
872 | while ((offset -= size) >= 0) { | 854 | while ((offset -= size) >= 0) { |
873 | bh = alloc_buffer_head(GFP_NOFS); | 855 | bh = alloc_buffer_head(gfp); |
874 | if (!bh) | 856 | if (!bh) |
875 | goto no_grow; | 857 | goto no_grow; |
876 | 858 | ||
@@ -896,23 +878,7 @@ no_grow: | |||
896 | } while (head); | 878 | } while (head); |
897 | } | 879 | } |
898 | 880 | ||
899 | /* | 881 | return NULL; |
900 | * Return failure for non-async IO requests. Async IO requests | ||
901 | * are not allowed to fail, so we have to wait until buffer heads | ||
902 | * become available. But we don't want tasks sleeping with | ||
903 | * partially complete buffers, so all were released above. | ||
904 | */ | ||
905 | if (!retry) | ||
906 | return NULL; | ||
907 | |||
908 | /* We're _really_ low on memory. Now we just | ||
909 | * wait for old buffer heads to become free due to | ||
910 | * finishing IO. Since this is an async request and | ||
911 | * the reserve list is empty, we're sure there are | ||
912 | * async buffer heads in use. | ||
913 | */ | ||
914 | free_more_memory(); | ||
915 | goto try_again; | ||
916 | } | 882 | } |
917 | EXPORT_SYMBOL_GPL(alloc_page_buffers); | 883 | EXPORT_SYMBOL_GPL(alloc_page_buffers); |
918 | 884 | ||
@@ -1001,8 +967,6 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |||
1001 | gfp_mask |= __GFP_NOFAIL; | 967 | gfp_mask |= __GFP_NOFAIL; |
1002 | 968 | ||
1003 | page = find_or_create_page(inode->i_mapping, index, gfp_mask); | 969 | page = find_or_create_page(inode->i_mapping, index, gfp_mask); |
1004 | if (!page) | ||
1005 | return ret; | ||
1006 | 970 | ||
1007 | BUG_ON(!PageLocked(page)); | 971 | BUG_ON(!PageLocked(page)); |
1008 | 972 | ||
@@ -1021,9 +985,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |||
1021 | /* | 985 | /* |
1022 | * Allocate some buffers for this page | 986 | * Allocate some buffers for this page |
1023 | */ | 987 | */ |
1024 | bh = alloc_page_buffers(page, size, 0); | 988 | bh = alloc_page_buffers(page, size, true); |
1025 | if (!bh) | ||
1026 | goto failed; | ||
1027 | 989 | ||
1028 | /* | 990 | /* |
1029 | * Link the page to the buffers and initialise them. Take the | 991 | * Link the page to the buffers and initialise them. Take the |
@@ -1103,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, | |||
1103 | ret = grow_buffers(bdev, block, size, gfp); | 1065 | ret = grow_buffers(bdev, block, size, gfp); |
1104 | if (ret < 0) | 1066 | if (ret < 0) |
1105 | return NULL; | 1067 | return NULL; |
1106 | if (ret == 0) | ||
1107 | free_more_memory(); | ||
1108 | } | 1068 | } |
1109 | } | 1069 | } |
1110 | 1070 | ||
@@ -1575,7 +1535,7 @@ void create_empty_buffers(struct page *page, | |||
1575 | { | 1535 | { |
1576 | struct buffer_head *bh, *head, *tail; | 1536 | struct buffer_head *bh, *head, *tail; |
1577 | 1537 | ||
1578 | head = alloc_page_buffers(page, blocksize, 1); | 1538 | head = alloc_page_buffers(page, blocksize, true); |
1579 | bh = head; | 1539 | bh = head; |
1580 | do { | 1540 | do { |
1581 | bh->b_state |= b_state; | 1541 | bh->b_state |= b_state; |
@@ -2639,7 +2599,7 @@ int nobh_write_begin(struct address_space *mapping, | |||
2639 | * Be careful: the buffer linked list is a NULL terminated one, rather | 2599 | * Be careful: the buffer linked list is a NULL terminated one, rather |
2640 | * than the circular one we're used to. | 2600 | * than the circular one we're used to. |
2641 | */ | 2601 | */ |
2642 | head = alloc_page_buffers(page, blocksize, 0); | 2602 | head = alloc_page_buffers(page, blocksize, false); |
2643 | if (!head) { | 2603 | if (!head) { |
2644 | ret = -ENOMEM; | 2604 | ret = -ENOMEM; |
2645 | goto out_release; | 2605 | goto out_release; |
@@ -3056,8 +3016,16 @@ void guard_bio_eod(int op, struct bio *bio) | |||
3056 | sector_t maxsector; | 3016 | sector_t maxsector; |
3057 | struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; | 3017 | struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; |
3058 | unsigned truncated_bytes; | 3018 | unsigned truncated_bytes; |
3019 | struct hd_struct *part; | ||
3020 | |||
3021 | rcu_read_lock(); | ||
3022 | part = __disk_get_part(bio->bi_disk, bio->bi_partno); | ||
3023 | if (part) | ||
3024 | maxsector = part_nr_sects_read(part); | ||
3025 | else | ||
3026 | maxsector = get_capacity(bio->bi_disk); | ||
3027 | rcu_read_unlock(); | ||
3059 | 3028 | ||
3060 | maxsector = get_capacity(bio->bi_disk); | ||
3061 | if (!maxsector) | 3029 | if (!maxsector) |
3062 | return; | 3030 | return; |
3063 | 3031 | ||
diff --git a/fs/direct-io.c b/fs/direct-io.c index 98fe1325da9d..3aafb3343a65 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -497,7 +497,7 @@ static struct bio *dio_await_one(struct dio *dio) | |||
497 | dio->waiter = current; | 497 | dio->waiter = current; |
498 | spin_unlock_irqrestore(&dio->bio_lock, flags); | 498 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
499 | if (!(dio->iocb->ki_flags & IOCB_HIPRI) || | 499 | if (!(dio->iocb->ki_flags & IOCB_HIPRI) || |
500 | !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) | 500 | !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) |
501 | io_schedule(); | 501 | io_schedule(); |
502 | /* wake up sets us TASK_RUNNING */ | 502 | /* wake up sets us TASK_RUNNING */ |
503 | spin_lock_irqsave(&dio->bio_lock, flags); | 503 | spin_lock_irqsave(&dio->bio_lock, flags); |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 245c430a2e41..08f5debd07d1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -933,33 +933,36 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, | |||
933 | 933 | ||
934 | #endif /* CONFIG_CGROUP_WRITEBACK */ | 934 | #endif /* CONFIG_CGROUP_WRITEBACK */ |
935 | 935 | ||
936 | void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, | 936 | /* |
937 | bool range_cyclic, enum wb_reason reason) | 937 | * Add in the number of potentially dirty inodes, because each inode |
938 | * write can dirty pagecache in the underlying blockdev. | ||
939 | */ | ||
940 | static unsigned long get_nr_dirty_pages(void) | ||
938 | { | 941 | { |
939 | struct wb_writeback_work *work; | 942 | return global_node_page_state(NR_FILE_DIRTY) + |
943 | global_node_page_state(NR_UNSTABLE_NFS) + | ||
944 | get_nr_dirty_inodes(); | ||
945 | } | ||
940 | 946 | ||
947 | static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) | ||
948 | { | ||
941 | if (!wb_has_dirty_io(wb)) | 949 | if (!wb_has_dirty_io(wb)) |
942 | return; | 950 | return; |
943 | 951 | ||
944 | /* | 952 | /* |
945 | * This is WB_SYNC_NONE writeback, so if allocation fails just | 953 | * All callers of this function want to start writeback of all |
946 | * wakeup the thread for old dirty data writeback | 954 | * dirty pages. Places like vmscan can call this at a very |
955 | * high frequency, causing pointless allocations of tons of | ||
956 | * work items and keeping the flusher threads busy retrieving | ||
957 | * that work. Ensure that we only allow one of them pending and | ||
958 | * inflight at the time. | ||
947 | */ | 959 | */ |
948 | work = kzalloc(sizeof(*work), | 960 | if (test_bit(WB_start_all, &wb->state) || |
949 | GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); | 961 | test_and_set_bit(WB_start_all, &wb->state)) |
950 | if (!work) { | ||
951 | trace_writeback_nowork(wb); | ||
952 | wb_wakeup(wb); | ||
953 | return; | 962 | return; |
954 | } | ||
955 | |||
956 | work->sync_mode = WB_SYNC_NONE; | ||
957 | work->nr_pages = nr_pages; | ||
958 | work->range_cyclic = range_cyclic; | ||
959 | work->reason = reason; | ||
960 | work->auto_free = 1; | ||
961 | 963 | ||
962 | wb_queue_work(wb, work); | 964 | wb->start_all_reason = reason; |
965 | wb_wakeup(wb); | ||
963 | } | 966 | } |
964 | 967 | ||
965 | /** | 968 | /** |
@@ -1814,17 +1817,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) | |||
1814 | return work; | 1817 | return work; |
1815 | } | 1818 | } |
1816 | 1819 | ||
1817 | /* | ||
1818 | * Add in the number of potentially dirty inodes, because each inode | ||
1819 | * write can dirty pagecache in the underlying blockdev. | ||
1820 | */ | ||
1821 | static unsigned long get_nr_dirty_pages(void) | ||
1822 | { | ||
1823 | return global_node_page_state(NR_FILE_DIRTY) + | ||
1824 | global_node_page_state(NR_UNSTABLE_NFS) + | ||
1825 | get_nr_dirty_inodes(); | ||
1826 | } | ||
1827 | |||
1828 | static long wb_check_background_flush(struct bdi_writeback *wb) | 1820 | static long wb_check_background_flush(struct bdi_writeback *wb) |
1829 | { | 1821 | { |
1830 | if (wb_over_bg_thresh(wb)) { | 1822 | if (wb_over_bg_thresh(wb)) { |
@@ -1877,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) | |||
1877 | return 0; | 1869 | return 0; |
1878 | } | 1870 | } |
1879 | 1871 | ||
1872 | static long wb_check_start_all(struct bdi_writeback *wb) | ||
1873 | { | ||
1874 | long nr_pages; | ||
1875 | |||
1876 | if (!test_bit(WB_start_all, &wb->state)) | ||
1877 | return 0; | ||
1878 | |||
1879 | nr_pages = get_nr_dirty_pages(); | ||
1880 | if (nr_pages) { | ||
1881 | struct wb_writeback_work work = { | ||
1882 | .nr_pages = wb_split_bdi_pages(wb, nr_pages), | ||
1883 | .sync_mode = WB_SYNC_NONE, | ||
1884 | .range_cyclic = 1, | ||
1885 | .reason = wb->start_all_reason, | ||
1886 | }; | ||
1887 | |||
1888 | nr_pages = wb_writeback(wb, &work); | ||
1889 | } | ||
1890 | |||
1891 | clear_bit(WB_start_all, &wb->state); | ||
1892 | return nr_pages; | ||
1893 | } | ||
1894 | |||
1895 | |||
1880 | /* | 1896 | /* |
1881 | * Retrieve work items and do the writeback they describe | 1897 | * Retrieve work items and do the writeback they describe |
1882 | */ | 1898 | */ |
@@ -1893,6 +1909,11 @@ static long wb_do_writeback(struct bdi_writeback *wb) | |||
1893 | } | 1909 | } |
1894 | 1910 | ||
1895 | /* | 1911 | /* |
1912 | * Check for a flush-everything request | ||
1913 | */ | ||
1914 | wrote += wb_check_start_all(wb); | ||
1915 | |||
1916 | /* | ||
1896 | * Check for periodic writeback, kupdated() style | 1917 | * Check for periodic writeback, kupdated() style |
1897 | */ | 1918 | */ |
1898 | wrote += wb_check_old_data_flush(wb); | 1919 | wrote += wb_check_old_data_flush(wb); |
@@ -1947,10 +1968,33 @@ void wb_workfn(struct work_struct *work) | |||
1947 | } | 1968 | } |
1948 | 1969 | ||
1949 | /* | 1970 | /* |
1950 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | 1971 | * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero, |
1951 | * the whole world. | 1972 | * write back the whole world. |
1952 | */ | 1973 | */ |
1953 | void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) | 1974 | static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, |
1975 | enum wb_reason reason) | ||
1976 | { | ||
1977 | struct bdi_writeback *wb; | ||
1978 | |||
1979 | if (!bdi_has_dirty_io(bdi)) | ||
1980 | return; | ||
1981 | |||
1982 | list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) | ||
1983 | wb_start_writeback(wb, reason); | ||
1984 | } | ||
1985 | |||
1986 | void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, | ||
1987 | enum wb_reason reason) | ||
1988 | { | ||
1989 | rcu_read_lock(); | ||
1990 | __wakeup_flusher_threads_bdi(bdi, reason); | ||
1991 | rcu_read_unlock(); | ||
1992 | } | ||
1993 | |||
1994 | /* | ||
1995 | * Wakeup the flusher threads to start writeback of all currently dirty pages | ||
1996 | */ | ||
1997 | void wakeup_flusher_threads(enum wb_reason reason) | ||
1954 | { | 1998 | { |
1955 | struct backing_dev_info *bdi; | 1999 | struct backing_dev_info *bdi; |
1956 | 2000 | ||
@@ -1960,20 +2004,9 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) | |||
1960 | if (blk_needs_flush_plug(current)) | 2004 | if (blk_needs_flush_plug(current)) |
1961 | blk_schedule_flush_plug(current); | 2005 | blk_schedule_flush_plug(current); |
1962 | 2006 | ||
1963 | if (!nr_pages) | ||
1964 | nr_pages = get_nr_dirty_pages(); | ||
1965 | |||
1966 | rcu_read_lock(); | 2007 | rcu_read_lock(); |
1967 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { | 2008 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) |
1968 | struct bdi_writeback *wb; | 2009 | __wakeup_flusher_threads_bdi(bdi, reason); |
1969 | |||
1970 | if (!bdi_has_dirty_io(bdi)) | ||
1971 | continue; | ||
1972 | |||
1973 | list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) | ||
1974 | wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), | ||
1975 | false, reason); | ||
1976 | } | ||
1977 | rcu_read_unlock(); | 2010 | rcu_read_unlock(); |
1978 | } | 2011 | } |
1979 | 2012 | ||
@@ -2343,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) | |||
2343 | EXPORT_SYMBOL(writeback_inodes_sb); | 2376 | EXPORT_SYMBOL(writeback_inodes_sb); |
2344 | 2377 | ||
2345 | /** | 2378 | /** |
2346 | * try_to_writeback_inodes_sb_nr - try to start writeback if none underway | 2379 | * try_to_writeback_inodes_sb - try to start writeback if none underway |
2347 | * @sb: the superblock | 2380 | * @sb: the superblock |
2348 | * @nr: the number of pages to write | 2381 | * @reason: reason why some writeback work was initiated |
2349 | * @reason: the reason of writeback | ||
2350 | * | 2382 | * |
2351 | * Invoke writeback_inodes_sb_nr if no writeback is currently underway. | 2383 | * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. |
2352 | * Returns 1 if writeback was started, 0 if not. | ||
2353 | */ | 2384 | */ |
2354 | bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, | 2385 | void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) |
2355 | enum wb_reason reason) | ||
2356 | { | 2386 | { |
2357 | if (!down_read_trylock(&sb->s_umount)) | 2387 | if (!down_read_trylock(&sb->s_umount)) |
2358 | return false; | 2388 | return; |
2359 | 2389 | ||
2360 | __writeback_inodes_sb_nr(sb, nr, reason, true); | 2390 | __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); |
2361 | up_read(&sb->s_umount); | 2391 | up_read(&sb->s_umount); |
2362 | return true; | ||
2363 | } | ||
2364 | EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); | ||
2365 | |||
2366 | /** | ||
2367 | * try_to_writeback_inodes_sb - try to start writeback if none underway | ||
2368 | * @sb: the superblock | ||
2369 | * @reason: reason why some writeback work was initiated | ||
2370 | * | ||
2371 | * Implement by try_to_writeback_inodes_sb_nr() | ||
2372 | * Returns 1 if writeback was started, 0 if not. | ||
2373 | */ | ||
2374 | bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) | ||
2375 | { | ||
2376 | return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); | ||
2377 | } | 2392 | } |
2378 | EXPORT_SYMBOL(try_to_writeback_inodes_sb); | 2393 | EXPORT_SYMBOL(try_to_writeback_inodes_sb); |
2379 | 2394 | ||
diff --git a/fs/iomap.c b/fs/iomap.c index 5011a964a550..b9f74803e56c 100644 --- a/fs/iomap.c +++ b/fs/iomap.c | |||
@@ -1057,7 +1057,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, | |||
1057 | 1057 | ||
1058 | if (!(iocb->ki_flags & IOCB_HIPRI) || | 1058 | if (!(iocb->ki_flags & IOCB_HIPRI) || |
1059 | !dio->submit.last_queue || | 1059 | !dio->submit.last_queue || |
1060 | !blk_mq_poll(dio->submit.last_queue, | 1060 | !blk_poll(dio->submit.last_queue, |
1061 | dio->submit.cookie)) | 1061 | dio->submit.cookie)) |
1062 | io_schedule(); | 1062 | io_schedule(); |
1063 | } | 1063 | } |
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index cc91856b5e2d..3a2e509c77c5 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c | |||
@@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { | |||
1739 | spin_lock(&mapping->private_lock); | 1739 | spin_lock(&mapping->private_lock); |
1740 | if (unlikely(!page_has_buffers(page))) { | 1740 | if (unlikely(!page_has_buffers(page))) { |
1741 | spin_unlock(&mapping->private_lock); | 1741 | spin_unlock(&mapping->private_lock); |
1742 | bh = head = alloc_page_buffers(page, bh_size, 1); | 1742 | bh = head = alloc_page_buffers(page, bh_size, true); |
1743 | spin_lock(&mapping->private_lock); | 1743 | spin_lock(&mapping->private_lock); |
1744 | if (likely(!page_has_buffers(page))) { | 1744 | if (likely(!page_has_buffers(page))) { |
1745 | struct buffer_head *tail; | 1745 | struct buffer_head *tail; |
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index b6f402194f02..ee8392aee9f6 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c | |||
@@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, | |||
507 | if (unlikely(!page_has_buffers(page))) { | 507 | if (unlikely(!page_has_buffers(page))) { |
508 | struct buffer_head *tail; | 508 | struct buffer_head *tail; |
509 | 509 | ||
510 | bh = head = alloc_page_buffers(page, blocksize, 1); | 510 | bh = head = alloc_page_buffers(page, blocksize, true); |
511 | do { | 511 | do { |
512 | set_buffer_uptodate(bh); | 512 | set_buffer_uptodate(bh); |
513 | tail = bh; | 513 | tail = bh; |
@@ -109,7 +109,7 @@ SYSCALL_DEFINE0(sync) | |||
109 | { | 109 | { |
110 | int nowait = 0, wait = 1; | 110 | int nowait = 0, wait = 1; |
111 | 111 | ||
112 | wakeup_flusher_threads(0, WB_REASON_SYNC); | 112 | wakeup_flusher_threads(WB_REASON_SYNC); |
113 | iterate_supers(sync_inodes_one_sb, NULL); | 113 | iterate_supers(sync_inodes_one_sb, NULL); |
114 | iterate_supers(sync_fs_one_sb, &nowait); | 114 | iterate_supers(sync_fs_one_sb, &nowait); |
115 | iterate_supers(sync_fs_one_sb, &wait); | 115 | iterate_supers(sync_fs_one_sb, &wait); |
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index fff4cfa0c21d..bfe86b54f6c1 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h | |||
@@ -25,6 +25,7 @@ enum wb_state { | |||
25 | WB_shutting_down, /* wb_shutdown() in progress */ | 25 | WB_shutting_down, /* wb_shutdown() in progress */ |
26 | WB_writeback_running, /* Writeback is in progress */ | 26 | WB_writeback_running, /* Writeback is in progress */ |
27 | WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ | 27 | WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ |
28 | WB_start_all, /* nr_pages == 0 (all) work pending */ | ||
28 | }; | 29 | }; |
29 | 30 | ||
30 | enum wb_congested_state { | 31 | enum wb_congested_state { |
@@ -45,6 +46,28 @@ enum wb_stat_item { | |||
45 | #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) | 46 | #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) |
46 | 47 | ||
47 | /* | 48 | /* |
49 | * why some writeback work was initiated | ||
50 | */ | ||
51 | enum wb_reason { | ||
52 | WB_REASON_BACKGROUND, | ||
53 | WB_REASON_VMSCAN, | ||
54 | WB_REASON_SYNC, | ||
55 | WB_REASON_PERIODIC, | ||
56 | WB_REASON_LAPTOP_TIMER, | ||
57 | WB_REASON_FREE_MORE_MEM, | ||
58 | WB_REASON_FS_FREE_SPACE, | ||
59 | /* | ||
60 | * There is no bdi forker thread any more and works are done | ||
61 | * by emergency worker, however, this is TPs userland visible | ||
62 | * and we'll be exposing exactly the same information, | ||
63 | * so it has a mismatch name. | ||
64 | */ | ||
65 | WB_REASON_FORKER_THREAD, | ||
66 | |||
67 | WB_REASON_MAX, | ||
68 | }; | ||
69 | |||
70 | /* | ||
48 | * For cgroup writeback, multiple wb's may map to the same blkcg. Those | 71 | * For cgroup writeback, multiple wb's may map to the same blkcg. Those |
49 | * wb's can operate mostly independently but should share the congested | 72 | * wb's can operate mostly independently but should share the congested |
50 | * state. To facilitate such sharing, the congested state is tracked using | 73 | * state. To facilitate such sharing, the congested state is tracked using |
@@ -116,6 +139,7 @@ struct bdi_writeback { | |||
116 | 139 | ||
117 | struct fprop_local_percpu completions; | 140 | struct fprop_local_percpu completions; |
118 | int dirty_exceeded; | 141 | int dirty_exceeded; |
142 | enum wb_reason start_all_reason; | ||
119 | 143 | ||
120 | spinlock_t work_lock; /* protects work_list & dwork scheduling */ | 144 | spinlock_t work_lock; /* protects work_list & dwork scheduling */ |
121 | struct list_head work_list; | 145 | struct list_head work_list; |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 16621579a3db..f41ca8486e02 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -39,8 +39,6 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) | |||
39 | return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); | 39 | return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); |
40 | } | 40 | } |
41 | 41 | ||
42 | void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, | ||
43 | bool range_cyclic, enum wb_reason reason); | ||
44 | void wb_start_background_writeback(struct bdi_writeback *wb); | 42 | void wb_start_background_writeback(struct bdi_writeback *wb); |
45 | void wb_workfn(struct work_struct *work); | 43 | void wb_workfn(struct work_struct *work); |
46 | void wb_wakeup_delayed(struct bdi_writeback *wb); | 44 | void wb_wakeup_delayed(struct bdi_writeback *wb); |
@@ -175,8 +173,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) | |||
175 | 173 | ||
176 | long congestion_wait(int sync, long timeout); | 174 | long congestion_wait(int sync, long timeout); |
177 | long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); | 175 | long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); |
178 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | ||
179 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
180 | 176 | ||
181 | static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) | 177 | static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) |
182 | { | 178 | { |
diff --git a/include/linux/bio.h b/include/linux/bio.h index 275c91c99516..d4eec19a6d3c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h | |||
@@ -129,18 +129,6 @@ static inline void *bio_data(struct bio *bio) | |||
129 | #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) | 129 | #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * queues that have highmem support enabled may still need to revert to | ||
133 | * PIO transfers occasionally and thus map high pages temporarily. For | ||
134 | * permanent PIO fall back, user is probably better off disabling highmem | ||
135 | * I/O completely on that queue (see ide-dma for example) | ||
136 | */ | ||
137 | #define __bio_kmap_atomic(bio, iter) \ | ||
138 | (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \ | ||
139 | bio_iter_iovec((bio), (iter)).bv_offset) | ||
140 | |||
141 | #define __bio_kunmap_atomic(addr) kunmap_atomic(addr) | ||
142 | |||
143 | /* | ||
144 | * merge helpers etc | 132 | * merge helpers etc |
145 | */ | 133 | */ |
146 | 134 | ||
@@ -522,13 +510,11 @@ do { \ | |||
522 | 510 | ||
523 | #ifdef CONFIG_BLK_CGROUP | 511 | #ifdef CONFIG_BLK_CGROUP |
524 | int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); | 512 | int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); |
525 | int bio_associate_current(struct bio *bio); | ||
526 | void bio_disassociate_task(struct bio *bio); | 513 | void bio_disassociate_task(struct bio *bio); |
527 | void bio_clone_blkcg_association(struct bio *dst, struct bio *src); | 514 | void bio_clone_blkcg_association(struct bio *dst, struct bio *src); |
528 | #else /* CONFIG_BLK_CGROUP */ | 515 | #else /* CONFIG_BLK_CGROUP */ |
529 | static inline int bio_associate_blkcg(struct bio *bio, | 516 | static inline int bio_associate_blkcg(struct bio *bio, |
530 | struct cgroup_subsys_state *blkcg_css) { return 0; } | 517 | struct cgroup_subsys_state *blkcg_css) { return 0; } |
531 | static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } | ||
532 | static inline void bio_disassociate_task(struct bio *bio) { } | 518 | static inline void bio_disassociate_task(struct bio *bio) { } |
533 | static inline void bio_clone_blkcg_association(struct bio *dst, | 519 | static inline void bio_clone_blkcg_association(struct bio *dst, |
534 | struct bio *src) { } | 520 | struct bio *src) { } |
@@ -575,17 +561,6 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) | |||
575 | } | 561 | } |
576 | #endif | 562 | #endif |
577 | 563 | ||
578 | static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter, | ||
579 | unsigned long *flags) | ||
580 | { | ||
581 | return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags); | ||
582 | } | ||
583 | #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) | ||
584 | |||
585 | #define bio_kmap_irq(bio, flags) \ | ||
586 | __bio_kmap_irq((bio), (bio)->bi_iter, (flags)) | ||
587 | #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) | ||
588 | |||
589 | /* | 564 | /* |
590 | * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. | 565 | * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. |
591 | * | 566 | * |
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 8bbc3716507a..e9825ff57b15 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/radix-tree.h> | 20 | #include <linux/radix-tree.h> |
21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
23 | #include <linux/kthread.h> | ||
23 | 24 | ||
24 | /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ | 25 | /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ |
25 | #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) | 26 | #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) |
@@ -224,22 +225,16 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) | |||
224 | return css ? container_of(css, struct blkcg, css) : NULL; | 225 | return css ? container_of(css, struct blkcg, css) : NULL; |
225 | } | 226 | } |
226 | 227 | ||
227 | static inline struct blkcg *task_blkcg(struct task_struct *tsk) | ||
228 | { | ||
229 | return css_to_blkcg(task_css(tsk, io_cgrp_id)); | ||
230 | } | ||
231 | |||
232 | static inline struct blkcg *bio_blkcg(struct bio *bio) | 228 | static inline struct blkcg *bio_blkcg(struct bio *bio) |
233 | { | 229 | { |
230 | struct cgroup_subsys_state *css; | ||
231 | |||
234 | if (bio && bio->bi_css) | 232 | if (bio && bio->bi_css) |
235 | return css_to_blkcg(bio->bi_css); | 233 | return css_to_blkcg(bio->bi_css); |
236 | return task_blkcg(current); | 234 | css = kthread_blkcg(); |
237 | } | 235 | if (css) |
238 | 236 | return css_to_blkcg(css); | |
239 | static inline struct cgroup_subsys_state * | 237 | return css_to_blkcg(task_css(current, io_cgrp_id)); |
240 | task_get_blkcg_css(struct task_struct *task) | ||
241 | { | ||
242 | return task_get_css(task, io_cgrp_id); | ||
243 | } | 238 | } |
244 | 239 | ||
245 | /** | 240 | /** |
@@ -736,12 +731,6 @@ struct blkcg_policy { | |||
736 | 731 | ||
737 | #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) | 732 | #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) |
738 | 733 | ||
739 | static inline struct cgroup_subsys_state * | ||
740 | task_get_blkcg_css(struct task_struct *task) | ||
741 | { | ||
742 | return NULL; | ||
743 | } | ||
744 | |||
745 | #ifdef CONFIG_BLOCK | 734 | #ifdef CONFIG_BLOCK |
746 | 735 | ||
747 | static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } | 736 | static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } |
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 994cbb0f7ffc..95c9a5c862e2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
@@ -31,10 +31,12 @@ struct blk_mq_hw_ctx { | |||
31 | 31 | ||
32 | struct sbitmap ctx_map; | 32 | struct sbitmap ctx_map; |
33 | 33 | ||
34 | struct blk_mq_ctx *dispatch_from; | ||
35 | |||
34 | struct blk_mq_ctx **ctxs; | 36 | struct blk_mq_ctx **ctxs; |
35 | unsigned int nr_ctx; | 37 | unsigned int nr_ctx; |
36 | 38 | ||
37 | wait_queue_entry_t dispatch_wait; | 39 | wait_queue_entry_t dispatch_wait; |
38 | atomic_t wait_index; | 40 | atomic_t wait_index; |
39 | 41 | ||
40 | struct blk_mq_tags *tags; | 42 | struct blk_mq_tags *tags; |
@@ -91,6 +93,8 @@ struct blk_mq_queue_data { | |||
91 | 93 | ||
92 | typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, | 94 | typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, |
93 | const struct blk_mq_queue_data *); | 95 | const struct blk_mq_queue_data *); |
96 | typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); | ||
97 | typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); | ||
94 | typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); | 98 | typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); |
95 | typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); | 99 | typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); |
96 | typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); | 100 | typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); |
@@ -113,6 +117,15 @@ struct blk_mq_ops { | |||
113 | queue_rq_fn *queue_rq; | 117 | queue_rq_fn *queue_rq; |
114 | 118 | ||
115 | /* | 119 | /* |
120 | * Reserve budget before queue request, once .queue_rq is | ||
121 | * run, it is driver's responsibility to release the | ||
122 | * reserved budget. Also we have to handle failure case | ||
123 | * of .get_budget for avoiding I/O deadlock. | ||
124 | */ | ||
125 | get_budget_fn *get_budget; | ||
126 | put_budget_fn *put_budget; | ||
127 | |||
128 | /* | ||
116 | * Called on request timeout | 129 | * Called on request timeout |
117 | */ | 130 | */ |
118 | timeout_fn *timeout; | 131 | timeout_fn *timeout; |
@@ -169,8 +182,7 @@ enum { | |||
169 | BLK_MQ_S_STOPPED = 0, | 182 | BLK_MQ_S_STOPPED = 0, |
170 | BLK_MQ_S_TAG_ACTIVE = 1, | 183 | BLK_MQ_S_TAG_ACTIVE = 1, |
171 | BLK_MQ_S_SCHED_RESTART = 2, | 184 | BLK_MQ_S_SCHED_RESTART = 2, |
172 | BLK_MQ_S_TAG_WAITING = 3, | 185 | BLK_MQ_S_START_ON_RUN = 3, |
173 | BLK_MQ_S_START_ON_RUN = 4, | ||
174 | 186 | ||
175 | BLK_MQ_MAX_DEPTH = 10240, | 187 | BLK_MQ_MAX_DEPTH = 10240, |
176 | 188 | ||
@@ -198,15 +210,21 @@ void blk_mq_free_request(struct request *rq); | |||
198 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *); | 210 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *); |
199 | 211 | ||
200 | enum { | 212 | enum { |
201 | BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ | 213 | /* return when out of requests */ |
202 | BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ | 214 | BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), |
203 | BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ | 215 | /* allocate from reserved pool */ |
216 | BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), | ||
217 | /* allocate internal/sched tag */ | ||
218 | BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), | ||
219 | /* set RQF_PREEMPT */ | ||
220 | BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), | ||
204 | }; | 221 | }; |
205 | 222 | ||
206 | struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, | 223 | struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, |
207 | unsigned int flags); | 224 | blk_mq_req_flags_t flags); |
208 | struct request *blk_mq_alloc_request_hctx(struct request_queue *q, | 225 | struct request *blk_mq_alloc_request_hctx(struct request_queue *q, |
209 | unsigned int op, unsigned int flags, unsigned int hctx_idx); | 226 | unsigned int op, blk_mq_req_flags_t flags, |
227 | unsigned int hctx_idx); | ||
210 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); | 228 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); |
211 | 229 | ||
212 | enum { | 230 | enum { |
@@ -249,7 +267,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); | |||
249 | void blk_mq_quiesce_queue(struct request_queue *q); | 267 | void blk_mq_quiesce_queue(struct request_queue *q); |
250 | void blk_mq_unquiesce_queue(struct request_queue *q); | 268 | void blk_mq_unquiesce_queue(struct request_queue *q); |
251 | void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); | 269 | void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); |
252 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); | 270 | bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); |
253 | void blk_mq_run_hw_queues(struct request_queue *q, bool async); | 271 | void blk_mq_run_hw_queues(struct request_queue *q, bool async); |
254 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); | 272 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); |
255 | void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, | 273 | void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, |
@@ -260,8 +278,8 @@ void blk_freeze_queue_start(struct request_queue *q); | |||
260 | void blk_mq_freeze_queue_wait(struct request_queue *q); | 278 | void blk_mq_freeze_queue_wait(struct request_queue *q); |
261 | int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, | 279 | int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, |
262 | unsigned long timeout); | 280 | unsigned long timeout); |
263 | int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, | 281 | int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, |
264 | int (reinit_request)(void *, struct request *)); | 282 | int (reinit_request)(void *, struct request *)); |
265 | 283 | ||
266 | int blk_mq_map_queues(struct blk_mq_tag_set *set); | 284 | int blk_mq_map_queues(struct blk_mq_tag_set *set); |
267 | void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); | 285 | void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); |
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 96ac3815542c..a1e628e032da 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
@@ -163,6 +163,8 @@ struct bio { | |||
163 | */ | 163 | */ |
164 | #define BIO_RESET_BITS BVEC_POOL_OFFSET | 164 | #define BIO_RESET_BITS BVEC_POOL_OFFSET |
165 | 165 | ||
166 | typedef __u32 __bitwise blk_mq_req_flags_t; | ||
167 | |||
166 | /* | 168 | /* |
167 | * Operations and flags common to the bio and request structures. | 169 | * Operations and flags common to the bio and request structures. |
168 | * We use 8 bits for encoding the operation, and the remaining 24 for flags. | 170 | * We use 8 bits for encoding the operation, and the remaining 24 for flags. |
@@ -225,11 +227,14 @@ enum req_flag_bits { | |||
225 | __REQ_PREFLUSH, /* request for cache flush */ | 227 | __REQ_PREFLUSH, /* request for cache flush */ |
226 | __REQ_RAHEAD, /* read ahead, can fail anytime */ | 228 | __REQ_RAHEAD, /* read ahead, can fail anytime */ |
227 | __REQ_BACKGROUND, /* background IO */ | 229 | __REQ_BACKGROUND, /* background IO */ |
230 | __REQ_NOWAIT, /* Don't wait if request will block */ | ||
228 | 231 | ||
229 | /* command specific flags for REQ_OP_WRITE_ZEROES: */ | 232 | /* command specific flags for REQ_OP_WRITE_ZEROES: */ |
230 | __REQ_NOUNMAP, /* do not free blocks when zeroing */ | 233 | __REQ_NOUNMAP, /* do not free blocks when zeroing */ |
231 | 234 | ||
232 | __REQ_NOWAIT, /* Don't wait if request will block */ | 235 | /* for driver use */ |
236 | __REQ_DRV, | ||
237 | |||
233 | __REQ_NR_BITS, /* stops here */ | 238 | __REQ_NR_BITS, /* stops here */ |
234 | }; | 239 | }; |
235 | 240 | ||
@@ -246,9 +251,11 @@ enum req_flag_bits { | |||
246 | #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) | 251 | #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) |
247 | #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) | 252 | #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) |
248 | #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) | 253 | #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) |
254 | #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) | ||
249 | 255 | ||
250 | #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) | 256 | #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) |
251 | #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) | 257 | |
258 | #define REQ_DRV (1ULL << __REQ_DRV) | ||
252 | 259 | ||
253 | #define REQ_FAILFAST_MASK \ | 260 | #define REQ_FAILFAST_MASK \ |
254 | (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) | 261 | (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) |
@@ -330,11 +337,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) | |||
330 | } | 337 | } |
331 | 338 | ||
332 | struct blk_rq_stat { | 339 | struct blk_rq_stat { |
333 | s64 mean; | 340 | u64 mean; |
334 | u64 min; | 341 | u64 min; |
335 | u64 max; | 342 | u64 max; |
336 | s32 nr_samples; | 343 | u32 nr_samples; |
337 | s32 nr_batch; | ||
338 | u64 batch; | 344 | u64 batch; |
339 | }; | 345 | }; |
340 | 346 | ||
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8da66379f7ea..8089ca17db9a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -267,6 +267,7 @@ struct blk_queue_ctx; | |||
267 | 267 | ||
268 | typedef void (request_fn_proc) (struct request_queue *q); | 268 | typedef void (request_fn_proc) (struct request_queue *q); |
269 | typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); | 269 | typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); |
270 | typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t); | ||
270 | typedef int (prep_rq_fn) (struct request_queue *, struct request *); | 271 | typedef int (prep_rq_fn) (struct request_queue *, struct request *); |
271 | typedef void (unprep_rq_fn) (struct request_queue *, struct request *); | 272 | typedef void (unprep_rq_fn) (struct request_queue *, struct request *); |
272 | 273 | ||
@@ -409,6 +410,7 @@ struct request_queue { | |||
409 | 410 | ||
410 | request_fn_proc *request_fn; | 411 | request_fn_proc *request_fn; |
411 | make_request_fn *make_request_fn; | 412 | make_request_fn *make_request_fn; |
413 | poll_q_fn *poll_fn; | ||
412 | prep_rq_fn *prep_rq_fn; | 414 | prep_rq_fn *prep_rq_fn; |
413 | unprep_rq_fn *unprep_rq_fn; | 415 | unprep_rq_fn *unprep_rq_fn; |
414 | softirq_done_fn *softirq_done_fn; | 416 | softirq_done_fn *softirq_done_fn; |
@@ -610,7 +612,6 @@ struct request_queue { | |||
610 | #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ | 612 | #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ |
611 | #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ | 613 | #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ |
612 | #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ | 614 | #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ |
613 | #define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */ | ||
614 | #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ | 615 | #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ |
615 | #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ | 616 | #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ |
616 | #define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ | 617 | #define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ |
@@ -632,14 +633,13 @@ struct request_queue { | |||
632 | #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ | 633 | #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ |
633 | #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ | 634 | #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ |
634 | #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ | 635 | #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ |
636 | #define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ | ||
635 | 637 | ||
636 | #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ | 638 | #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ |
637 | (1 << QUEUE_FLAG_STACKABLE) | \ | ||
638 | (1 << QUEUE_FLAG_SAME_COMP) | \ | 639 | (1 << QUEUE_FLAG_SAME_COMP) | \ |
639 | (1 << QUEUE_FLAG_ADD_RANDOM)) | 640 | (1 << QUEUE_FLAG_ADD_RANDOM)) |
640 | 641 | ||
641 | #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ | 642 | #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ |
642 | (1 << QUEUE_FLAG_STACKABLE) | \ | ||
643 | (1 << QUEUE_FLAG_SAME_COMP) | \ | 643 | (1 << QUEUE_FLAG_SAME_COMP) | \ |
644 | (1 << QUEUE_FLAG_POLL)) | 644 | (1 << QUEUE_FLAG_POLL)) |
645 | 645 | ||
@@ -723,8 +723,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) | |||
723 | #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) | 723 | #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) |
724 | #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) | 724 | #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) |
725 | #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) | 725 | #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) |
726 | #define blk_queue_stackable(q) \ | ||
727 | test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) | ||
728 | #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) | 726 | #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) |
729 | #define blk_queue_secure_erase(q) \ | 727 | #define blk_queue_secure_erase(q) \ |
730 | (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) | 728 | (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) |
@@ -736,6 +734,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) | |||
736 | ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ | 734 | ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ |
737 | REQ_FAILFAST_DRIVER)) | 735 | REQ_FAILFAST_DRIVER)) |
738 | #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) | 736 | #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) |
737 | #define blk_queue_preempt_only(q) \ | ||
738 | test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) | ||
739 | |||
740 | extern int blk_set_preempt_only(struct request_queue *q); | ||
741 | extern void blk_clear_preempt_only(struct request_queue *q); | ||
739 | 742 | ||
740 | static inline bool blk_account_rq(struct request *rq) | 743 | static inline bool blk_account_rq(struct request *rq) |
741 | { | 744 | { |
@@ -923,24 +926,17 @@ static inline void rq_flush_dcache_pages(struct request *rq) | |||
923 | } | 926 | } |
924 | #endif | 927 | #endif |
925 | 928 | ||
926 | #ifdef CONFIG_PRINTK | ||
927 | #define vfs_msg(sb, level, fmt, ...) \ | ||
928 | __vfs_msg(sb, level, fmt, ##__VA_ARGS__) | ||
929 | #else | ||
930 | #define vfs_msg(sb, level, fmt, ...) \ | ||
931 | do { \ | ||
932 | no_printk(fmt, ##__VA_ARGS__); \ | ||
933 | __vfs_msg(sb, "", " "); \ | ||
934 | } while (0) | ||
935 | #endif | ||
936 | |||
937 | extern int blk_register_queue(struct gendisk *disk); | 929 | extern int blk_register_queue(struct gendisk *disk); |
938 | extern void blk_unregister_queue(struct gendisk *disk); | 930 | extern void blk_unregister_queue(struct gendisk *disk); |
939 | extern blk_qc_t generic_make_request(struct bio *bio); | 931 | extern blk_qc_t generic_make_request(struct bio *bio); |
932 | extern blk_qc_t direct_make_request(struct bio *bio); | ||
940 | extern void blk_rq_init(struct request_queue *q, struct request *rq); | 933 | extern void blk_rq_init(struct request_queue *q, struct request *rq); |
941 | extern void blk_init_request_from_bio(struct request *req, struct bio *bio); | 934 | extern void blk_init_request_from_bio(struct request *req, struct bio *bio); |
942 | extern void blk_put_request(struct request *); | 935 | extern void blk_put_request(struct request *); |
943 | extern void __blk_put_request(struct request_queue *, struct request *); | 936 | extern void __blk_put_request(struct request_queue *, struct request *); |
937 | extern struct request *blk_get_request_flags(struct request_queue *, | ||
938 | unsigned int op, | ||
939 | blk_mq_req_flags_t flags); | ||
944 | extern struct request *blk_get_request(struct request_queue *, unsigned int op, | 940 | extern struct request *blk_get_request(struct request_queue *, unsigned int op, |
945 | gfp_t gfp_mask); | 941 | gfp_t gfp_mask); |
946 | extern void blk_requeue_request(struct request_queue *, struct request *); | 942 | extern void blk_requeue_request(struct request_queue *, struct request *); |
@@ -964,7 +960,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, | |||
964 | extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, | 960 | extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, |
965 | struct scsi_ioctl_command __user *); | 961 | struct scsi_ioctl_command __user *); |
966 | 962 | ||
967 | extern int blk_queue_enter(struct request_queue *q, bool nowait); | 963 | extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); |
968 | extern void blk_queue_exit(struct request_queue *q); | 964 | extern void blk_queue_exit(struct request_queue *q); |
969 | extern void blk_start_queue(struct request_queue *q); | 965 | extern void blk_start_queue(struct request_queue *q); |
970 | extern void blk_start_queue_async(struct request_queue *q); | 966 | extern void blk_start_queue_async(struct request_queue *q); |
@@ -991,7 +987,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, | |||
991 | int blk_status_to_errno(blk_status_t status); | 987 | int blk_status_to_errno(blk_status_t status); |
992 | blk_status_t errno_to_blk_status(int errno); | 988 | blk_status_t errno_to_blk_status(int errno); |
993 | 989 | ||
994 | bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); | 990 | bool blk_poll(struct request_queue *q, blk_qc_t cookie); |
995 | 991 | ||
996 | static inline struct request_queue *bdev_get_queue(struct block_device *bdev) | 992 | static inline struct request_queue *bdev_get_queue(struct block_device *bdev) |
997 | { | 993 | { |
@@ -1110,6 +1106,8 @@ extern struct request *blk_peek_request(struct request_queue *q); | |||
1110 | extern void blk_start_request(struct request *rq); | 1106 | extern void blk_start_request(struct request *rq); |
1111 | extern struct request *blk_fetch_request(struct request_queue *q); | 1107 | extern struct request *blk_fetch_request(struct request_queue *q); |
1112 | 1108 | ||
1109 | void blk_steal_bios(struct bio_list *list, struct request *rq); | ||
1110 | |||
1113 | /* | 1111 | /* |
1114 | * Request completion related functions. | 1112 | * Request completion related functions. |
1115 | * | 1113 | * |
@@ -1372,7 +1370,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, | |||
1372 | gfp_mask, 0); | 1370 | gfp_mask, 0); |
1373 | } | 1371 | } |
1374 | 1372 | ||
1375 | extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); | 1373 | extern int blk_verify_command(unsigned char *cmd, fmode_t mode); |
1376 | 1374 | ||
1377 | enum blk_default_limits { | 1375 | enum blk_default_limits { |
1378 | BLK_MAX_SEGMENTS = 128, | 1376 | BLK_MAX_SEGMENTS = 128, |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index afa37f807f12..8b1bf8d3d4a2 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -157,7 +157,7 @@ void set_bh_page(struct buffer_head *bh, | |||
157 | struct page *page, unsigned long offset); | 157 | struct page *page, unsigned long offset); |
158 | int try_to_free_buffers(struct page *); | 158 | int try_to_free_buffers(struct page *); |
159 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | 159 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, |
160 | int retry); | 160 | bool retry); |
161 | void create_empty_buffers(struct page *, unsigned long, | 161 | void create_empty_buffers(struct page *, unsigned long, |
162 | unsigned long b_state); | 162 | unsigned long b_state); |
163 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate); | 163 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate); |
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ddb7632d73b9..3d794b3dc532 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h | |||
@@ -145,6 +145,7 @@ struct elevator_type | |||
145 | size_t icq_align; /* ditto */ | 145 | size_t icq_align; /* ditto */ |
146 | struct elv_fs_entry *elevator_attrs; | 146 | struct elv_fs_entry *elevator_attrs; |
147 | char elevator_name[ELV_NAME_MAX]; | 147 | char elevator_name[ELV_NAME_MAX]; |
148 | const char *elevator_alias; | ||
148 | struct module *elevator_owner; | 149 | struct module *elevator_owner; |
149 | bool uses_mq; | 150 | bool uses_mq; |
150 | #ifdef CONFIG_BLK_DEBUG_FS | 151 | #ifdef CONFIG_BLK_DEBUG_FS |
diff --git a/include/linux/genhd.h b/include/linux/genhd.h index eaefb7a62f83..5144ebe046c9 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h | |||
@@ -141,6 +141,7 @@ struct hd_struct { | |||
141 | #define GENHD_FL_NATIVE_CAPACITY 128 | 141 | #define GENHD_FL_NATIVE_CAPACITY 128 |
142 | #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 | 142 | #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 |
143 | #define GENHD_FL_NO_PART_SCAN 512 | 143 | #define GENHD_FL_NO_PART_SCAN 512 |
144 | #define GENHD_FL_HIDDEN 1024 | ||
144 | 145 | ||
145 | enum { | 146 | enum { |
146 | DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ | 147 | DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ |
@@ -236,7 +237,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk) | |||
236 | 237 | ||
237 | static inline dev_t disk_devt(struct gendisk *disk) | 238 | static inline dev_t disk_devt(struct gendisk *disk) |
238 | { | 239 | { |
239 | return disk_to_dev(disk)->devt; | 240 | return MKDEV(disk->major, disk->first_minor); |
240 | } | 241 | } |
241 | 242 | ||
242 | static inline dev_t part_devt(struct hd_struct *part) | 243 | static inline dev_t part_devt(struct hd_struct *part) |
@@ -244,6 +245,7 @@ static inline dev_t part_devt(struct hd_struct *part) | |||
244 | return part_to_dev(part)->devt; | 245 | return part_to_dev(part)->devt; |
245 | } | 246 | } |
246 | 247 | ||
248 | extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); | ||
247 | extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); | 249 | extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); |
248 | 250 | ||
249 | static inline void disk_put_part(struct hd_struct *part) | 251 | static inline void disk_put_part(struct hd_struct *part) |
diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 86d53a3cb497..3203e36b2ee8 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h | |||
@@ -4,6 +4,7 @@ | |||
4 | /* Simple interface for creating and stopping kernel threads without mess. */ | 4 | /* Simple interface for creating and stopping kernel threads without mess. */ |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/cgroup.h> | ||
7 | 8 | ||
8 | __printf(4, 5) | 9 | __printf(4, 5) |
9 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | 10 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
@@ -199,4 +200,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); | |||
199 | 200 | ||
200 | void kthread_destroy_worker(struct kthread_worker *worker); | 201 | void kthread_destroy_worker(struct kthread_worker *worker); |
201 | 202 | ||
203 | #ifdef CONFIG_BLK_CGROUP | ||
204 | void kthread_associate_blkcg(struct cgroup_subsys_state *css); | ||
205 | struct cgroup_subsys_state *kthread_blkcg(void); | ||
206 | #else | ||
207 | static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } | ||
208 | static inline struct cgroup_subsys_state *kthread_blkcg(void) | ||
209 | { | ||
210 | return NULL; | ||
211 | } | ||
212 | #endif | ||
202 | #endif /* _LINUX_KTHREAD_H */ | 213 | #endif /* _LINUX_KTHREAD_H */ |
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index a29a8db5cc2f..2d1d9de06728 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h | |||
@@ -57,6 +57,7 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, | |||
57 | typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); | 57 | typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); |
58 | typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); | 58 | typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); |
59 | typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); | 59 | typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); |
60 | typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); | ||
60 | typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); | 61 | typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); |
61 | typedef void (nvm_destroy_dma_pool_fn)(void *); | 62 | typedef void (nvm_destroy_dma_pool_fn)(void *); |
62 | typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, | 63 | typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, |
@@ -70,6 +71,7 @@ struct nvm_dev_ops { | |||
70 | nvm_op_set_bb_fn *set_bb_tbl; | 71 | nvm_op_set_bb_fn *set_bb_tbl; |
71 | 72 | ||
72 | nvm_submit_io_fn *submit_io; | 73 | nvm_submit_io_fn *submit_io; |
74 | nvm_submit_io_sync_fn *submit_io_sync; | ||
73 | 75 | ||
74 | nvm_create_dma_pool_fn *create_dma_pool; | 76 | nvm_create_dma_pool_fn *create_dma_pool; |
75 | nvm_destroy_dma_pool_fn *destroy_dma_pool; | 77 | nvm_destroy_dma_pool_fn *destroy_dma_pool; |
@@ -461,10 +463,9 @@ struct nvm_tgt_type { | |||
461 | 463 | ||
462 | /* For internal use */ | 464 | /* For internal use */ |
463 | struct list_head list; | 465 | struct list_head list; |
466 | struct module *owner; | ||
464 | }; | 467 | }; |
465 | 468 | ||
466 | extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); | ||
467 | |||
468 | extern int nvm_register_tgt_type(struct nvm_tgt_type *); | 469 | extern int nvm_register_tgt_type(struct nvm_tgt_type *); |
469 | extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); | 470 | extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); |
470 | 471 | ||
@@ -479,10 +480,8 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, | |||
479 | int, int); | 480 | int, int); |
480 | extern int nvm_max_phys_sects(struct nvm_tgt_dev *); | 481 | extern int nvm_max_phys_sects(struct nvm_tgt_dev *); |
481 | extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); | 482 | extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); |
483 | extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); | ||
482 | extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); | 484 | extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); |
483 | extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, | ||
484 | const struct ppa_addr *, int, int); | ||
485 | extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); | ||
486 | extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, | 485 | extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, |
487 | void *); | 486 | void *); |
488 | extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); | 487 | extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); |
@@ -491,8 +490,6 @@ extern void nvm_end_io(struct nvm_rq *); | |||
491 | extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); | 490 | extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); |
492 | extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); | 491 | extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); |
493 | 492 | ||
494 | extern int nvm_dev_factory(struct nvm_dev *, int flags); | ||
495 | |||
496 | extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); | 493 | extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); |
497 | 494 | ||
498 | #else /* CONFIG_NVM */ | 495 | #else /* CONFIG_NVM */ |
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index a726f96010d5..496ff759f84c 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h | |||
@@ -40,6 +40,8 @@ | |||
40 | * @node_name: FC WWNN for the port | 40 | * @node_name: FC WWNN for the port |
41 | * @port_name: FC WWPN for the port | 41 | * @port_name: FC WWPN for the port |
42 | * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) | 42 | * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) |
43 | * @dev_loss_tmo: maximum delay for reconnects to an association on | ||
44 | * this device. Used only on a remoteport. | ||
43 | * | 45 | * |
44 | * Initialization values for dynamic port fields: | 46 | * Initialization values for dynamic port fields: |
45 | * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must | 47 | * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must |
@@ -50,6 +52,7 @@ struct nvme_fc_port_info { | |||
50 | u64 port_name; | 52 | u64 port_name; |
51 | u32 port_role; | 53 | u32 port_role; |
52 | u32 port_id; | 54 | u32 port_id; |
55 | u32 dev_loss_tmo; | ||
53 | }; | 56 | }; |
54 | 57 | ||
55 | 58 | ||
@@ -102,8 +105,6 @@ enum nvmefc_fcp_datadir { | |||
102 | }; | 105 | }; |
103 | 106 | ||
104 | 107 | ||
105 | #define NVME_FC_MAX_SEGMENTS 256 | ||
106 | |||
107 | /** | 108 | /** |
108 | * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport | 109 | * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport |
109 | * to LLDD in order to perform a NVME FCP IO operation. | 110 | * to LLDD in order to perform a NVME FCP IO operation. |
@@ -202,6 +203,9 @@ enum nvme_fc_obj_state { | |||
202 | * The length of the buffer corresponds to the local_priv_sz | 203 | * The length of the buffer corresponds to the local_priv_sz |
203 | * value specified in the nvme_fc_port_template supplied by | 204 | * value specified in the nvme_fc_port_template supplied by |
204 | * the LLDD. | 205 | * the LLDD. |
206 | * @dev_loss_tmo: maximum delay for reconnects to an association on | ||
207 | * this device. To modify, lldd must call | ||
208 | * nvme_fc_set_remoteport_devloss(). | ||
205 | * | 209 | * |
206 | * Fields with dynamic values. Values may change base on link state. LLDD | 210 | * Fields with dynamic values. Values may change base on link state. LLDD |
207 | * may reference fields directly to change them. Initialized by the | 211 | * may reference fields directly to change them. Initialized by the |
@@ -259,10 +263,9 @@ struct nvme_fc_remote_port { | |||
259 | u32 port_role; | 263 | u32 port_role; |
260 | u64 node_name; | 264 | u64 node_name; |
261 | u64 port_name; | 265 | u64 port_name; |
262 | |||
263 | struct nvme_fc_local_port *localport; | 266 | struct nvme_fc_local_port *localport; |
264 | |||
265 | void *private; | 267 | void *private; |
268 | u32 dev_loss_tmo; | ||
266 | 269 | ||
267 | /* dynamic fields */ | 270 | /* dynamic fields */ |
268 | u32 port_id; | 271 | u32 port_id; |
@@ -446,6 +449,10 @@ int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, | |||
446 | 449 | ||
447 | int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); | 450 | int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); |
448 | 451 | ||
452 | void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); | ||
453 | |||
454 | int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport, | ||
455 | u32 dev_loss_tmo); | ||
449 | 456 | ||
450 | 457 | ||
451 | /* | 458 | /* |
diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9310ce77d8e1..aea87f0d917b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h | |||
@@ -90,6 +90,14 @@ enum { | |||
90 | }; | 90 | }; |
91 | 91 | ||
92 | #define NVME_AQ_DEPTH 32 | 92 | #define NVME_AQ_DEPTH 32 |
93 | #define NVME_NR_AEN_COMMANDS 1 | ||
94 | #define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) | ||
95 | |||
96 | /* | ||
97 | * Subtract one to leave an empty queue entry for 'Full Queue' condition. See | ||
98 | * NVM-Express 1.2 specification, section 4.1.2. | ||
99 | */ | ||
100 | #define NVME_AQ_MQ_TAG_DEPTH (NVME_AQ_BLK_MQ_DEPTH - 1) | ||
93 | 101 | ||
94 | enum { | 102 | enum { |
95 | NVME_REG_CAP = 0x0000, /* Controller Capabilities */ | 103 | NVME_REG_CAP = 0x0000, /* Controller Capabilities */ |
@@ -267,6 +275,7 @@ enum { | |||
267 | NVME_CTRL_OACS_SEC_SUPP = 1 << 0, | 275 | NVME_CTRL_OACS_SEC_SUPP = 1 << 0, |
268 | NVME_CTRL_OACS_DIRECTIVES = 1 << 5, | 276 | NVME_CTRL_OACS_DIRECTIVES = 1 << 5, |
269 | NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, | 277 | NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, |
278 | NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, | ||
270 | }; | 279 | }; |
271 | 280 | ||
272 | struct nvme_lbaf { | 281 | struct nvme_lbaf { |
@@ -396,6 +405,21 @@ struct nvme_fw_slot_info_log { | |||
396 | }; | 405 | }; |
397 | 406 | ||
398 | enum { | 407 | enum { |
408 | NVME_CMD_EFFECTS_CSUPP = 1 << 0, | ||
409 | NVME_CMD_EFFECTS_LBCC = 1 << 1, | ||
410 | NVME_CMD_EFFECTS_NCC = 1 << 2, | ||
411 | NVME_CMD_EFFECTS_NIC = 1 << 3, | ||
412 | NVME_CMD_EFFECTS_CCC = 1 << 4, | ||
413 | NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, | ||
414 | }; | ||
415 | |||
416 | struct nvme_effects_log { | ||
417 | __le32 acs[256]; | ||
418 | __le32 iocs[256]; | ||
419 | __u8 resv[2048]; | ||
420 | }; | ||
421 | |||
422 | enum { | ||
399 | NVME_SMART_CRIT_SPARE = 1 << 0, | 423 | NVME_SMART_CRIT_SPARE = 1 << 0, |
400 | NVME_SMART_CRIT_TEMPERATURE = 1 << 1, | 424 | NVME_SMART_CRIT_TEMPERATURE = 1 << 1, |
401 | NVME_SMART_CRIT_RELIABILITY = 1 << 2, | 425 | NVME_SMART_CRIT_RELIABILITY = 1 << 2, |
@@ -404,6 +428,10 @@ enum { | |||
404 | }; | 428 | }; |
405 | 429 | ||
406 | enum { | 430 | enum { |
431 | NVME_AER_ERROR = 0, | ||
432 | NVME_AER_SMART = 1, | ||
433 | NVME_AER_CSS = 6, | ||
434 | NVME_AER_VS = 7, | ||
407 | NVME_AER_NOTICE_NS_CHANGED = 0x0002, | 435 | NVME_AER_NOTICE_NS_CHANGED = 0x0002, |
408 | NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, | 436 | NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, |
409 | }; | 437 | }; |
@@ -681,6 +709,7 @@ enum nvme_admin_opcode { | |||
681 | nvme_admin_format_nvm = 0x80, | 709 | nvme_admin_format_nvm = 0x80, |
682 | nvme_admin_security_send = 0x81, | 710 | nvme_admin_security_send = 0x81, |
683 | nvme_admin_security_recv = 0x82, | 711 | nvme_admin_security_recv = 0x82, |
712 | nvme_admin_sanitize_nvm = 0x84, | ||
684 | }; | 713 | }; |
685 | 714 | ||
686 | enum { | 715 | enum { |
@@ -712,6 +741,7 @@ enum { | |||
712 | NVME_LOG_ERROR = 0x01, | 741 | NVME_LOG_ERROR = 0x01, |
713 | NVME_LOG_SMART = 0x02, | 742 | NVME_LOG_SMART = 0x02, |
714 | NVME_LOG_FW_SLOT = 0x03, | 743 | NVME_LOG_FW_SLOT = 0x03, |
744 | NVME_LOG_CMD_EFFECTS = 0x05, | ||
715 | NVME_LOG_DISC = 0x70, | 745 | NVME_LOG_DISC = 0x70, |
716 | NVME_LOG_RESERVATION = 0x80, | 746 | NVME_LOG_RESERVATION = 0x80, |
717 | NVME_FWACT_REPL = (0 << 3), | 747 | NVME_FWACT_REPL = (0 << 3), |
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index a1904aadbc45..0dcc60e820de 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h | |||
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb); | |||
211 | */ | 211 | */ |
212 | bool sbitmap_any_bit_clear(const struct sbitmap *sb); | 212 | bool sbitmap_any_bit_clear(const struct sbitmap *sb); |
213 | 213 | ||
214 | #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) | ||
215 | #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) | ||
216 | |||
214 | typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); | 217 | typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); |
215 | 218 | ||
216 | /** | 219 | /** |
217 | * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. | 220 | * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. |
221 | * @start: Where to start the iteration. | ||
218 | * @sb: Bitmap to iterate over. | 222 | * @sb: Bitmap to iterate over. |
219 | * @fn: Callback. Should return true to continue or false to break early. | 223 | * @fn: Callback. Should return true to continue or false to break early. |
220 | * @data: Pointer to pass to callback. | 224 | * @data: Pointer to pass to callback. |
@@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); | |||
222 | * This is inline even though it's non-trivial so that the function calls to the | 226 | * This is inline even though it's non-trivial so that the function calls to the |
223 | * callback will hopefully get optimized away. | 227 | * callback will hopefully get optimized away. |
224 | */ | 228 | */ |
225 | static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, | 229 | static inline void __sbitmap_for_each_set(struct sbitmap *sb, |
226 | void *data) | 230 | unsigned int start, |
231 | sb_for_each_fn fn, void *data) | ||
227 | { | 232 | { |
228 | unsigned int i; | 233 | unsigned int index; |
234 | unsigned int nr; | ||
235 | unsigned int scanned = 0; | ||
229 | 236 | ||
230 | for (i = 0; i < sb->map_nr; i++) { | 237 | if (start >= sb->depth) |
231 | struct sbitmap_word *word = &sb->map[i]; | 238 | start = 0; |
232 | unsigned int off, nr; | 239 | index = SB_NR_TO_INDEX(sb, start); |
240 | nr = SB_NR_TO_BIT(sb, start); | ||
233 | 241 | ||
234 | if (!word->word) | 242 | while (scanned < sb->depth) { |
235 | continue; | 243 | struct sbitmap_word *word = &sb->map[index]; |
244 | unsigned int depth = min_t(unsigned int, word->depth - nr, | ||
245 | sb->depth - scanned); | ||
236 | 246 | ||
237 | nr = 0; | 247 | scanned += depth; |
238 | off = i << sb->shift; | 248 | if (!word->word) |
249 | goto next; | ||
250 | |||
251 | /* | ||
252 | * On the first iteration of the outer loop, we need to add the | ||
253 | * bit offset back to the size of the word for find_next_bit(). | ||
254 | * On all other iterations, nr is zero, so this is a noop. | ||
255 | */ | ||
256 | depth += nr; | ||
239 | while (1) { | 257 | while (1) { |
240 | nr = find_next_bit(&word->word, word->depth, nr); | 258 | nr = find_next_bit(&word->word, depth, nr); |
241 | if (nr >= word->depth) | 259 | if (nr >= depth) |
242 | break; | 260 | break; |
243 | 261 | if (!fn(sb, (index << sb->shift) + nr, data)) | |
244 | if (!fn(sb, off + nr, data)) | ||
245 | return; | 262 | return; |
246 | 263 | ||
247 | nr++; | 264 | nr++; |
248 | } | 265 | } |
266 | next: | ||
267 | nr = 0; | ||
268 | if (++index >= sb->map_nr) | ||
269 | index = 0; | ||
249 | } | 270 | } |
250 | } | 271 | } |
251 | 272 | ||
252 | #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) | 273 | /** |
253 | #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) | 274 | * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. |
275 | * @sb: Bitmap to iterate over. | ||
276 | * @fn: Callback. Should return true to continue or false to break early. | ||
277 | * @data: Pointer to pass to callback. | ||
278 | */ | ||
279 | static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, | ||
280 | void *data) | ||
281 | { | ||
282 | __sbitmap_for_each_set(sb, 0, fn, data); | ||
283 | } | ||
254 | 284 | ||
255 | static inline unsigned long *__sbitmap_word(struct sbitmap *sb, | 285 | static inline unsigned long *__sbitmap_word(struct sbitmap *sb, |
256 | unsigned int bitnr) | 286 | unsigned int bitnr) |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e12d92808e98..f42d85631d17 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -43,28 +43,6 @@ enum writeback_sync_modes { | |||
43 | }; | 43 | }; |
44 | 44 | ||
45 | /* | 45 | /* |
46 | * why some writeback work was initiated | ||
47 | */ | ||
48 | enum wb_reason { | ||
49 | WB_REASON_BACKGROUND, | ||
50 | WB_REASON_VMSCAN, | ||
51 | WB_REASON_SYNC, | ||
52 | WB_REASON_PERIODIC, | ||
53 | WB_REASON_LAPTOP_TIMER, | ||
54 | WB_REASON_FREE_MORE_MEM, | ||
55 | WB_REASON_FS_FREE_SPACE, | ||
56 | /* | ||
57 | * There is no bdi forker thread any more and works are done | ||
58 | * by emergency worker, however, this is TPs userland visible | ||
59 | * and we'll be exposing exactly the same information, | ||
60 | * so it has a mismatch name. | ||
61 | */ | ||
62 | WB_REASON_FORKER_THREAD, | ||
63 | |||
64 | WB_REASON_MAX, | ||
65 | }; | ||
66 | |||
67 | /* | ||
68 | * A control structure which tells the writeback code what to do. These are | 46 | * A control structure which tells the writeback code what to do. These are |
69 | * always on the stack, and hence need no locking. They are always initialised | 47 | * always on the stack, and hence need no locking. They are always initialised |
70 | * in a manner such that unspecified fields are set to zero. | 48 | * in a manner such that unspecified fields are set to zero. |
@@ -186,11 +164,11 @@ struct bdi_writeback; | |||
186 | void writeback_inodes_sb(struct super_block *, enum wb_reason reason); | 164 | void writeback_inodes_sb(struct super_block *, enum wb_reason reason); |
187 | void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, | 165 | void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, |
188 | enum wb_reason reason); | 166 | enum wb_reason reason); |
189 | bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); | 167 | void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); |
190 | bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, | ||
191 | enum wb_reason reason); | ||
192 | void sync_inodes_sb(struct super_block *); | 168 | void sync_inodes_sb(struct super_block *); |
193 | void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); | 169 | void wakeup_flusher_threads(enum wb_reason reason); |
170 | void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, | ||
171 | enum wb_reason reason); | ||
194 | void inode_wait_for_writeback(struct inode *inode); | 172 | void inode_wait_for_writeback(struct inode *inode); |
195 | 173 | ||
196 | /* writeback.h requires fs.h; it, too, is not included from here. */ | 174 | /* writeback.h requires fs.h; it, too, is not included from here. */ |
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 571ddb49b926..73af87dfbff8 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h | |||
@@ -221,6 +221,7 @@ struct scsi_device { | |||
221 | unsigned char access_state; | 221 | unsigned char access_state; |
222 | struct mutex state_mutex; | 222 | struct mutex state_mutex; |
223 | enum scsi_device_state sdev_state; | 223 | enum scsi_device_state sdev_state; |
224 | struct task_struct *quiesced_by; | ||
224 | unsigned long sdev_data[0]; | 225 | unsigned long sdev_data[0]; |
225 | } __attribute__((aligned(sizeof(unsigned long)))); | 226 | } __attribute__((aligned(sizeof(unsigned long)))); |
226 | 227 | ||
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 2e1fa7910306..32db72c7c055 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -287,7 +287,6 @@ DEFINE_EVENT(writeback_class, name, \ | |||
287 | TP_PROTO(struct bdi_writeback *wb), \ | 287 | TP_PROTO(struct bdi_writeback *wb), \ |
288 | TP_ARGS(wb)) | 288 | TP_ARGS(wb)) |
289 | 289 | ||
290 | DEFINE_WRITEBACK_EVENT(writeback_nowork); | ||
291 | DEFINE_WRITEBACK_EVENT(writeback_wake_background); | 290 | DEFINE_WRITEBACK_EVENT(writeback_wake_background); |
292 | 291 | ||
293 | TRACE_EVENT(writeback_bdi_register, | 292 | TRACE_EVENT(writeback_bdi_register, |
diff --git a/kernel/kthread.c b/kernel/kthread.c index ba3992c8c375..8af313081b0d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/freezer.h> | 20 | #include <linux/freezer.h> |
21 | #include <linux/ptrace.h> | 21 | #include <linux/ptrace.h> |
22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <linux/cgroup.h> | ||
24 | #include <trace/events/sched.h> | 23 | #include <trace/events/sched.h> |
25 | 24 | ||
26 | static DEFINE_SPINLOCK(kthread_create_lock); | 25 | static DEFINE_SPINLOCK(kthread_create_lock); |
@@ -47,6 +46,9 @@ struct kthread { | |||
47 | void *data; | 46 | void *data; |
48 | struct completion parked; | 47 | struct completion parked; |
49 | struct completion exited; | 48 | struct completion exited; |
49 | #ifdef CONFIG_BLK_CGROUP | ||
50 | struct cgroup_subsys_state *blkcg_css; | ||
51 | #endif | ||
50 | }; | 52 | }; |
51 | 53 | ||
52 | enum KTHREAD_BITS { | 54 | enum KTHREAD_BITS { |
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k) | |||
74 | 76 | ||
75 | void free_kthread_struct(struct task_struct *k) | 77 | void free_kthread_struct(struct task_struct *k) |
76 | { | 78 | { |
79 | struct kthread *kthread; | ||
80 | |||
77 | /* | 81 | /* |
78 | * Can be NULL if this kthread was created by kernel_thread() | 82 | * Can be NULL if this kthread was created by kernel_thread() |
79 | * or if kmalloc() in kthread() failed. | 83 | * or if kmalloc() in kthread() failed. |
80 | */ | 84 | */ |
81 | kfree(to_kthread(k)); | 85 | kthread = to_kthread(k); |
86 | #ifdef CONFIG_BLK_CGROUP | ||
87 | WARN_ON_ONCE(kthread && kthread->blkcg_css); | ||
88 | #endif | ||
89 | kfree(kthread); | ||
82 | } | 90 | } |
83 | 91 | ||
84 | /** | 92 | /** |
@@ -196,7 +204,7 @@ static int kthread(void *_create) | |||
196 | struct kthread *self; | 204 | struct kthread *self; |
197 | int ret; | 205 | int ret; |
198 | 206 | ||
199 | self = kmalloc(sizeof(*self), GFP_KERNEL); | 207 | self = kzalloc(sizeof(*self), GFP_KERNEL); |
200 | set_kthread_struct(self); | 208 | set_kthread_struct(self); |
201 | 209 | ||
202 | /* If user was SIGKILLed, I release the structure. */ | 210 | /* If user was SIGKILLed, I release the structure. */ |
@@ -212,7 +220,6 @@ static int kthread(void *_create) | |||
212 | do_exit(-ENOMEM); | 220 | do_exit(-ENOMEM); |
213 | } | 221 | } |
214 | 222 | ||
215 | self->flags = 0; | ||
216 | self->data = data; | 223 | self->data = data; |
217 | init_completion(&self->exited); | 224 | init_completion(&self->exited); |
218 | init_completion(&self->parked); | 225 | init_completion(&self->parked); |
@@ -1152,3 +1159,54 @@ void kthread_destroy_worker(struct kthread_worker *worker) | |||
1152 | kfree(worker); | 1159 | kfree(worker); |
1153 | } | 1160 | } |
1154 | EXPORT_SYMBOL(kthread_destroy_worker); | 1161 | EXPORT_SYMBOL(kthread_destroy_worker); |
1162 | |||
1163 | #ifdef CONFIG_BLK_CGROUP | ||
1164 | /** | ||
1165 | * kthread_associate_blkcg - associate blkcg to current kthread | ||
1166 | * @css: the cgroup info | ||
1167 | * | ||
1168 | * Current thread must be a kthread. The thread is running jobs on behalf of | ||
1169 | * other threads. In some cases, we expect the jobs attach cgroup info of | ||
1170 | * original threads instead of that of current thread. This function stores | ||
1171 | * original thread's cgroup info in current kthread context for later | ||
1172 | * retrieval. | ||
1173 | */ | ||
1174 | void kthread_associate_blkcg(struct cgroup_subsys_state *css) | ||
1175 | { | ||
1176 | struct kthread *kthread; | ||
1177 | |||
1178 | if (!(current->flags & PF_KTHREAD)) | ||
1179 | return; | ||
1180 | kthread = to_kthread(current); | ||
1181 | if (!kthread) | ||
1182 | return; | ||
1183 | |||
1184 | if (kthread->blkcg_css) { | ||
1185 | css_put(kthread->blkcg_css); | ||
1186 | kthread->blkcg_css = NULL; | ||
1187 | } | ||
1188 | if (css) { | ||
1189 | css_get(css); | ||
1190 | kthread->blkcg_css = css; | ||
1191 | } | ||
1192 | } | ||
1193 | EXPORT_SYMBOL(kthread_associate_blkcg); | ||
1194 | |||
1195 | /** | ||
1196 | * kthread_blkcg - get associated blkcg css of current kthread | ||
1197 | * | ||
1198 | * Current thread must be a kthread. | ||
1199 | */ | ||
1200 | struct cgroup_subsys_state *kthread_blkcg(void) | ||
1201 | { | ||
1202 | struct kthread *kthread; | ||
1203 | |||
1204 | if (current->flags & PF_KTHREAD) { | ||
1205 | kthread = to_kthread(current); | ||
1206 | if (kthread) | ||
1207 | return kthread->blkcg_css; | ||
1208 | } | ||
1209 | return NULL; | ||
1210 | } | ||
1211 | EXPORT_SYMBOL(kthread_blkcg); | ||
1212 | #endif | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d9c31bc2eaea..9576bd582d4a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1342,11 +1342,6 @@ static struct ctl_table vm_table[] = { | |||
1342 | .extra1 = &zero, | 1342 | .extra1 = &zero, |
1343 | }, | 1343 | }, |
1344 | { | 1344 | { |
1345 | .procname = "nr_pdflush_threads", | ||
1346 | .mode = 0444 /* read-only */, | ||
1347 | .proc_handler = pdflush_proc_obsolete, | ||
1348 | }, | ||
1349 | { | ||
1350 | .procname = "swappiness", | 1345 | .procname = "swappiness", |
1351 | .data = &vm_swappiness, | 1346 | .data = &vm_swappiness, |
1352 | .maxlen = sizeof(vm_swappiness), | 1347 | .maxlen = sizeof(vm_swappiness), |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 45a3928544ce..206e0e2ace53 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = { | |||
66 | }; | 66 | }; |
67 | 67 | ||
68 | /* Global reference count of probes */ | 68 | /* Global reference count of probes */ |
69 | static atomic_t blk_probes_ref = ATOMIC_INIT(0); | 69 | static DEFINE_MUTEX(blk_probe_mutex); |
70 | static int blk_probes_ref; | ||
70 | 71 | ||
71 | static void blk_register_tracepoints(void); | 72 | static void blk_register_tracepoints(void); |
72 | static void blk_unregister_tracepoints(void); | 73 | static void blk_unregister_tracepoints(void); |
@@ -329,14 +330,29 @@ static void blk_trace_free(struct blk_trace *bt) | |||
329 | kfree(bt); | 330 | kfree(bt); |
330 | } | 331 | } |
331 | 332 | ||
333 | static void get_probe_ref(void) | ||
334 | { | ||
335 | mutex_lock(&blk_probe_mutex); | ||
336 | if (++blk_probes_ref == 1) | ||
337 | blk_register_tracepoints(); | ||
338 | mutex_unlock(&blk_probe_mutex); | ||
339 | } | ||
340 | |||
341 | static void put_probe_ref(void) | ||
342 | { | ||
343 | mutex_lock(&blk_probe_mutex); | ||
344 | if (!--blk_probes_ref) | ||
345 | blk_unregister_tracepoints(); | ||
346 | mutex_unlock(&blk_probe_mutex); | ||
347 | } | ||
348 | |||
332 | static void blk_trace_cleanup(struct blk_trace *bt) | 349 | static void blk_trace_cleanup(struct blk_trace *bt) |
333 | { | 350 | { |
334 | blk_trace_free(bt); | 351 | blk_trace_free(bt); |
335 | if (atomic_dec_and_test(&blk_probes_ref)) | 352 | put_probe_ref(); |
336 | blk_unregister_tracepoints(); | ||
337 | } | 353 | } |
338 | 354 | ||
339 | int blk_trace_remove(struct request_queue *q) | 355 | static int __blk_trace_remove(struct request_queue *q) |
340 | { | 356 | { |
341 | struct blk_trace *bt; | 357 | struct blk_trace *bt; |
342 | 358 | ||
@@ -349,6 +365,17 @@ int blk_trace_remove(struct request_queue *q) | |||
349 | 365 | ||
350 | return 0; | 366 | return 0; |
351 | } | 367 | } |
368 | |||
369 | int blk_trace_remove(struct request_queue *q) | ||
370 | { | ||
371 | int ret; | ||
372 | |||
373 | mutex_lock(&q->blk_trace_mutex); | ||
374 | ret = __blk_trace_remove(q); | ||
375 | mutex_unlock(&q->blk_trace_mutex); | ||
376 | |||
377 | return ret; | ||
378 | } | ||
352 | EXPORT_SYMBOL_GPL(blk_trace_remove); | 379 | EXPORT_SYMBOL_GPL(blk_trace_remove); |
353 | 380 | ||
354 | static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, | 381 | static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, |
@@ -538,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
538 | if (cmpxchg(&q->blk_trace, NULL, bt)) | 565 | if (cmpxchg(&q->blk_trace, NULL, bt)) |
539 | goto err; | 566 | goto err; |
540 | 567 | ||
541 | if (atomic_inc_return(&blk_probes_ref) == 1) | 568 | get_probe_ref(); |
542 | blk_register_tracepoints(); | ||
543 | 569 | ||
544 | ret = 0; | 570 | ret = 0; |
545 | err: | 571 | err: |
@@ -550,9 +576,8 @@ err: | |||
550 | return ret; | 576 | return ret; |
551 | } | 577 | } |
552 | 578 | ||
553 | int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | 579 | static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, |
554 | struct block_device *bdev, | 580 | struct block_device *bdev, char __user *arg) |
555 | char __user *arg) | ||
556 | { | 581 | { |
557 | struct blk_user_trace_setup buts; | 582 | struct blk_user_trace_setup buts; |
558 | int ret; | 583 | int ret; |
@@ -571,6 +596,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
571 | } | 596 | } |
572 | return 0; | 597 | return 0; |
573 | } | 598 | } |
599 | |||
600 | int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | ||
601 | struct block_device *bdev, | ||
602 | char __user *arg) | ||
603 | { | ||
604 | int ret; | ||
605 | |||
606 | mutex_lock(&q->blk_trace_mutex); | ||
607 | ret = __blk_trace_setup(q, name, dev, bdev, arg); | ||
608 | mutex_unlock(&q->blk_trace_mutex); | ||
609 | |||
610 | return ret; | ||
611 | } | ||
574 | EXPORT_SYMBOL_GPL(blk_trace_setup); | 612 | EXPORT_SYMBOL_GPL(blk_trace_setup); |
575 | 613 | ||
576 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) | 614 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) |
@@ -607,7 +645,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, | |||
607 | } | 645 | } |
608 | #endif | 646 | #endif |
609 | 647 | ||
610 | int blk_trace_startstop(struct request_queue *q, int start) | 648 | static int __blk_trace_startstop(struct request_queue *q, int start) |
611 | { | 649 | { |
612 | int ret; | 650 | int ret; |
613 | struct blk_trace *bt = q->blk_trace; | 651 | struct blk_trace *bt = q->blk_trace; |
@@ -646,6 +684,17 @@ int blk_trace_startstop(struct request_queue *q, int start) | |||
646 | 684 | ||
647 | return ret; | 685 | return ret; |
648 | } | 686 | } |
687 | |||
688 | int blk_trace_startstop(struct request_queue *q, int start) | ||
689 | { | ||
690 | int ret; | ||
691 | |||
692 | mutex_lock(&q->blk_trace_mutex); | ||
693 | ret = __blk_trace_startstop(q, start); | ||
694 | mutex_unlock(&q->blk_trace_mutex); | ||
695 | |||
696 | return ret; | ||
697 | } | ||
649 | EXPORT_SYMBOL_GPL(blk_trace_startstop); | 698 | EXPORT_SYMBOL_GPL(blk_trace_startstop); |
650 | 699 | ||
651 | /* | 700 | /* |
@@ -676,7 +725,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
676 | switch (cmd) { | 725 | switch (cmd) { |
677 | case BLKTRACESETUP: | 726 | case BLKTRACESETUP: |
678 | bdevname(bdev, b); | 727 | bdevname(bdev, b); |
679 | ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); | 728 | ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); |
680 | break; | 729 | break; |
681 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) | 730 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) |
682 | case BLKTRACESETUP32: | 731 | case BLKTRACESETUP32: |
@@ -687,10 +736,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
687 | case BLKTRACESTART: | 736 | case BLKTRACESTART: |
688 | start = 1; | 737 | start = 1; |
689 | case BLKTRACESTOP: | 738 | case BLKTRACESTOP: |
690 | ret = blk_trace_startstop(q, start); | 739 | ret = __blk_trace_startstop(q, start); |
691 | break; | 740 | break; |
692 | case BLKTRACETEARDOWN: | 741 | case BLKTRACETEARDOWN: |
693 | ret = blk_trace_remove(q); | 742 | ret = __blk_trace_remove(q); |
694 | break; | 743 | break; |
695 | default: | 744 | default: |
696 | ret = -ENOTTY; | 745 | ret = -ENOTTY; |
@@ -708,10 +757,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
708 | **/ | 757 | **/ |
709 | void blk_trace_shutdown(struct request_queue *q) | 758 | void blk_trace_shutdown(struct request_queue *q) |
710 | { | 759 | { |
760 | mutex_lock(&q->blk_trace_mutex); | ||
761 | |||
711 | if (q->blk_trace) { | 762 | if (q->blk_trace) { |
712 | blk_trace_startstop(q, 0); | 763 | __blk_trace_startstop(q, 0); |
713 | blk_trace_remove(q); | 764 | __blk_trace_remove(q); |
714 | } | 765 | } |
766 | |||
767 | mutex_unlock(&q->blk_trace_mutex); | ||
715 | } | 768 | } |
716 | 769 | ||
717 | #ifdef CONFIG_BLK_CGROUP | 770 | #ifdef CONFIG_BLK_CGROUP |
@@ -1558,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q) | |||
1558 | if (bt == NULL) | 1611 | if (bt == NULL) |
1559 | return -EINVAL; | 1612 | return -EINVAL; |
1560 | 1613 | ||
1561 | if (atomic_dec_and_test(&blk_probes_ref)) | 1614 | put_probe_ref(); |
1562 | blk_unregister_tracepoints(); | ||
1563 | |||
1564 | blk_trace_free(bt); | 1615 | blk_trace_free(bt); |
1565 | return 0; | 1616 | return 0; |
1566 | } | 1617 | } |
@@ -1591,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q, | |||
1591 | if (cmpxchg(&q->blk_trace, NULL, bt)) | 1642 | if (cmpxchg(&q->blk_trace, NULL, bt)) |
1592 | goto free_bt; | 1643 | goto free_bt; |
1593 | 1644 | ||
1594 | if (atomic_inc_return(&blk_probes_ref) == 1) | 1645 | get_probe_ref(); |
1595 | blk_register_tracepoints(); | ||
1596 | return 0; | 1646 | return 0; |
1597 | 1647 | ||
1598 | free_bt: | 1648 | free_bt: |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e19606bb41a0..74b52dfd5852 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -1072,23 +1072,3 @@ out: | |||
1072 | return ret; | 1072 | return ret; |
1073 | } | 1073 | } |
1074 | EXPORT_SYMBOL(wait_iff_congested); | 1074 | EXPORT_SYMBOL(wait_iff_congested); |
1075 | |||
1076 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | ||
1077 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1078 | { | ||
1079 | char kbuf[] = "0\n"; | ||
1080 | |||
1081 | if (*ppos || *lenp < sizeof(kbuf)) { | ||
1082 | *lenp = 0; | ||
1083 | return 0; | ||
1084 | } | ||
1085 | |||
1086 | if (copy_to_user(buffer, kbuf, sizeof(kbuf))) | ||
1087 | return -EFAULT; | ||
1088 | pr_warn_once("%s exported in /proc is scheduled for removal\n", | ||
1089 | table->procname); | ||
1090 | |||
1091 | *lenp = 2; | ||
1092 | *ppos += *lenp; | ||
1093 | return 2; | ||
1094 | } | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b9c5cbe8eba..c518c845f202 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1972,31 +1972,31 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) | |||
1972 | int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, | 1972 | int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, |
1973 | void __user *buffer, size_t *length, loff_t *ppos) | 1973 | void __user *buffer, size_t *length, loff_t *ppos) |
1974 | { | 1974 | { |
1975 | proc_dointvec(table, write, buffer, length, ppos); | 1975 | unsigned int old_interval = dirty_writeback_interval; |
1976 | return 0; | 1976 | int ret; |
1977 | |||
1978 | ret = proc_dointvec(table, write, buffer, length, ppos); | ||
1979 | |||
1980 | /* | ||
1981 | * Writing 0 to dirty_writeback_interval will disable periodic writeback | ||
1982 | * and a different non-zero value will wakeup the writeback threads. | ||
1983 | * wb_wakeup_delayed() would be more appropriate, but it's a pain to | ||
1984 | * iterate over all bdis and wbs. | ||
1985 | * The reason we do this is to make the change take effect immediately. | ||
1986 | */ | ||
1987 | if (!ret && write && dirty_writeback_interval && | ||
1988 | dirty_writeback_interval != old_interval) | ||
1989 | wakeup_flusher_threads(WB_REASON_PERIODIC); | ||
1990 | |||
1991 | return ret; | ||
1977 | } | 1992 | } |
1978 | 1993 | ||
1979 | #ifdef CONFIG_BLOCK | 1994 | #ifdef CONFIG_BLOCK |
1980 | void laptop_mode_timer_fn(unsigned long data) | 1995 | void laptop_mode_timer_fn(unsigned long data) |
1981 | { | 1996 | { |
1982 | struct request_queue *q = (struct request_queue *)data; | 1997 | struct request_queue *q = (struct request_queue *)data; |
1983 | int nr_pages = global_node_page_state(NR_FILE_DIRTY) + | ||
1984 | global_node_page_state(NR_UNSTABLE_NFS); | ||
1985 | struct bdi_writeback *wb; | ||
1986 | 1998 | ||
1987 | /* | 1999 | wakeup_flusher_threads_bdi(q->backing_dev_info, WB_REASON_LAPTOP_TIMER); |
1988 | * We want to write everything out, not just down to the dirty | ||
1989 | * threshold | ||
1990 | */ | ||
1991 | if (!bdi_has_dirty_io(q->backing_dev_info)) | ||
1992 | return; | ||
1993 | |||
1994 | rcu_read_lock(); | ||
1995 | list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) | ||
1996 | if (wb_has_dirty_io(wb)) | ||
1997 | wb_start_writeback(wb, nr_pages, true, | ||
1998 | WB_REASON_LAPTOP_TIMER); | ||
1999 | rcu_read_unlock(); | ||
2000 | } | 2000 | } |
2001 | 2001 | ||
2002 | /* | 2002 | /* |
diff --git a/mm/page_io.c b/mm/page_io.c index 5d882de3fbfd..cd52b9cc169b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -408,7 +408,7 @@ int swap_readpage(struct page *page, bool do_poll) | |||
408 | if (!READ_ONCE(bio->bi_private)) | 408 | if (!READ_ONCE(bio->bi_private)) |
409 | break; | 409 | break; |
410 | 410 | ||
411 | if (!blk_mq_poll(disk->queue, qc)) | 411 | if (!blk_poll(disk->queue, qc)) |
412 | break; | 412 | break; |
413 | } | 413 | } |
414 | __set_current_state(TASK_RUNNING); | 414 | __set_current_state(TASK_RUNNING); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index eb2f0315b8c0..15b483ef6440 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1868,7 +1868,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1868 | * also allow kswapd to start writing pages during reclaim. | 1868 | * also allow kswapd to start writing pages during reclaim. |
1869 | */ | 1869 | */ |
1870 | if (stat.nr_unqueued_dirty == nr_taken) { | 1870 | if (stat.nr_unqueued_dirty == nr_taken) { |
1871 | wakeup_flusher_threads(0, WB_REASON_VMSCAN); | 1871 | wakeup_flusher_threads(WB_REASON_VMSCAN); |
1872 | set_bit(PGDAT_DIRTY, &pgdat->flags); | 1872 | set_bit(PGDAT_DIRTY, &pgdat->flags); |
1873 | } | 1873 | } |
1874 | 1874 | ||