aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-01-29 14:51:49 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-29 14:51:49 -0500
commit0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch)
treecefccd67dc1f27bb45830f6b8065dd4a1c05e83b
parent9697e9da84299d0d715d515dd2cc48f1eceb277d (diff)
parent796baeeef85a40b3495a907fb7425086e7010102 (diff)
Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main pull request for block IO related changes for the 4.16 kernel. Nothing major in this pull request, but a good amount of improvements and fixes all over the map. This contains: - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and Paolo. - Support for SMR zones for deadline and mq-deadline from Damien and Christoph. - Set of fixes for bcache by way of Michael Lyle, including fixes from himself, Kent, Rui, Tang, and Coly. - Series from Matias for lightnvm with fixes from Hans Holmberg, Javier, and Matias. Mostly centered around pblk, and the removing rrpc 1.2 in preparation for supporting 2.0. - A couple of NVMe pull requests from Christoph. Nothing major in here, just fixes and cleanups, and support for command tracing from Johannes. - Support for blk-throttle for tracking reads and writes separately. From Joseph Qi. A few cleanups/fixes also for blk-throttle from Weiping. - Series from Mike Snitzer that enables dm to register its queue more logically, something that's alwways been problematic on dm since it's a stacked device. - Series from Ming cleaning up some of the bio accessor use, in preparation for supporting multipage bvecs. - Various fixes from Ming closing up holes around queue mapping and quiescing. - BSD partition fix from Richard Narron, fixing a problem where we can't mount newer (10/11) FreeBSD partitions. - Series from Tejun reworking blk-mq timeout handling. The previous scheme relied on atomic bits, but it had races where we would think a request had timed out if it to reused at the wrong time. - null_blk now supports faking timeouts, to enable us to better exercise and test that functionality separately. From me. - Kill the separate atomic poll bit in the request struct. After this, we don't use the atomic bits on blk-mq anymore at all. From me. - sgl_alloc/free helpers from Bart. - Heavily contended tag case scalability improvement from me. - Various little fixes and cleanups from Arnd, Bart, Corentin, Douglas, Eryu, Goldwyn, and myself" * 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits) block: remove smart1,2.h nvme: add tracepoint for nvme_complete_rq nvme: add tracepoint for nvme_setup_cmd nvme-pci: introduce RECONNECTING state to mark initializing procedure nvme-rdma: remove redundant boolean for inline_data nvme: don't free uuid pointer before printing it nvme-pci: Suspend queues after deleting them bsg: use pr_debug instead of hand crafted macros blk-mq-debugfs: don't allow write on attributes with seq_operations set nvme-pci: Fix queue double allocations block: Set BIO_TRACE_COMPLETION on new bio during split blk-throttle: use queue_is_rq_based block: Remove kblockd_schedule_delayed_work{,_on}() blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly() lib/scatterlist: Fix chaining support in sgl_alloc_order() blk-throttle: track read and write request individually block: add bdev_read_only() checks to common helpers block: fail op_is_write() requests to read-only partitions blk-throttle: export io_serviced_recursive, io_service_bytes_recursive ...
-rw-r--r--block/bfq-cgroup.c7
-rw-r--r--block/bfq-iosched.c529
-rw-r--r--block/bfq-iosched.h19
-rw-r--r--block/bfq-wf2q.c7
-rw-r--r--block/bio-integrity.c1
-rw-r--r--block/bio.c30
-rw-r--r--block/blk-core.c87
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-lib.c12
-rw-r--r--block/blk-map.c4
-rw-r--r--block/blk-merge.c13
-rw-r--r--block/blk-mq-debugfs.c22
-rw-r--r--block/blk-mq-sched.c3
-rw-r--r--block/blk-mq-sched.h2
-rw-r--r--block/blk-mq-sysfs.c9
-rw-r--r--block/blk-mq-tag.c13
-rw-r--r--block/blk-mq.c667
-rw-r--r--block/blk-mq.h52
-rw-r--r--block/blk-sysfs.c47
-rw-r--r--block/blk-throttle.c146
-rw-r--r--block/blk-timeout.c26
-rw-r--r--block/blk-zoned.c42
-rw-r--r--block/blk.h46
-rw-r--r--block/bounce.c33
-rw-r--r--block/bsg-lib.c3
-rw-r--r--block/bsg.c40
-rw-r--r--block/deadline-iosched.c114
-rw-r--r--block/elevator.c12
-rw-r--r--block/genhd.c23
-rw-r--r--block/mq-deadline.c141
-rw-r--r--block/partitions/msdos.c4
-rw-r--r--block/scsi_ioctl.c34
-rw-r--r--crypto/Kconfig1
-rw-r--r--crypto/scompress.c51
-rw-r--r--drivers/block/DAC960.c160
-rw-r--r--drivers/block/Kconfig4
-rw-r--r--drivers/block/aoe/aoe.h3
-rw-r--r--drivers/block/aoe/aoecmd.c48
-rw-r--r--drivers/block/drbd/drbd_bitmap.c2
-rw-r--r--drivers/block/null_blk.c290
-rw-r--r--drivers/block/pktcdvd.c12
-rw-r--r--drivers/block/smart1,2.h278
-rw-r--r--drivers/block/zram/zram_drv.c2
-rw-r--r--drivers/lightnvm/Kconfig7
-rw-r--r--drivers/lightnvm/Makefile1
-rw-r--r--drivers/lightnvm/core.c462
-rw-r--r--drivers/lightnvm/pblk-cache.c5
-rw-r--r--drivers/lightnvm/pblk-core.c55
-rw-r--r--drivers/lightnvm/pblk-gc.c23
-rw-r--r--drivers/lightnvm/pblk-init.c104
-rw-r--r--drivers/lightnvm/pblk-map.c2
-rw-r--r--drivers/lightnvm/pblk-rb.c111
-rw-r--r--drivers/lightnvm/pblk-read.c35
-rw-r--r--drivers/lightnvm/pblk-recovery.c43
-rw-r--r--drivers/lightnvm/pblk-rl.c54
-rw-r--r--drivers/lightnvm/pblk-sysfs.c15
-rw-r--r--drivers/lightnvm/pblk-write.c23
-rw-r--r--drivers/lightnvm/pblk.h163
-rw-r--r--drivers/lightnvm/rrpc.c1625
-rw-r--r--drivers/lightnvm/rrpc.h290
-rw-r--r--drivers/md/bcache/alloc.c19
-rw-r--r--drivers/md/bcache/bcache.h24
-rw-r--r--drivers/md/bcache/btree.c10
-rw-r--r--drivers/md/bcache/closure.c47
-rw-r--r--drivers/md/bcache/closure.h60
-rw-r--r--drivers/md/bcache/debug.c7
-rw-r--r--drivers/md/bcache/io.c13
-rw-r--r--drivers/md/bcache/movinggc.c2
-rw-r--r--drivers/md/bcache/request.c29
-rw-r--r--drivers/md/bcache/super.c27
-rw-r--r--drivers/md/bcache/util.c34
-rw-r--r--drivers/md/bcache/util.h1
-rw-r--r--drivers/md/bcache/writeback.c203
-rw-r--r--drivers/md/bcache/writeback.h12
-rw-r--r--drivers/md/dm-crypt.c1
-rw-r--r--drivers/md/dm-mpath.c19
-rw-r--r--drivers/md/dm-rq.c28
-rw-r--r--drivers/md/dm.c21
-rw-r--r--drivers/nvme/host/Makefile4
-rw-r--r--drivers/nvme/host/core.c134
-rw-r--r--drivers/nvme/host/fabrics.c22
-rw-r--r--drivers/nvme/host/fabrics.h2
-rw-r--r--drivers/nvme/host/fc.c7
-rw-r--r--drivers/nvme/host/lightnvm.c185
-rw-r--r--drivers/nvme/host/multipath.c44
-rw-r--r--drivers/nvme/host/nvme.h9
-rw-r--r--drivers/nvme/host/pci.c216
-rw-r--r--drivers/nvme/host/rdma.c6
-rw-r--r--drivers/nvme/host/trace.c130
-rw-r--r--drivers/nvme/host/trace.h165
-rw-r--r--drivers/nvme/target/Kconfig2
-rw-r--r--drivers/nvme/target/core.c14
-rw-r--r--drivers/nvme/target/fabrics-cmd.c2
-rw-r--r--drivers/nvme/target/fc.c60
-rw-r--r--drivers/nvme/target/fcloop.c244
-rw-r--r--drivers/nvme/target/loop.c3
-rw-r--r--drivers/nvme/target/rdma.c83
-rw-r--r--drivers/target/Kconfig1
-rw-r--r--drivers/target/target_core_transport.c46
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/extent_io.c11
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/f2fs/data.c2
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--include/linux/bio.h24
-rw-r--r--include/linux/blk-cgroup.h8
-rw-r--r--include/linux/blk-mq.h3
-rw-r--r--include/linux/blk_types.h28
-rw-r--r--include/linux/blkdev.h172
-rw-r--r--include/linux/bvec.h9
-rw-r--r--include/linux/elevator.h2
-rw-r--r--include/linux/genhd.h5
-rw-r--r--include/linux/lightnvm.h125
-rw-r--r--include/linux/nvme.h22
-rw-r--r--include/linux/scatterlist.h11
-rw-r--r--include/uapi/linux/lightnvm.h9
-rw-r--r--kernel/irq/affinity.c30
-rw-r--r--kernel/power/swap.c2
-rw-r--r--lib/Kconfig4
-rw-r--r--lib/sbitmap.c2
-rw-r--r--lib/scatterlist.c127
-rw-r--r--mm/page_io.c4
124 files changed, 3884 insertions, 4729 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index da1525ec4c87..d819dc77fe65 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
775 unsigned long flags; 775 unsigned long flags;
776 int i; 776 int i;
777 777
778 spin_lock_irqsave(&bfqd->lock, flags);
779
778 if (!entity) /* root group */ 780 if (!entity) /* root group */
779 return; 781 goto put_async_queues;
780 782
781 spin_lock_irqsave(&bfqd->lock, flags);
782 /* 783 /*
783 * Empty all service_trees belonging to this group before 784 * Empty all service_trees belonging to this group before
784 * deactivating the group itself. 785 * deactivating the group itself.
@@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
809 } 810 }
810 811
811 __bfq_deactivate_entity(entity, false); 812 __bfq_deactivate_entity(entity, false);
813
814put_async_queues:
812 bfq_put_async_queues(bfqd, bfqg); 815 bfq_put_async_queues(bfqd, bfqg);
813 816
814 spin_unlock_irqrestore(&bfqd->lock, flags); 817 spin_unlock_irqrestore(&bfqd->lock, flags);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index bcb6d21baf12..47e6ec7427c4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;
166/* Default timeout values, in jiffies, approximating CFQ defaults. */ 166/* Default timeout values, in jiffies, approximating CFQ defaults. */
167const int bfq_timeout = HZ / 8; 167const int bfq_timeout = HZ / 8;
168 168
169/*
170 * Time limit for merging (see comments in bfq_setup_cooperator). Set
171 * to the slowest value that, in our tests, proved to be effective in
172 * removing false positives, while not causing true positives to miss
173 * queue merging.
174 *
175 * As can be deduced from the low time limit below, queue merging, if
176 * successful, happens at the very beggining of the I/O of the involved
177 * cooperating processes, as a consequence of the arrival of the very
178 * first requests from each cooperator. After that, there is very
179 * little chance to find cooperators.
180 */
181static const unsigned long bfq_merge_time_limit = HZ/10;
182
169static struct kmem_cache *bfq_pool; 183static struct kmem_cache *bfq_pool;
170 184
171/* Below this threshold (in ns), we consider thinktime immediate. */ 185/* Below this threshold (in ns), we consider thinktime immediate. */
@@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool;
178#define BFQQ_SEEK_THR (sector_t)(8 * 100) 192#define BFQQ_SEEK_THR (sector_t)(8 * 100)
179#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 193#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
180#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) 194#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
181#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) 195#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
182 196
183/* Min number of samples required to perform peak-rate update */ 197/* Min number of samples required to perform peak-rate update */
184#define BFQ_RATE_MIN_SAMPLES 32 198#define BFQ_RATE_MIN_SAMPLES 32
@@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool;
195 * interactive applications automatically, using the following formula: 209 * interactive applications automatically, using the following formula:
196 * duration = (R / r) * T, where r is the peak rate of the device, and 210 * duration = (R / r) * T, where r is the peak rate of the device, and
197 * R and T are two reference parameters. 211 * R and T are two reference parameters.
198 * In particular, R is the peak rate of the reference device (see below), 212 * In particular, R is the peak rate of the reference device (see
199 * and T is a reference time: given the systems that are likely to be 213 * below), and T is a reference time: given the systems that are
200 * installed on the reference device according to its speed class, T is 214 * likely to be installed on the reference device according to its
201 * about the maximum time needed, under BFQ and while reading two files in 215 * speed class, T is about the maximum time needed, under BFQ and
202 * parallel, to load typical large applications on these systems. 216 * while reading two files in parallel, to load typical large
203 * In practice, the slower/faster the device at hand is, the more/less it 217 * applications on these systems (see the comments on
204 * takes to load applications with respect to the reference device. 218 * max_service_from_wr below, for more details on how T is obtained).
205 * Accordingly, the longer/shorter BFQ grants weight raising to interactive 219 * In practice, the slower/faster the device at hand is, the more/less
206 * applications. 220 * it takes to load applications with respect to the reference device.
221 * Accordingly, the longer/shorter BFQ grants weight raising to
222 * interactive applications.
207 * 223 *
208 * BFQ uses four different reference pairs (R, T), depending on: 224 * BFQ uses four different reference pairs (R, T), depending on:
209 * . whether the device is rotational or non-rotational; 225 * . whether the device is rotational or non-rotational;
@@ -240,6 +256,60 @@ static int T_slow[2];
240static int T_fast[2]; 256static int T_fast[2];
241static int device_speed_thresh[2]; 257static int device_speed_thresh[2];
242 258
259/*
260 * BFQ uses the above-detailed, time-based weight-raising mechanism to
261 * privilege interactive tasks. This mechanism is vulnerable to the
262 * following false positives: I/O-bound applications that will go on
263 * doing I/O for much longer than the duration of weight
264 * raising. These applications have basically no benefit from being
265 * weight-raised at the beginning of their I/O. On the opposite end,
266 * while being weight-raised, these applications
267 * a) unjustly steal throughput to applications that may actually need
268 * low latency;
269 * b) make BFQ uselessly perform device idling; device idling results
270 * in loss of device throughput with most flash-based storage, and may
271 * increase latencies when used purposelessly.
272 *
273 * BFQ tries to reduce these problems, by adopting the following
274 * countermeasure. To introduce this countermeasure, we need first to
275 * finish explaining how the duration of weight-raising for
276 * interactive tasks is computed.
277 *
278 * For a bfq_queue deemed as interactive, the duration of weight
279 * raising is dynamically adjusted, as a function of the estimated
280 * peak rate of the device, so as to be equal to the time needed to
281 * execute the 'largest' interactive task we benchmarked so far. By
282 * largest task, we mean the task for which each involved process has
283 * to do more I/O than for any of the other tasks we benchmarked. This
284 * reference interactive task is the start-up of LibreOffice Writer,
285 * and in this task each process/bfq_queue needs to have at most ~110K
286 * sectors transferred.
287 *
288 * This last piece of information enables BFQ to reduce the actual
289 * duration of weight-raising for at least one class of I/O-bound
290 * applications: those doing sequential or quasi-sequential I/O. An
291 * example is file copy. In fact, once started, the main I/O-bound
292 * processes of these applications usually consume the above 110K
293 * sectors in much less time than the processes of an application that
294 * is starting, because these I/O-bound processes will greedily devote
295 * almost all their CPU cycles only to their target,
296 * throughput-friendly I/O operations. This is even more true if BFQ
297 * happens to be underestimating the device peak rate, and thus
298 * overestimating the duration of weight raising. But, according to
299 * our measurements, once transferred 110K sectors, these processes
300 * have no right to be weight-raised any longer.
301 *
302 * Basing on the last consideration, BFQ ends weight-raising for a
303 * bfq_queue if the latter happens to have received an amount of
304 * service at least equal to the following constant. The constant is
305 * set to slightly more than 110K, to have a minimum safety margin.
306 *
307 * This early ending of weight-raising reduces the amount of time
308 * during which interactive false positives cause the two problems
309 * described at the beginning of these comments.
310 */
311static const unsigned long max_service_from_wr = 120000;
312
243#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) 313#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
244#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) 314#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
245 315
@@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
403 } 473 }
404} 474}
405 475
476/*
477 * See the comments on bfq_limit_depth for the purpose of
478 * the depths set in the function.
479 */
480static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
481{
482 bfqd->sb_shift = bt->sb.shift;
483
484 /*
485 * In-word depths if no bfq_queue is being weight-raised:
486 * leaving 25% of tags only for sync reads.
487 *
488 * In next formulas, right-shift the value
489 * (1U<<bfqd->sb_shift), instead of computing directly
490 * (1U<<(bfqd->sb_shift - something)), to be robust against
491 * any possible value of bfqd->sb_shift, without having to
492 * limit 'something'.
493 */
494 /* no more than 50% of tags for async I/O */
495 bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
496 /*
497 * no more than 75% of tags for sync writes (25% extra tags
498 * w.r.t. async I/O, to prevent async I/O from starving sync
499 * writes)
500 */
501 bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
502
503 /*
504 * In-word depths in case some bfq_queue is being weight-
505 * raised: leaving ~63% of tags for sync reads. This is the
506 * highest percentage for which, in our tests, application
507 * start-up times didn't suffer from any regression due to tag
508 * shortage.
509 */
510 /* no more than ~18% of tags for async I/O */
511 bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
512 /* no more than ~37% of tags for sync writes (~20% extra tags) */
513 bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
514}
515
516/*
517 * Async I/O can easily starve sync I/O (both sync reads and sync
518 * writes), by consuming all tags. Similarly, storms of sync writes,
519 * such as those that sync(2) may trigger, can starve sync reads.
520 * Limit depths of async I/O and sync writes so as to counter both
521 * problems.
522 */
523static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
524{
525 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
526 struct bfq_data *bfqd = data->q->elevator->elevator_data;
527 struct sbitmap_queue *bt;
528
529 if (op_is_sync(op) && !op_is_write(op))
530 return;
531
532 if (data->flags & BLK_MQ_REQ_RESERVED) {
533 if (unlikely(!tags->nr_reserved_tags)) {
534 WARN_ON_ONCE(1);
535 return;
536 }
537 bt = &tags->breserved_tags;
538 } else
539 bt = &tags->bitmap_tags;
540
541 if (unlikely(bfqd->sb_shift != bt->sb.shift))
542 bfq_update_depths(bfqd, bt);
543
544 data->shallow_depth =
545 bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
546
547 bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
548 __func__, bfqd->wr_busy_queues, op_is_sync(op),
549 data->shallow_depth);
550}
551
406static struct bfq_queue * 552static struct bfq_queue *
407bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, 553bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
408 sector_t sector, struct rb_node **ret_parent, 554 sector_t sector, struct rb_node **ret_parent,
@@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
444 return bfqq; 590 return bfqq;
445} 591}
446 592
593static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
594{
595 return bfqq->service_from_backlogged > 0 &&
596 time_is_before_jiffies(bfqq->first_IO_time +
597 bfq_merge_time_limit);
598}
599
447void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) 600void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
448{ 601{
449 struct rb_node **p, *parent; 602 struct rb_node **p, *parent;
@@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
454 bfqq->pos_root = NULL; 607 bfqq->pos_root = NULL;
455 } 608 }
456 609
610 /*
611 * bfqq cannot be merged any longer (see comments in
612 * bfq_setup_cooperator): no point in adding bfqq into the
613 * position tree.
614 */
615 if (bfq_too_late_for_merging(bfqq))
616 return;
617
457 if (bfq_class_idle(bfqq)) 618 if (bfq_class_idle(bfqq))
458 return; 619 return;
459 if (!bfqq->next_rq) 620 if (!bfqq->next_rq)
@@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
1247 if (old_wr_coeff == 1 && wr_or_deserves_wr) { 1408 if (old_wr_coeff == 1 && wr_or_deserves_wr) {
1248 /* start a weight-raising period */ 1409 /* start a weight-raising period */
1249 if (interactive) { 1410 if (interactive) {
1411 bfqq->service_from_wr = 0;
1250 bfqq->wr_coeff = bfqd->bfq_wr_coeff; 1412 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
1251 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); 1413 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
1252 } else { 1414 } else {
@@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q,
1627 rb_erase(&bfqq->pos_node, bfqq->pos_root); 1789 rb_erase(&bfqq->pos_node, bfqq->pos_root);
1628 bfqq->pos_root = NULL; 1790 bfqq->pos_root = NULL;
1629 } 1791 }
1792 } else {
1793 bfq_pos_tree_add_move(bfqd, bfqq);
1630 } 1794 }
1631 1795
1632 if (rq->cmd_flags & REQ_META) 1796 if (rq->cmd_flags & REQ_META)
@@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
1933static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, 2097static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
1934 struct bfq_queue *new_bfqq) 2098 struct bfq_queue *new_bfqq)
1935{ 2099{
2100 if (bfq_too_late_for_merging(new_bfqq))
2101 return false;
2102
1936 if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || 2103 if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
1937 (bfqq->ioprio_class != new_bfqq->ioprio_class)) 2104 (bfqq->ioprio_class != new_bfqq->ioprio_class))
1938 return false; 2105 return false;
@@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
1957} 2124}
1958 2125
1959/* 2126/*
1960 * If this function returns true, then bfqq cannot be merged. The idea
1961 * is that true cooperation happens very early after processes start
1962 * to do I/O. Usually, late cooperations are just accidental false
1963 * positives. In case bfqq is weight-raised, such false positives
1964 * would evidently degrade latency guarantees for bfqq.
1965 */
1966static bool wr_from_too_long(struct bfq_queue *bfqq)
1967{
1968 return bfqq->wr_coeff > 1 &&
1969 time_is_before_jiffies(bfqq->last_wr_start_finish +
1970 msecs_to_jiffies(100));
1971}
1972
1973/*
1974 * Attempt to schedule a merge of bfqq with the currently in-service 2127 * Attempt to schedule a merge of bfqq with the currently in-service
1975 * queue or with a close queue among the scheduled queues. Return 2128 * queue or with a close queue among the scheduled queues. Return
1976 * NULL if no merge was scheduled, a pointer to the shared bfq_queue 2129 * NULL if no merge was scheduled, a pointer to the shared bfq_queue
@@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)
1983 * to maintain. Besides, in such a critical condition as an out of memory, 2136 * to maintain. Besides, in such a critical condition as an out of memory,
1984 * the benefits of queue merging may be little relevant, or even negligible. 2137 * the benefits of queue merging may be little relevant, or even negligible.
1985 * 2138 *
1986 * Weight-raised queues can be merged only if their weight-raising
1987 * period has just started. In fact cooperating processes are usually
1988 * started together. Thus, with this filter we avoid false positives
1989 * that would jeopardize low-latency guarantees.
1990 *
1991 * WARNING: queue merging may impair fairness among non-weight raised 2139 * WARNING: queue merging may impair fairness among non-weight raised
1992 * queues, for at least two reasons: 1) the original weight of a 2140 * queues, for at least two reasons: 1) the original weight of a
1993 * merged queue may change during the merged state, 2) even being the 2141 * merged queue may change during the merged state, 2) even being the
@@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2001{ 2149{
2002 struct bfq_queue *in_service_bfqq, *new_bfqq; 2150 struct bfq_queue *in_service_bfqq, *new_bfqq;
2003 2151
2152 /*
2153 * Prevent bfqq from being merged if it has been created too
2154 * long ago. The idea is that true cooperating processes, and
2155 * thus their associated bfq_queues, are supposed to be
2156 * created shortly after each other. This is the case, e.g.,
2157 * for KVM/QEMU and dump I/O threads. Basing on this
2158 * assumption, the following filtering greatly reduces the
2159 * probability that two non-cooperating processes, which just
2160 * happen to do close I/O for some short time interval, have
2161 * their queues merged by mistake.
2162 */
2163 if (bfq_too_late_for_merging(bfqq))
2164 return NULL;
2165
2004 if (bfqq->new_bfqq) 2166 if (bfqq->new_bfqq)
2005 return bfqq->new_bfqq; 2167 return bfqq->new_bfqq;
2006 2168
2007 if (!io_struct || 2169 if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
2008 wr_from_too_long(bfqq) ||
2009 unlikely(bfqq == &bfqd->oom_bfqq))
2010 return NULL; 2170 return NULL;
2011 2171
2012 /* If there is only one backlogged queue, don't search. */ 2172 /* If there is only one backlogged queue, don't search. */
@@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2015 2175
2016 in_service_bfqq = bfqd->in_service_queue; 2176 in_service_bfqq = bfqd->in_service_queue;
2017 2177
2018 if (!in_service_bfqq || in_service_bfqq == bfqq 2178 if (in_service_bfqq && in_service_bfqq != bfqq &&
2019 || wr_from_too_long(in_service_bfqq) || 2179 likely(in_service_bfqq != &bfqd->oom_bfqq) &&
2020 unlikely(in_service_bfqq == &bfqd->oom_bfqq)) 2180 bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
2021 goto check_scheduled;
2022
2023 if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
2024 bfqq->entity.parent == in_service_bfqq->entity.parent && 2181 bfqq->entity.parent == in_service_bfqq->entity.parent &&
2025 bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { 2182 bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
2026 new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); 2183 new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2032 * queues. The only thing we need is that the bio/request is not 2189 * queues. The only thing we need is that the bio/request is not
2033 * NULL, as we need it to establish whether a cooperator exists. 2190 * NULL, as we need it to establish whether a cooperator exists.
2034 */ 2191 */
2035check_scheduled:
2036 new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, 2192 new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
2037 bfq_io_struct_pos(io_struct, request)); 2193 bfq_io_struct_pos(io_struct, request));
2038 2194
2039 if (new_bfqq && !wr_from_too_long(new_bfqq) && 2195 if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
2040 likely(new_bfqq != &bfqd->oom_bfqq) &&
2041 bfq_may_be_close_cooperator(bfqq, new_bfqq)) 2196 bfq_may_be_close_cooperator(bfqq, new_bfqq))
2042 return bfq_setup_merge(bfqq, new_bfqq); 2197 return bfq_setup_merge(bfqq, new_bfqq);
2043 2198
@@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
2062 bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); 2217 bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
2063 bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); 2218 bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
2064 if (unlikely(bfq_bfqq_just_created(bfqq) && 2219 if (unlikely(bfq_bfqq_just_created(bfqq) &&
2065 !bfq_bfqq_in_large_burst(bfqq))) { 2220 !bfq_bfqq_in_large_burst(bfqq) &&
2221 bfqq->bfqd->low_latency)) {
2066 /* 2222 /*
2067 * bfqq being merged right after being created: bfqq 2223 * bfqq being merged right after being created: bfqq
2068 * would have deserved interactive weight raising, but 2224 * would have deserved interactive weight raising, but
@@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2917 * whereas soft_rt_next_start is set to infinity for applications that do 3073 * whereas soft_rt_next_start is set to infinity for applications that do
2918 * not. 3074 * not.
2919 * 3075 *
2920 * Unfortunately, even a greedy application may happen to behave in an 3076 * Unfortunately, even a greedy (i.e., I/O-bound) application may
2921 * isochronous way if the CPU load is high. In fact, the application may 3077 * happen to meet, occasionally or systematically, both the above
2922 * stop issuing requests while the CPUs are busy serving other processes, 3078 * bandwidth and isochrony requirements. This may happen at least in
2923 * then restart, then stop again for a while, and so on. In addition, if 3079 * the following circumstances. First, if the CPU load is high. The
2924 * the disk achieves a low enough throughput with the request pattern 3080 * application may stop issuing requests while the CPUs are busy
2925 * issued by the application (e.g., because the request pattern is random 3081 * serving other processes, then restart, then stop again for a while,
2926 * and/or the device is slow), then the application may meet the above 3082 * and so on. The other circumstances are related to the storage
2927 * bandwidth requirement too. To prevent such a greedy application to be 3083 * device: the storage device is highly loaded or reaches a low-enough
2928 * deemed as soft real-time, a further rule is used in the computation of 3084 * throughput with the I/O of the application (e.g., because the I/O
2929 * soft_rt_next_start: soft_rt_next_start must be higher than the current 3085 * is random and/or the device is slow). In all these cases, the
2930 * time plus the maximum time for which the arrival of a request is waited 3086 * I/O of the application may be simply slowed down enough to meet
2931 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. 3087 * the bandwidth and isochrony requirements. To reduce the probability
2932 * This filters out greedy applications, as the latter issue instead their 3088 * that greedy applications are deemed as soft real-time in these
2933 * next request as soon as possible after the last one has been completed 3089 * corner cases, a further rule is used in the computation of
2934 * (in contrast, when a batch of requests is completed, a soft real-time 3090 * soft_rt_next_start: the return value of this function is forced to
2935 * application spends some time processing data). 3091 * be higher than the maximum between the following two quantities.
3092 *
3093 * (a) Current time plus: (1) the maximum time for which the arrival
3094 * of a request is waited for when a sync queue becomes idle,
3095 * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We
3096 * postpone for a moment the reason for adding a few extra
3097 * jiffies; we get back to it after next item (b). Lower-bounding
3098 * the return value of this function with the current time plus
3099 * bfqd->bfq_slice_idle tends to filter out greedy applications,
3100 * because the latter issue their next request as soon as possible
3101 * after the last one has been completed. In contrast, a soft
3102 * real-time application spends some time processing data, after a
3103 * batch of its requests has been completed.
2936 * 3104 *
2937 * Unfortunately, the last filter may easily generate false positives if 3105 * (b) Current value of bfqq->soft_rt_next_start. As pointed out
2938 * only bfqd->bfq_slice_idle is used as a reference time interval and one 3106 * above, greedy applications may happen to meet both the
2939 * or both the following cases occur: 3107 * bandwidth and isochrony requirements under heavy CPU or
2940 * 1) HZ is so low that the duration of a jiffy is comparable to or higher 3108 * storage-device load. In more detail, in these scenarios, these
2941 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with 3109 * applications happen, only for limited time periods, to do I/O
2942 * HZ=100. 3110 * slowly enough to meet all the requirements described so far,
3111 * including the filtering in above item (a). These slow-speed
3112 * time intervals are usually interspersed between other time
3113 * intervals during which these applications do I/O at a very high
3114 * speed. Fortunately, exactly because of the high speed of the
3115 * I/O in the high-speed intervals, the values returned by this
3116 * function happen to be so high, near the end of any such
3117 * high-speed interval, to be likely to fall *after* the end of
3118 * the low-speed time interval that follows. These high values are
3119 * stored in bfqq->soft_rt_next_start after each invocation of
3120 * this function. As a consequence, if the last value of
3121 * bfqq->soft_rt_next_start is constantly used to lower-bound the
3122 * next value that this function may return, then, from the very
3123 * beginning of a low-speed interval, bfqq->soft_rt_next_start is
3124 * likely to be constantly kept so high that any I/O request
3125 * issued during the low-speed interval is considered as arriving
3126 * to soon for the application to be deemed as soft
3127 * real-time. Then, in the high-speed interval that follows, the
3128 * application will not be deemed as soft real-time, just because
3129 * it will do I/O at a high speed. And so on.
3130 *
3131 * Getting back to the filtering in item (a), in the following two
3132 * cases this filtering might be easily passed by a greedy
3133 * application, if the reference quantity was just
3134 * bfqd->bfq_slice_idle:
3135 * 1) HZ is so low that the duration of a jiffy is comparable to or
3136 * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow
3137 * devices with HZ=100. The time granularity may be so coarse
3138 * that the approximation, in jiffies, of bfqd->bfq_slice_idle
3139 * is rather lower than the exact value.
2943 * 2) jiffies, instead of increasing at a constant rate, may stop increasing 3140 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
2944 * for a while, then suddenly 'jump' by several units to recover the lost 3141 * for a while, then suddenly 'jump' by several units to recover the lost
2945 * increments. This seems to happen, e.g., inside virtual machines. 3142 * increments. This seems to happen, e.g., inside virtual machines.
2946 * To address this issue, we do not use as a reference time interval just 3143 * To address this issue, in the filtering in (a) we do not use as a
2947 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In 3144 * reference time interval just bfqd->bfq_slice_idle, but
2948 * particular we add the minimum number of jiffies for which the filter 3145 * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the
2949 * seems to be quite precise also in embedded systems and KVM/QEMU virtual 3146 * minimum number of jiffies for which the filter seems to be quite
2950 * machines. 3147 * precise also in embedded systems and KVM/QEMU virtual machines.
2951 */ 3148 */
2952static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, 3149static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2953 struct bfq_queue *bfqq) 3150 struct bfq_queue *bfqq)
2954{ 3151{
2955 return max(bfqq->last_idle_bklogged + 3152 return max3(bfqq->soft_rt_next_start,
2956 HZ * bfqq->service_from_backlogged / 3153 bfqq->last_idle_bklogged +
2957 bfqd->bfq_wr_max_softrt_rate, 3154 HZ * bfqq->service_from_backlogged /
2958 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); 3155 bfqd->bfq_wr_max_softrt_rate,
3156 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
2959} 3157}
2960 3158
2961/** 3159/**
@@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
3000 slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); 3198 slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
3001 3199
3002 /* 3200 /*
3003 * Increase service_from_backlogged before next statement,
3004 * because the possible next invocation of
3005 * bfq_bfqq_charge_time would likely inflate
3006 * entity->service. In contrast, service_from_backlogged must
3007 * contain real service, to enable the soft real-time
3008 * heuristic to correctly compute the bandwidth consumed by
3009 * bfqq.
3010 */
3011 bfqq->service_from_backlogged += entity->service;
3012
3013 /*
3014 * As above explained, charge slow (typically seeky) and 3201 * As above explained, charge slow (typically seeky) and
3015 * timed-out queues with the time and not the service 3202 * timed-out queues with the time and not the service
3016 * received, to favor sequential workloads. 3203 * received, to favor sequential workloads.
@@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3535 bfqq->entity.prio_changed = 1; 3722 bfqq->entity.prio_changed = 1;
3536 } 3723 }
3537 } 3724 }
3725 if (bfqq->wr_coeff > 1 &&
3726 bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
3727 bfqq->service_from_wr > max_service_from_wr) {
3728 /* see comments on max_service_from_wr */
3729 bfq_bfqq_end_wr(bfqq);
3730 }
3538 } 3731 }
3539 /* 3732 /*
3540 * To improve latency (for this or other queues), immediately 3733 * To improve latency (for this or other queues), immediately
@@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3630 } 3823 }
3631 3824
3632 /* 3825 /*
3633 * We exploit the put_rq_private hook to decrement 3826 * We exploit the bfq_finish_request hook to decrement
3634 * rq_in_driver, but put_rq_private will not be 3827 * rq_in_driver, but bfq_finish_request will not be
3635 * invoked on this request. So, to avoid unbalance, 3828 * invoked on this request. So, to avoid unbalance,
3636 * just start this request, without incrementing 3829 * just start this request, without incrementing
3637 * rq_in_driver. As a negative consequence, 3830 * rq_in_driver. As a negative consequence,
@@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3640 * bfq_schedule_dispatch to be invoked uselessly. 3833 * bfq_schedule_dispatch to be invoked uselessly.
3641 * 3834 *
3642 * As for implementing an exact solution, the 3835 * As for implementing an exact solution, the
3643 * put_request hook, if defined, is probably invoked 3836 * bfq_finish_request hook, if defined, is probably
3644 * also on this request. So, by exploiting this hook, 3837 * invoked also on this request. So, by exploiting
3645 * we could 1) increment rq_in_driver here, and 2) 3838 * this hook, we could 1) increment rq_in_driver here,
3646 * decrement it in put_request. Such a solution would 3839 * and 2) decrement it in bfq_finish_request. Such a
3647 * let the value of the counter be always accurate, 3840 * solution would let the value of the counter be
3648 * but it would entail using an extra interface 3841 * always accurate, but it would entail using an extra
3649 * function. This cost seems higher than the benefit, 3842 * interface function. This cost seems higher than the
3650 * being the frequency of non-elevator-private 3843 * benefit, being the frequency of non-elevator-private
3651 * requests very low. 3844 * requests very low.
3652 */ 3845 */
3653 goto start_rq; 3846 goto start_rq;
@@ -3689,35 +3882,16 @@ exit:
3689 return rq; 3882 return rq;
3690} 3883}
3691 3884
3692static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3693{
3694 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3695 struct request *rq;
3696#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 3885#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
3697 struct bfq_queue *in_serv_queue, *bfqq; 3886static void bfq_update_dispatch_stats(struct request_queue *q,
3698 bool waiting_rq, idle_timer_disabled; 3887 struct request *rq,
3699#endif 3888 struct bfq_queue *in_serv_queue,
3700 3889 bool idle_timer_disabled)
3701 spin_lock_irq(&bfqd->lock); 3890{
3702 3891 struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL;
3703#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
3704 in_serv_queue = bfqd->in_service_queue;
3705 waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
3706
3707 rq = __bfq_dispatch_request(hctx);
3708
3709 idle_timer_disabled =
3710 waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
3711
3712#else
3713 rq = __bfq_dispatch_request(hctx);
3714#endif
3715 spin_unlock_irq(&bfqd->lock);
3716 3892
3717#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
3718 bfqq = rq ? RQ_BFQQ(rq) : NULL;
3719 if (!idle_timer_disabled && !bfqq) 3893 if (!idle_timer_disabled && !bfqq)
3720 return rq; 3894 return;
3721 3895
3722 /* 3896 /*
3723 * rq and bfqq are guaranteed to exist until this function 3897 * rq and bfqq are guaranteed to exist until this function
@@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3732 * In addition, the following queue lock guarantees that 3906 * In addition, the following queue lock guarantees that
3733 * bfqq_group(bfqq) exists as well. 3907 * bfqq_group(bfqq) exists as well.
3734 */ 3908 */
3735 spin_lock_irq(hctx->queue->queue_lock); 3909 spin_lock_irq(q->queue_lock);
3736 if (idle_timer_disabled) 3910 if (idle_timer_disabled)
3737 /* 3911 /*
3738 * Since the idle timer has been disabled, 3912 * Since the idle timer has been disabled,
@@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3751 bfqg_stats_set_start_empty_time(bfqg); 3925 bfqg_stats_set_start_empty_time(bfqg);
3752 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); 3926 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
3753 } 3927 }
3754 spin_unlock_irq(hctx->queue->queue_lock); 3928 spin_unlock_irq(q->queue_lock);
3929}
3930#else
3931static inline void bfq_update_dispatch_stats(struct request_queue *q,
3932 struct request *rq,
3933 struct bfq_queue *in_serv_queue,
3934 bool idle_timer_disabled) {}
3755#endif 3935#endif
3756 3936
3937static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3938{
3939 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3940 struct request *rq;
3941 struct bfq_queue *in_serv_queue;
3942 bool waiting_rq, idle_timer_disabled;
3943
3944 spin_lock_irq(&bfqd->lock);
3945
3946 in_serv_queue = bfqd->in_service_queue;
3947 waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
3948
3949 rq = __bfq_dispatch_request(hctx);
3950
3951 idle_timer_disabled =
3952 waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
3953
3954 spin_unlock_irq(&bfqd->lock);
3955
3956 bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
3957 idle_timer_disabled);
3958
3757 return rq; 3959 return rq;
3758} 3960}
3759 3961
@@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4002 bfqq->split_time = bfq_smallest_from_now(); 4204 bfqq->split_time = bfq_smallest_from_now();
4003 4205
4004 /* 4206 /*
4005 * Set to the value for which bfqq will not be deemed as 4207 * To not forget the possibly high bandwidth consumed by a
4006 * soft rt when it becomes backlogged. 4208 * process/queue in the recent past,
4209 * bfq_bfqq_softrt_next_start() returns a value at least equal
4210 * to the current value of bfqq->soft_rt_next_start (see
4211 * comments on bfq_bfqq_softrt_next_start). Set
4212 * soft_rt_next_start to now, to mean that bfqq has consumed
4213 * no bandwidth so far.
4007 */ 4214 */
4008 bfqq->soft_rt_next_start = bfq_greatest_from_now(); 4215 bfqq->soft_rt_next_start = jiffies;
4009 4216
4010 /* first request is almost certainly seeky */ 4217 /* first request is almost certainly seeky */
4011 bfqq->seek_history = 1; 4218 bfqq->seek_history = 1;
@@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
4276 return idle_timer_disabled; 4483 return idle_timer_disabled;
4277} 4484}
4278 4485
4486#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
4487static void bfq_update_insert_stats(struct request_queue *q,
4488 struct bfq_queue *bfqq,
4489 bool idle_timer_disabled,
4490 unsigned int cmd_flags)
4491{
4492 if (!bfqq)
4493 return;
4494
4495 /*
4496 * bfqq still exists, because it can disappear only after
4497 * either it is merged with another queue, or the process it
4498 * is associated with exits. But both actions must be taken by
4499 * the same process currently executing this flow of
4500 * instructions.
4501 *
4502 * In addition, the following queue lock guarantees that
4503 * bfqq_group(bfqq) exists as well.
4504 */
4505 spin_lock_irq(q->queue_lock);
4506 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
4507 if (idle_timer_disabled)
4508 bfqg_stats_update_idle_time(bfqq_group(bfqq));
4509 spin_unlock_irq(q->queue_lock);
4510}
4511#else
4512static inline void bfq_update_insert_stats(struct request_queue *q,
4513 struct bfq_queue *bfqq,
4514 bool idle_timer_disabled,
4515 unsigned int cmd_flags) {}
4516#endif
4517
4279static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 4518static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
4280 bool at_head) 4519 bool at_head)
4281{ 4520{
4282 struct request_queue *q = hctx->queue; 4521 struct request_queue *q = hctx->queue;
4283 struct bfq_data *bfqd = q->elevator->elevator_data; 4522 struct bfq_data *bfqd = q->elevator->elevator_data;
4284#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
4285 struct bfq_queue *bfqq = RQ_BFQQ(rq); 4523 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4286 bool idle_timer_disabled = false; 4524 bool idle_timer_disabled = false;
4287 unsigned int cmd_flags; 4525 unsigned int cmd_flags;
4288#endif
4289 4526
4290 spin_lock_irq(&bfqd->lock); 4527 spin_lock_irq(&bfqd->lock);
4291 if (blk_mq_sched_try_insert_merge(q, rq)) { 4528 if (blk_mq_sched_try_insert_merge(q, rq)) {
@@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
4304 else 4541 else
4305 list_add_tail(&rq->queuelist, &bfqd->dispatch); 4542 list_add_tail(&rq->queuelist, &bfqd->dispatch);
4306 } else { 4543 } else {
4307#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
4308 idle_timer_disabled = __bfq_insert_request(bfqd, rq); 4544 idle_timer_disabled = __bfq_insert_request(bfqd, rq);
4309 /* 4545 /*
4310 * Update bfqq, because, if a queue merge has occurred 4546 * Update bfqq, because, if a queue merge has occurred
@@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
4312 * redirected into a new queue. 4548 * redirected into a new queue.
4313 */ 4549 */
4314 bfqq = RQ_BFQQ(rq); 4550 bfqq = RQ_BFQQ(rq);
4315#else
4316 __bfq_insert_request(bfqd, rq);
4317#endif
4318 4551
4319 if (rq_mergeable(rq)) { 4552 if (rq_mergeable(rq)) {
4320 elv_rqhash_add(q, rq); 4553 elv_rqhash_add(q, rq);
@@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
4323 } 4556 }
4324 } 4557 }
4325 4558
4326#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
4327 /* 4559 /*
4328 * Cache cmd_flags before releasing scheduler lock, because rq 4560 * Cache cmd_flags before releasing scheduler lock, because rq
4329 * may disappear afterwards (for example, because of a request 4561 * may disappear afterwards (for example, because of a request
4330 * merge). 4562 * merge).
4331 */ 4563 */
4332 cmd_flags = rq->cmd_flags; 4564 cmd_flags = rq->cmd_flags;
4333#endif 4565
4334 spin_unlock_irq(&bfqd->lock); 4566 spin_unlock_irq(&bfqd->lock);
4335 4567
4336#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 4568 bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
4337 if (!bfqq) 4569 cmd_flags);
4338 return;
4339 /*
4340 * bfqq still exists, because it can disappear only after
4341 * either it is merged with another queue, or the process it
4342 * is associated with exits. But both actions must be taken by
4343 * the same process currently executing this flow of
4344 * instruction.
4345 *
4346 * In addition, the following queue lock guarantees that
4347 * bfqq_group(bfqq) exists as well.
4348 */
4349 spin_lock_irq(q->queue_lock);
4350 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
4351 if (idle_timer_disabled)
4352 bfqg_stats_update_idle_time(bfqq_group(bfqq));
4353 spin_unlock_irq(q->queue_lock);
4354#endif
4355} 4570}
4356 4571
4357static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, 4572static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
@@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
4482 bfq_schedule_dispatch(bfqd); 4697 bfq_schedule_dispatch(bfqd);
4483} 4698}
4484 4699
4485static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) 4700static void bfq_finish_request_body(struct bfq_queue *bfqq)
4486{ 4701{
4487 bfqq->allocated--; 4702 bfqq->allocated--;
4488 4703
@@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq)
4512 spin_lock_irqsave(&bfqd->lock, flags); 4727 spin_lock_irqsave(&bfqd->lock, flags);
4513 4728
4514 bfq_completed_request(bfqq, bfqd); 4729 bfq_completed_request(bfqq, bfqd);
4515 bfq_put_rq_priv_body(bfqq); 4730 bfq_finish_request_body(bfqq);
4516 4731
4517 spin_unlock_irqrestore(&bfqd->lock, flags); 4732 spin_unlock_irqrestore(&bfqd->lock, flags);
4518 } else { 4733 } else {
@@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq)
4533 bfqg_stats_update_io_remove(bfqq_group(bfqq), 4748 bfqg_stats_update_io_remove(bfqq_group(bfqq),
4534 rq->cmd_flags); 4749 rq->cmd_flags);
4535 } 4750 }
4536 bfq_put_rq_priv_body(bfqq); 4751 bfq_finish_request_body(bfqq);
4537 } 4752 }
4538 4753
4539 rq->elv.priv[0] = NULL; 4754 rq->elv.priv[0] = NULL;
@@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e)
4818 hrtimer_cancel(&bfqd->idle_slice_timer); 5033 hrtimer_cancel(&bfqd->idle_slice_timer);
4819 5034
4820#ifdef CONFIG_BFQ_GROUP_IOSCHED 5035#ifdef CONFIG_BFQ_GROUP_IOSCHED
5036 /* release oom-queue reference to root group */
5037 bfqg_and_blkg_put(bfqd->root_group);
5038
4821 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); 5039 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
4822#else 5040#else
4823 spin_lock_irq(&bfqd->lock); 5041 spin_lock_irq(&bfqd->lock);
@@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {
5206 5424
5207static struct elevator_type iosched_bfq_mq = { 5425static struct elevator_type iosched_bfq_mq = {
5208 .ops.mq = { 5426 .ops.mq = {
5427 .limit_depth = bfq_limit_depth,
5209 .prepare_request = bfq_prepare_request, 5428 .prepare_request = bfq_prepare_request,
5210 .finish_request = bfq_finish_request, 5429 .finish_request = bfq_finish_request,
5211 .exit_icq = bfq_exit_icq, 5430 .exit_icq = bfq_exit_icq,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 91c4390903a1..350c39ae2896 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -337,6 +337,11 @@ struct bfq_queue {
337 * last transition from idle to backlogged. 337 * last transition from idle to backlogged.
338 */ 338 */
339 unsigned long service_from_backlogged; 339 unsigned long service_from_backlogged;
340 /*
341 * Cumulative service received from the @bfq_queue since its
342 * last transition to weight-raised state.
343 */
344 unsigned long service_from_wr;
340 345
341 /* 346 /*
342 * Value of wr start time when switching to soft rt 347 * Value of wr start time when switching to soft rt
@@ -344,6 +349,8 @@ struct bfq_queue {
344 unsigned long wr_start_at_switch_to_srt; 349 unsigned long wr_start_at_switch_to_srt;
345 350
346 unsigned long split_time; /* time of last split */ 351 unsigned long split_time; /* time of last split */
352
353 unsigned long first_IO_time; /* time of first I/O for this queue */
347}; 354};
348 355
349/** 356/**
@@ -627,6 +634,18 @@ struct bfq_data {
627 struct bfq_io_cq *bio_bic; 634 struct bfq_io_cq *bio_bic;
628 /* bfqq associated with the task issuing current bio for merging */ 635 /* bfqq associated with the task issuing current bio for merging */
629 struct bfq_queue *bio_bfqq; 636 struct bfq_queue *bio_bfqq;
637
638 /*
639 * Cached sbitmap shift, used to compute depth limits in
640 * bfq_update_depths.
641 */
642 unsigned int sb_shift;
643
644 /*
645 * Depth limits used in bfq_limit_depth (see comments on the
646 * function)
647 */
648 unsigned int word_depths[2][2];
630}; 649};
631 650
632enum bfqq_state_flags { 651enum bfqq_state_flags {
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index e495d3f9b4b0..4498c43245e2 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
835 struct bfq_entity *entity = &bfqq->entity; 835 struct bfq_entity *entity = &bfqq->entity;
836 struct bfq_service_tree *st; 836 struct bfq_service_tree *st;
837 837
838 if (!bfqq->service_from_backlogged)
839 bfqq->first_IO_time = jiffies;
840
841 if (bfqq->wr_coeff > 1)
842 bfqq->service_from_wr += served;
843
844 bfqq->service_from_backlogged += served;
838 for_each_entity(entity) { 845 for_each_entity(entity) {
839 st = bfq_entity_service_tree(entity); 846 st = bfq_entity_service_tree(entity);
840 847
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 23b42e8aa03e..9cfdd6c83b5b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
374/** 374/**
375 * __bio_integrity_endio - Integrity I/O completion function 375 * __bio_integrity_endio - Integrity I/O completion function
376 * @bio: Protected bio 376 * @bio: Protected bio
377 * @error: Pointer to errno
378 * 377 *
379 * Description: Completion for integrity I/O 378 * Description: Completion for integrity I/O
380 * 379 *
diff --git a/block/bio.c b/block/bio.c
index 9ef6cf3addb3..e1708db48258 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -971,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
971EXPORT_SYMBOL(bio_advance); 971EXPORT_SYMBOL(bio_advance);
972 972
973/** 973/**
974 * bio_alloc_pages - allocates a single page for each bvec in a bio
975 * @bio: bio to allocate pages for
976 * @gfp_mask: flags for allocation
977 *
978 * Allocates pages up to @bio->bi_vcnt.
979 *
980 * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
981 * freed.
982 */
983int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
984{
985 int i;
986 struct bio_vec *bv;
987
988 bio_for_each_segment_all(bv, bio, i) {
989 bv->bv_page = alloc_page(gfp_mask);
990 if (!bv->bv_page) {
991 while (--bv >= bio->bi_io_vec)
992 __free_page(bv->bv_page);
993 return -ENOMEM;
994 }
995 }
996
997 return 0;
998}
999EXPORT_SYMBOL(bio_alloc_pages);
1000
1001/**
1002 * bio_copy_data - copy contents of data buffers from one chain of bios to 974 * bio_copy_data - copy contents of data buffers from one chain of bios to
1003 * another 975 * another
1004 * @src: source bio list 976 * @src: source bio list
@@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
1838 bio_advance(bio, split->bi_iter.bi_size); 1810 bio_advance(bio, split->bi_iter.bi_size);
1839 1811
1840 if (bio_flagged(bio, BIO_TRACE_COMPLETION)) 1812 if (bio_flagged(bio, BIO_TRACE_COMPLETION))
1841 bio_set_flag(bio, BIO_TRACE_COMPLETION); 1813 bio_set_flag(split, BIO_TRACE_COMPLETION);
1842 1814
1843 return split; 1815 return split;
1844} 1816}
diff --git a/block/blk-core.c b/block/blk-core.c
index 3ba4326a63b5..a2005a485335 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
126 rq->start_time = jiffies; 126 rq->start_time = jiffies;
127 set_start_time_ns(rq); 127 set_start_time_ns(rq);
128 rq->part = NULL; 128 rq->part = NULL;
129 seqcount_init(&rq->gstate_seq);
130 u64_stats_init(&rq->aborted_gstate_sync);
129} 131}
130EXPORT_SYMBOL(blk_rq_init); 132EXPORT_SYMBOL(blk_rq_init);
131 133
@@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q)
699 queue_flag_set(QUEUE_FLAG_DEAD, q); 701 queue_flag_set(QUEUE_FLAG_DEAD, q);
700 spin_unlock_irq(lock); 702 spin_unlock_irq(lock);
701 703
704 /*
705 * make sure all in-progress dispatch are completed because
706 * blk_freeze_queue() can only complete all requests, and
707 * dispatch may still be in-progress since we dispatch requests
708 * from more than one contexts
709 */
710 if (q->mq_ops)
711 blk_mq_quiesce_queue(q);
712
702 /* for synchronous bio-based driver finish in-flight integrity i/o */ 713 /* for synchronous bio-based driver finish in-flight integrity i/o */
703 blk_flush_integrity(); 714 blk_flush_integrity();
704 715
@@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1646 1657
1647 lockdep_assert_held(q->queue_lock); 1658 lockdep_assert_held(q->queue_lock);
1648 1659
1660 blk_req_zone_write_unlock(req);
1649 blk_pm_put_request(req); 1661 blk_pm_put_request(req);
1650 1662
1651 elv_completed_request(q, req); 1663 elv_completed_request(q, req);
@@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part,
2055 2067
2056#endif /* CONFIG_FAIL_MAKE_REQUEST */ 2068#endif /* CONFIG_FAIL_MAKE_REQUEST */
2057 2069
2070static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
2071{
2072 if (part->policy && op_is_write(bio_op(bio))) {
2073 char b[BDEVNAME_SIZE];
2074
2075 printk(KERN_ERR
2076 "generic_make_request: Trying to write "
2077 "to read-only block-device %s (partno %d)\n",
2078 bio_devname(bio, b), part->partno);
2079 return true;
2080 }
2081
2082 return false;
2083}
2084
2058/* 2085/*
2059 * Remap block n of partition p to block n+start(p) of the disk. 2086 * Remap block n of partition p to block n+start(p) of the disk.
2060 */ 2087 */
@@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio)
2063 struct hd_struct *p; 2090 struct hd_struct *p;
2064 int ret = 0; 2091 int ret = 0;
2065 2092
2093 rcu_read_lock();
2094 p = __disk_get_part(bio->bi_disk, bio->bi_partno);
2095 if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
2096 bio_check_ro(bio, p))) {
2097 ret = -EIO;
2098 goto out;
2099 }
2100
2066 /* 2101 /*
2067 * Zone reset does not include bi_size so bio_sectors() is always 0. 2102 * Zone reset does not include bi_size so bio_sectors() is always 0.
2068 * Include a test for the reset op code and perform the remap if needed. 2103 * Include a test for the reset op code and perform the remap if needed.
2069 */ 2104 */
2070 if (!bio->bi_partno || 2105 if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
2071 (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) 2106 goto out;
2072 return 0;
2073 2107
2074 rcu_read_lock(); 2108 bio->bi_iter.bi_sector += p->start_sect;
2075 p = __disk_get_part(bio->bi_disk, bio->bi_partno); 2109 bio->bi_partno = 0;
2076 if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { 2110 trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
2077 bio->bi_iter.bi_sector += p->start_sect; 2111 bio->bi_iter.bi_sector - p->start_sect);
2078 bio->bi_partno = 0;
2079 trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
2080 bio->bi_iter.bi_sector - p->start_sect);
2081 } else {
2082 printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
2083 ret = -EIO;
2084 }
2085 rcu_read_unlock();
2086 2112
2113out:
2114 rcu_read_unlock();
2087 return ret; 2115 return ret;
2088} 2116}
2089 2117
@@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio)
2142 * For a REQ_NOWAIT based request, return -EOPNOTSUPP 2170 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
2143 * if queue is not a request based queue. 2171 * if queue is not a request based queue.
2144 */ 2172 */
2145
2146 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) 2173 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
2147 goto not_supported; 2174 goto not_supported;
2148 2175
2149 if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) 2176 if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
2150 goto end_io; 2177 goto end_io;
2151 2178
2152 if (blk_partition_remap(bio)) 2179 if (!bio->bi_partno) {
2153 goto end_io; 2180 if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
2181 goto end_io;
2182 } else {
2183 if (blk_partition_remap(bio))
2184 goto end_io;
2185 }
2154 2186
2155 if (bio_check_eod(bio, nr_sectors)) 2187 if (bio_check_eod(bio, nr_sectors))
2156 goto end_io; 2188 goto end_io;
@@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
2493 * bypass a potential scheduler on the bottom device for 2525 * bypass a potential scheduler on the bottom device for
2494 * insert. 2526 * insert.
2495 */ 2527 */
2496 blk_mq_request_bypass_insert(rq, true); 2528 return blk_mq_request_issue_directly(rq);
2497 return BLK_STS_OK;
2498 } 2529 }
2499 2530
2500 spin_lock_irqsave(q->queue_lock, flags); 2531 spin_lock_irqsave(q->queue_lock, flags);
@@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req)
2846 wbt_issue(req->q->rq_wb, &req->issue_stat); 2877 wbt_issue(req->q->rq_wb, &req->issue_stat);
2847 } 2878 }
2848 2879
2849 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); 2880 BUG_ON(blk_rq_is_complete(req));
2850 blk_add_timer(req); 2881 blk_add_timer(req);
2851} 2882}
2852EXPORT_SYMBOL(blk_start_request); 2883EXPORT_SYMBOL(blk_start_request);
@@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
3415} 3446}
3416EXPORT_SYMBOL(kblockd_mod_delayed_work_on); 3447EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
3417 3448
3418int kblockd_schedule_delayed_work(struct delayed_work *dwork,
3419 unsigned long delay)
3420{
3421 return queue_delayed_work(kblockd_workqueue, dwork, delay);
3422}
3423EXPORT_SYMBOL(kblockd_schedule_delayed_work);
3424
3425int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
3426 unsigned long delay)
3427{
3428 return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
3429}
3430EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
3431
3432/** 3449/**
3433 * blk_start_plug - initialize blk_plug and track it inside the task_struct 3450 * blk_start_plug - initialize blk_plug and track it inside the task_struct
3434 * @plug: The &struct blk_plug that needs to be initialized 3451 * @plug: The &struct blk_plug that needs to be initialized
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 5c0f3dc446dc..f7b292f12449 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
61 * be reused after dying flag is set 61 * be reused after dying flag is set
62 */ 62 */
63 if (q->mq_ops) { 63 if (q->mq_ops) {
64 blk_mq_sched_insert_request(rq, at_head, true, false, false); 64 blk_mq_sched_insert_request(rq, at_head, true, false);
65 return; 65 return;
66 } 66 }
67 67
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2bc544ce3d2e..a676084d4740 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
37 if (!q) 37 if (!q)
38 return -ENXIO; 38 return -ENXIO;
39 39
40 if (bdev_read_only(bdev))
41 return -EPERM;
42
40 if (flags & BLKDEV_DISCARD_SECURE) { 43 if (flags & BLKDEV_DISCARD_SECURE) {
41 if (!blk_queue_secure_erase(q)) 44 if (!blk_queue_secure_erase(q))
42 return -EOPNOTSUPP; 45 return -EOPNOTSUPP;
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
156 if (!q) 159 if (!q)
157 return -ENXIO; 160 return -ENXIO;
158 161
162 if (bdev_read_only(bdev))
163 return -EPERM;
164
159 bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; 165 bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
160 if ((sector | nr_sects) & bs_mask) 166 if ((sector | nr_sects) & bs_mask)
161 return -EINVAL; 167 return -EINVAL;
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
233 if (!q) 239 if (!q)
234 return -ENXIO; 240 return -ENXIO;
235 241
242 if (bdev_read_only(bdev))
243 return -EPERM;
244
236 /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ 245 /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
237 max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); 246 max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
238 247
@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
287 if (!q) 296 if (!q)
288 return -ENXIO; 297 return -ENXIO;
289 298
299 if (bdev_read_only(bdev))
300 return -EPERM;
301
290 while (nr_sects != 0) { 302 while (nr_sects != 0) {
291 bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), 303 bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
292 gfp_mask); 304 gfp_mask);
diff --git a/block/blk-map.c b/block/blk-map.c
index d3a94719f03f..db9373bd31ac 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
119 unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); 119 unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
120 struct bio *bio = NULL; 120 struct bio *bio = NULL;
121 struct iov_iter i; 121 struct iov_iter i;
122 int ret; 122 int ret = -EINVAL;
123 123
124 if (!iter_is_iovec(iter)) 124 if (!iter_is_iovec(iter))
125 goto fail; 125 goto fail;
@@ -148,7 +148,7 @@ unmap_rq:
148 __blk_rq_unmap_user(bio); 148 __blk_rq_unmap_user(bio);
149fail: 149fail:
150 rq->bio = NULL; 150 rq->bio = NULL;
151 return -EINVAL; 151 return ret;
152} 152}
153EXPORT_SYMBOL(blk_rq_map_user_iov); 153EXPORT_SYMBOL(blk_rq_map_user_iov);
154 154
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f5dedd57dff6..8452fc7164cc 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
128 nsegs++; 128 nsegs++;
129 sectors = max_sectors; 129 sectors = max_sectors;
130 } 130 }
131 if (sectors) 131 goto split;
132 goto split;
133 /* Make this single bvec as the 1st segment */
134 } 132 }
135 133
136 if (bvprvp && blk_queue_cluster(q)) { 134 if (bvprvp && blk_queue_cluster(q)) {
@@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
146 bvprvp = &bvprv; 144 bvprvp = &bvprv;
147 sectors += bv.bv_len >> 9; 145 sectors += bv.bv_len >> 9;
148 146
149 if (nsegs == 1 && seg_size > front_seg_size)
150 front_seg_size = seg_size;
151 continue; 147 continue;
152 } 148 }
153new_segment: 149new_segment:
154 if (nsegs == queue_max_segments(q)) 150 if (nsegs == queue_max_segments(q))
155 goto split; 151 goto split;
156 152
153 if (nsegs == 1 && seg_size > front_seg_size)
154 front_seg_size = seg_size;
155
157 nsegs++; 156 nsegs++;
158 bvprv = bv; 157 bvprv = bv;
159 bvprvp = &bvprv; 158 bvprvp = &bvprv;
160 seg_size = bv.bv_len; 159 seg_size = bv.bv_len;
161 sectors += bv.bv_len >> 9; 160 sectors += bv.bv_len >> 9;
162 161
163 if (nsegs == 1 && seg_size > front_seg_size)
164 front_seg_size = seg_size;
165 } 162 }
166 163
167 do_split = false; 164 do_split = false;
@@ -174,6 +171,8 @@ split:
174 bio = new; 171 bio = new;
175 } 172 }
176 173
174 if (nsegs == 1 && seg_size > front_seg_size)
175 front_seg_size = seg_size;
177 bio->bi_seg_front_size = front_seg_size; 176 bio->bi_seg_front_size = front_seg_size;
178 if (seg_size > bio->bi_seg_back_size) 177 if (seg_size > bio->bi_seg_back_size)
179 bio->bi_seg_back_size = seg_size; 178 bio->bi_seg_back_size = seg_size;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b56a4f35720d..21cbc1f071c6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -289,17 +289,12 @@ static const char *const rqf_name[] = {
289 RQF_NAME(HASHED), 289 RQF_NAME(HASHED),
290 RQF_NAME(STATS), 290 RQF_NAME(STATS),
291 RQF_NAME(SPECIAL_PAYLOAD), 291 RQF_NAME(SPECIAL_PAYLOAD),
292 RQF_NAME(ZONE_WRITE_LOCKED),
293 RQF_NAME(MQ_TIMEOUT_EXPIRED),
294 RQF_NAME(MQ_POLL_SLEPT),
292}; 295};
293#undef RQF_NAME 296#undef RQF_NAME
294 297
295#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
296static const char *const rqaf_name[] = {
297 RQAF_NAME(COMPLETE),
298 RQAF_NAME(STARTED),
299 RQAF_NAME(POLL_SLEPT),
300};
301#undef RQAF_NAME
302
303int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) 298int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
304{ 299{
305 const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; 300 const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
316 seq_puts(m, ", .rq_flags="); 311 seq_puts(m, ", .rq_flags=");
317 blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, 312 blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
318 ARRAY_SIZE(rqf_name)); 313 ARRAY_SIZE(rqf_name));
319 seq_puts(m, ", .atomic_flags="); 314 seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
320 blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
321 seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, 315 seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
322 rq->internal_tag); 316 rq->internal_tag);
323 if (mq_ops->show_rq) 317 if (mq_ops->show_rq)
@@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
409 const struct show_busy_params *params = data; 403 const struct show_busy_params *params = data;
410 404
411 if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && 405 if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
412 test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 406 blk_mq_rq_state(rq) != MQ_RQ_IDLE)
413 __blk_mq_debugfs_rq_show(params->m, 407 __blk_mq_debugfs_rq_show(params->m,
414 list_entry_rq(&rq->queuelist)); 408 list_entry_rq(&rq->queuelist));
415} 409}
@@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
703 const struct blk_mq_debugfs_attr *attr = m->private; 697 const struct blk_mq_debugfs_attr *attr = m->private;
704 void *data = d_inode(file->f_path.dentry->d_parent)->i_private; 698 void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
705 699
706 if (!attr->write) 700 /*
701 * Attributes that only implement .seq_ops are read-only and 'attr' is
702 * the same with 'data' in this case.
703 */
704 if (attr == data || !attr->write)
707 return -EPERM; 705 return -EPERM;
708 706
709 return attr->write(data, buf, count, ppos); 707 return attr->write(data, buf, count, ppos);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c117bd8fd1f6..55c0a745b427 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
172 WRITE_ONCE(hctx->dispatch_from, ctx); 172 WRITE_ONCE(hctx->dispatch_from, ctx);
173} 173}
174 174
175/* return true if hw queue need to be run again */
176void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 175void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
177{ 176{
178 struct request_queue *q = hctx->queue; 177 struct request_queue *q = hctx->queue;
@@ -428,7 +427,7 @@ done:
428} 427}
429 428
430void blk_mq_sched_insert_request(struct request *rq, bool at_head, 429void blk_mq_sched_insert_request(struct request *rq, bool at_head,
431 bool run_queue, bool async, bool can_block) 430 bool run_queue, bool async)
432{ 431{
433 struct request_queue *q = rq->q; 432 struct request_queue *q = rq->q;
434 struct elevator_queue *e = q->elevator; 433 struct elevator_queue *e = q->elevator;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index ba1d1418a96d..1e9c9018ace1 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
18void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); 18void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
19 19
20void blk_mq_sched_insert_request(struct request *rq, bool at_head, 20void blk_mq_sched_insert_request(struct request *rq, bool at_head,
21 bool run_queue, bool async, bool can_block); 21 bool run_queue, bool async);
22void blk_mq_sched_insert_requests(struct request_queue *q, 22void blk_mq_sched_insert_requests(struct request_queue *q,
23 struct blk_mq_ctx *ctx, 23 struct blk_mq_ctx *ctx,
24 struct list_head *list, bool run_queue_async); 24 struct list_head *list, bool run_queue_async);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 79969c3c234f..a54b4b070f1c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
248 return ret; 248 return ret;
249} 249}
250 250
251static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) 251void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
252{ 252{
253 struct blk_mq_hw_ctx *hctx; 253 struct blk_mq_hw_ctx *hctx;
254 int i; 254 int i;
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
265 q->mq_sysfs_init_done = false; 265 q->mq_sysfs_init_done = false;
266} 266}
267 267
268void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
269{
270 mutex_lock(&q->sysfs_lock);
271 __blk_mq_unregister_dev(dev, q);
272 mutex_unlock(&q->sysfs_lock);
273}
274
275void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) 268void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
276{ 269{
277 kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 270 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c81b40ecd3f1..336dde07b230 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
134 ws = bt_wait_ptr(bt, data->hctx); 134 ws = bt_wait_ptr(bt, data->hctx);
135 drop_ctx = data->ctx == NULL; 135 drop_ctx = data->ctx == NULL;
136 do { 136 do {
137 prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
138
139 tag = __blk_mq_get_tag(data, bt);
140 if (tag != -1)
141 break;
142
143 /* 137 /*
144 * We're out of tags on this hardware queue, kick any 138 * We're out of tags on this hardware queue, kick any
145 * pending IO submits before going to sleep waiting for 139 * pending IO submits before going to sleep waiting for
@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
155 if (tag != -1) 149 if (tag != -1)
156 break; 150 break;
157 151
152 prepare_to_wait_exclusive(&ws->wait, &wait,
153 TASK_UNINTERRUPTIBLE);
154
155 tag = __blk_mq_get_tag(data, bt);
156 if (tag != -1)
157 break;
158
158 if (data->ctx) 159 if (data->ctx)
159 blk_mq_put_ctx(data->ctx); 160 blk_mq_put_ctx(data->ctx);
160 161
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d3797327491..01f271d40825 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
95{ 95{
96 struct mq_inflight *mi = priv; 96 struct mq_inflight *mi = priv;
97 97
98 if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && 98 if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
99 !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
100 /* 99 /*
101 * index[0] counts the specific partition that was asked 100 * index[0] counts the specific partition that was asked
102 * for. index[1] counts the ones that are active on the 101 * for. index[1] counts the ones that are active on the
@@ -222,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
222 221
223 queue_for_each_hw_ctx(q, hctx, i) { 222 queue_for_each_hw_ctx(q, hctx, i) {
224 if (hctx->flags & BLK_MQ_F_BLOCKING) 223 if (hctx->flags & BLK_MQ_F_BLOCKING)
225 synchronize_srcu(hctx->queue_rq_srcu); 224 synchronize_srcu(hctx->srcu);
226 else 225 else
227 rcu = true; 226 rcu = true;
228 } 227 }
@@ -272,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
272{ 271{
273 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 272 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
274 struct request *rq = tags->static_rqs[tag]; 273 struct request *rq = tags->static_rqs[tag];
275 274 req_flags_t rq_flags = 0;
276 rq->rq_flags = 0;
277 275
278 if (data->flags & BLK_MQ_REQ_INTERNAL) { 276 if (data->flags & BLK_MQ_REQ_INTERNAL) {
279 rq->tag = -1; 277 rq->tag = -1;
280 rq->internal_tag = tag; 278 rq->internal_tag = tag;
281 } else { 279 } else {
282 if (blk_mq_tag_busy(data->hctx)) { 280 if (blk_mq_tag_busy(data->hctx)) {
283 rq->rq_flags = RQF_MQ_INFLIGHT; 281 rq_flags = RQF_MQ_INFLIGHT;
284 atomic_inc(&data->hctx->nr_active); 282 atomic_inc(&data->hctx->nr_active);
285 } 283 }
286 rq->tag = tag; 284 rq->tag = tag;
@@ -288,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
288 data->hctx->tags->rqs[rq->tag] = rq; 286 data->hctx->tags->rqs[rq->tag] = rq;
289 } 287 }
290 288
291 INIT_LIST_HEAD(&rq->queuelist);
292 /* csd/requeue_work/fifo_time is initialized before use */ 289 /* csd/requeue_work/fifo_time is initialized before use */
293 rq->q = data->q; 290 rq->q = data->q;
294 rq->mq_ctx = data->ctx; 291 rq->mq_ctx = data->ctx;
292 rq->rq_flags = rq_flags;
293 rq->cpu = -1;
295 rq->cmd_flags = op; 294 rq->cmd_flags = op;
296 if (data->flags & BLK_MQ_REQ_PREEMPT) 295 if (data->flags & BLK_MQ_REQ_PREEMPT)
297 rq->rq_flags |= RQF_PREEMPT; 296 rq->rq_flags |= RQF_PREEMPT;
298 if (blk_queue_io_stat(data->q)) 297 if (blk_queue_io_stat(data->q))
299 rq->rq_flags |= RQF_IO_STAT; 298 rq->rq_flags |= RQF_IO_STAT;
300 /* do not touch atomic flags, it needs atomic ops against the timer */ 299 INIT_LIST_HEAD(&rq->queuelist);
301 rq->cpu = -1;
302 INIT_HLIST_NODE(&rq->hash); 300 INIT_HLIST_NODE(&rq->hash);
303 RB_CLEAR_NODE(&rq->rb_node); 301 RB_CLEAR_NODE(&rq->rb_node);
304 rq->rq_disk = NULL; 302 rq->rq_disk = NULL;
305 rq->part = NULL; 303 rq->part = NULL;
306 rq->start_time = jiffies; 304 rq->start_time = jiffies;
307#ifdef CONFIG_BLK_CGROUP
308 rq->rl = NULL;
309 set_start_time_ns(rq);
310 rq->io_start_time_ns = 0;
311#endif
312 rq->nr_phys_segments = 0; 305 rq->nr_phys_segments = 0;
313#if defined(CONFIG_BLK_DEV_INTEGRITY) 306#if defined(CONFIG_BLK_DEV_INTEGRITY)
314 rq->nr_integrity_segments = 0; 307 rq->nr_integrity_segments = 0;
@@ -316,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
316 rq->special = NULL; 309 rq->special = NULL;
317 /* tag was already set */ 310 /* tag was already set */
318 rq->extra_len = 0; 311 rq->extra_len = 0;
312 rq->__deadline = 0;
319 313
320 INIT_LIST_HEAD(&rq->timeout_list); 314 INIT_LIST_HEAD(&rq->timeout_list);
321 rq->timeout = 0; 315 rq->timeout = 0;
@@ -324,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
324 rq->end_io_data = NULL; 318 rq->end_io_data = NULL;
325 rq->next_rq = NULL; 319 rq->next_rq = NULL;
326 320
321#ifdef CONFIG_BLK_CGROUP
322 rq->rl = NULL;
323 set_start_time_ns(rq);
324 rq->io_start_time_ns = 0;
325#endif
326
327 data->ctx->rq_dispatched[op_is_sync(op)]++; 327 data->ctx->rq_dispatched[op_is_sync(op)]++;
328 return rq; 328 return rq;
329} 329}
@@ -443,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
443 blk_queue_exit(q); 443 blk_queue_exit(q);
444 return ERR_PTR(-EXDEV); 444 return ERR_PTR(-EXDEV);
445 } 445 }
446 cpu = cpumask_first(alloc_data.hctx->cpumask); 446 cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
447 alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 447 alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
448 448
449 rq = blk_mq_get_request(q, NULL, op, &alloc_data); 449 rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@ -485,8 +485,7 @@ void blk_mq_free_request(struct request *rq)
485 if (blk_rq_rl(rq)) 485 if (blk_rq_rl(rq))
486 blk_put_rl(blk_rq_rl(rq)); 486 blk_put_rl(blk_rq_rl(rq));
487 487
488 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 488 blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
489 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
490 if (rq->tag != -1) 489 if (rq->tag != -1)
491 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 490 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
492 if (sched_tag != -1) 491 if (sched_tag != -1)
@@ -532,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq)
532 bool shared = false; 531 bool shared = false;
533 int cpu; 532 int cpu;
534 533
534 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
535 blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
536
535 if (rq->internal_tag != -1) 537 if (rq->internal_tag != -1)
536 blk_mq_sched_completed_request(rq); 538 blk_mq_sched_completed_request(rq);
537 if (rq->rq_flags & RQF_STATS) { 539 if (rq->rq_flags & RQF_STATS) {
@@ -559,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq)
559 put_cpu(); 561 put_cpu();
560} 562}
561 563
564static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
565 __releases(hctx->srcu)
566{
567 if (!(hctx->flags & BLK_MQ_F_BLOCKING))
568 rcu_read_unlock();
569 else
570 srcu_read_unlock(hctx->srcu, srcu_idx);
571}
572
573static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
574 __acquires(hctx->srcu)
575{
576 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
577 /* shut up gcc false positive */
578 *srcu_idx = 0;
579 rcu_read_lock();
580 } else
581 *srcu_idx = srcu_read_lock(hctx->srcu);
582}
583
584static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
585{
586 unsigned long flags;
587
588 /*
589 * blk_mq_rq_aborted_gstate() is used from the completion path and
590 * can thus be called from irq context. u64_stats_fetch in the
591 * middle of update on the same CPU leads to lockup. Disable irq
592 * while updating.
593 */
594 local_irq_save(flags);
595 u64_stats_update_begin(&rq->aborted_gstate_sync);
596 rq->aborted_gstate = gstate;
597 u64_stats_update_end(&rq->aborted_gstate_sync);
598 local_irq_restore(flags);
599}
600
601static u64 blk_mq_rq_aborted_gstate(struct request *rq)
602{
603 unsigned int start;
604 u64 aborted_gstate;
605
606 do {
607 start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
608 aborted_gstate = rq->aborted_gstate;
609 } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
610
611 return aborted_gstate;
612}
613
562/** 614/**
563 * blk_mq_complete_request - end I/O on a request 615 * blk_mq_complete_request - end I/O on a request
564 * @rq: the request being processed 616 * @rq: the request being processed
@@ -570,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq)
570void blk_mq_complete_request(struct request *rq) 622void blk_mq_complete_request(struct request *rq)
571{ 623{
572 struct request_queue *q = rq->q; 624 struct request_queue *q = rq->q;
625 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
626 int srcu_idx;
573 627
574 if (unlikely(blk_should_fake_timeout(q))) 628 if (unlikely(blk_should_fake_timeout(q)))
575 return; 629 return;
576 if (!blk_mark_rq_complete(rq)) 630
631 /*
632 * If @rq->aborted_gstate equals the current instance, timeout is
633 * claiming @rq and we lost. This is synchronized through
634 * hctx_lock(). See blk_mq_timeout_work() for details.
635 *
636 * Completion path never blocks and we can directly use RCU here
637 * instead of hctx_lock() which can be either RCU or SRCU.
638 * However, that would complicate paths which want to synchronize
639 * against us. Let stay in sync with the issue path so that
640 * hctx_lock() covers both issue and completion paths.
641 */
642 hctx_lock(hctx, &srcu_idx);
643 if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
577 __blk_mq_complete_request(rq); 644 __blk_mq_complete_request(rq);
645 hctx_unlock(hctx, srcu_idx);
578} 646}
579EXPORT_SYMBOL(blk_mq_complete_request); 647EXPORT_SYMBOL(blk_mq_complete_request);
580 648
581int blk_mq_request_started(struct request *rq) 649int blk_mq_request_started(struct request *rq)
582{ 650{
583 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 651 return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
584} 652}
585EXPORT_SYMBOL_GPL(blk_mq_request_started); 653EXPORT_SYMBOL_GPL(blk_mq_request_started);
586 654
@@ -598,34 +666,27 @@ void blk_mq_start_request(struct request *rq)
598 wbt_issue(q->rq_wb, &rq->issue_stat); 666 wbt_issue(q->rq_wb, &rq->issue_stat);
599 } 667 }
600 668
601 blk_add_timer(rq); 669 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
602
603 WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
604 670
605 /* 671 /*
606 * Mark us as started and clear complete. Complete might have been 672 * Mark @rq in-flight which also advances the generation number,
607 * set if requeue raced with timeout, which then marked it as 673 * and register for timeout. Protect with a seqcount to allow the
608 * complete. So be sure to clear complete again when we start 674 * timeout path to read both @rq->gstate and @rq->deadline
609 * the request, otherwise we'll ignore the completion event. 675 * coherently.
610 * 676 *
611 * Ensure that ->deadline is visible before we set STARTED, such that 677 * This is the only place where a request is marked in-flight. If
612 * blk_mq_check_expired() is guaranteed to observe our ->deadline when 678 * the timeout path reads an in-flight @rq->gstate, the
613 * it observes STARTED. 679 * @rq->deadline it reads together under @rq->gstate_seq is
680 * guaranteed to be the matching one.
614 */ 681 */
615 smp_wmb(); 682 preempt_disable();
616 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 683 write_seqcount_begin(&rq->gstate_seq);
617 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { 684
618 /* 685 blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
619 * Coherence order guarantees these consecutive stores to a 686 blk_add_timer(rq);
620 * single variable propagate in the specified order. Thus the 687
621 * clear_bit() is ordered _after_ the set bit. See 688 write_seqcount_end(&rq->gstate_seq);
622 * blk_mq_check_expired(). 689 preempt_enable();
623 *
624 * (the bits must be part of the same byte for this to be
625 * true).
626 */
627 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
628 }
629 690
630 if (q->dma_drain_size && blk_rq_bytes(rq)) { 691 if (q->dma_drain_size && blk_rq_bytes(rq)) {
631 /* 692 /*
@@ -639,13 +700,9 @@ void blk_mq_start_request(struct request *rq)
639EXPORT_SYMBOL(blk_mq_start_request); 700EXPORT_SYMBOL(blk_mq_start_request);
640 701
641/* 702/*
642 * When we reach here because queue is busy, REQ_ATOM_COMPLETE 703 * When we reach here because queue is busy, it's safe to change the state
643 * flag isn't set yet, so there may be race with timeout handler, 704 * to IDLE without checking @rq->aborted_gstate because we should still be
644 * but given rq->deadline is just set in .queue_rq() under 705 * holding the RCU read lock and thus protected against timeout.
645 * this situation, the race won't be possible in reality because
646 * rq->timeout should be set as big enough to cover the window
647 * between blk_mq_start_request() called from .queue_rq() and
648 * clearing REQ_ATOM_STARTED here.
649 */ 706 */
650static void __blk_mq_requeue_request(struct request *rq) 707static void __blk_mq_requeue_request(struct request *rq)
651{ 708{
@@ -657,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq)
657 wbt_requeue(q->rq_wb, &rq->issue_stat); 714 wbt_requeue(q->rq_wb, &rq->issue_stat);
658 blk_mq_sched_requeue_request(rq); 715 blk_mq_sched_requeue_request(rq);
659 716
660 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 717 if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
718 blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
661 if (q->dma_drain_size && blk_rq_bytes(rq)) 719 if (q->dma_drain_size && blk_rq_bytes(rq))
662 rq->nr_phys_segments--; 720 rq->nr_phys_segments--;
663 } 721 }
@@ -689,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
689 747
690 rq->rq_flags &= ~RQF_SOFTBARRIER; 748 rq->rq_flags &= ~RQF_SOFTBARRIER;
691 list_del_init(&rq->queuelist); 749 list_del_init(&rq->queuelist);
692 blk_mq_sched_insert_request(rq, true, false, false, true); 750 blk_mq_sched_insert_request(rq, true, false, false);
693 } 751 }
694 752
695 while (!list_empty(&rq_list)) { 753 while (!list_empty(&rq_list)) {
696 rq = list_entry(rq_list.next, struct request, queuelist); 754 rq = list_entry(rq_list.next, struct request, queuelist);
697 list_del_init(&rq->queuelist); 755 list_del_init(&rq->queuelist);
698 blk_mq_sched_insert_request(rq, false, false, false, true); 756 blk_mq_sched_insert_request(rq, false, false, false);
699 } 757 }
700 758
701 blk_mq_run_hw_queues(q, false); 759 blk_mq_run_hw_queues(q, false);
@@ -729,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
729 787
730void blk_mq_kick_requeue_list(struct request_queue *q) 788void blk_mq_kick_requeue_list(struct request_queue *q)
731{ 789{
732 kblockd_schedule_delayed_work(&q->requeue_work, 0); 790 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
733} 791}
734EXPORT_SYMBOL(blk_mq_kick_requeue_list); 792EXPORT_SYMBOL(blk_mq_kick_requeue_list);
735 793
@@ -755,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
755struct blk_mq_timeout_data { 813struct blk_mq_timeout_data {
756 unsigned long next; 814 unsigned long next;
757 unsigned int next_set; 815 unsigned int next_set;
816 unsigned int nr_expired;
758}; 817};
759 818
760void blk_mq_rq_timed_out(struct request *req, bool reserved) 819static void blk_mq_rq_timed_out(struct request *req, bool reserved)
761{ 820{
762 const struct blk_mq_ops *ops = req->q->mq_ops; 821 const struct blk_mq_ops *ops = req->q->mq_ops;
763 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 822 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
764 823
765 /* 824 req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
766 * We know that complete is set at this point. If STARTED isn't set
767 * anymore, then the request isn't active and the "timeout" should
768 * just be ignored. This can happen due to the bitflag ordering.
769 * Timeout first checks if STARTED is set, and if it is, assumes
770 * the request is active. But if we race with completion, then
771 * both flags will get cleared. So check here again, and ignore
772 * a timeout event with a request that isn't active.
773 */
774 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
775 return;
776 825
777 if (ops->timeout) 826 if (ops->timeout)
778 ret = ops->timeout(req, reserved); 827 ret = ops->timeout(req, reserved);
@@ -782,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
782 __blk_mq_complete_request(req); 831 __blk_mq_complete_request(req);
783 break; 832 break;
784 case BLK_EH_RESET_TIMER: 833 case BLK_EH_RESET_TIMER:
834 /*
835 * As nothing prevents from completion happening while
836 * ->aborted_gstate is set, this may lead to ignored
837 * completions and further spurious timeouts.
838 */
839 blk_mq_rq_update_aborted_gstate(req, 0);
785 blk_add_timer(req); 840 blk_add_timer(req);
786 blk_clear_rq_complete(req);
787 break; 841 break;
788 case BLK_EH_NOT_HANDLED: 842 case BLK_EH_NOT_HANDLED:
789 break; 843 break;
@@ -797,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
797 struct request *rq, void *priv, bool reserved) 851 struct request *rq, void *priv, bool reserved)
798{ 852{
799 struct blk_mq_timeout_data *data = priv; 853 struct blk_mq_timeout_data *data = priv;
800 unsigned long deadline; 854 unsigned long gstate, deadline;
855 int start;
801 856
802 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 857 might_sleep();
803 return;
804 858
805 /* 859 if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
806 * Ensures that if we see STARTED we must also see our 860 return;
807 * up-to-date deadline, see blk_mq_start_request().
808 */
809 smp_rmb();
810 861
811 deadline = READ_ONCE(rq->deadline); 862 /* read coherent snapshots of @rq->state_gen and @rq->deadline */
863 while (true) {
864 start = read_seqcount_begin(&rq->gstate_seq);
865 gstate = READ_ONCE(rq->gstate);
866 deadline = blk_rq_deadline(rq);
867 if (!read_seqcount_retry(&rq->gstate_seq, start))
868 break;
869 cond_resched();
870 }
812 871
813 /* 872 /* if in-flight && overdue, mark for abortion */
814 * The rq being checked may have been freed and reallocated 873 if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
815 * out already here, we avoid this race by checking rq->deadline 874 time_after_eq(jiffies, deadline)) {
816 * and REQ_ATOM_COMPLETE flag together: 875 blk_mq_rq_update_aborted_gstate(rq, gstate);
817 * 876 data->nr_expired++;
818 * - if rq->deadline is observed as new value because of 877 hctx->nr_expired++;
819 * reusing, the rq won't be timed out because of timing.
820 * - if rq->deadline is observed as previous value,
821 * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
822 * because we put a barrier between setting rq->deadline
823 * and clearing the flag in blk_mq_start_request(), so
824 * this rq won't be timed out too.
825 */
826 if (time_after_eq(jiffies, deadline)) {
827 if (!blk_mark_rq_complete(rq)) {
828 /*
829 * Again coherence order ensures that consecutive reads
830 * from the same variable must be in that order. This
831 * ensures that if we see COMPLETE clear, we must then
832 * see STARTED set and we'll ignore this timeout.
833 *
834 * (There's also the MB implied by the test_and_clear())
835 */
836 blk_mq_rq_timed_out(rq, reserved);
837 }
838 } else if (!data->next_set || time_after(data->next, deadline)) { 878 } else if (!data->next_set || time_after(data->next, deadline)) {
839 data->next = deadline; 879 data->next = deadline;
840 data->next_set = 1; 880 data->next_set = 1;
841 } 881 }
842} 882}
843 883
884static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
885 struct request *rq, void *priv, bool reserved)
886{
887 /*
888 * We marked @rq->aborted_gstate and waited for RCU. If there were
889 * completions that we lost to, they would have finished and
890 * updated @rq->gstate by now; otherwise, the completion path is
891 * now guaranteed to see @rq->aborted_gstate and yield. If
892 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
893 */
894 if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
895 READ_ONCE(rq->gstate) == rq->aborted_gstate)
896 blk_mq_rq_timed_out(rq, reserved);
897}
898
844static void blk_mq_timeout_work(struct work_struct *work) 899static void blk_mq_timeout_work(struct work_struct *work)
845{ 900{
846 struct request_queue *q = 901 struct request_queue *q =
@@ -848,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
848 struct blk_mq_timeout_data data = { 903 struct blk_mq_timeout_data data = {
849 .next = 0, 904 .next = 0,
850 .next_set = 0, 905 .next_set = 0,
906 .nr_expired = 0,
851 }; 907 };
908 struct blk_mq_hw_ctx *hctx;
852 int i; 909 int i;
853 910
854 /* A deadlock might occur if a request is stuck requiring a 911 /* A deadlock might occur if a request is stuck requiring a
@@ -867,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work)
867 if (!percpu_ref_tryget(&q->q_usage_counter)) 924 if (!percpu_ref_tryget(&q->q_usage_counter))
868 return; 925 return;
869 926
927 /* scan for the expired ones and set their ->aborted_gstate */
870 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); 928 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
871 929
930 if (data.nr_expired) {
931 bool has_rcu = false;
932
933 /*
934 * Wait till everyone sees ->aborted_gstate. The
935 * sequential waits for SRCUs aren't ideal. If this ever
936 * becomes a problem, we can add per-hw_ctx rcu_head and
937 * wait in parallel.
938 */
939 queue_for_each_hw_ctx(q, hctx, i) {
940 if (!hctx->nr_expired)
941 continue;
942
943 if (!(hctx->flags & BLK_MQ_F_BLOCKING))
944 has_rcu = true;
945 else
946 synchronize_srcu(hctx->srcu);
947
948 hctx->nr_expired = 0;
949 }
950 if (has_rcu)
951 synchronize_rcu();
952
953 /* terminate the ones we won */
954 blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
955 }
956
872 if (data.next_set) { 957 if (data.next_set) {
873 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 958 data.next = blk_rq_timeout(round_jiffies_up(data.next));
874 mod_timer(&q->timeout, data.next); 959 mod_timer(&q->timeout, data.next);
875 } else { 960 } else {
876 struct blk_mq_hw_ctx *hctx; 961 /*
877 962 * Request timeouts are handled as a forward rolling timer. If
963 * we end up here it means that no requests are pending and
964 * also that no request has been pending for a while. Mark
965 * each hctx as idle.
966 */
878 queue_for_each_hw_ctx(q, hctx, i) { 967 queue_for_each_hw_ctx(q, hctx, i) {
879 /* the hctx may be unmapped, so check it here */ 968 /* the hctx may be unmapped, so check it here */
880 if (blk_mq_hw_queue_mapped(hctx)) 969 if (blk_mq_hw_queue_mapped(hctx))
@@ -1010,66 +1099,67 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1010 1099
1011/* 1100/*
1012 * Mark us waiting for a tag. For shared tags, this involves hooking us into 1101 * Mark us waiting for a tag. For shared tags, this involves hooking us into
1013 * the tag wakeups. For non-shared tags, we can simply mark us nedeing a 1102 * the tag wakeups. For non-shared tags, we can simply mark us needing a
1014 * restart. For both caes, take care to check the condition again after 1103 * restart. For both cases, take care to check the condition again after
1015 * marking us as waiting. 1104 * marking us as waiting.
1016 */ 1105 */
1017static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, 1106static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
1018 struct request *rq) 1107 struct request *rq)
1019{ 1108{
1020 struct blk_mq_hw_ctx *this_hctx = *hctx; 1109 struct blk_mq_hw_ctx *this_hctx = *hctx;
1021 bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
1022 struct sbq_wait_state *ws; 1110 struct sbq_wait_state *ws;
1023 wait_queue_entry_t *wait; 1111 wait_queue_entry_t *wait;
1024 bool ret; 1112 bool ret;
1025 1113
1026 if (!shared_tags) { 1114 if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1027 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) 1115 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
1028 set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); 1116 set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
1029 } else {
1030 wait = &this_hctx->dispatch_wait;
1031 if (!list_empty_careful(&wait->entry))
1032 return false;
1033 1117
1034 spin_lock(&this_hctx->lock); 1118 /*
1035 if (!list_empty(&wait->entry)) { 1119 * It's possible that a tag was freed in the window between the
1036 spin_unlock(&this_hctx->lock); 1120 * allocation failure and adding the hardware queue to the wait
1037 return false; 1121 * queue.
1038 } 1122 *
1123 * Don't clear RESTART here, someone else could have set it.
1124 * At most this will cost an extra queue run.
1125 */
1126 return blk_mq_get_driver_tag(rq, hctx, false);
1127 }
1039 1128
1040 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); 1129 wait = &this_hctx->dispatch_wait;
1041 add_wait_queue(&ws->wait, wait); 1130 if (!list_empty_careful(&wait->entry))
1131 return false;
1132
1133 spin_lock(&this_hctx->lock);
1134 if (!list_empty(&wait->entry)) {
1135 spin_unlock(&this_hctx->lock);
1136 return false;
1042 } 1137 }
1043 1138
1139 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
1140 add_wait_queue(&ws->wait, wait);
1141
1044 /* 1142 /*
1045 * It's possible that a tag was freed in the window between the 1143 * It's possible that a tag was freed in the window between the
1046 * allocation failure and adding the hardware queue to the wait 1144 * allocation failure and adding the hardware queue to the wait
1047 * queue. 1145 * queue.
1048 */ 1146 */
1049 ret = blk_mq_get_driver_tag(rq, hctx, false); 1147 ret = blk_mq_get_driver_tag(rq, hctx, false);
1050 1148 if (!ret) {
1051 if (!shared_tags) {
1052 /*
1053 * Don't clear RESTART here, someone else could have set it.
1054 * At most this will cost an extra queue run.
1055 */
1056 return ret;
1057 } else {
1058 if (!ret) {
1059 spin_unlock(&this_hctx->lock);
1060 return false;
1061 }
1062
1063 /*
1064 * We got a tag, remove ourselves from the wait queue to ensure
1065 * someone else gets the wakeup.
1066 */
1067 spin_lock_irq(&ws->wait.lock);
1068 list_del_init(&wait->entry);
1069 spin_unlock_irq(&ws->wait.lock);
1070 spin_unlock(&this_hctx->lock); 1149 spin_unlock(&this_hctx->lock);
1071 return true; 1150 return false;
1072 } 1151 }
1152
1153 /*
1154 * We got a tag, remove ourselves from the wait queue to ensure
1155 * someone else gets the wakeup.
1156 */
1157 spin_lock_irq(&ws->wait.lock);
1158 list_del_init(&wait->entry);
1159 spin_unlock_irq(&ws->wait.lock);
1160 spin_unlock(&this_hctx->lock);
1161
1162 return true;
1073} 1163}
1074 1164
1075bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, 1165bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
@@ -1206,9 +1296,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1206 /* 1296 /*
1207 * We should be running this queue from one of the CPUs that 1297 * We should be running this queue from one of the CPUs that
1208 * are mapped to it. 1298 * are mapped to it.
1299 *
1300 * There are at least two related races now between setting
1301 * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
1302 * __blk_mq_run_hw_queue():
1303 *
1304 * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
1305 * but later it becomes online, then this warning is harmless
1306 * at all
1307 *
1308 * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
1309 * but later it becomes offline, then the warning can't be
1310 * triggered, and we depend on blk-mq timeout handler to
1311 * handle dispatched requests to this hctx
1209 */ 1312 */
1210 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && 1313 if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1211 cpu_online(hctx->next_cpu)); 1314 cpu_online(hctx->next_cpu)) {
1315 printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
1316 raw_smp_processor_id(),
1317 cpumask_empty(hctx->cpumask) ? "inactive": "active");
1318 dump_stack();
1319 }
1212 1320
1213 /* 1321 /*
1214 * We can't run the queue inline with ints disabled. Ensure that 1322 * We can't run the queue inline with ints disabled. Ensure that
@@ -1216,17 +1324,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1216 */ 1324 */
1217 WARN_ON_ONCE(in_interrupt()); 1325 WARN_ON_ONCE(in_interrupt());
1218 1326
1219 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 1327 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1220 rcu_read_lock();
1221 blk_mq_sched_dispatch_requests(hctx);
1222 rcu_read_unlock();
1223 } else {
1224 might_sleep();
1225 1328
1226 srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); 1329 hctx_lock(hctx, &srcu_idx);
1227 blk_mq_sched_dispatch_requests(hctx); 1330 blk_mq_sched_dispatch_requests(hctx);
1228 srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); 1331 hctx_unlock(hctx, srcu_idx);
1229 }
1230} 1332}
1231 1333
1232/* 1334/*
@@ -1237,20 +1339,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1237 */ 1339 */
1238static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 1340static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1239{ 1341{
1342 bool tried = false;
1343
1240 if (hctx->queue->nr_hw_queues == 1) 1344 if (hctx->queue->nr_hw_queues == 1)
1241 return WORK_CPU_UNBOUND; 1345 return WORK_CPU_UNBOUND;
1242 1346
1243 if (--hctx->next_cpu_batch <= 0) { 1347 if (--hctx->next_cpu_batch <= 0) {
1244 int next_cpu; 1348 int next_cpu;
1245 1349select_cpu:
1246 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 1350 next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
1351 cpu_online_mask);
1247 if (next_cpu >= nr_cpu_ids) 1352 if (next_cpu >= nr_cpu_ids)
1248 next_cpu = cpumask_first(hctx->cpumask); 1353 next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
1249 1354
1250 hctx->next_cpu = next_cpu; 1355 /*
1356 * No online CPU is found, so have to make sure hctx->next_cpu
1357 * is set correctly for not breaking workqueue.
1358 */
1359 if (next_cpu >= nr_cpu_ids)
1360 hctx->next_cpu = cpumask_first(hctx->cpumask);
1361 else
1362 hctx->next_cpu = next_cpu;
1251 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1363 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1252 } 1364 }
1253 1365
1366 /*
1367 * Do unbound schedule if we can't find a online CPU for this hctx,
1368 * and it should only happen in the path of handling CPU DEAD.
1369 */
1370 if (!cpu_online(hctx->next_cpu)) {
1371 if (!tried) {
1372 tried = true;
1373 goto select_cpu;
1374 }
1375
1376 /*
1377 * Make sure to re-select CPU next time once after CPUs
1378 * in hctx->cpumask become online again.
1379 */
1380 hctx->next_cpu_batch = 1;
1381 return WORK_CPU_UNBOUND;
1382 }
1254 return hctx->next_cpu; 1383 return hctx->next_cpu;
1255} 1384}
1256 1385
@@ -1274,9 +1403,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1274 put_cpu(); 1403 put_cpu();
1275 } 1404 }
1276 1405
1277 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1406 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1278 &hctx->run_work, 1407 msecs_to_jiffies(msecs));
1279 msecs_to_jiffies(msecs));
1280} 1408}
1281 1409
1282void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1410void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1287,7 +1415,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1287 1415
1288bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1416bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1289{ 1417{
1290 if (blk_mq_hctx_has_pending(hctx)) { 1418 int srcu_idx;
1419 bool need_run;
1420
1421 /*
1422 * When queue is quiesced, we may be switching io scheduler, or
1423 * updating nr_hw_queues, or other things, and we can't run queue
1424 * any more, even __blk_mq_hctx_has_pending() can't be called safely.
1425 *
1426 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
1427 * quiesced.
1428 */
1429 hctx_lock(hctx, &srcu_idx);
1430 need_run = !blk_queue_quiesced(hctx->queue) &&
1431 blk_mq_hctx_has_pending(hctx);
1432 hctx_unlock(hctx, srcu_idx);
1433
1434 if (need_run) {
1291 __blk_mq_delay_run_hw_queue(hctx, async, 0); 1435 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1292 return true; 1436 return true;
1293 } 1437 }
@@ -1595,9 +1739,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1595 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); 1739 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1596} 1740}
1597 1741
1598static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1742static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1599 struct request *rq, 1743 struct request *rq,
1600 blk_qc_t *cookie, bool may_sleep) 1744 blk_qc_t *cookie)
1601{ 1745{
1602 struct request_queue *q = rq->q; 1746 struct request_queue *q = rq->q;
1603 struct blk_mq_queue_data bd = { 1747 struct blk_mq_queue_data bd = {
@@ -1606,15 +1750,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1606 }; 1750 };
1607 blk_qc_t new_cookie; 1751 blk_qc_t new_cookie;
1608 blk_status_t ret; 1752 blk_status_t ret;
1753
1754 new_cookie = request_to_qc_t(hctx, rq);
1755
1756 /*
1757 * For OK queue, we are done. For error, caller may kill it.
1758 * Any other error (busy), just add it to our list as we
1759 * previously would have done.
1760 */
1761 ret = q->mq_ops->queue_rq(hctx, &bd);
1762 switch (ret) {
1763 case BLK_STS_OK:
1764 *cookie = new_cookie;
1765 break;
1766 case BLK_STS_RESOURCE:
1767 __blk_mq_requeue_request(rq);
1768 break;
1769 default:
1770 *cookie = BLK_QC_T_NONE;
1771 break;
1772 }
1773
1774 return ret;
1775}
1776
1777static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1778 struct request *rq,
1779 blk_qc_t *cookie,
1780 bool bypass_insert)
1781{
1782 struct request_queue *q = rq->q;
1609 bool run_queue = true; 1783 bool run_queue = true;
1610 1784
1611 /* RCU or SRCU read lock is needed before checking quiesced flag */ 1785 /*
1786 * RCU or SRCU read lock is needed before checking quiesced flag.
1787 *
1788 * When queue is stopped or quiesced, ignore 'bypass_insert' from
1789 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
1790 * and avoid driver to try to dispatch again.
1791 */
1612 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { 1792 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
1613 run_queue = false; 1793 run_queue = false;
1794 bypass_insert = false;
1614 goto insert; 1795 goto insert;
1615 } 1796 }
1616 1797
1617 if (q->elevator) 1798 if (q->elevator && !bypass_insert)
1618 goto insert; 1799 goto insert;
1619 1800
1620 if (!blk_mq_get_driver_tag(rq, NULL, false)) 1801 if (!blk_mq_get_driver_tag(rq, NULL, false))
@@ -1625,47 +1806,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1625 goto insert; 1806 goto insert;
1626 } 1807 }
1627 1808
1628 new_cookie = request_to_qc_t(hctx, rq); 1809 return __blk_mq_issue_directly(hctx, rq, cookie);
1629
1630 /*
1631 * For OK queue, we are done. For error, kill it. Any other
1632 * error (busy), just add it to our list as we previously
1633 * would have done
1634 */
1635 ret = q->mq_ops->queue_rq(hctx, &bd);
1636 switch (ret) {
1637 case BLK_STS_OK:
1638 *cookie = new_cookie;
1639 return;
1640 case BLK_STS_RESOURCE:
1641 __blk_mq_requeue_request(rq);
1642 goto insert;
1643 default:
1644 *cookie = BLK_QC_T_NONE;
1645 blk_mq_end_request(rq, ret);
1646 return;
1647 }
1648
1649insert: 1810insert:
1650 blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); 1811 if (bypass_insert)
1812 return BLK_STS_RESOURCE;
1813
1814 blk_mq_sched_insert_request(rq, false, run_queue, false);
1815 return BLK_STS_OK;
1651} 1816}
1652 1817
1653static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1818static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1654 struct request *rq, blk_qc_t *cookie) 1819 struct request *rq, blk_qc_t *cookie)
1655{ 1820{
1656 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 1821 blk_status_t ret;
1657 rcu_read_lock(); 1822 int srcu_idx;
1658 __blk_mq_try_issue_directly(hctx, rq, cookie, false);
1659 rcu_read_unlock();
1660 } else {
1661 unsigned int srcu_idx;
1662 1823
1663 might_sleep(); 1824 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1664 1825
1665 srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); 1826 hctx_lock(hctx, &srcu_idx);
1666 __blk_mq_try_issue_directly(hctx, rq, cookie, true); 1827
1667 srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); 1828 ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
1668 } 1829 if (ret == BLK_STS_RESOURCE)
1830 blk_mq_sched_insert_request(rq, false, true, false);
1831 else if (ret != BLK_STS_OK)
1832 blk_mq_end_request(rq, ret);
1833
1834 hctx_unlock(hctx, srcu_idx);
1835}
1836
1837blk_status_t blk_mq_request_issue_directly(struct request *rq)
1838{
1839 blk_status_t ret;
1840 int srcu_idx;
1841 blk_qc_t unused_cookie;
1842 struct blk_mq_ctx *ctx = rq->mq_ctx;
1843 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1844
1845 hctx_lock(hctx, &srcu_idx);
1846 ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
1847 hctx_unlock(hctx, srcu_idx);
1848
1849 return ret;
1669} 1850}
1670 1851
1671static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1852static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -1776,7 +1957,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1776 } else if (q->elevator) { 1957 } else if (q->elevator) {
1777 blk_mq_put_ctx(data.ctx); 1958 blk_mq_put_ctx(data.ctx);
1778 blk_mq_bio_to_request(rq, bio); 1959 blk_mq_bio_to_request(rq, bio);
1779 blk_mq_sched_insert_request(rq, false, true, true, true); 1960 blk_mq_sched_insert_request(rq, false, true, true);
1780 } else { 1961 } else {
1781 blk_mq_put_ctx(data.ctx); 1962 blk_mq_put_ctx(data.ctx);
1782 blk_mq_bio_to_request(rq, bio); 1963 blk_mq_bio_to_request(rq, bio);
@@ -1869,6 +2050,22 @@ static size_t order_to_size(unsigned int order)
1869 return (size_t)PAGE_SIZE << order; 2050 return (size_t)PAGE_SIZE << order;
1870} 2051}
1871 2052
2053static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
2054 unsigned int hctx_idx, int node)
2055{
2056 int ret;
2057
2058 if (set->ops->init_request) {
2059 ret = set->ops->init_request(set, rq, hctx_idx, node);
2060 if (ret)
2061 return ret;
2062 }
2063
2064 seqcount_init(&rq->gstate_seq);
2065 u64_stats_init(&rq->aborted_gstate_sync);
2066 return 0;
2067}
2068
1872int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 2069int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1873 unsigned int hctx_idx, unsigned int depth) 2070 unsigned int hctx_idx, unsigned int depth)
1874{ 2071{
@@ -1930,12 +2127,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1930 struct request *rq = p; 2127 struct request *rq = p;
1931 2128
1932 tags->static_rqs[i] = rq; 2129 tags->static_rqs[i] = rq;
1933 if (set->ops->init_request) { 2130 if (blk_mq_init_request(set, rq, hctx_idx, node)) {
1934 if (set->ops->init_request(set, rq, hctx_idx, 2131 tags->static_rqs[i] = NULL;
1935 node)) { 2132 goto fail;
1936 tags->static_rqs[i] = NULL;
1937 goto fail;
1938 }
1939 } 2133 }
1940 2134
1941 p += rq_size; 2135 p += rq_size;
@@ -1994,7 +2188,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
1994{ 2188{
1995 blk_mq_debugfs_unregister_hctx(hctx); 2189 blk_mq_debugfs_unregister_hctx(hctx);
1996 2190
1997 blk_mq_tag_idle(hctx); 2191 if (blk_mq_hw_queue_mapped(hctx))
2192 blk_mq_tag_idle(hctx);
1998 2193
1999 if (set->ops->exit_request) 2194 if (set->ops->exit_request)
2000 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); 2195 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
@@ -2005,7 +2200,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
2005 set->ops->exit_hctx(hctx, hctx_idx); 2200 set->ops->exit_hctx(hctx, hctx_idx);
2006 2201
2007 if (hctx->flags & BLK_MQ_F_BLOCKING) 2202 if (hctx->flags & BLK_MQ_F_BLOCKING)
2008 cleanup_srcu_struct(hctx->queue_rq_srcu); 2203 cleanup_srcu_struct(hctx->srcu);
2009 2204
2010 blk_mq_remove_cpuhp(hctx); 2205 blk_mq_remove_cpuhp(hctx);
2011 blk_free_flush_queue(hctx->fq); 2206 blk_free_flush_queue(hctx->fq);
@@ -2074,13 +2269,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
2074 if (!hctx->fq) 2269 if (!hctx->fq)
2075 goto sched_exit_hctx; 2270 goto sched_exit_hctx;
2076 2271
2077 if (set->ops->init_request && 2272 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2078 set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
2079 node))
2080 goto free_fq; 2273 goto free_fq;
2081 2274
2082 if (hctx->flags & BLK_MQ_F_BLOCKING) 2275 if (hctx->flags & BLK_MQ_F_BLOCKING)
2083 init_srcu_struct(hctx->queue_rq_srcu); 2276 init_srcu_struct(hctx->srcu);
2084 2277
2085 blk_mq_debugfs_register_hctx(q, hctx); 2278 blk_mq_debugfs_register_hctx(q, hctx);
2086 2279
@@ -2116,16 +2309,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
2116 INIT_LIST_HEAD(&__ctx->rq_list); 2309 INIT_LIST_HEAD(&__ctx->rq_list);
2117 __ctx->queue = q; 2310 __ctx->queue = q;
2118 2311
2119 /* If the cpu isn't present, the cpu is mapped to first hctx */
2120 if (!cpu_present(i))
2121 continue;
2122
2123 hctx = blk_mq_map_queue(q, i);
2124
2125 /* 2312 /*
2126 * Set local node, IFF we have more than one hw queue. If 2313 * Set local node, IFF we have more than one hw queue. If
2127 * not, we remain on the home node of the device 2314 * not, we remain on the home node of the device
2128 */ 2315 */
2316 hctx = blk_mq_map_queue(q, i);
2129 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 2317 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2130 hctx->numa_node = local_memory_node(cpu_to_node(i)); 2318 hctx->numa_node = local_memory_node(cpu_to_node(i));
2131 } 2319 }
@@ -2182,7 +2370,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
2182 * 2370 *
2183 * If the cpu isn't present, the cpu is mapped to first hctx. 2371 * If the cpu isn't present, the cpu is mapped to first hctx.
2184 */ 2372 */
2185 for_each_present_cpu(i) { 2373 for_each_possible_cpu(i) {
2186 hctx_idx = q->mq_map[i]; 2374 hctx_idx = q->mq_map[i];
2187 /* unmapped hw queue can be remapped after CPU topo changed */ 2375 /* unmapped hw queue can be remapped after CPU topo changed */
2188 if (!set->tags[hctx_idx] && 2376 if (!set->tags[hctx_idx] &&
@@ -2236,7 +2424,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
2236 /* 2424 /*
2237 * Initialize batch roundrobin counts 2425 * Initialize batch roundrobin counts
2238 */ 2426 */
2239 hctx->next_cpu = cpumask_first(hctx->cpumask); 2427 hctx->next_cpu = cpumask_first_and(hctx->cpumask,
2428 cpu_online_mask);
2240 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2429 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2241 } 2430 }
2242} 2431}
@@ -2369,7 +2558,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2369{ 2558{
2370 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); 2559 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2371 2560
2372 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), 2561 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2373 __alignof__(struct blk_mq_hw_ctx)) != 2562 __alignof__(struct blk_mq_hw_ctx)) !=
2374 sizeof(struct blk_mq_hw_ctx)); 2563 sizeof(struct blk_mq_hw_ctx));
2375 2564
@@ -2386,6 +2575,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2386 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 2575 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2387 2576
2388 blk_mq_sysfs_unregister(q); 2577 blk_mq_sysfs_unregister(q);
2578
2579 /* protect against switching io scheduler */
2580 mutex_lock(&q->sysfs_lock);
2389 for (i = 0; i < set->nr_hw_queues; i++) { 2581 for (i = 0; i < set->nr_hw_queues; i++) {
2390 int node; 2582 int node;
2391 2583
@@ -2430,6 +2622,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2430 } 2622 }
2431 } 2623 }
2432 q->nr_hw_queues = i; 2624 q->nr_hw_queues = i;
2625 mutex_unlock(&q->sysfs_lock);
2433 blk_mq_sysfs_register(q); 2626 blk_mq_sysfs_register(q);
2434} 2627}
2435 2628
@@ -2601,9 +2794,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2601 2794
2602static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2795static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2603{ 2796{
2604 if (set->ops->map_queues) 2797 if (set->ops->map_queues) {
2798 int cpu;
2799 /*
2800 * transport .map_queues is usually done in the following
2801 * way:
2802 *
2803 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
2804 * mask = get_cpu_mask(queue)
2805 * for_each_cpu(cpu, mask)
2806 * set->mq_map[cpu] = queue;
2807 * }
2808 *
2809 * When we need to remap, the table has to be cleared for
2810 * killing stale mapping since one CPU may not be mapped
2811 * to any hw queue.
2812 */
2813 for_each_possible_cpu(cpu)
2814 set->mq_map[cpu] = 0;
2815
2605 return set->ops->map_queues(set); 2816 return set->ops->map_queues(set);
2606 else 2817 } else
2607 return blk_mq_map_queues(set); 2818 return blk_mq_map_queues(set);
2608} 2819}
2609 2820
@@ -2712,6 +2923,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2712 return -EINVAL; 2923 return -EINVAL;
2713 2924
2714 blk_mq_freeze_queue(q); 2925 blk_mq_freeze_queue(q);
2926 blk_mq_quiesce_queue(q);
2715 2927
2716 ret = 0; 2928 ret = 0;
2717 queue_for_each_hw_ctx(q, hctx, i) { 2929 queue_for_each_hw_ctx(q, hctx, i) {
@@ -2735,6 +2947,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2735 if (!ret) 2947 if (!ret)
2736 q->nr_requests = nr; 2948 q->nr_requests = nr;
2737 2949
2950 blk_mq_unquiesce_queue(q);
2738 blk_mq_unfreeze_queue(q); 2951 blk_mq_unfreeze_queue(q);
2739 2952
2740 return ret; 2953 return ret;
@@ -2850,7 +3063,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
2850 unsigned int nsecs; 3063 unsigned int nsecs;
2851 ktime_t kt; 3064 ktime_t kt;
2852 3065
2853 if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) 3066 if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
2854 return false; 3067 return false;
2855 3068
2856 /* 3069 /*
@@ -2870,7 +3083,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
2870 if (!nsecs) 3083 if (!nsecs)
2871 return false; 3084 return false;
2872 3085
2873 set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 3086 rq->rq_flags |= RQF_MQ_POLL_SLEPT;
2874 3087
2875 /* 3088 /*
2876 * This will be replaced with the stats tracking code, using 3089 * This will be replaced with the stats tracking code, using
@@ -2884,7 +3097,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
2884 3097
2885 hrtimer_init_sleeper(&hs, current); 3098 hrtimer_init_sleeper(&hs, current);
2886 do { 3099 do {
2887 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 3100 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
2888 break; 3101 break;
2889 set_current_state(TASK_UNINTERRUPTIBLE); 3102 set_current_state(TASK_UNINTERRUPTIBLE);
2890 hrtimer_start_expires(&hs.timer, mode); 3103 hrtimer_start_expires(&hs.timer, mode);
@@ -2970,12 +3183,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
2970 3183
2971static int __init blk_mq_init(void) 3184static int __init blk_mq_init(void)
2972{ 3185{
2973 /*
2974 * See comment in block/blk.h rq_atomic_flags enum
2975 */
2976 BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
2977 (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
2978
2979 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 3186 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
2980 blk_mq_hctx_notify_dead); 3187 blk_mq_hctx_notify_dead);
2981 return 0; 3188 return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c7c3ff5bf62..88c558f71819 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,20 @@ struct blk_mq_ctx {
27 struct kobject kobj; 27 struct kobject kobj;
28} ____cacheline_aligned_in_smp; 28} ____cacheline_aligned_in_smp;
29 29
30/*
31 * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
32 * and the upper bits the generation number.
33 */
34enum mq_rq_state {
35 MQ_RQ_IDLE = 0,
36 MQ_RQ_IN_FLIGHT = 1,
37 MQ_RQ_COMPLETE = 2,
38
39 MQ_RQ_STATE_BITS = 2,
40 MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
41 MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
42};
43
30void blk_mq_freeze_queue(struct request_queue *q); 44void blk_mq_freeze_queue(struct request_queue *q);
31void blk_mq_free_queue(struct request_queue *q); 45void blk_mq_free_queue(struct request_queue *q);
32int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); 46int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
60void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 74void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
61 struct list_head *list); 75 struct list_head *list);
62 76
77/* Used by blk_insert_cloned_request() to issue request directly */
78blk_status_t blk_mq_request_issue_directly(struct request *rq);
79
63/* 80/*
64 * CPU -> queue mappings 81 * CPU -> queue mappings
65 */ 82 */
@@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
81extern void blk_mq_sysfs_unregister(struct request_queue *q); 98extern void blk_mq_sysfs_unregister(struct request_queue *q);
82extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); 99extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
83 100
84extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
85
86void blk_mq_release(struct request_queue *q); 101void blk_mq_release(struct request_queue *q);
87 102
103/**
104 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
105 * @rq: target request.
106 */
107static inline int blk_mq_rq_state(struct request *rq)
108{
109 return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
110}
111
112/**
113 * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
114 * @rq: target request.
115 * @state: new state to set.
116 *
117 * Set @rq's state to @state. The caller is responsible for ensuring that
118 * there are no other updaters. A request can transition into IN_FLIGHT
119 * only from IDLE and doing so increments the generation number.
120 */
121static inline void blk_mq_rq_update_state(struct request *rq,
122 enum mq_rq_state state)
123{
124 u64 old_val = READ_ONCE(rq->gstate);
125 u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
126
127 if (state == MQ_RQ_IN_FLIGHT) {
128 WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
129 new_val += MQ_RQ_GEN_INC;
130 }
131
132 /* avoid exposing interim values */
133 WRITE_ONCE(rq->gstate, new_val);
134}
135
88static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 136static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
89 unsigned int cpu) 137 unsigned int cpu)
90{ 138{
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 870484eaed1f..cbea895a5547 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
853 .release = blk_release_queue, 853 .release = blk_release_queue,
854}; 854};
855 855
856/**
857 * blk_register_queue - register a block layer queue with sysfs
858 * @disk: Disk of which the request queue should be registered with sysfs.
859 */
856int blk_register_queue(struct gendisk *disk) 860int blk_register_queue(struct gendisk *disk)
857{ 861{
858 int ret; 862 int ret;
@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
909 if (q->request_fn || (q->mq_ops && q->elevator)) { 913 if (q->request_fn || (q->mq_ops && q->elevator)) {
910 ret = elv_register_queue(q); 914 ret = elv_register_queue(q);
911 if (ret) { 915 if (ret) {
916 mutex_unlock(&q->sysfs_lock);
912 kobject_uevent(&q->kobj, KOBJ_REMOVE); 917 kobject_uevent(&q->kobj, KOBJ_REMOVE);
913 kobject_del(&q->kobj); 918 kobject_del(&q->kobj);
914 blk_trace_remove_sysfs(dev); 919 blk_trace_remove_sysfs(dev);
915 kobject_put(&dev->kobj); 920 kobject_put(&dev->kobj);
916 goto unlock; 921 return ret;
917 } 922 }
918 } 923 }
919 ret = 0; 924 ret = 0;
@@ -921,7 +926,15 @@ unlock:
921 mutex_unlock(&q->sysfs_lock); 926 mutex_unlock(&q->sysfs_lock);
922 return ret; 927 return ret;
923} 928}
929EXPORT_SYMBOL_GPL(blk_register_queue);
924 930
931/**
932 * blk_unregister_queue - counterpart of blk_register_queue()
933 * @disk: Disk of which the request queue should be unregistered from sysfs.
934 *
935 * Note: the caller is responsible for guaranteeing that this function is called
936 * after blk_register_queue() has finished.
937 */
925void blk_unregister_queue(struct gendisk *disk) 938void blk_unregister_queue(struct gendisk *disk)
926{ 939{
927 struct request_queue *q = disk->queue; 940 struct request_queue *q = disk->queue;
@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
929 if (WARN_ON(!q)) 942 if (WARN_ON(!q))
930 return; 943 return;
931 944
932 mutex_lock(&q->sysfs_lock); 945 /* Return early if disk->queue was never registered. */
933 queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); 946 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
934 mutex_unlock(&q->sysfs_lock); 947 return;
935 948
936 wbt_exit(q); 949 /*
950 * Since sysfs_remove_dir() prevents adding new directory entries
951 * before removal of existing entries starts, protect against
952 * concurrent elv_iosched_store() calls.
953 */
954 mutex_lock(&q->sysfs_lock);
937 955
956 spin_lock_irq(q->queue_lock);
957 queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
958 spin_unlock_irq(q->queue_lock);
938 959
960 /*
961 * Remove the sysfs attributes before unregistering the queue data
962 * structures that can be modified through sysfs.
963 */
939 if (q->mq_ops) 964 if (q->mq_ops)
940 blk_mq_unregister_dev(disk_to_dev(disk), q); 965 blk_mq_unregister_dev(disk_to_dev(disk), q);
941 966 mutex_unlock(&q->sysfs_lock);
942 if (q->request_fn || (q->mq_ops && q->elevator))
943 elv_unregister_queue(q);
944 967
945 kobject_uevent(&q->kobj, KOBJ_REMOVE); 968 kobject_uevent(&q->kobj, KOBJ_REMOVE);
946 kobject_del(&q->kobj); 969 kobject_del(&q->kobj);
947 blk_trace_remove_sysfs(disk_to_dev(disk)); 970 blk_trace_remove_sysfs(disk_to_dev(disk));
971
972 wbt_exit(q);
973
974 mutex_lock(&q->sysfs_lock);
975 if (q->request_fn || (q->mq_ops && q->elevator))
976 elv_unregister_queue(q);
977 mutex_unlock(&q->sysfs_lock);
978
948 kobject_put(&disk_to_dev(disk)->kobj); 979 kobject_put(&disk_to_dev(disk)->kobj);
949} 980}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d19f416d6101..c5a131673733 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -216,9 +216,9 @@ struct throtl_data
216 216
217 unsigned int scale; 217 unsigned int scale;
218 218
219 struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; 219 struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
220 struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; 220 struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
221 struct latency_bucket __percpu *latency_buckets; 221 struct latency_bucket __percpu *latency_buckets[2];
222 unsigned long last_calculate_time; 222 unsigned long last_calculate_time;
223 unsigned long filtered_latency; 223 unsigned long filtered_latency;
224 224
@@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = {
1511 .seq_show = blkg_print_stat_bytes, 1511 .seq_show = blkg_print_stat_bytes,
1512 }, 1512 },
1513 { 1513 {
1514 .name = "throttle.io_service_bytes_recursive",
1515 .private = (unsigned long)&blkcg_policy_throtl,
1516 .seq_show = blkg_print_stat_bytes_recursive,
1517 },
1518 {
1514 .name = "throttle.io_serviced", 1519 .name = "throttle.io_serviced",
1515 .private = (unsigned long)&blkcg_policy_throtl, 1520 .private = (unsigned long)&blkcg_policy_throtl,
1516 .seq_show = blkg_print_stat_ios, 1521 .seq_show = blkg_print_stat_ios,
1517 }, 1522 },
1523 {
1524 .name = "throttle.io_serviced_recursive",
1525 .private = (unsigned long)&blkcg_policy_throtl,
1526 .seq_show = blkg_print_stat_ios_recursive,
1527 },
1518 { } /* terminate */ 1528 { } /* terminate */
1519}; 1529};
1520 1530
@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
2040#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2050#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2041static void throtl_update_latency_buckets(struct throtl_data *td) 2051static void throtl_update_latency_buckets(struct throtl_data *td)
2042{ 2052{
2043 struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; 2053 struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
2044 int i, cpu; 2054 int i, cpu, rw;
2045 unsigned long last_latency = 0; 2055 unsigned long last_latency[2] = { 0 };
2046 unsigned long latency; 2056 unsigned long latency[2];
2047 2057
2048 if (!blk_queue_nonrot(td->queue)) 2058 if (!blk_queue_nonrot(td->queue))
2049 return; 2059 return;
@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
2052 td->last_calculate_time = jiffies; 2062 td->last_calculate_time = jiffies;
2053 2063
2054 memset(avg_latency, 0, sizeof(avg_latency)); 2064 memset(avg_latency, 0, sizeof(avg_latency));
2055 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 2065 for (rw = READ; rw <= WRITE; rw++) {
2056 struct latency_bucket *tmp = &td->tmp_buckets[i]; 2066 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2057 2067 struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
2058 for_each_possible_cpu(cpu) { 2068
2059 struct latency_bucket *bucket; 2069 for_each_possible_cpu(cpu) {
2060 2070 struct latency_bucket *bucket;
2061 /* this isn't race free, but ok in practice */ 2071
2062 bucket = per_cpu_ptr(td->latency_buckets, cpu); 2072 /* this isn't race free, but ok in practice */
2063 tmp->total_latency += bucket[i].total_latency; 2073 bucket = per_cpu_ptr(td->latency_buckets[rw],
2064 tmp->samples += bucket[i].samples; 2074 cpu);
2065 bucket[i].total_latency = 0; 2075 tmp->total_latency += bucket[i].total_latency;
2066 bucket[i].samples = 0; 2076 tmp->samples += bucket[i].samples;
2067 } 2077 bucket[i].total_latency = 0;
2078 bucket[i].samples = 0;
2079 }
2068 2080
2069 if (tmp->samples >= 32) { 2081 if (tmp->samples >= 32) {
2070 int samples = tmp->samples; 2082 int samples = tmp->samples;
2071 2083
2072 latency = tmp->total_latency; 2084 latency[rw] = tmp->total_latency;
2073 2085
2074 tmp->total_latency = 0; 2086 tmp->total_latency = 0;
2075 tmp->samples = 0; 2087 tmp->samples = 0;
2076 latency /= samples; 2088 latency[rw] /= samples;
2077 if (latency == 0) 2089 if (latency[rw] == 0)
2078 continue; 2090 continue;
2079 avg_latency[i].latency = latency; 2091 avg_latency[rw][i].latency = latency[rw];
2092 }
2080 } 2093 }
2081 } 2094 }
2082 2095
2083 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 2096 for (rw = READ; rw <= WRITE; rw++) {
2084 if (!avg_latency[i].latency) { 2097 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2085 if (td->avg_buckets[i].latency < last_latency) 2098 if (!avg_latency[rw][i].latency) {
2086 td->avg_buckets[i].latency = last_latency; 2099 if (td->avg_buckets[rw][i].latency < last_latency[rw])
2087 continue; 2100 td->avg_buckets[rw][i].latency =
2088 } 2101 last_latency[rw];
2102 continue;
2103 }
2089 2104
2090 if (!td->avg_buckets[i].valid) 2105 if (!td->avg_buckets[rw][i].valid)
2091 latency = avg_latency[i].latency; 2106 latency[rw] = avg_latency[rw][i].latency;
2092 else 2107 else
2093 latency = (td->avg_buckets[i].latency * 7 + 2108 latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
2094 avg_latency[i].latency) >> 3; 2109 avg_latency[rw][i].latency) >> 3;
2095 2110
2096 td->avg_buckets[i].latency = max(latency, last_latency); 2111 td->avg_buckets[rw][i].latency = max(latency[rw],
2097 td->avg_buckets[i].valid = true; 2112 last_latency[rw]);
2098 last_latency = td->avg_buckets[i].latency; 2113 td->avg_buckets[rw][i].valid = true;
2114 last_latency[rw] = td->avg_buckets[rw][i].latency;
2115 }
2099 } 2116 }
2100 2117
2101 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) 2118 for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
2102 throtl_log(&td->service_queue, 2119 throtl_log(&td->service_queue,
2103 "Latency bucket %d: latency=%ld, valid=%d", i, 2120 "Latency bucket %d: read latency=%ld, read valid=%d, "
2104 td->avg_buckets[i].latency, td->avg_buckets[i].valid); 2121 "write latency=%ld, write valid=%d", i,
2122 td->avg_buckets[READ][i].latency,
2123 td->avg_buckets[READ][i].valid,
2124 td->avg_buckets[WRITE][i].latency,
2125 td->avg_buckets[WRITE][i].valid);
2105} 2126}
2106#else 2127#else
2107static inline void throtl_update_latency_buckets(struct throtl_data *td) 2128static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
2242 struct latency_bucket *latency; 2263 struct latency_bucket *latency;
2243 int index; 2264 int index;
2244 2265
2245 if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || 2266 if (!td || td->limit_index != LIMIT_LOW ||
2267 !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
2246 !blk_queue_nonrot(td->queue)) 2268 !blk_queue_nonrot(td->queue))
2247 return; 2269 return;
2248 2270
2249 index = request_bucket_index(size); 2271 index = request_bucket_index(size);
2250 2272
2251 latency = get_cpu_ptr(td->latency_buckets); 2273 latency = get_cpu_ptr(td->latency_buckets[op]);
2252 latency[index].total_latency += time; 2274 latency[index].total_latency += time;
2253 latency[index].samples++; 2275 latency[index].samples++;
2254 put_cpu_ptr(td->latency_buckets); 2276 put_cpu_ptr(td->latency_buckets[op]);
2255} 2277}
2256 2278
2257void blk_throtl_stat_add(struct request *rq, u64 time_ns) 2279void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
2270 unsigned long finish_time; 2292 unsigned long finish_time;
2271 unsigned long start_time; 2293 unsigned long start_time;
2272 unsigned long lat; 2294 unsigned long lat;
2295 int rw = bio_data_dir(bio);
2273 2296
2274 tg = bio->bi_cg_private; 2297 tg = bio->bi_cg_private;
2275 if (!tg) 2298 if (!tg)
@@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
2298 2321
2299 bucket = request_bucket_index( 2322 bucket = request_bucket_index(
2300 blk_stat_size(&bio->bi_issue_stat)); 2323 blk_stat_size(&bio->bi_issue_stat));
2301 threshold = tg->td->avg_buckets[bucket].latency + 2324 threshold = tg->td->avg_buckets[rw][bucket].latency +
2302 tg->latency_target; 2325 tg->latency_target;
2303 if (lat > threshold) 2326 if (lat > threshold)
2304 tg->bad_bio_cnt++; 2327 tg->bad_bio_cnt++;
@@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
2391 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 2414 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
2392 if (!td) 2415 if (!td)
2393 return -ENOMEM; 2416 return -ENOMEM;
2394 td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * 2417 td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
2395 LATENCY_BUCKET_SIZE, __alignof__(u64)); 2418 LATENCY_BUCKET_SIZE, __alignof__(u64));
2396 if (!td->latency_buckets) { 2419 if (!td->latency_buckets[READ]) {
2420 kfree(td);
2421 return -ENOMEM;
2422 }
2423 td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
2424 LATENCY_BUCKET_SIZE, __alignof__(u64));
2425 if (!td->latency_buckets[WRITE]) {
2426 free_percpu(td->latency_buckets[READ]);
2397 kfree(td); 2427 kfree(td);
2398 return -ENOMEM; 2428 return -ENOMEM;
2399 } 2429 }
@@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
2412 /* activate policy */ 2442 /* activate policy */
2413 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 2443 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
2414 if (ret) { 2444 if (ret) {
2415 free_percpu(td->latency_buckets); 2445 free_percpu(td->latency_buckets[READ]);
2446 free_percpu(td->latency_buckets[WRITE]);
2416 kfree(td); 2447 kfree(td);
2417 } 2448 }
2418 return ret; 2449 return ret;
@@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
2423 BUG_ON(!q->td); 2454 BUG_ON(!q->td);
2424 throtl_shutdown_wq(q); 2455 throtl_shutdown_wq(q);
2425 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 2456 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
2426 free_percpu(q->td->latency_buckets); 2457 free_percpu(q->td->latency_buckets[READ]);
2458 free_percpu(q->td->latency_buckets[WRITE]);
2427 kfree(q->td); 2459 kfree(q->td);
2428} 2460}
2429 2461
@@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
2441 } else { 2473 } else {
2442 td->throtl_slice = DFL_THROTL_SLICE_HD; 2474 td->throtl_slice = DFL_THROTL_SLICE_HD;
2443 td->filtered_latency = LATENCY_FILTERED_HD; 2475 td->filtered_latency = LATENCY_FILTERED_HD;
2444 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) 2476 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2445 td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; 2477 td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
2478 td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
2479 }
2446 } 2480 }
2447#ifndef CONFIG_BLK_DEV_THROTTLING_LOW 2481#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
2448 /* if no low limit, use previous default */ 2482 /* if no low limit, use previous default */
2449 td->throtl_slice = DFL_THROTL_SLICE_HD; 2483 td->throtl_slice = DFL_THROTL_SLICE_HD;
2450#endif 2484#endif
2451 2485
2452 td->track_bio_latency = !q->mq_ops && !q->request_fn; 2486 td->track_bio_latency = !queue_is_rq_based(q);
2453 if (!td->track_bio_latency) 2487 if (!td->track_bio_latency)
2454 blk_stat_enable_accounting(q); 2488 blk_stat_enable_accounting(q);
2455} 2489}
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 764ecf9aeb30..a05e3676d24a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
112static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 112static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
113 unsigned int *next_set) 113 unsigned int *next_set)
114{ 114{
115 if (time_after_eq(jiffies, rq->deadline)) { 115 const unsigned long deadline = blk_rq_deadline(rq);
116
117 if (time_after_eq(jiffies, deadline)) {
116 list_del_init(&rq->timeout_list); 118 list_del_init(&rq->timeout_list);
117 119
118 /* 120 /*
@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
120 */ 122 */
121 if (!blk_mark_rq_complete(rq)) 123 if (!blk_mark_rq_complete(rq))
122 blk_rq_timed_out(rq); 124 blk_rq_timed_out(rq);
123 } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { 125 } else if (!*next_set || time_after(*next_timeout, deadline)) {
124 *next_timeout = rq->deadline; 126 *next_timeout = deadline;
125 *next_set = 1; 127 *next_set = 1;
126 } 128 }
127} 129}
@@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)
156 */ 158 */
157void blk_abort_request(struct request *req) 159void blk_abort_request(struct request *req)
158{ 160{
159 if (blk_mark_rq_complete(req))
160 return;
161
162 if (req->q->mq_ops) { 161 if (req->q->mq_ops) {
163 blk_mq_rq_timed_out(req, false); 162 /*
163 * All we need to ensure is that timeout scan takes place
164 * immediately and that scan sees the new timeout value.
165 * No need for fancy synchronizations.
166 */
167 blk_rq_set_deadline(req, jiffies);
168 mod_timer(&req->q->timeout, 0);
164 } else { 169 } else {
170 if (blk_mark_rq_complete(req))
171 return;
165 blk_delete_timer(req); 172 blk_delete_timer(req);
166 blk_rq_timed_out(req); 173 blk_rq_timed_out(req);
167 } 174 }
@@ -208,7 +215,8 @@ void blk_add_timer(struct request *req)
208 if (!req->timeout) 215 if (!req->timeout)
209 req->timeout = q->rq_timeout; 216 req->timeout = q->rq_timeout;
210 217
211 WRITE_ONCE(req->deadline, jiffies + req->timeout); 218 blk_rq_set_deadline(req, jiffies + req->timeout);
219 req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
212 220
213 /* 221 /*
214 * Only the non-mq case needs to add the request to a protected list. 222 * Only the non-mq case needs to add the request to a protected list.
@@ -222,7 +230,7 @@ void blk_add_timer(struct request *req)
222 * than an existing one, modify the timer. Round up to next nearest 230 * than an existing one, modify the timer. Round up to next nearest
223 * second. 231 * second.
224 */ 232 */
225 expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); 233 expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
226 234
227 if (!timer_pending(&q->timeout) || 235 if (!timer_pending(&q->timeout) ||
228 time_before(expiry, q->timeout.expires)) { 236 time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..acb7252c7e81 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
22} 22}
23 23
24/* 24/*
25 * Return true if a request is a write requests that needs zone write locking.
26 */
27bool blk_req_needs_zone_write_lock(struct request *rq)
28{
29 if (!rq->q->seq_zones_wlock)
30 return false;
31
32 if (blk_rq_is_passthrough(rq))
33 return false;
34
35 switch (req_op(rq)) {
36 case REQ_OP_WRITE_ZEROES:
37 case REQ_OP_WRITE_SAME:
38 case REQ_OP_WRITE:
39 return blk_rq_zone_is_seq(rq);
40 default:
41 return false;
42 }
43}
44EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
45
46void __blk_req_zone_write_lock(struct request *rq)
47{
48 if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
49 rq->q->seq_zones_wlock)))
50 return;
51
52 WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
53 rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
54}
55EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
56
57void __blk_req_zone_write_unlock(struct request *rq)
58{
59 rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
60 if (rq->q->seq_zones_wlock)
61 WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
62 rq->q->seq_zones_wlock));
63}
64EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
65
66/*
25 * Check that a zone report belongs to the partition. 67 * Check that a zone report belongs to the partition.
26 * If yes, fix its start sector and write pointer, copy it in the 68 * If yes, fix its start sector and write pointer, copy it in the
27 * zone information array and return true. Return false otherwise. 69 * zone information array and return true. Return false otherwise.
diff --git a/block/blk.h b/block/blk.h
index 442098aa9463..46db5dc83dcb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -120,33 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes);
120void blk_account_io_done(struct request *req); 120void blk_account_io_done(struct request *req);
121 121
122/* 122/*
123 * Internal atomic flags for request handling
124 */
125enum rq_atomic_flags {
126 /*
127 * Keep these two bits first - not because we depend on the
128 * value of them, but we do depend on them being in the same
129 * byte of storage to ensure ordering on writes. Keeping them
130 * first will achieve that nicely.
131 */
132 REQ_ATOM_COMPLETE = 0,
133 REQ_ATOM_STARTED,
134
135 REQ_ATOM_POLL_SLEPT,
136};
137
138/*
139 * EH timer and IO completion will both attempt to 'grab' the request, make 123 * EH timer and IO completion will both attempt to 'grab' the request, make
140 * sure that only one of them succeeds 124 * sure that only one of them succeeds. Steal the bottom bit of the
125 * __deadline field for this.
141 */ 126 */
142static inline int blk_mark_rq_complete(struct request *rq) 127static inline int blk_mark_rq_complete(struct request *rq)
143{ 128{
144 return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 129 return test_and_set_bit(0, &rq->__deadline);
145} 130}
146 131
147static inline void blk_clear_rq_complete(struct request *rq) 132static inline void blk_clear_rq_complete(struct request *rq)
148{ 133{
149 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 134 clear_bit(0, &rq->__deadline);
135}
136
137static inline bool blk_rq_is_complete(struct request *rq)
138{
139 return test_bit(0, &rq->__deadline);
150} 140}
151 141
152/* 142/*
@@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
172 e->type->ops.sq.elevator_deactivate_req_fn(q, rq); 162 e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
173} 163}
174 164
165int elv_register_queue(struct request_queue *q);
166void elv_unregister_queue(struct request_queue *q);
167
175struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); 168struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
176 169
177#ifdef CONFIG_FAIL_IO_TIMEOUT 170#ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -246,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
246} 239}
247 240
248/* 241/*
242 * Steal a bit from this field for legacy IO path atomic IO marking. Note that
243 * setting the deadline clears the bottom bit, potentially clearing the
244 * completed bit. The user has to be OK with this (current ones are fine).
245 */
246static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
247{
248 rq->__deadline = time & ~0x1UL;
249}
250
251static inline unsigned long blk_rq_deadline(struct request *rq)
252{
253 return rq->__deadline & ~0x1UL;
254}
255
256/*
249 * Internal io_context interface 257 * Internal io_context interface
250 */ 258 */
251void get_io_context(struct io_context *ioc); 259void get_io_context(struct io_context *ioc);
diff --git a/block/bounce.c b/block/bounce.c
index 1d05c422c932..6a3e68292273 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)
113static void copy_to_high_bio_irq(struct bio *to, struct bio *from) 113static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
114{ 114{
115 unsigned char *vfrom; 115 unsigned char *vfrom;
116 struct bio_vec tovec, *fromvec = from->bi_io_vec; 116 struct bio_vec tovec, fromvec;
117 struct bvec_iter iter; 117 struct bvec_iter iter;
118 /*
119 * The bio of @from is created by bounce, so we can iterate
120 * its bvec from start to end, but the @from->bi_iter can't be
121 * trusted because it might be changed by splitting.
122 */
123 struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
118 124
119 bio_for_each_segment(tovec, to, iter) { 125 bio_for_each_segment(tovec, to, iter) {
120 if (tovec.bv_page != fromvec->bv_page) { 126 fromvec = bio_iter_iovec(from, from_iter);
127 if (tovec.bv_page != fromvec.bv_page) {
121 /* 128 /*
122 * fromvec->bv_offset and fromvec->bv_len might have 129 * fromvec->bv_offset and fromvec->bv_len might have
123 * been modified by the block layer, so use the original 130 * been modified by the block layer, so use the original
124 * copy, bounce_copy_vec already uses tovec->bv_len 131 * copy, bounce_copy_vec already uses tovec->bv_len
125 */ 132 */
126 vfrom = page_address(fromvec->bv_page) + 133 vfrom = page_address(fromvec.bv_page) +
127 tovec.bv_offset; 134 tovec.bv_offset;
128 135
129 bounce_copy_vec(&tovec, vfrom); 136 bounce_copy_vec(&tovec, vfrom);
130 flush_dcache_page(tovec.bv_page); 137 flush_dcache_page(tovec.bv_page);
131 } 138 }
132 139 bio_advance_iter(from, &from_iter, tovec.bv_len);
133 fromvec++;
134 } 140 }
135} 141}
136 142
137static void bounce_end_io(struct bio *bio, mempool_t *pool) 143static void bounce_end_io(struct bio *bio, mempool_t *pool)
138{ 144{
139 struct bio *bio_orig = bio->bi_private; 145 struct bio *bio_orig = bio->bi_private;
140 struct bio_vec *bvec, *org_vec; 146 struct bio_vec *bvec, orig_vec;
141 int i; 147 int i;
142 int start = bio_orig->bi_iter.bi_idx; 148 struct bvec_iter orig_iter = bio_orig->bi_iter;
143 149
144 /* 150 /*
145 * free up bounce indirect pages used 151 * free up bounce indirect pages used
146 */ 152 */
147 bio_for_each_segment_all(bvec, bio, i) { 153 bio_for_each_segment_all(bvec, bio, i) {
148 org_vec = bio_orig->bi_io_vec + i + start; 154 orig_vec = bio_iter_iovec(bio_orig, orig_iter);
149 155 if (bvec->bv_page != orig_vec.bv_page) {
150 if (bvec->bv_page == org_vec->bv_page) 156 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
151 continue; 157 mempool_free(bvec->bv_page, pool);
152 158 }
153 dec_zone_page_state(bvec->bv_page, NR_BOUNCE); 159 bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
154 mempool_free(bvec->bv_page, pool);
155 } 160 }
156 161
157 bio_orig->bi_status = bio->bi_status; 162 bio_orig->bi_status = bio->bi_status;
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 15d25ccd51a5..1474153f73e3 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -30,7 +30,7 @@
30 30
31/** 31/**
32 * bsg_teardown_job - routine to teardown a bsg job 32 * bsg_teardown_job - routine to teardown a bsg job
33 * @job: bsg_job that is to be torn down 33 * @kref: kref inside bsg_job that is to be torn down
34 */ 34 */
35static void bsg_teardown_job(struct kref *kref) 35static void bsg_teardown_job(struct kref *kref)
36{ 36{
@@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
251 * @name: device to give bsg device 251 * @name: device to give bsg device
252 * @job_fn: bsg job handler 252 * @job_fn: bsg job handler
253 * @dd_job_size: size of LLD data needed for each job 253 * @dd_job_size: size of LLD data needed for each job
254 * @release: @dev release function
254 */ 255 */
255struct request_queue *bsg_setup_queue(struct device *dev, const char *name, 256struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
256 bsg_job_fn *job_fn, int dd_job_size, 257 bsg_job_fn *job_fn, int dd_job_size,
diff --git a/block/bsg.c b/block/bsg.c
index 452f94f1c5d4..a1bcbb6ba50b 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -32,6 +32,9 @@
32#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" 32#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
33#define BSG_VERSION "0.4" 33#define BSG_VERSION "0.4"
34 34
35#define bsg_dbg(bd, fmt, ...) \
36 pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
37
35struct bsg_device { 38struct bsg_device {
36 struct request_queue *queue; 39 struct request_queue *queue;
37 spinlock_t lock; 40 spinlock_t lock;
@@ -55,14 +58,6 @@ enum {
55#define BSG_DEFAULT_CMDS 64 58#define BSG_DEFAULT_CMDS 64
56#define BSG_MAX_DEVS 32768 59#define BSG_MAX_DEVS 32768
57 60
58#undef BSG_DEBUG
59
60#ifdef BSG_DEBUG
61#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
62#else
63#define dprintk(fmt, args...)
64#endif
65
66static DEFINE_MUTEX(bsg_mutex); 61static DEFINE_MUTEX(bsg_mutex);
67static DEFINE_IDR(bsg_minor_idr); 62static DEFINE_IDR(bsg_minor_idr);
68 63
@@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
123 118
124 bc->bd = bd; 119 bc->bd = bd;
125 INIT_LIST_HEAD(&bc->list); 120 INIT_LIST_HEAD(&bc->list);
126 dprintk("%s: returning free cmd %p\n", bd->name, bc); 121 bsg_dbg(bd, "returning free cmd %p\n", bc);
127 return bc; 122 return bc;
128out: 123out:
129 spin_unlock_irq(&bd->lock); 124 spin_unlock_irq(&bd->lock);
@@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
222 if (!bcd->class_dev) 217 if (!bcd->class_dev)
223 return ERR_PTR(-ENXIO); 218 return ERR_PTR(-ENXIO);
224 219
225 dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, 220 bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
221 (unsigned long long) hdr->dout_xferp,
226 hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, 222 hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
227 hdr->din_xfer_len); 223 hdr->din_xfer_len);
228 224
@@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)
299 struct bsg_device *bd = bc->bd; 295 struct bsg_device *bd = bc->bd;
300 unsigned long flags; 296 unsigned long flags;
301 297
302 dprintk("%s: finished rq %p bc %p, bio %p\n", 298 bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
303 bd->name, rq, bc, bc->bio); 299 rq, bc, bc->bio);
304 300
305 bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); 301 bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
306 302
@@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
333 list_add_tail(&bc->list, &bd->busy_list); 329 list_add_tail(&bc->list, &bd->busy_list);
334 spin_unlock_irq(&bd->lock); 330 spin_unlock_irq(&bd->lock);
335 331
336 dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); 332 bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
337 333
338 rq->end_io_data = bc; 334 rq->end_io_data = bc;
339 blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); 335 blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
@@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
379 } 375 }
380 } while (1); 376 } while (1);
381 377
382 dprintk("%s: returning done %p\n", bd->name, bc); 378 bsg_dbg(bd, "returning done %p\n", bc);
383 379
384 return bc; 380 return bc;
385} 381}
@@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
390 struct scsi_request *req = scsi_req(rq); 386 struct scsi_request *req = scsi_req(rq);
391 int ret = 0; 387 int ret = 0;
392 388
393 dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); 389 pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
394 /* 390 /*
395 * fill in all the output members 391 * fill in all the output members
396 */ 392 */
@@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
469 struct bsg_command *bc; 465 struct bsg_command *bc;
470 int ret, tret; 466 int ret, tret;
471 467
472 dprintk("%s: entered\n", bd->name); 468 bsg_dbg(bd, "entered\n");
473 469
474 /* 470 /*
475 * wait for all commands to complete 471 * wait for all commands to complete
@@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
572 int ret; 568 int ret;
573 ssize_t bytes_read; 569 ssize_t bytes_read;
574 570
575 dprintk("%s: read %zd bytes\n", bd->name, count); 571 bsg_dbg(bd, "read %zd bytes\n", count);
576 572
577 bsg_set_block(bd, file); 573 bsg_set_block(bd, file);
578 574
@@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
646 ssize_t bytes_written; 642 ssize_t bytes_written;
647 int ret; 643 int ret;
648 644
649 dprintk("%s: write %zd bytes\n", bd->name, count); 645 bsg_dbg(bd, "write %zd bytes\n", count);
650 646
651 if (unlikely(uaccess_kernel())) 647 if (unlikely(uaccess_kernel()))
652 return -EINVAL; 648 return -EINVAL;
@@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
664 if (!bytes_written || err_block_err(ret)) 660 if (!bytes_written || err_block_err(ret))
665 bytes_written = ret; 661 bytes_written = ret;
666 662
667 dprintk("%s: returning %zd\n", bd->name, bytes_written); 663 bsg_dbg(bd, "returning %zd\n", bytes_written);
668 return bytes_written; 664 return bytes_written;
669} 665}
670 666
@@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)
717 hlist_del(&bd->dev_list); 713 hlist_del(&bd->dev_list);
718 mutex_unlock(&bsg_mutex); 714 mutex_unlock(&bsg_mutex);
719 715
720 dprintk("%s: tearing down\n", bd->name); 716 bsg_dbg(bd, "tearing down\n");
721 717
722 /* 718 /*
723 * close can always block 719 * close can always block
@@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
744 struct file *file) 740 struct file *file)
745{ 741{
746 struct bsg_device *bd; 742 struct bsg_device *bd;
747#ifdef BSG_DEBUG
748 unsigned char buf[32]; 743 unsigned char buf[32];
749#endif
750 744
751 if (!blk_queue_scsi_passthrough(rq)) { 745 if (!blk_queue_scsi_passthrough(rq)) {
752 WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); 746 WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
@@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
771 hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); 765 hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
772 766
773 strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); 767 strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
774 dprintk("bound to <%s>, max queue %d\n", 768 bsg_dbg(bd, "bound to <%s>, max queue %d\n",
775 format_dev_t(buf, inode->i_rdev), bd->max_queue); 769 format_dev_t(buf, inode->i_rdev), bd->max_queue);
776 770
777 mutex_unlock(&bsg_mutex); 771 mutex_unlock(&bsg_mutex);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b83f77460d28..9de9f156e203 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -50,8 +50,6 @@ struct deadline_data {
50 int front_merges; 50 int front_merges;
51}; 51};
52 52
53static void deadline_move_request(struct deadline_data *, struct request *);
54
55static inline struct rb_root * 53static inline struct rb_root *
56deadline_rb_root(struct deadline_data *dd, struct request *rq) 54deadline_rb_root(struct deadline_data *dd, struct request *rq)
57{ 55{
@@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)
100 struct deadline_data *dd = q->elevator->elevator_data; 98 struct deadline_data *dd = q->elevator->elevator_data;
101 const int data_dir = rq_data_dir(rq); 99 const int data_dir = rq_data_dir(rq);
102 100
101 /*
102 * This may be a requeue of a write request that has locked its
103 * target zone. If it is the case, this releases the zone lock.
104 */
105 blk_req_zone_write_unlock(rq);
106
103 deadline_add_rq_rb(dd, rq); 107 deadline_add_rq_rb(dd, rq);
104 108
105 /* 109 /*
@@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
190{ 194{
191 struct request_queue *q = rq->q; 195 struct request_queue *q = rq->q;
192 196
197 /*
198 * For a zoned block device, write requests must write lock their
199 * target zone.
200 */
201 blk_req_zone_write_lock(rq);
202
193 deadline_remove_request(q, rq); 203 deadline_remove_request(q, rq);
194 elv_dispatch_add_tail(q, rq); 204 elv_dispatch_add_tail(q, rq);
195} 205}
@@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
231} 241}
232 242
233/* 243/*
244 * For the specified data direction, return the next request to dispatch using
245 * arrival ordered lists.
246 */
247static struct request *
248deadline_fifo_request(struct deadline_data *dd, int data_dir)
249{
250 struct request *rq;
251
252 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
253 return NULL;
254
255 if (list_empty(&dd->fifo_list[data_dir]))
256 return NULL;
257
258 rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
259 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
260 return rq;
261
262 /*
263 * Look for a write request that can be dispatched, that is one with
264 * an unlocked target zone.
265 */
266 list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
267 if (blk_req_can_dispatch_to_zone(rq))
268 return rq;
269 }
270
271 return NULL;
272}
273
274/*
275 * For the specified data direction, return the next request to dispatch using
276 * sector position sorted lists.
277 */
278static struct request *
279deadline_next_request(struct deadline_data *dd, int data_dir)
280{
281 struct request *rq;
282
283 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
284 return NULL;
285
286 rq = dd->next_rq[data_dir];
287 if (!rq)
288 return NULL;
289
290 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
291 return rq;
292
293 /*
294 * Look for a write request that can be dispatched, that is one with
295 * an unlocked target zone.
296 */
297 while (rq) {
298 if (blk_req_can_dispatch_to_zone(rq))
299 return rq;
300 rq = deadline_latter_request(rq);
301 }
302
303 return NULL;
304}
305
306/*
234 * deadline_dispatch_requests selects the best request according to 307 * deadline_dispatch_requests selects the best request according to
235 * read/write expire, fifo_batch, etc 308 * read/write expire, fifo_batch, etc
236 */ 309 */
@@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
239 struct deadline_data *dd = q->elevator->elevator_data; 312 struct deadline_data *dd = q->elevator->elevator_data;
240 const int reads = !list_empty(&dd->fifo_list[READ]); 313 const int reads = !list_empty(&dd->fifo_list[READ]);
241 const int writes = !list_empty(&dd->fifo_list[WRITE]); 314 const int writes = !list_empty(&dd->fifo_list[WRITE]);
242 struct request *rq; 315 struct request *rq, *next_rq;
243 int data_dir; 316 int data_dir;
244 317
245 /* 318 /*
246 * batches are currently reads XOR writes 319 * batches are currently reads XOR writes
247 */ 320 */
248 if (dd->next_rq[WRITE]) 321 rq = deadline_next_request(dd, WRITE);
249 rq = dd->next_rq[WRITE]; 322 if (!rq)
250 else 323 rq = deadline_next_request(dd, READ);
251 rq = dd->next_rq[READ];
252 324
253 if (rq && dd->batching < dd->fifo_batch) 325 if (rq && dd->batching < dd->fifo_batch)
254 /* we have a next request are still entitled to batch */ 326 /* we have a next request are still entitled to batch */
@@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
262 if (reads) { 334 if (reads) {
263 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); 335 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
264 336
265 if (writes && (dd->starved++ >= dd->writes_starved)) 337 if (deadline_fifo_request(dd, WRITE) &&
338 (dd->starved++ >= dd->writes_starved))
266 goto dispatch_writes; 339 goto dispatch_writes;
267 340
268 data_dir = READ; 341 data_dir = READ;
@@ -291,21 +364,29 @@ dispatch_find_request:
291 /* 364 /*
292 * we are not running a batch, find best request for selected data_dir 365 * we are not running a batch, find best request for selected data_dir
293 */ 366 */
294 if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { 367 next_rq = deadline_next_request(dd, data_dir);
368 if (deadline_check_fifo(dd, data_dir) || !next_rq) {
295 /* 369 /*
296 * A deadline has expired, the last request was in the other 370 * A deadline has expired, the last request was in the other
297 * direction, or we have run out of higher-sectored requests. 371 * direction, or we have run out of higher-sectored requests.
298 * Start again from the request with the earliest expiry time. 372 * Start again from the request with the earliest expiry time.
299 */ 373 */
300 rq = rq_entry_fifo(dd->fifo_list[data_dir].next); 374 rq = deadline_fifo_request(dd, data_dir);
301 } else { 375 } else {
302 /* 376 /*
303 * The last req was the same dir and we have a next request in 377 * The last req was the same dir and we have a next request in
304 * sort order. No expired requests so continue on from here. 378 * sort order. No expired requests so continue on from here.
305 */ 379 */
306 rq = dd->next_rq[data_dir]; 380 rq = next_rq;
307 } 381 }
308 382
383 /*
384 * For a zoned block device, if we only have writes queued and none of
385 * them can be dispatched, rq will be NULL.
386 */
387 if (!rq)
388 return 0;
389
309 dd->batching = 0; 390 dd->batching = 0;
310 391
311dispatch_request: 392dispatch_request:
@@ -318,6 +399,16 @@ dispatch_request:
318 return 1; 399 return 1;
319} 400}
320 401
402/*
403 * For zoned block devices, write unlock the target zone of completed
404 * write requests.
405 */
406static void
407deadline_completed_request(struct request_queue *q, struct request *rq)
408{
409 blk_req_zone_write_unlock(rq);
410}
411
321static void deadline_exit_queue(struct elevator_queue *e) 412static void deadline_exit_queue(struct elevator_queue *e)
322{ 413{
323 struct deadline_data *dd = e->elevator_data; 414 struct deadline_data *dd = e->elevator_data;
@@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {
439 .elevator_merged_fn = deadline_merged_request, 530 .elevator_merged_fn = deadline_merged_request,
440 .elevator_merge_req_fn = deadline_merged_requests, 531 .elevator_merge_req_fn = deadline_merged_requests,
441 .elevator_dispatch_fn = deadline_dispatch_requests, 532 .elevator_dispatch_fn = deadline_dispatch_requests,
533 .elevator_completed_req_fn = deadline_completed_request,
442 .elevator_add_req_fn = deadline_add_request, 534 .elevator_add_req_fn = deadline_add_request,
443 .elevator_former_req_fn = elv_rb_former_request, 535 .elevator_former_req_fn = elv_rb_former_request,
444 .elevator_latter_req_fn = elv_rb_latter_request, 536 .elevator_latter_req_fn = elv_rb_latter_request,
diff --git a/block/elevator.c b/block/elevator.c
index 7bda083d5968..e87e9b43aba0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
869 struct elevator_queue *e = q->elevator; 869 struct elevator_queue *e = q->elevator;
870 int error; 870 int error;
871 871
872 lockdep_assert_held(&q->sysfs_lock);
873
872 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 874 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
873 if (!error) { 875 if (!error) {
874 struct elv_fs_entry *attr = e->type->elevator_attrs; 876 struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
886 } 888 }
887 return error; 889 return error;
888} 890}
889EXPORT_SYMBOL(elv_register_queue);
890 891
891void elv_unregister_queue(struct request_queue *q) 892void elv_unregister_queue(struct request_queue *q)
892{ 893{
894 lockdep_assert_held(&q->sysfs_lock);
895
893 if (q) { 896 if (q) {
894 struct elevator_queue *e = q->elevator; 897 struct elevator_queue *e = q->elevator;
895 898
@@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
900 wbt_enable_default(q); 903 wbt_enable_default(q);
901 } 904 }
902} 905}
903EXPORT_SYMBOL(elv_unregister_queue);
904 906
905int elv_register(struct elevator_type *e) 907int elv_register(struct elevator_type *e)
906{ 908{
@@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,
967{ 969{
968 int ret; 970 int ret;
969 971
972 lockdep_assert_held(&q->sysfs_lock);
973
970 blk_mq_freeze_queue(q); 974 blk_mq_freeze_queue(q);
975 blk_mq_quiesce_queue(q);
971 976
972 if (q->elevator) { 977 if (q->elevator) {
973 if (q->elevator->registered) 978 if (q->elevator->registered)
@@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,
994 blk_add_trace_msg(q, "elv switch: none"); 999 blk_add_trace_msg(q, "elv switch: none");
995 1000
996out: 1001out:
1002 blk_mq_unquiesce_queue(q);
997 blk_mq_unfreeze_queue(q); 1003 blk_mq_unfreeze_queue(q);
998 return ret; 1004 return ret;
999} 1005}
@@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1010 bool old_registered = false; 1016 bool old_registered = false;
1011 int err; 1017 int err;
1012 1018
1019 lockdep_assert_held(&q->sysfs_lock);
1020
1013 if (q->mq_ops) 1021 if (q->mq_ops)
1014 return elevator_switch_mq(q, new_e); 1022 return elevator_switch_mq(q, new_e);
1015 1023
diff --git a/block/genhd.c b/block/genhd.c
index 96a66f671720..88a53c188cb7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -629,16 +629,18 @@ exit:
629} 629}
630 630
631/** 631/**
632 * device_add_disk - add partitioning information to kernel list 632 * __device_add_disk - add disk information to kernel list
633 * @parent: parent device for the disk 633 * @parent: parent device for the disk
634 * @disk: per-device partitioning information 634 * @disk: per-device partitioning information
635 * @register_queue: register the queue if set to true
635 * 636 *
636 * This function registers the partitioning information in @disk 637 * This function registers the partitioning information in @disk
637 * with the kernel. 638 * with the kernel.
638 * 639 *
639 * FIXME: error handling 640 * FIXME: error handling
640 */ 641 */
641void device_add_disk(struct device *parent, struct gendisk *disk) 642static void __device_add_disk(struct device *parent, struct gendisk *disk,
643 bool register_queue)
642{ 644{
643 dev_t devt; 645 dev_t devt;
644 int retval; 646 int retval;
@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
682 exact_match, exact_lock, disk); 684 exact_match, exact_lock, disk);
683 } 685 }
684 register_disk(parent, disk); 686 register_disk(parent, disk);
685 blk_register_queue(disk); 687 if (register_queue)
688 blk_register_queue(disk);
686 689
687 /* 690 /*
688 * Take an extra ref on queue which will be put on disk_release() 691 * Take an extra ref on queue which will be put on disk_release()
@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
693 disk_add_events(disk); 696 disk_add_events(disk);
694 blk_integrity_add(disk); 697 blk_integrity_add(disk);
695} 698}
699
700void device_add_disk(struct device *parent, struct gendisk *disk)
701{
702 __device_add_disk(parent, disk, true);
703}
696EXPORT_SYMBOL(device_add_disk); 704EXPORT_SYMBOL(device_add_disk);
697 705
706void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
707{
708 __device_add_disk(parent, disk, false);
709}
710EXPORT_SYMBOL(device_add_disk_no_queue_reg);
711
698void del_gendisk(struct gendisk *disk) 712void del_gendisk(struct gendisk *disk)
699{ 713{
700 struct disk_part_iter piter; 714 struct disk_part_iter piter;
@@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
725 * Unregister bdi before releasing device numbers (as they can 739 * Unregister bdi before releasing device numbers (as they can
726 * get reused and we'd get clashes in sysfs). 740 * get reused and we'd get clashes in sysfs).
727 */ 741 */
728 bdi_unregister(disk->queue->backing_dev_info); 742 if (!(disk->flags & GENHD_FL_HIDDEN))
743 bdi_unregister(disk->queue->backing_dev_info);
729 blk_unregister_queue(disk); 744 blk_unregister_queue(disk);
730 } else { 745 } else {
731 WARN_ON(1); 746 WARN_ON(1);
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 0179e484ec98..c56f211c8440 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -59,6 +59,7 @@ struct deadline_data {
59 int front_merges; 59 int front_merges;
60 60
61 spinlock_t lock; 61 spinlock_t lock;
62 spinlock_t zone_lock;
62 struct list_head dispatch; 63 struct list_head dispatch;
63}; 64};
64 65
@@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
192} 193}
193 194
194/* 195/*
196 * For the specified data direction, return the next request to
197 * dispatch using arrival ordered lists.
198 */
199static struct request *
200deadline_fifo_request(struct deadline_data *dd, int data_dir)
201{
202 struct request *rq;
203 unsigned long flags;
204
205 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
206 return NULL;
207
208 if (list_empty(&dd->fifo_list[data_dir]))
209 return NULL;
210
211 rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
212 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
213 return rq;
214
215 /*
216 * Look for a write request that can be dispatched, that is one with
217 * an unlocked target zone.
218 */
219 spin_lock_irqsave(&dd->zone_lock, flags);
220 list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
221 if (blk_req_can_dispatch_to_zone(rq))
222 goto out;
223 }
224 rq = NULL;
225out:
226 spin_unlock_irqrestore(&dd->zone_lock, flags);
227
228 return rq;
229}
230
231/*
232 * For the specified data direction, return the next request to
233 * dispatch using sector position sorted lists.
234 */
235static struct request *
236deadline_next_request(struct deadline_data *dd, int data_dir)
237{
238 struct request *rq;
239 unsigned long flags;
240
241 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
242 return NULL;
243
244 rq = dd->next_rq[data_dir];
245 if (!rq)
246 return NULL;
247
248 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
249 return rq;
250
251 /*
252 * Look for a write request that can be dispatched, that is one with
253 * an unlocked target zone.
254 */
255 spin_lock_irqsave(&dd->zone_lock, flags);
256 while (rq) {
257 if (blk_req_can_dispatch_to_zone(rq))
258 break;
259 rq = deadline_latter_request(rq);
260 }
261 spin_unlock_irqrestore(&dd->zone_lock, flags);
262
263 return rq;
264}
265
266/*
195 * deadline_dispatch_requests selects the best request according to 267 * deadline_dispatch_requests selects the best request according to
196 * read/write expire, fifo_batch, etc 268 * read/write expire, fifo_batch, etc
197 */ 269 */
198static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) 270static struct request *__dd_dispatch_request(struct deadline_data *dd)
199{ 271{
200 struct deadline_data *dd = hctx->queue->elevator->elevator_data; 272 struct request *rq, *next_rq;
201 struct request *rq;
202 bool reads, writes; 273 bool reads, writes;
203 int data_dir; 274 int data_dir;
204 275
@@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
214 /* 285 /*
215 * batches are currently reads XOR writes 286 * batches are currently reads XOR writes
216 */ 287 */
217 if (dd->next_rq[WRITE]) 288 rq = deadline_next_request(dd, WRITE);
218 rq = dd->next_rq[WRITE]; 289 if (!rq)
219 else 290 rq = deadline_next_request(dd, READ);
220 rq = dd->next_rq[READ];
221 291
222 if (rq && dd->batching < dd->fifo_batch) 292 if (rq && dd->batching < dd->fifo_batch)
223 /* we have a next request are still entitled to batch */ 293 /* we have a next request are still entitled to batch */
@@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
231 if (reads) { 301 if (reads) {
232 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); 302 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
233 303
234 if (writes && (dd->starved++ >= dd->writes_starved)) 304 if (deadline_fifo_request(dd, WRITE) &&
305 (dd->starved++ >= dd->writes_starved))
235 goto dispatch_writes; 306 goto dispatch_writes;
236 307
237 data_dir = READ; 308 data_dir = READ;
@@ -260,21 +331,29 @@ dispatch_find_request:
260 /* 331 /*
261 * we are not running a batch, find best request for selected data_dir 332 * we are not running a batch, find best request for selected data_dir
262 */ 333 */
263 if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { 334 next_rq = deadline_next_request(dd, data_dir);
335 if (deadline_check_fifo(dd, data_dir) || !next_rq) {
264 /* 336 /*
265 * A deadline has expired, the last request was in the other 337 * A deadline has expired, the last request was in the other
266 * direction, or we have run out of higher-sectored requests. 338 * direction, or we have run out of higher-sectored requests.
267 * Start again from the request with the earliest expiry time. 339 * Start again from the request with the earliest expiry time.
268 */ 340 */
269 rq = rq_entry_fifo(dd->fifo_list[data_dir].next); 341 rq = deadline_fifo_request(dd, data_dir);
270 } else { 342 } else {
271 /* 343 /*
272 * The last req was the same dir and we have a next request in 344 * The last req was the same dir and we have a next request in
273 * sort order. No expired requests so continue on from here. 345 * sort order. No expired requests so continue on from here.
274 */ 346 */
275 rq = dd->next_rq[data_dir]; 347 rq = next_rq;
276 } 348 }
277 349
350 /*
351 * For a zoned block device, if we only have writes queued and none of
352 * them can be dispatched, rq will be NULL.
353 */
354 if (!rq)
355 return NULL;
356
278 dd->batching = 0; 357 dd->batching = 0;
279 358
280dispatch_request: 359dispatch_request:
@@ -284,17 +363,27 @@ dispatch_request:
284 dd->batching++; 363 dd->batching++;
285 deadline_move_request(dd, rq); 364 deadline_move_request(dd, rq);
286done: 365done:
366 /*
367 * If the request needs its target zone locked, do it.
368 */
369 blk_req_zone_write_lock(rq);
287 rq->rq_flags |= RQF_STARTED; 370 rq->rq_flags |= RQF_STARTED;
288 return rq; 371 return rq;
289} 372}
290 373
374/*
375 * One confusing aspect here is that we get called for a specific
376 * hardware queue, but we return a request that may not be for a
377 * different hardware queue. This is because mq-deadline has shared
378 * state for all hardware queues, in terms of sorting, FIFOs, etc.
379 */
291static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) 380static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
292{ 381{
293 struct deadline_data *dd = hctx->queue->elevator->elevator_data; 382 struct deadline_data *dd = hctx->queue->elevator->elevator_data;
294 struct request *rq; 383 struct request *rq;
295 384
296 spin_lock(&dd->lock); 385 spin_lock(&dd->lock);
297 rq = __dd_dispatch_request(hctx); 386 rq = __dd_dispatch_request(dd);
298 spin_unlock(&dd->lock); 387 spin_unlock(&dd->lock);
299 388
300 return rq; 389 return rq;
@@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
339 dd->front_merges = 1; 428 dd->front_merges = 1;
340 dd->fifo_batch = fifo_batch; 429 dd->fifo_batch = fifo_batch;
341 spin_lock_init(&dd->lock); 430 spin_lock_init(&dd->lock);
431 spin_lock_init(&dd->zone_lock);
342 INIT_LIST_HEAD(&dd->dispatch); 432 INIT_LIST_HEAD(&dd->dispatch);
343 433
344 q->elevator = eq; 434 q->elevator = eq;
@@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
395 struct deadline_data *dd = q->elevator->elevator_data; 485 struct deadline_data *dd = q->elevator->elevator_data;
396 const int data_dir = rq_data_dir(rq); 486 const int data_dir = rq_data_dir(rq);
397 487
488 /*
489 * This may be a requeue of a write request that has locked its
490 * target zone. If it is the case, this releases the zone lock.
491 */
492 blk_req_zone_write_unlock(rq);
493
398 if (blk_mq_sched_try_insert_merge(q, rq)) 494 if (blk_mq_sched_try_insert_merge(q, rq))
399 return; 495 return;
400 496
@@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
439 spin_unlock(&dd->lock); 535 spin_unlock(&dd->lock);
440} 536}
441 537
538/*
539 * For zoned block devices, write unlock the target zone of
540 * completed write requests. Do this while holding the zone lock
541 * spinlock so that the zone is never unlocked while deadline_fifo_request()
542 * while deadline_next_request() are executing.
543 */
544static void dd_completed_request(struct request *rq)
545{
546 struct request_queue *q = rq->q;
547
548 if (blk_queue_is_zoned(q)) {
549 struct deadline_data *dd = q->elevator->elevator_data;
550 unsigned long flags;
551
552 spin_lock_irqsave(&dd->zone_lock, flags);
553 blk_req_zone_write_unlock(rq);
554 spin_unlock_irqrestore(&dd->zone_lock, flags);
555 }
556}
557
442static bool dd_has_work(struct blk_mq_hw_ctx *hctx) 558static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
443{ 559{
444 struct deadline_data *dd = hctx->queue->elevator->elevator_data; 560 struct deadline_data *dd = hctx->queue->elevator->elevator_data;
@@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = {
640 .ops.mq = { 756 .ops.mq = {
641 .insert_requests = dd_insert_requests, 757 .insert_requests = dd_insert_requests,
642 .dispatch_request = dd_dispatch_request, 758 .dispatch_request = dd_dispatch_request,
759 .completed_request = dd_completed_request,
643 .next_request = elv_rb_latter_request, 760 .next_request = elv_rb_latter_request,
644 .former_request = elv_rb_former_request, 761 .former_request = elv_rb_former_request,
645 .bio_merge = dd_bio_merge, 762 .bio_merge = dd_bio_merge,
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 0af3a3db6fb0..82c44f7df911 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,
301 continue; 301 continue;
302 bsd_start = le32_to_cpu(p->p_offset); 302 bsd_start = le32_to_cpu(p->p_offset);
303 bsd_size = le32_to_cpu(p->p_size); 303 bsd_size = le32_to_cpu(p->p_size);
304 if (memcmp(flavour, "bsd\0", 4) == 0) 304 /* FreeBSD has relative offset if C partition offset is zero */
305 if (memcmp(flavour, "bsd\0", 4) == 0 &&
306 le32_to_cpu(l->d_partitions[2].p_offset) == 0)
305 bsd_start += offset; 307 bsd_start += offset;
306 if (offset == bsd_start && size == bsd_size) 308 if (offset == bsd_start && size == bsd_size)
307 /* full parent partition, we have it already */ 309 /* full parent partition, we have it already */
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index edcfff974527..60b471f8621b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -384,9 +384,10 @@ out_put_request:
384 384
385/** 385/**
386 * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl 386 * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
387 * @file: file this ioctl operates on (optional)
388 * @q: request queue to send scsi commands down 387 * @q: request queue to send scsi commands down
389 * @disk: gendisk to operate on (option) 388 * @disk: gendisk to operate on (option)
389 * @mode: mode used to open the file through which the ioctl has been
390 * submitted
390 * @sic: userspace structure describing the command to perform 391 * @sic: userspace structure describing the command to perform
391 * 392 *
392 * Send down the scsi command described by @sic to the device below 393 * Send down the scsi command described by @sic to the device below
@@ -415,10 +416,10 @@ out_put_request:
415 * Positive numbers returned are the compacted SCSI error codes (4 416 * Positive numbers returned are the compacted SCSI error codes (4
416 * bytes in one int) where the lowest byte is the SCSI status. 417 * bytes in one int) where the lowest byte is the SCSI status.
417 */ 418 */
418#define OMAX_SB_LEN 16 /* For backward compatibility */
419int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, 419int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
420 struct scsi_ioctl_command __user *sic) 420 struct scsi_ioctl_command __user *sic)
421{ 421{
422 enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */
422 struct request *rq; 423 struct request *rq;
423 struct scsi_request *req; 424 struct scsi_request *req;
424 int err; 425 int err;
@@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
692 if (bd && bd == bd->bd_contains) 693 if (bd && bd == bd->bd_contains)
693 return 0; 694 return 0;
694 695
695 /* Actually none of these is particularly useful on a partition,
696 * but they are safe.
697 */
698 switch (cmd) {
699 case SCSI_IOCTL_GET_IDLUN:
700 case SCSI_IOCTL_GET_BUS_NUMBER:
701 case SCSI_IOCTL_GET_PCI:
702 case SCSI_IOCTL_PROBE_HOST:
703 case SG_GET_VERSION_NUM:
704 case SG_SET_TIMEOUT:
705 case SG_GET_TIMEOUT:
706 case SG_GET_RESERVED_SIZE:
707 case SG_SET_RESERVED_SIZE:
708 case SG_EMULATED_HOST:
709 return 0;
710 case CDROM_GET_CAPABILITY:
711 /* Keep this until we remove the printk below. udev sends it
712 * and we do not want to spam dmesg about it. CD-ROMs do
713 * not have partitions, so we get here only for disks.
714 */
715 return -ENOIOCTLCMD;
716 default:
717 break;
718 }
719
720 if (capable(CAP_SYS_RAWIO)) 696 if (capable(CAP_SYS_RAWIO))
721 return 0; 697 return 0;
722 698
723 /* In particular, rule out all resets and host-specific ioctls. */
724 printk_ratelimited(KERN_WARNING
725 "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
726
727 return -ENOIOCTLCMD; 699 return -ENOIOCTLCMD;
728} 700}
729EXPORT_SYMBOL(scsi_verify_blk_ioctl); 701EXPORT_SYMBOL(scsi_verify_blk_ioctl);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index f7911963bb79..20360e040425 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -106,6 +106,7 @@ config CRYPTO_KPP
106config CRYPTO_ACOMP2 106config CRYPTO_ACOMP2
107 tristate 107 tristate
108 select CRYPTO_ALGAPI2 108 select CRYPTO_ALGAPI2
109 select SGL_ALLOC
109 110
110config CRYPTO_ACOMP 111config CRYPTO_ACOMP
111 tristate 112 tristate
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 2075e2c4e7df..968bbcf65c94 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -140,53 +140,6 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm)
140 return ret; 140 return ret;
141} 141}
142 142
143static void crypto_scomp_sg_free(struct scatterlist *sgl)
144{
145 int i, n;
146 struct page *page;
147
148 if (!sgl)
149 return;
150
151 n = sg_nents(sgl);
152 for_each_sg(sgl, sgl, n, i) {
153 page = sg_page(sgl);
154 if (page)
155 __free_page(page);
156 }
157
158 kfree(sgl);
159}
160
161static struct scatterlist *crypto_scomp_sg_alloc(size_t size, gfp_t gfp)
162{
163 struct scatterlist *sgl;
164 struct page *page;
165 int i, n;
166
167 n = ((size - 1) >> PAGE_SHIFT) + 1;
168
169 sgl = kmalloc_array(n, sizeof(struct scatterlist), gfp);
170 if (!sgl)
171 return NULL;
172
173 sg_init_table(sgl, n);
174
175 for (i = 0; i < n; i++) {
176 page = alloc_page(gfp);
177 if (!page)
178 goto err;
179 sg_set_page(sgl + i, page, PAGE_SIZE, 0);
180 }
181
182 return sgl;
183
184err:
185 sg_mark_end(sgl + i);
186 crypto_scomp_sg_free(sgl);
187 return NULL;
188}
189
190static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) 143static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
191{ 144{
192 struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); 145 struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
@@ -220,7 +173,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
220 scratch_dst, &req->dlen, *ctx); 173 scratch_dst, &req->dlen, *ctx);
221 if (!ret) { 174 if (!ret) {
222 if (!req->dst) { 175 if (!req->dst) {
223 req->dst = crypto_scomp_sg_alloc(req->dlen, GFP_ATOMIC); 176 req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL);
224 if (!req->dst) 177 if (!req->dst)
225 goto out; 178 goto out;
226 } 179 }
@@ -274,7 +227,7 @@ int crypto_init_scomp_ops_async(struct crypto_tfm *tfm)
274 227
275 crt->compress = scomp_acomp_compress; 228 crt->compress = scomp_acomp_compress;
276 crt->decompress = scomp_acomp_decompress; 229 crt->decompress = scomp_acomp_decompress;
277 crt->dst_free = crypto_scomp_sg_free; 230 crt->dst_free = sgl_free;
278 crt->reqsize = sizeof(void *); 231 crt->reqsize = sizeof(void *);
279 232
280 return 0; 233 return 0;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 442e777bdfb2..728075214959 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6619,43 +6619,27 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller)
6619 6619
6620#ifdef DAC960_GAM_MINOR 6620#ifdef DAC960_GAM_MINOR
6621 6621
6622/* 6622static long DAC960_gam_get_controller_info(DAC960_ControllerInfo_T __user *UserSpaceControllerInfo)
6623 * DAC960_gam_ioctl is the ioctl function for performing RAID operations.
6624*/
6625
6626static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6627 unsigned long Argument)
6628{ 6623{
6629 long ErrorCode = 0;
6630 if (!capable(CAP_SYS_ADMIN)) return -EACCES;
6631
6632 mutex_lock(&DAC960_mutex);
6633 switch (Request)
6634 {
6635 case DAC960_IOCTL_GET_CONTROLLER_COUNT:
6636 ErrorCode = DAC960_ControllerCount;
6637 break;
6638 case DAC960_IOCTL_GET_CONTROLLER_INFO:
6639 {
6640 DAC960_ControllerInfo_T __user *UserSpaceControllerInfo =
6641 (DAC960_ControllerInfo_T __user *) Argument;
6642 DAC960_ControllerInfo_T ControllerInfo; 6624 DAC960_ControllerInfo_T ControllerInfo;
6643 DAC960_Controller_T *Controller; 6625 DAC960_Controller_T *Controller;
6644 int ControllerNumber; 6626 int ControllerNumber;
6627 long ErrorCode;
6628
6645 if (UserSpaceControllerInfo == NULL) 6629 if (UserSpaceControllerInfo == NULL)
6646 ErrorCode = -EINVAL; 6630 ErrorCode = -EINVAL;
6647 else ErrorCode = get_user(ControllerNumber, 6631 else ErrorCode = get_user(ControllerNumber,
6648 &UserSpaceControllerInfo->ControllerNumber); 6632 &UserSpaceControllerInfo->ControllerNumber);
6649 if (ErrorCode != 0) 6633 if (ErrorCode != 0)
6650 break; 6634 goto out;
6651 ErrorCode = -ENXIO; 6635 ErrorCode = -ENXIO;
6652 if (ControllerNumber < 0 || 6636 if (ControllerNumber < 0 ||
6653 ControllerNumber > DAC960_ControllerCount - 1) { 6637 ControllerNumber > DAC960_ControllerCount - 1) {
6654 break; 6638 goto out;
6655 } 6639 }
6656 Controller = DAC960_Controllers[ControllerNumber]; 6640 Controller = DAC960_Controllers[ControllerNumber];
6657 if (Controller == NULL) 6641 if (Controller == NULL)
6658 break; 6642 goto out;
6659 memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); 6643 memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T));
6660 ControllerInfo.ControllerNumber = ControllerNumber; 6644 ControllerInfo.ControllerNumber = ControllerNumber;
6661 ControllerInfo.FirmwareType = Controller->FirmwareType; 6645 ControllerInfo.FirmwareType = Controller->FirmwareType;
@@ -6670,12 +6654,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6670 strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion); 6654 strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion);
6671 ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, 6655 ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo,
6672 sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0); 6656 sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0);
6673 break; 6657out:
6674 } 6658 return ErrorCode;
6675 case DAC960_IOCTL_V1_EXECUTE_COMMAND: 6659}
6676 { 6660
6677 DAC960_V1_UserCommand_T __user *UserSpaceUserCommand = 6661static long DAC960_gam_v1_execute_command(DAC960_V1_UserCommand_T __user *UserSpaceUserCommand)
6678 (DAC960_V1_UserCommand_T __user *) Argument; 6662{
6679 DAC960_V1_UserCommand_T UserCommand; 6663 DAC960_V1_UserCommand_T UserCommand;
6680 DAC960_Controller_T *Controller; 6664 DAC960_Controller_T *Controller;
6681 DAC960_Command_T *Command = NULL; 6665 DAC960_Command_T *Command = NULL;
@@ -6688,39 +6672,41 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6688 int ControllerNumber, DataTransferLength; 6672 int ControllerNumber, DataTransferLength;
6689 unsigned char *DataTransferBuffer = NULL; 6673 unsigned char *DataTransferBuffer = NULL;
6690 dma_addr_t DataTransferBufferDMA; 6674 dma_addr_t DataTransferBufferDMA;
6675 long ErrorCode;
6676
6691 if (UserSpaceUserCommand == NULL) { 6677 if (UserSpaceUserCommand == NULL) {
6692 ErrorCode = -EINVAL; 6678 ErrorCode = -EINVAL;
6693 break; 6679 goto out;
6694 } 6680 }
6695 if (copy_from_user(&UserCommand, UserSpaceUserCommand, 6681 if (copy_from_user(&UserCommand, UserSpaceUserCommand,
6696 sizeof(DAC960_V1_UserCommand_T))) { 6682 sizeof(DAC960_V1_UserCommand_T))) {
6697 ErrorCode = -EFAULT; 6683 ErrorCode = -EFAULT;
6698 break; 6684 goto out;
6699 } 6685 }
6700 ControllerNumber = UserCommand.ControllerNumber; 6686 ControllerNumber = UserCommand.ControllerNumber;
6701 ErrorCode = -ENXIO; 6687 ErrorCode = -ENXIO;
6702 if (ControllerNumber < 0 || 6688 if (ControllerNumber < 0 ||
6703 ControllerNumber > DAC960_ControllerCount - 1) 6689 ControllerNumber > DAC960_ControllerCount - 1)
6704 break; 6690 goto out;
6705 Controller = DAC960_Controllers[ControllerNumber]; 6691 Controller = DAC960_Controllers[ControllerNumber];
6706 if (Controller == NULL) 6692 if (Controller == NULL)
6707 break; 6693 goto out;
6708 ErrorCode = -EINVAL; 6694 ErrorCode = -EINVAL;
6709 if (Controller->FirmwareType != DAC960_V1_Controller) 6695 if (Controller->FirmwareType != DAC960_V1_Controller)
6710 break; 6696 goto out;
6711 CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode; 6697 CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode;
6712 DataTransferLength = UserCommand.DataTransferLength; 6698 DataTransferLength = UserCommand.DataTransferLength;
6713 if (CommandOpcode & 0x80) 6699 if (CommandOpcode & 0x80)
6714 break; 6700 goto out;
6715 if (CommandOpcode == DAC960_V1_DCDB) 6701 if (CommandOpcode == DAC960_V1_DCDB)
6716 { 6702 {
6717 if (copy_from_user(&DCDB, UserCommand.DCDB, 6703 if (copy_from_user(&DCDB, UserCommand.DCDB,
6718 sizeof(DAC960_V1_DCDB_T))) { 6704 sizeof(DAC960_V1_DCDB_T))) {
6719 ErrorCode = -EFAULT; 6705 ErrorCode = -EFAULT;
6720 break; 6706 goto out;
6721 } 6707 }
6722 if (DCDB.Channel >= DAC960_V1_MaxChannels) 6708 if (DCDB.Channel >= DAC960_V1_MaxChannels)
6723 break; 6709 goto out;
6724 if (!((DataTransferLength == 0 && 6710 if (!((DataTransferLength == 0 &&
6725 DCDB.Direction 6711 DCDB.Direction
6726 == DAC960_V1_DCDB_NoDataTransfer) || 6712 == DAC960_V1_DCDB_NoDataTransfer) ||
@@ -6730,15 +6716,15 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6730 (DataTransferLength < 0 && 6716 (DataTransferLength < 0 &&
6731 DCDB.Direction 6717 DCDB.Direction
6732 == DAC960_V1_DCDB_DataTransferSystemToDevice))) 6718 == DAC960_V1_DCDB_DataTransferSystemToDevice)))
6733 break; 6719 goto out;
6734 if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength) 6720 if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength)
6735 != abs(DataTransferLength)) 6721 != abs(DataTransferLength))
6736 break; 6722 goto out;
6737 DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice, 6723 DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice,
6738 sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA); 6724 sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA);
6739 if (DCDB_IOBUF == NULL) { 6725 if (DCDB_IOBUF == NULL) {
6740 ErrorCode = -ENOMEM; 6726 ErrorCode = -ENOMEM;
6741 break; 6727 goto out;
6742 } 6728 }
6743 } 6729 }
6744 ErrorCode = -ENOMEM; 6730 ErrorCode = -ENOMEM;
@@ -6748,19 +6734,19 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6748 DataTransferLength, 6734 DataTransferLength,
6749 &DataTransferBufferDMA); 6735 &DataTransferBufferDMA);
6750 if (DataTransferBuffer == NULL) 6736 if (DataTransferBuffer == NULL)
6751 break; 6737 goto out;
6752 } 6738 }
6753 else if (DataTransferLength < 0) 6739 else if (DataTransferLength < 0)
6754 { 6740 {
6755 DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, 6741 DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
6756 -DataTransferLength, &DataTransferBufferDMA); 6742 -DataTransferLength, &DataTransferBufferDMA);
6757 if (DataTransferBuffer == NULL) 6743 if (DataTransferBuffer == NULL)
6758 break; 6744 goto out;
6759 if (copy_from_user(DataTransferBuffer, 6745 if (copy_from_user(DataTransferBuffer,
6760 UserCommand.DataTransferBuffer, 6746 UserCommand.DataTransferBuffer,
6761 -DataTransferLength)) { 6747 -DataTransferLength)) {
6762 ErrorCode = -EFAULT; 6748 ErrorCode = -EFAULT;
6763 break; 6749 goto out;
6764 } 6750 }
6765 } 6751 }
6766 if (CommandOpcode == DAC960_V1_DCDB) 6752 if (CommandOpcode == DAC960_V1_DCDB)
@@ -6837,12 +6823,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6837 if (DCDB_IOBUF != NULL) 6823 if (DCDB_IOBUF != NULL)
6838 pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), 6824 pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T),
6839 DCDB_IOBUF, DCDB_IOBUFDMA); 6825 DCDB_IOBUF, DCDB_IOBUFDMA);
6840 break; 6826 out:
6841 } 6827 return ErrorCode;
6842 case DAC960_IOCTL_V2_EXECUTE_COMMAND: 6828}
6843 { 6829
6844 DAC960_V2_UserCommand_T __user *UserSpaceUserCommand = 6830static long DAC960_gam_v2_execute_command(DAC960_V2_UserCommand_T __user *UserSpaceUserCommand)
6845 (DAC960_V2_UserCommand_T __user *) Argument; 6831{
6846 DAC960_V2_UserCommand_T UserCommand; 6832 DAC960_V2_UserCommand_T UserCommand;
6847 DAC960_Controller_T *Controller; 6833 DAC960_Controller_T *Controller;
6848 DAC960_Command_T *Command = NULL; 6834 DAC960_Command_T *Command = NULL;
@@ -6855,26 +6841,26 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6855 dma_addr_t DataTransferBufferDMA; 6841 dma_addr_t DataTransferBufferDMA;
6856 unsigned char *RequestSenseBuffer = NULL; 6842 unsigned char *RequestSenseBuffer = NULL;
6857 dma_addr_t RequestSenseBufferDMA; 6843 dma_addr_t RequestSenseBufferDMA;
6844 long ErrorCode = -EINVAL;
6858 6845
6859 ErrorCode = -EINVAL;
6860 if (UserSpaceUserCommand == NULL) 6846 if (UserSpaceUserCommand == NULL)
6861 break; 6847 goto out;
6862 if (copy_from_user(&UserCommand, UserSpaceUserCommand, 6848 if (copy_from_user(&UserCommand, UserSpaceUserCommand,
6863 sizeof(DAC960_V2_UserCommand_T))) { 6849 sizeof(DAC960_V2_UserCommand_T))) {
6864 ErrorCode = -EFAULT; 6850 ErrorCode = -EFAULT;
6865 break; 6851 goto out;
6866 } 6852 }
6867 ErrorCode = -ENXIO; 6853 ErrorCode = -ENXIO;
6868 ControllerNumber = UserCommand.ControllerNumber; 6854 ControllerNumber = UserCommand.ControllerNumber;
6869 if (ControllerNumber < 0 || 6855 if (ControllerNumber < 0 ||
6870 ControllerNumber > DAC960_ControllerCount - 1) 6856 ControllerNumber > DAC960_ControllerCount - 1)
6871 break; 6857 goto out;
6872 Controller = DAC960_Controllers[ControllerNumber]; 6858 Controller = DAC960_Controllers[ControllerNumber];
6873 if (Controller == NULL) 6859 if (Controller == NULL)
6874 break; 6860 goto out;
6875 if (Controller->FirmwareType != DAC960_V2_Controller){ 6861 if (Controller->FirmwareType != DAC960_V2_Controller){
6876 ErrorCode = -EINVAL; 6862 ErrorCode = -EINVAL;
6877 break; 6863 goto out;
6878 } 6864 }
6879 DataTransferLength = UserCommand.DataTransferLength; 6865 DataTransferLength = UserCommand.DataTransferLength;
6880 ErrorCode = -ENOMEM; 6866 ErrorCode = -ENOMEM;
@@ -6884,14 +6870,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6884 DataTransferLength, 6870 DataTransferLength,
6885 &DataTransferBufferDMA); 6871 &DataTransferBufferDMA);
6886 if (DataTransferBuffer == NULL) 6872 if (DataTransferBuffer == NULL)
6887 break; 6873 goto out;
6888 } 6874 }
6889 else if (DataTransferLength < 0) 6875 else if (DataTransferLength < 0)
6890 { 6876 {
6891 DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, 6877 DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
6892 -DataTransferLength, &DataTransferBufferDMA); 6878 -DataTransferLength, &DataTransferBufferDMA);
6893 if (DataTransferBuffer == NULL) 6879 if (DataTransferBuffer == NULL)
6894 break; 6880 goto out;
6895 if (copy_from_user(DataTransferBuffer, 6881 if (copy_from_user(DataTransferBuffer,
6896 UserCommand.DataTransferBuffer, 6882 UserCommand.DataTransferBuffer,
6897 -DataTransferLength)) { 6883 -DataTransferLength)) {
@@ -7001,42 +6987,44 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
7001 if (RequestSenseBuffer != NULL) 6987 if (RequestSenseBuffer != NULL)
7002 pci_free_consistent(Controller->PCIDevice, RequestSenseLength, 6988 pci_free_consistent(Controller->PCIDevice, RequestSenseLength,
7003 RequestSenseBuffer, RequestSenseBufferDMA); 6989 RequestSenseBuffer, RequestSenseBufferDMA);
7004 break; 6990out:
7005 } 6991 return ErrorCode;
7006 case DAC960_IOCTL_V2_GET_HEALTH_STATUS: 6992}
7007 { 6993
7008 DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus = 6994static long DAC960_gam_v2_get_health_status(DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus)
7009 (DAC960_V2_GetHealthStatus_T __user *) Argument; 6995{
7010 DAC960_V2_GetHealthStatus_T GetHealthStatus; 6996 DAC960_V2_GetHealthStatus_T GetHealthStatus;
7011 DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer; 6997 DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer;
7012 DAC960_Controller_T *Controller; 6998 DAC960_Controller_T *Controller;
7013 int ControllerNumber; 6999 int ControllerNumber;
7000 long ErrorCode;
7001
7014 if (UserSpaceGetHealthStatus == NULL) { 7002 if (UserSpaceGetHealthStatus == NULL) {
7015 ErrorCode = -EINVAL; 7003 ErrorCode = -EINVAL;
7016 break; 7004 goto out;
7017 } 7005 }
7018 if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus, 7006 if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus,
7019 sizeof(DAC960_V2_GetHealthStatus_T))) { 7007 sizeof(DAC960_V2_GetHealthStatus_T))) {
7020 ErrorCode = -EFAULT; 7008 ErrorCode = -EFAULT;
7021 break; 7009 goto out;
7022 } 7010 }
7023 ErrorCode = -ENXIO; 7011 ErrorCode = -ENXIO;
7024 ControllerNumber = GetHealthStatus.ControllerNumber; 7012 ControllerNumber = GetHealthStatus.ControllerNumber;
7025 if (ControllerNumber < 0 || 7013 if (ControllerNumber < 0 ||
7026 ControllerNumber > DAC960_ControllerCount - 1) 7014 ControllerNumber > DAC960_ControllerCount - 1)
7027 break; 7015 goto out;
7028 Controller = DAC960_Controllers[ControllerNumber]; 7016 Controller = DAC960_Controllers[ControllerNumber];
7029 if (Controller == NULL) 7017 if (Controller == NULL)
7030 break; 7018 goto out;
7031 if (Controller->FirmwareType != DAC960_V2_Controller) { 7019 if (Controller->FirmwareType != DAC960_V2_Controller) {
7032 ErrorCode = -EINVAL; 7020 ErrorCode = -EINVAL;
7033 break; 7021 goto out;
7034 } 7022 }
7035 if (copy_from_user(&HealthStatusBuffer, 7023 if (copy_from_user(&HealthStatusBuffer,
7036 GetHealthStatus.HealthStatusBuffer, 7024 GetHealthStatus.HealthStatusBuffer,
7037 sizeof(DAC960_V2_HealthStatusBuffer_T))) { 7025 sizeof(DAC960_V2_HealthStatusBuffer_T))) {
7038 ErrorCode = -EFAULT; 7026 ErrorCode = -EFAULT;
7039 break; 7027 goto out;
7040 } 7028 }
7041 ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue, 7029 ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue,
7042 !(Controller->V2.HealthStatusBuffer->StatusChangeCounter 7030 !(Controller->V2.HealthStatusBuffer->StatusChangeCounter
@@ -7046,7 +7034,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
7046 DAC960_MonitoringTimerInterval); 7034 DAC960_MonitoringTimerInterval);
7047 if (ErrorCode == -ERESTARTSYS) { 7035 if (ErrorCode == -ERESTARTSYS) {
7048 ErrorCode = -EINTR; 7036 ErrorCode = -EINTR;
7049 break; 7037 goto out;
7050 } 7038 }
7051 if (copy_to_user(GetHealthStatus.HealthStatusBuffer, 7039 if (copy_to_user(GetHealthStatus.HealthStatusBuffer,
7052 Controller->V2.HealthStatusBuffer, 7040 Controller->V2.HealthStatusBuffer,
@@ -7054,7 +7042,39 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
7054 ErrorCode = -EFAULT; 7042 ErrorCode = -EFAULT;
7055 else 7043 else
7056 ErrorCode = 0; 7044 ErrorCode = 0;
7057 } 7045
7046out:
7047 return ErrorCode;
7048}
7049
7050/*
7051 * DAC960_gam_ioctl is the ioctl function for performing RAID operations.
7052*/
7053
7054static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
7055 unsigned long Argument)
7056{
7057 long ErrorCode = 0;
7058 void __user *argp = (void __user *)Argument;
7059 if (!capable(CAP_SYS_ADMIN)) return -EACCES;
7060
7061 mutex_lock(&DAC960_mutex);
7062 switch (Request)
7063 {
7064 case DAC960_IOCTL_GET_CONTROLLER_COUNT:
7065 ErrorCode = DAC960_ControllerCount;
7066 break;
7067 case DAC960_IOCTL_GET_CONTROLLER_INFO:
7068 ErrorCode = DAC960_gam_get_controller_info(argp);
7069 break;
7070 case DAC960_IOCTL_V1_EXECUTE_COMMAND:
7071 ErrorCode = DAC960_gam_v1_execute_command(argp);
7072 break;
7073 case DAC960_IOCTL_V2_EXECUTE_COMMAND:
7074 ErrorCode = DAC960_gam_v2_execute_command(argp);
7075 break;
7076 case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
7077 ErrorCode = DAC960_gam_v2_get_health_status(argp);
7058 break; 7078 break;
7059 default: 7079 default:
7060 ErrorCode = -ENOTTY; 7080 ErrorCode = -ENOTTY;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 40579d0cb3d1..ad9b687a236a 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK
20 tristate "Null test block driver" 20 tristate "Null test block driver"
21 select CONFIGFS_FS 21 select CONFIGFS_FS
22 22
23config BLK_DEV_NULL_BLK_FAULT_INJECTION
24 bool "Support fault injection for Null test block driver"
25 depends on BLK_DEV_NULL_BLK && FAULT_INJECTION
26
23config BLK_DEV_FD 27config BLK_DEV_FD
24 tristate "Normal floppy disk support" 28 tristate "Normal floppy disk support"
25 depends on ARCH_MAY_HAVE_PC_FDC 29 depends on ARCH_MAY_HAVE_PC_FDC
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 9220f8e833d0..c0ebda1283cc 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -112,8 +112,7 @@ enum frame_flags {
112struct frame { 112struct frame {
113 struct list_head head; 113 struct list_head head;
114 u32 tag; 114 u32 tag;
115 struct timeval sent; /* high-res time packet was sent */ 115 ktime_t sent; /* high-res time packet was sent */
116 u32 sent_jiffs; /* low-res jiffies-based sent time */
117 ulong waited; 116 ulong waited;
118 ulong waited_total; 117 ulong waited_total;
119 struct aoetgt *t; /* parent target I belong to */ 118 struct aoetgt *t; /* parent target I belong to */
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 812fed069708..540bb60cd071 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
398 398
399 skb = skb_clone(f->skb, GFP_ATOMIC); 399 skb = skb_clone(f->skb, GFP_ATOMIC);
400 if (skb) { 400 if (skb) {
401 do_gettimeofday(&f->sent); 401 f->sent = ktime_get();
402 f->sent_jiffs = (u32) jiffies;
403 __skb_queue_head_init(&queue); 402 __skb_queue_head_init(&queue);
404 __skb_queue_tail(&queue, skb); 403 __skb_queue_tail(&queue, skb);
405 aoenet_xmit(&queue); 404 aoenet_xmit(&queue);
@@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
489 skb = skb_clone(skb, GFP_ATOMIC); 488 skb = skb_clone(skb, GFP_ATOMIC);
490 if (skb == NULL) 489 if (skb == NULL)
491 return; 490 return;
492 do_gettimeofday(&f->sent); 491 f->sent = ktime_get();
493 f->sent_jiffs = (u32) jiffies;
494 __skb_queue_head_init(&queue); 492 __skb_queue_head_init(&queue);
495 __skb_queue_tail(&queue, skb); 493 __skb_queue_tail(&queue, skb);
496 aoenet_xmit(&queue); 494 aoenet_xmit(&queue);
@@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
499static int 497static int
500tsince_hr(struct frame *f) 498tsince_hr(struct frame *f)
501{ 499{
502 struct timeval now; 500 u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
503 int n;
504 501
505 do_gettimeofday(&now); 502 /* delta is normally under 4.2 seconds, avoid 64-bit division */
506 n = now.tv_usec - f->sent.tv_usec; 503 if (likely(delta <= UINT_MAX))
507 n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC; 504 return (u32)delta / NSEC_PER_USEC;
508 505
509 if (n < 0) 506 /* avoid overflow after 71 minutes */
510 n = -n; 507 if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
508 return INT_MAX;
511 509
512 /* For relatively long periods, use jiffies to avoid 510 return div_u64(delta, NSEC_PER_USEC);
513 * discrepancies caused by updates to the system time.
514 *
515 * On system with HZ of 1000, 32-bits is over 49 days
516 * worth of jiffies, or over 71 minutes worth of usecs.
517 *
518 * Jiffies overflow is handled by subtraction of unsigned ints:
519 * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
520 * $3 = 4
521 * (gdb)
522 */
523 if (n > USEC_PER_SEC / 4) {
524 n = ((u32) jiffies) - f->sent_jiffs;
525 n *= USEC_PER_SEC / HZ;
526 }
527
528 return n;
529} 511}
530 512
531static int 513static int
@@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
589 nf->waited = 0; 571 nf->waited = 0;
590 nf->waited_total = f->waited_total; 572 nf->waited_total = f->waited_total;
591 nf->sent = f->sent; 573 nf->sent = f->sent;
592 nf->sent_jiffs = f->sent_jiffs;
593 f->skb = skb; 574 f->skb = skb;
594 575
595 return nf; 576 return nf;
@@ -633,8 +614,7 @@ probe(struct aoetgt *t)
633 614
634 skb = skb_clone(f->skb, GFP_ATOMIC); 615 skb = skb_clone(f->skb, GFP_ATOMIC);
635 if (skb) { 616 if (skb) {
636 do_gettimeofday(&f->sent); 617 f->sent = ktime_get();
637 f->sent_jiffs = (u32) jiffies;
638 __skb_queue_head_init(&queue); 618 __skb_queue_head_init(&queue);
639 __skb_queue_tail(&queue, skb); 619 __skb_queue_tail(&queue, skb);
640 aoenet_xmit(&queue); 620 aoenet_xmit(&queue);
@@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
1432 d->timer.function = rexmit_timer; 1412 d->timer.function = rexmit_timer;
1433 1413
1434 skb = skb_clone(skb, GFP_ATOMIC); 1414 skb = skb_clone(skb, GFP_ATOMIC);
1435 if (skb) { 1415 if (skb)
1436 do_gettimeofday(&f->sent); 1416 f->sent = ktime_get();
1437 f->sent_jiffs = (u32) jiffies;
1438 }
1439 1417
1440 return skb; 1418 return skb;
1441} 1419}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index bd97908c766f..9f4e6f502b84 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio)
953 struct drbd_bm_aio_ctx *ctx = bio->bi_private; 953 struct drbd_bm_aio_ctx *ctx = bio->bi_private;
954 struct drbd_device *device = ctx->device; 954 struct drbd_device *device = ctx->device;
955 struct drbd_bitmap *b = device->bitmap; 955 struct drbd_bitmap *b = device->bitmap;
956 unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); 956 unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
957 957
958 if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && 958 if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
959 !bm_test_page_unchanged(b->bm_pages[idx])) 959 !bm_test_page_unchanged(b->bm_pages[idx]))
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index ad0477ae820f..6655893a3a7a 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -12,9 +12,9 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/blk-mq.h> 13#include <linux/blk-mq.h>
14#include <linux/hrtimer.h> 14#include <linux/hrtimer.h>
15#include <linux/lightnvm.h>
16#include <linux/configfs.h> 15#include <linux/configfs.h>
17#include <linux/badblocks.h> 16#include <linux/badblocks.h>
17#include <linux/fault-inject.h>
18 18
19#define SECTOR_SHIFT 9 19#define SECTOR_SHIFT 9
20#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) 20#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
@@ -27,6 +27,10 @@
27#define TICKS_PER_SEC 50ULL 27#define TICKS_PER_SEC 50ULL
28#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) 28#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC)
29 29
30#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
31static DECLARE_FAULT_ATTR(null_timeout_attr);
32#endif
33
30static inline u64 mb_per_tick(int mbps) 34static inline u64 mb_per_tick(int mbps)
31{ 35{
32 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); 36 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
@@ -107,7 +111,6 @@ struct nullb_device {
107 unsigned int hw_queue_depth; /* queue depth */ 111 unsigned int hw_queue_depth; /* queue depth */
108 unsigned int index; /* index of the disk, only valid with a disk */ 112 unsigned int index; /* index of the disk, only valid with a disk */
109 unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ 113 unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
110 bool use_lightnvm; /* register as a LightNVM device */
111 bool blocking; /* blocking blk-mq device */ 114 bool blocking; /* blocking blk-mq device */
112 bool use_per_node_hctx; /* use per-node allocation for hardware context */ 115 bool use_per_node_hctx; /* use per-node allocation for hardware context */
113 bool power; /* power on/off the device */ 116 bool power; /* power on/off the device */
@@ -121,7 +124,6 @@ struct nullb {
121 unsigned int index; 124 unsigned int index;
122 struct request_queue *q; 125 struct request_queue *q;
123 struct gendisk *disk; 126 struct gendisk *disk;
124 struct nvm_dev *ndev;
125 struct blk_mq_tag_set *tag_set; 127 struct blk_mq_tag_set *tag_set;
126 struct blk_mq_tag_set __tag_set; 128 struct blk_mq_tag_set __tag_set;
127 unsigned int queue_depth; 129 unsigned int queue_depth;
@@ -139,7 +141,6 @@ static LIST_HEAD(nullb_list);
139static struct mutex lock; 141static struct mutex lock;
140static int null_major; 142static int null_major;
141static DEFINE_IDA(nullb_indexes); 143static DEFINE_IDA(nullb_indexes);
142static struct kmem_cache *ppa_cache;
143static struct blk_mq_tag_set tag_set; 144static struct blk_mq_tag_set tag_set;
144 145
145enum { 146enum {
@@ -166,6 +167,11 @@ static int g_home_node = NUMA_NO_NODE;
166module_param_named(home_node, g_home_node, int, S_IRUGO); 167module_param_named(home_node, g_home_node, int, S_IRUGO);
167MODULE_PARM_DESC(home_node, "Home node for the device"); 168MODULE_PARM_DESC(home_node, "Home node for the device");
168 169
170#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
171static char g_timeout_str[80];
172module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
173#endif
174
169static int g_queue_mode = NULL_Q_MQ; 175static int g_queue_mode = NULL_Q_MQ;
170 176
171static int null_param_store_val(const char *str, int *val, int min, int max) 177static int null_param_store_val(const char *str, int *val, int min, int max)
@@ -208,10 +214,6 @@ static int nr_devices = 1;
208module_param(nr_devices, int, S_IRUGO); 214module_param(nr_devices, int, S_IRUGO);
209MODULE_PARM_DESC(nr_devices, "Number of devices to register"); 215MODULE_PARM_DESC(nr_devices, "Number of devices to register");
210 216
211static bool g_use_lightnvm;
212module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
213MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
214
215static bool g_blocking; 217static bool g_blocking;
216module_param_named(blocking, g_blocking, bool, S_IRUGO); 218module_param_named(blocking, g_blocking, bool, S_IRUGO);
217MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); 219MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
@@ -345,7 +347,6 @@ NULLB_DEVICE_ATTR(blocksize, uint);
345NULLB_DEVICE_ATTR(irqmode, uint); 347NULLB_DEVICE_ATTR(irqmode, uint);
346NULLB_DEVICE_ATTR(hw_queue_depth, uint); 348NULLB_DEVICE_ATTR(hw_queue_depth, uint);
347NULLB_DEVICE_ATTR(index, uint); 349NULLB_DEVICE_ATTR(index, uint);
348NULLB_DEVICE_ATTR(use_lightnvm, bool);
349NULLB_DEVICE_ATTR(blocking, bool); 350NULLB_DEVICE_ATTR(blocking, bool);
350NULLB_DEVICE_ATTR(use_per_node_hctx, bool); 351NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
351NULLB_DEVICE_ATTR(memory_backed, bool); 352NULLB_DEVICE_ATTR(memory_backed, bool);
@@ -455,7 +456,6 @@ static struct configfs_attribute *nullb_device_attrs[] = {
455 &nullb_device_attr_irqmode, 456 &nullb_device_attr_irqmode,
456 &nullb_device_attr_hw_queue_depth, 457 &nullb_device_attr_hw_queue_depth,
457 &nullb_device_attr_index, 458 &nullb_device_attr_index,
458 &nullb_device_attr_use_lightnvm,
459 &nullb_device_attr_blocking, 459 &nullb_device_attr_blocking,
460 &nullb_device_attr_use_per_node_hctx, 460 &nullb_device_attr_use_per_node_hctx,
461 &nullb_device_attr_power, 461 &nullb_device_attr_power,
@@ -573,7 +573,6 @@ static struct nullb_device *null_alloc_dev(void)
573 dev->blocksize = g_bs; 573 dev->blocksize = g_bs;
574 dev->irqmode = g_irqmode; 574 dev->irqmode = g_irqmode;
575 dev->hw_queue_depth = g_hw_queue_depth; 575 dev->hw_queue_depth = g_hw_queue_depth;
576 dev->use_lightnvm = g_use_lightnvm;
577 dev->blocking = g_blocking; 576 dev->blocking = g_blocking;
578 dev->use_per_node_hctx = g_use_per_node_hctx; 577 dev->use_per_node_hctx = g_use_per_node_hctx;
579 return dev; 578 return dev;
@@ -1352,6 +1351,12 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
1352 return BLK_QC_T_NONE; 1351 return BLK_QC_T_NONE;
1353} 1352}
1354 1353
1354static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
1355{
1356 pr_info("null: rq %p timed out\n", rq);
1357 return BLK_EH_HANDLED;
1358}
1359
1355static int null_rq_prep_fn(struct request_queue *q, struct request *req) 1360static int null_rq_prep_fn(struct request_queue *q, struct request *req)
1356{ 1361{
1357 struct nullb *nullb = q->queuedata; 1362 struct nullb *nullb = q->queuedata;
@@ -1369,6 +1374,16 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req)
1369 return BLKPREP_DEFER; 1374 return BLKPREP_DEFER;
1370} 1375}
1371 1376
1377static bool should_timeout_request(struct request *rq)
1378{
1379#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1380 if (g_timeout_str[0])
1381 return should_fail(&null_timeout_attr, 1);
1382#endif
1383
1384 return false;
1385}
1386
1372static void null_request_fn(struct request_queue *q) 1387static void null_request_fn(struct request_queue *q)
1373{ 1388{
1374 struct request *rq; 1389 struct request *rq;
@@ -1376,12 +1391,20 @@ static void null_request_fn(struct request_queue *q)
1376 while ((rq = blk_fetch_request(q)) != NULL) { 1391 while ((rq = blk_fetch_request(q)) != NULL) {
1377 struct nullb_cmd *cmd = rq->special; 1392 struct nullb_cmd *cmd = rq->special;
1378 1393
1379 spin_unlock_irq(q->queue_lock); 1394 if (!should_timeout_request(rq)) {
1380 null_handle_cmd(cmd); 1395 spin_unlock_irq(q->queue_lock);
1381 spin_lock_irq(q->queue_lock); 1396 null_handle_cmd(cmd);
1397 spin_lock_irq(q->queue_lock);
1398 }
1382 } 1399 }
1383} 1400}
1384 1401
1402static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
1403{
1404 pr_info("null: rq %p timed out\n", rq);
1405 return BLK_EH_HANDLED;
1406}
1407
1385static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, 1408static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
1386 const struct blk_mq_queue_data *bd) 1409 const struct blk_mq_queue_data *bd)
1387{ 1410{
@@ -1399,12 +1422,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
1399 1422
1400 blk_mq_start_request(bd->rq); 1423 blk_mq_start_request(bd->rq);
1401 1424
1402 return null_handle_cmd(cmd); 1425 if (!should_timeout_request(bd->rq))
1426 return null_handle_cmd(cmd);
1427
1428 return BLK_STS_OK;
1403} 1429}
1404 1430
1405static const struct blk_mq_ops null_mq_ops = { 1431static const struct blk_mq_ops null_mq_ops = {
1406 .queue_rq = null_queue_rq, 1432 .queue_rq = null_queue_rq,
1407 .complete = null_softirq_done_fn, 1433 .complete = null_softirq_done_fn,
1434 .timeout = null_timeout_rq,
1408}; 1435};
1409 1436
1410static void cleanup_queue(struct nullb_queue *nq) 1437static void cleanup_queue(struct nullb_queue *nq)
@@ -1423,170 +1450,6 @@ static void cleanup_queues(struct nullb *nullb)
1423 kfree(nullb->queues); 1450 kfree(nullb->queues);
1424} 1451}
1425 1452
1426#ifdef CONFIG_NVM
1427
1428static void null_lnvm_end_io(struct request *rq, blk_status_t status)
1429{
1430 struct nvm_rq *rqd = rq->end_io_data;
1431
1432 /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
1433 rqd->error = status ? -EIO : 0;
1434 nvm_end_io(rqd);
1435
1436 blk_put_request(rq);
1437}
1438
1439static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
1440{
1441 struct request_queue *q = dev->q;
1442 struct request *rq;
1443 struct bio *bio = rqd->bio;
1444
1445 rq = blk_mq_alloc_request(q,
1446 op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
1447 if (IS_ERR(rq))
1448 return -ENOMEM;
1449
1450 blk_init_request_from_bio(rq, bio);
1451
1452 rq->end_io_data = rqd;
1453
1454 blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
1455
1456 return 0;
1457}
1458
1459static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
1460{
1461 struct nullb *nullb = dev->q->queuedata;
1462 sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
1463 sector_t blksize;
1464 struct nvm_id_group *grp;
1465
1466 id->ver_id = 0x1;
1467 id->vmnt = 0;
1468 id->cap = 0x2;
1469 id->dom = 0x1;
1470
1471 id->ppaf.blk_offset = 0;
1472 id->ppaf.blk_len = 16;
1473 id->ppaf.pg_offset = 16;
1474 id->ppaf.pg_len = 16;
1475 id->ppaf.sect_offset = 32;
1476 id->ppaf.sect_len = 8;
1477 id->ppaf.pln_offset = 40;
1478 id->ppaf.pln_len = 8;
1479 id->ppaf.lun_offset = 48;
1480 id->ppaf.lun_len = 8;
1481 id->ppaf.ch_offset = 56;
1482 id->ppaf.ch_len = 8;
1483
1484 sector_div(size, nullb->dev->blocksize); /* convert size to pages */
1485 size >>= 8; /* concert size to pgs pr blk */
1486 grp = &id->grp;
1487 grp->mtype = 0;
1488 grp->fmtype = 0;
1489 grp->num_ch = 1;
1490 grp->num_pg = 256;
1491 blksize = size;
1492 size >>= 16;
1493 grp->num_lun = size + 1;
1494 sector_div(blksize, grp->num_lun);
1495 grp->num_blk = blksize;
1496 grp->num_pln = 1;
1497
1498 grp->fpg_sz = nullb->dev->blocksize;
1499 grp->csecs = nullb->dev->blocksize;
1500 grp->trdt = 25000;
1501 grp->trdm = 25000;
1502 grp->tprt = 500000;
1503 grp->tprm = 500000;
1504 grp->tbet = 1500000;
1505 grp->tbem = 1500000;
1506 grp->mpos = 0x010101; /* single plane rwe */
1507 grp->cpar = nullb->dev->hw_queue_depth;
1508
1509 return 0;
1510}
1511
1512static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
1513{
1514 mempool_t *virtmem_pool;
1515
1516 virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
1517 if (!virtmem_pool) {
1518 pr_err("null_blk: Unable to create virtual memory pool\n");
1519 return NULL;
1520 }
1521
1522 return virtmem_pool;
1523}
1524
1525static void null_lnvm_destroy_dma_pool(void *pool)
1526{
1527 mempool_destroy(pool);
1528}
1529
1530static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
1531 gfp_t mem_flags, dma_addr_t *dma_handler)
1532{
1533 return mempool_alloc(pool, mem_flags);
1534}
1535
1536static void null_lnvm_dev_dma_free(void *pool, void *entry,
1537 dma_addr_t dma_handler)
1538{
1539 mempool_free(entry, pool);
1540}
1541
1542static struct nvm_dev_ops null_lnvm_dev_ops = {
1543 .identity = null_lnvm_id,
1544 .submit_io = null_lnvm_submit_io,
1545
1546 .create_dma_pool = null_lnvm_create_dma_pool,
1547 .destroy_dma_pool = null_lnvm_destroy_dma_pool,
1548 .dev_dma_alloc = null_lnvm_dev_dma_alloc,
1549 .dev_dma_free = null_lnvm_dev_dma_free,
1550
1551 /* Simulate nvme protocol restriction */
1552 .max_phys_sect = 64,
1553};
1554
1555static int null_nvm_register(struct nullb *nullb)
1556{
1557 struct nvm_dev *dev;
1558 int rv;
1559
1560 dev = nvm_alloc_dev(0);
1561 if (!dev)
1562 return -ENOMEM;
1563
1564 dev->q = nullb->q;
1565 memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
1566 dev->ops = &null_lnvm_dev_ops;
1567
1568 rv = nvm_register(dev);
1569 if (rv) {
1570 kfree(dev);
1571 return rv;
1572 }
1573 nullb->ndev = dev;
1574 return 0;
1575}
1576
1577static void null_nvm_unregister(struct nullb *nullb)
1578{
1579 nvm_unregister(nullb->ndev);
1580}
1581#else
1582static int null_nvm_register(struct nullb *nullb)
1583{
1584 pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
1585 return -EINVAL;
1586}
1587static void null_nvm_unregister(struct nullb *nullb) {}
1588#endif /* CONFIG_NVM */
1589
1590static void null_del_dev(struct nullb *nullb) 1453static void null_del_dev(struct nullb *nullb)
1591{ 1454{
1592 struct nullb_device *dev = nullb->dev; 1455 struct nullb_device *dev = nullb->dev;
@@ -1595,10 +1458,7 @@ static void null_del_dev(struct nullb *nullb)
1595 1458
1596 list_del_init(&nullb->list); 1459 list_del_init(&nullb->list);
1597 1460
1598 if (dev->use_lightnvm) 1461 del_gendisk(nullb->disk);
1599 null_nvm_unregister(nullb);
1600 else
1601 del_gendisk(nullb->disk);
1602 1462
1603 if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { 1463 if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
1604 hrtimer_cancel(&nullb->bw_timer); 1464 hrtimer_cancel(&nullb->bw_timer);
@@ -1610,8 +1470,7 @@ static void null_del_dev(struct nullb *nullb)
1610 if (dev->queue_mode == NULL_Q_MQ && 1470 if (dev->queue_mode == NULL_Q_MQ &&
1611 nullb->tag_set == &nullb->__tag_set) 1471 nullb->tag_set == &nullb->__tag_set)
1612 blk_mq_free_tag_set(nullb->tag_set); 1472 blk_mq_free_tag_set(nullb->tag_set);
1613 if (!dev->use_lightnvm) 1473 put_disk(nullb->disk);
1614 put_disk(nullb->disk);
1615 cleanup_queues(nullb); 1474 cleanup_queues(nullb);
1616 if (null_cache_active(nullb)) 1475 if (null_cache_active(nullb))
1617 null_free_device_storage(nullb->dev, true); 1476 null_free_device_storage(nullb->dev, true);
@@ -1775,11 +1634,6 @@ static void null_validate_conf(struct nullb_device *dev)
1775{ 1634{
1776 dev->blocksize = round_down(dev->blocksize, 512); 1635 dev->blocksize = round_down(dev->blocksize, 512);
1777 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); 1636 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
1778 if (dev->use_lightnvm && dev->blocksize != 4096)
1779 dev->blocksize = 4096;
1780
1781 if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
1782 dev->queue_mode = NULL_Q_MQ;
1783 1637
1784 if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { 1638 if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
1785 if (dev->submit_queues != nr_online_nodes) 1639 if (dev->submit_queues != nr_online_nodes)
@@ -1805,6 +1659,20 @@ static void null_validate_conf(struct nullb_device *dev)
1805 dev->mbps = 0; 1659 dev->mbps = 0;
1806} 1660}
1807 1661
1662static bool null_setup_fault(void)
1663{
1664#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1665 if (!g_timeout_str[0])
1666 return true;
1667
1668 if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
1669 return false;
1670
1671 null_timeout_attr.verbose = 0;
1672#endif
1673 return true;
1674}
1675
1808static int null_add_dev(struct nullb_device *dev) 1676static int null_add_dev(struct nullb_device *dev)
1809{ 1677{
1810 struct nullb *nullb; 1678 struct nullb *nullb;
@@ -1838,6 +1706,10 @@ static int null_add_dev(struct nullb_device *dev)
1838 if (rv) 1706 if (rv)
1839 goto out_cleanup_queues; 1707 goto out_cleanup_queues;
1840 1708
1709 if (!null_setup_fault())
1710 goto out_cleanup_queues;
1711
1712 nullb->tag_set->timeout = 5 * HZ;
1841 nullb->q = blk_mq_init_queue(nullb->tag_set); 1713 nullb->q = blk_mq_init_queue(nullb->tag_set);
1842 if (IS_ERR(nullb->q)) { 1714 if (IS_ERR(nullb->q)) {
1843 rv = -ENOMEM; 1715 rv = -ENOMEM;
@@ -1861,8 +1733,14 @@ static int null_add_dev(struct nullb_device *dev)
1861 rv = -ENOMEM; 1733 rv = -ENOMEM;
1862 goto out_cleanup_queues; 1734 goto out_cleanup_queues;
1863 } 1735 }
1736
1737 if (!null_setup_fault())
1738 goto out_cleanup_blk_queue;
1739
1864 blk_queue_prep_rq(nullb->q, null_rq_prep_fn); 1740 blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
1865 blk_queue_softirq_done(nullb->q, null_softirq_done_fn); 1741 blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
1742 blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
1743 nullb->q->rq_timeout = 5 * HZ;
1866 rv = init_driver_queues(nullb); 1744 rv = init_driver_queues(nullb);
1867 if (rv) 1745 if (rv)
1868 goto out_cleanup_blk_queue; 1746 goto out_cleanup_blk_queue;
@@ -1895,11 +1773,7 @@ static int null_add_dev(struct nullb_device *dev)
1895 1773
1896 sprintf(nullb->disk_name, "nullb%d", nullb->index); 1774 sprintf(nullb->disk_name, "nullb%d", nullb->index);
1897 1775
1898 if (dev->use_lightnvm) 1776 rv = null_gendisk_register(nullb);
1899 rv = null_nvm_register(nullb);
1900 else
1901 rv = null_gendisk_register(nullb);
1902
1903 if (rv) 1777 if (rv)
1904 goto out_cleanup_blk_queue; 1778 goto out_cleanup_blk_queue;
1905 1779
@@ -1938,18 +1812,6 @@ static int __init null_init(void)
1938 g_bs = PAGE_SIZE; 1812 g_bs = PAGE_SIZE;
1939 } 1813 }
1940 1814
1941 if (g_use_lightnvm && g_bs != 4096) {
1942 pr_warn("null_blk: LightNVM only supports 4k block size\n");
1943 pr_warn("null_blk: defaults block size to 4k\n");
1944 g_bs = 4096;
1945 }
1946
1947 if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
1948 pr_warn("null_blk: LightNVM only supported for blk-mq\n");
1949 pr_warn("null_blk: defaults queue mode to blk-mq\n");
1950 g_queue_mode = NULL_Q_MQ;
1951 }
1952
1953 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { 1815 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
1954 if (g_submit_queues != nr_online_nodes) { 1816 if (g_submit_queues != nr_online_nodes) {
1955 pr_warn("null_blk: submit_queues param is set to %u.\n", 1817 pr_warn("null_blk: submit_queues param is set to %u.\n",
@@ -1982,16 +1844,6 @@ static int __init null_init(void)
1982 goto err_conf; 1844 goto err_conf;
1983 } 1845 }
1984 1846
1985 if (g_use_lightnvm) {
1986 ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
1987 0, 0, NULL);
1988 if (!ppa_cache) {
1989 pr_err("null_blk: unable to create ppa cache\n");
1990 ret = -ENOMEM;
1991 goto err_ppa;
1992 }
1993 }
1994
1995 for (i = 0; i < nr_devices; i++) { 1847 for (i = 0; i < nr_devices; i++) {
1996 dev = null_alloc_dev(); 1848 dev = null_alloc_dev();
1997 if (!dev) { 1849 if (!dev) {
@@ -2015,8 +1867,6 @@ err_dev:
2015 null_del_dev(nullb); 1867 null_del_dev(nullb);
2016 null_free_dev(dev); 1868 null_free_dev(dev);
2017 } 1869 }
2018 kmem_cache_destroy(ppa_cache);
2019err_ppa:
2020 unregister_blkdev(null_major, "nullb"); 1870 unregister_blkdev(null_major, "nullb");
2021err_conf: 1871err_conf:
2022 configfs_unregister_subsystem(&nullb_subsys); 1872 configfs_unregister_subsystem(&nullb_subsys);
@@ -2047,8 +1897,6 @@ static void __exit null_exit(void)
2047 1897
2048 if (g_queue_mode == NULL_Q_MQ && shared_tags) 1898 if (g_queue_mode == NULL_Q_MQ && shared_tags)
2049 blk_mq_free_tag_set(&tag_set); 1899 blk_mq_free_tag_set(&tag_set);
2050
2051 kmem_cache_destroy(ppa_cache);
2052} 1900}
2053 1901
2054module_init(null_init); 1902module_init(null_init);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 67974796c350..531a0915066b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
2579 bdev = bdget(dev); 2579 bdev = bdget(dev);
2580 if (!bdev) 2580 if (!bdev)
2581 return -ENOMEM; 2581 return -ENOMEM;
2582 ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
2583 if (ret)
2584 return ret;
2582 if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { 2585 if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
2583 WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); 2586 WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
2584 bdput(bdev); 2587 blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
2585 return -EINVAL; 2588 return -EINVAL;
2586 } 2589 }
2587 ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
2588 if (ret)
2589 return ret;
2590 2590
2591 /* This is safe, since we have a reference from open(). */ 2591 /* This is safe, since we have a reference from open(). */
2592 __module_get(THIS_MODULE); 2592 __module_get(THIS_MODULE);
@@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
2745 pd->pkt_dev = MKDEV(pktdev_major, idx); 2745 pd->pkt_dev = MKDEV(pktdev_major, idx);
2746 ret = pkt_new_dev(pd, dev); 2746 ret = pkt_new_dev(pd, dev);
2747 if (ret) 2747 if (ret)
2748 goto out_new_dev; 2748 goto out_mem2;
2749 2749
2750 /* inherit events of the host device */ 2750 /* inherit events of the host device */
2751 disk->events = pd->bdev->bd_disk->events; 2751 disk->events = pd->bdev->bd_disk->events;
@@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
2763 mutex_unlock(&ctl_mutex); 2763 mutex_unlock(&ctl_mutex);
2764 return 0; 2764 return 0;
2765 2765
2766out_new_dev:
2767 blk_cleanup_queue(disk->queue);
2768out_mem2: 2766out_mem2:
2769 put_disk(disk); 2767 put_disk(disk);
2770out_mem: 2768out_mem:
diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h
deleted file mode 100644
index e5565fbaeb30..000000000000
--- a/drivers/block/smart1,2.h
+++ /dev/null
@@ -1,278 +0,0 @@
1/*
2 * Disk Array driver for Compaq SMART2 Controllers
3 * Copyright 1998 Compaq Computer Corporation
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
13 * NON INFRINGEMENT. See the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * Questions/Comments/Bugfixes to iss_storagedev@hp.com
20 *
21 * If you want to make changes, improve or add functionality to this
22 * driver, you'll probably need the Compaq Array Controller Interface
23 * Specificiation (Document number ECG086/1198)
24 */
25
26/*
27 * This file contains the controller communication implementation for
28 * Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge,
29 * this should support:
30 *
31 * PCI:
32 * SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200
33 * Integerated SMART Array Controller, SMART-4200, SMART-4250ES
34 *
35 * EISA:
36 * SMART-2/E, SMART, IAES, IDA-2, IDA
37 */
38
39/*
40 * Memory mapped FIFO interface (SMART 42xx cards)
41 */
42static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c)
43{
44 writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET);
45}
46
47/*
48 * This card is the opposite of the other cards.
49 * 0 turns interrupts on...
50 * 0x08 turns them off...
51 */
52static void smart4_intr_mask(ctlr_info_t *h, unsigned long val)
53{
54 if (val)
55 { /* Turn interrupts on */
56 writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
57 } else /* Turn them off */
58 {
59 writel( S42XX_INTR_OFF,
60 h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
61 }
62}
63
64/*
65 * For older cards FIFO Full = 0.
66 * On this card 0 means there is room, anything else FIFO Full.
67 *
68 */
69static unsigned long smart4_fifo_full(ctlr_info_t *h)
70{
71
72 return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET));
73}
74
75/* This type of controller returns -1 if the fifo is empty,
76 * Not 0 like the others.
77 * And we need to let it know we read a value out
78 */
79static unsigned long smart4_completed(ctlr_info_t *h)
80{
81 long register_value
82 = readl(h->vaddr + S42XX_REPLY_PORT_OFFSET);
83
84 /* Fifo is empty */
85 if( register_value == 0xffffffff)
86 return 0;
87
88 /* Need to let it know we got the reply */
89 /* We do this by writing a 0 to the port we just read from */
90 writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET);
91
92 return ((unsigned long) register_value);
93}
94
95 /*
96 * This hardware returns interrupt pending at a different place and
97 * it does not tell us if the fifo is empty, we will have check
98 * that by getting a 0 back from the command_completed call.
99 */
100static unsigned long smart4_intr_pending(ctlr_info_t *h)
101{
102 unsigned long register_value =
103 readl(h->vaddr + S42XX_INTR_STATUS);
104
105 if( register_value & S42XX_INTR_PENDING)
106 return FIFO_NOT_EMPTY;
107 return 0 ;
108}
109
110static struct access_method smart4_access = {
111 smart4_submit_command,
112 smart4_intr_mask,
113 smart4_fifo_full,
114 smart4_intr_pending,
115 smart4_completed,
116};
117
118/*
119 * Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards)
120 */
121static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c)
122{
123 writel(c->busaddr, h->vaddr + COMMAND_FIFO);
124}
125
126static void smart2_intr_mask(ctlr_info_t *h, unsigned long val)
127{
128 writel(val, h->vaddr + INTR_MASK);
129}
130
131static unsigned long smart2_fifo_full(ctlr_info_t *h)
132{
133 return readl(h->vaddr + COMMAND_FIFO);
134}
135
136static unsigned long smart2_completed(ctlr_info_t *h)
137{
138 return readl(h->vaddr + COMMAND_COMPLETE_FIFO);
139}
140
141static unsigned long smart2_intr_pending(ctlr_info_t *h)
142{
143 return readl(h->vaddr + INTR_PENDING);
144}
145
146static struct access_method smart2_access = {
147 smart2_submit_command,
148 smart2_intr_mask,
149 smart2_fifo_full,
150 smart2_intr_pending,
151 smart2_completed,
152};
153
154/*
155 * IO access for SMART-2/E cards
156 */
157static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c)
158{
159 outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO);
160}
161
162static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val)
163{
164 outl(val, h->io_mem_addr + INTR_MASK);
165}
166
167static unsigned long smart2e_fifo_full(ctlr_info_t *h)
168{
169 return inl(h->io_mem_addr + COMMAND_FIFO);
170}
171
172static unsigned long smart2e_completed(ctlr_info_t *h)
173{
174 return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO);
175}
176
177static unsigned long smart2e_intr_pending(ctlr_info_t *h)
178{
179 return inl(h->io_mem_addr + INTR_PENDING);
180}
181
182static struct access_method smart2e_access = {
183 smart2e_submit_command,
184 smart2e_intr_mask,
185 smart2e_fifo_full,
186 smart2e_intr_pending,
187 smart2e_completed,
188};
189
190/*
191 * IO access for older SMART-1 type cards
192 */
193#define SMART1_SYSTEM_MASK 0xC8E
194#define SMART1_SYSTEM_DOORBELL 0xC8F
195#define SMART1_LOCAL_MASK 0xC8C
196#define SMART1_LOCAL_DOORBELL 0xC8D
197#define SMART1_INTR_MASK 0xC89
198#define SMART1_LISTADDR 0xC90
199#define SMART1_LISTLEN 0xC94
200#define SMART1_TAG 0xC97
201#define SMART1_COMPLETE_ADDR 0xC98
202#define SMART1_LISTSTATUS 0xC9E
203
204#define CHANNEL_BUSY 0x01
205#define CHANNEL_CLEAR 0x02
206
207static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c)
208{
209 /*
210 * This __u16 is actually a bunch of control flags on SMART
211 * and below. We want them all to be zero.
212 */
213 c->hdr.size = 0;
214
215 outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
216
217 outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR);
218 outw(c->size, h->io_mem_addr + SMART1_LISTLEN);
219
220 outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
221}
222
223static void smart1_intr_mask(ctlr_info_t *h, unsigned long val)
224{
225 if (val == 1) {
226 outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
227 outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
228 outb(0x01, h->io_mem_addr + SMART1_INTR_MASK);
229 outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK);
230 } else {
231 outb(0, h->io_mem_addr + 0xC8E);
232 }
233}
234
235static unsigned long smart1_fifo_full(ctlr_info_t *h)
236{
237 unsigned char chan;
238 chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR;
239 return chan;
240}
241
242static unsigned long smart1_completed(ctlr_info_t *h)
243{
244 unsigned char status;
245 unsigned long cmd;
246
247 if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) {
248 outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
249
250 cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR);
251 status = inb(h->io_mem_addr + SMART1_LISTSTATUS);
252
253 outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
254
255 /*
256 * this is x86 (actually compaq x86) only, so it's ok
257 */
258 if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status;
259 } else {
260 cmd = 0;
261 }
262 return cmd;
263}
264
265static unsigned long smart1_intr_pending(ctlr_info_t *h)
266{
267 unsigned char chan;
268 chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY;
269 return chan;
270}
271
272static struct access_method smart1_access = {
273 smart1_submit_command,
274 smart1_intr_mask,
275 smart1_fifo_full,
276 smart1_intr_pending,
277 smart1_completed,
278};
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d70eba30003a..0afa6c8c3857 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
430 430
431static void zram_page_end_io(struct bio *bio) 431static void zram_page_end_io(struct bio *bio)
432{ 432{
433 struct page *page = bio->bi_io_vec[0].bv_page; 433 struct page *page = bio_first_page_all(bio);
434 434
435 page_endio(page, op_is_write(bio_op(bio)), 435 page_endio(page, op_is_write(bio_op(bio)),
436 blk_status_to_errno(bio->bi_status)); 436 blk_status_to_errno(bio->bi_status));
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 2a953efec4e1..10c08982185a 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -27,13 +27,6 @@ config NVM_DEBUG
27 27
28 It is required to create/remove targets without IOCTLs. 28 It is required to create/remove targets without IOCTLs.
29 29
30config NVM_RRPC
31 tristate "Round-robin Hybrid Open-Channel SSD target"
32 ---help---
33 Allows an open-channel SSD to be exposed as a block device to the
34 host. The target is implemented using a linear mapping table and
35 cost-based garbage collection. It is optimized for 4K IO sizes.
36
37config NVM_PBLK 30config NVM_PBLK
38 tristate "Physical Block Device Open-Channel SSD target" 31 tristate "Physical Block Device Open-Channel SSD target"
39 ---help--- 32 ---help---
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 2c3fd9d2c08c..97d9d7c71550 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -4,7 +4,6 @@
4# 4#
5 5
6obj-$(CONFIG_NVM) := core.o 6obj-$(CONFIG_NVM) := core.o
7obj-$(CONFIG_NVM_RRPC) += rrpc.o
8obj-$(CONFIG_NVM_PBLK) += pblk.o 7obj-$(CONFIG_NVM_PBLK) += pblk.o
9pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ 8pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
10 pblk-write.o pblk-cache.o pblk-read.o \ 9 pblk-write.o pblk-cache.o pblk-read.o \
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 83249b43dd06..dcc9e621e651 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -45,12 +45,6 @@ struct nvm_dev_map {
45 int nr_chnls; 45 int nr_chnls;
46}; 46};
47 47
48struct nvm_area {
49 struct list_head list;
50 sector_t begin;
51 sector_t end; /* end is excluded */
52};
53
54static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) 48static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
55{ 49{
56 struct nvm_target *tgt; 50 struct nvm_target *tgt;
@@ -62,6 +56,30 @@ static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
62 return NULL; 56 return NULL;
63} 57}
64 58
59static bool nvm_target_exists(const char *name)
60{
61 struct nvm_dev *dev;
62 struct nvm_target *tgt;
63 bool ret = false;
64
65 down_write(&nvm_lock);
66 list_for_each_entry(dev, &nvm_devices, devices) {
67 mutex_lock(&dev->mlock);
68 list_for_each_entry(tgt, &dev->targets, list) {
69 if (!strcmp(name, tgt->disk->disk_name)) {
70 ret = true;
71 mutex_unlock(&dev->mlock);
72 goto out;
73 }
74 }
75 mutex_unlock(&dev->mlock);
76 }
77
78out:
79 up_write(&nvm_lock);
80 return ret;
81}
82
65static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) 83static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
66{ 84{
67 int i; 85 int i;
@@ -104,7 +122,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
104 if (clear) { 122 if (clear) {
105 for (j = 0; j < ch_map->nr_luns; j++) { 123 for (j = 0; j < ch_map->nr_luns; j++) {
106 int lun = j + lun_offs[j]; 124 int lun = j + lun_offs[j];
107 int lunid = (ch * dev->geo.luns_per_chnl) + lun; 125 int lunid = (ch * dev->geo.nr_luns) + lun;
108 126
109 WARN_ON(!test_and_clear_bit(lunid, 127 WARN_ON(!test_and_clear_bit(lunid,
110 dev->lun_map)); 128 dev->lun_map));
@@ -122,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
122} 140}
123 141
124static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, 142static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
125 int lun_begin, int lun_end) 143 u16 lun_begin, u16 lun_end,
144 u16 op)
126{ 145{
127 struct nvm_tgt_dev *tgt_dev = NULL; 146 struct nvm_tgt_dev *tgt_dev = NULL;
128 struct nvm_dev_map *dev_rmap = dev->rmap; 147 struct nvm_dev_map *dev_rmap = dev->rmap;
@@ -130,10 +149,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
130 struct ppa_addr *luns; 149 struct ppa_addr *luns;
131 int nr_luns = lun_end - lun_begin + 1; 150 int nr_luns = lun_end - lun_begin + 1;
132 int luns_left = nr_luns; 151 int luns_left = nr_luns;
133 int nr_chnls = nr_luns / dev->geo.luns_per_chnl; 152 int nr_chnls = nr_luns / dev->geo.nr_luns;
134 int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; 153 int nr_chnls_mod = nr_luns % dev->geo.nr_luns;
135 int bch = lun_begin / dev->geo.luns_per_chnl; 154 int bch = lun_begin / dev->geo.nr_luns;
136 int blun = lun_begin % dev->geo.luns_per_chnl; 155 int blun = lun_begin % dev->geo.nr_luns;
137 int lunid = 0; 156 int lunid = 0;
138 int lun_balanced = 1; 157 int lun_balanced = 1;
139 int prev_nr_luns; 158 int prev_nr_luns;
@@ -154,15 +173,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
154 if (!luns) 173 if (!luns)
155 goto err_luns; 174 goto err_luns;
156 175
157 prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? 176 prev_nr_luns = (luns_left > dev->geo.nr_luns) ?
158 dev->geo.luns_per_chnl : luns_left; 177 dev->geo.nr_luns : luns_left;
159 for (i = 0; i < nr_chnls; i++) { 178 for (i = 0; i < nr_chnls; i++) {
160 struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; 179 struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
161 int *lun_roffs = ch_rmap->lun_offs; 180 int *lun_roffs = ch_rmap->lun_offs;
162 struct nvm_ch_map *ch_map = &dev_map->chnls[i]; 181 struct nvm_ch_map *ch_map = &dev_map->chnls[i];
163 int *lun_offs; 182 int *lun_offs;
164 int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? 183 int luns_in_chnl = (luns_left > dev->geo.nr_luns) ?
165 dev->geo.luns_per_chnl : luns_left; 184 dev->geo.nr_luns : luns_left;
166 185
167 if (lun_balanced && prev_nr_luns != luns_in_chnl) 186 if (lun_balanced && prev_nr_luns != luns_in_chnl)
168 lun_balanced = 0; 187 lun_balanced = 0;
@@ -199,8 +218,9 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
199 memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); 218 memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
200 /* Target device only owns a portion of the physical device */ 219 /* Target device only owns a portion of the physical device */
201 tgt_dev->geo.nr_chnls = nr_chnls; 220 tgt_dev->geo.nr_chnls = nr_chnls;
202 tgt_dev->geo.nr_luns = nr_luns; 221 tgt_dev->geo.all_luns = nr_luns;
203 tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; 222 tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
223 tgt_dev->geo.op = op;
204 tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; 224 tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
205 tgt_dev->q = dev->q; 225 tgt_dev->q = dev->q;
206 tgt_dev->map = dev_map; 226 tgt_dev->map = dev_map;
@@ -226,27 +246,79 @@ static const struct block_device_operations nvm_fops = {
226 .owner = THIS_MODULE, 246 .owner = THIS_MODULE,
227}; 247};
228 248
229static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) 249static struct nvm_tgt_type *__nvm_find_target_type(const char *name)
230{ 250{
231 struct nvm_tgt_type *tmp, *tt = NULL; 251 struct nvm_tgt_type *tt;
232 252
233 if (lock) 253 list_for_each_entry(tt, &nvm_tgt_types, list)
234 down_write(&nvm_tgtt_lock); 254 if (!strcmp(name, tt->name))
255 return tt;
235 256
236 list_for_each_entry(tmp, &nvm_tgt_types, list) 257 return NULL;
237 if (!strcmp(name, tmp->name)) { 258}
238 tt = tmp; 259
239 break; 260static struct nvm_tgt_type *nvm_find_target_type(const char *name)
240 } 261{
262 struct nvm_tgt_type *tt;
263
264 down_write(&nvm_tgtt_lock);
265 tt = __nvm_find_target_type(name);
266 up_write(&nvm_tgtt_lock);
241 267
242 if (lock)
243 up_write(&nvm_tgtt_lock);
244 return tt; 268 return tt;
245} 269}
246 270
271static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
272 int lun_end)
273{
274 if (lun_begin > lun_end || lun_end >= geo->all_luns) {
275 pr_err("nvm: lun out of bound (%u:%u > %u)\n",
276 lun_begin, lun_end, geo->all_luns - 1);
277 return -EINVAL;
278 }
279
280 return 0;
281}
282
283static int __nvm_config_simple(struct nvm_dev *dev,
284 struct nvm_ioctl_create_simple *s)
285{
286 struct nvm_geo *geo = &dev->geo;
287
288 if (s->lun_begin == -1 && s->lun_end == -1) {
289 s->lun_begin = 0;
290 s->lun_end = geo->all_luns - 1;
291 }
292
293 return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
294}
295
296static int __nvm_config_extended(struct nvm_dev *dev,
297 struct nvm_ioctl_create_extended *e)
298{
299 struct nvm_geo *geo = &dev->geo;
300
301 if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
302 e->lun_begin = 0;
303 e->lun_end = dev->geo.all_luns - 1;
304 }
305
306 /* op not set falls into target's default */
307 if (e->op == 0xFFFF)
308 e->op = NVM_TARGET_DEFAULT_OP;
309
310 if (e->op < NVM_TARGET_MIN_OP ||
311 e->op > NVM_TARGET_MAX_OP) {
312 pr_err("nvm: invalid over provisioning value\n");
313 return -EINVAL;
314 }
315
316 return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
317}
318
247static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) 319static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
248{ 320{
249 struct nvm_ioctl_create_simple *s = &create->conf.s; 321 struct nvm_ioctl_create_extended e;
250 struct request_queue *tqueue; 322 struct request_queue *tqueue;
251 struct gendisk *tdisk; 323 struct gendisk *tdisk;
252 struct nvm_tgt_type *tt; 324 struct nvm_tgt_type *tt;
@@ -255,22 +327,41 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
255 void *targetdata; 327 void *targetdata;
256 int ret; 328 int ret;
257 329
258 tt = nvm_find_target_type(create->tgttype, 1); 330 switch (create->conf.type) {
331 case NVM_CONFIG_TYPE_SIMPLE:
332 ret = __nvm_config_simple(dev, &create->conf.s);
333 if (ret)
334 return ret;
335
336 e.lun_begin = create->conf.s.lun_begin;
337 e.lun_end = create->conf.s.lun_end;
338 e.op = NVM_TARGET_DEFAULT_OP;
339 break;
340 case NVM_CONFIG_TYPE_EXTENDED:
341 ret = __nvm_config_extended(dev, &create->conf.e);
342 if (ret)
343 return ret;
344
345 e = create->conf.e;
346 break;
347 default:
348 pr_err("nvm: config type not valid\n");
349 return -EINVAL;
350 }
351
352 tt = nvm_find_target_type(create->tgttype);
259 if (!tt) { 353 if (!tt) {
260 pr_err("nvm: target type %s not found\n", create->tgttype); 354 pr_err("nvm: target type %s not found\n", create->tgttype);
261 return -EINVAL; 355 return -EINVAL;
262 } 356 }
263 357
264 mutex_lock(&dev->mlock); 358 if (nvm_target_exists(create->tgtname)) {
265 t = nvm_find_target(dev, create->tgtname); 359 pr_err("nvm: target name already exists (%s)\n",
266 if (t) { 360 create->tgtname);
267 pr_err("nvm: target name already exists.\n");
268 mutex_unlock(&dev->mlock);
269 return -EINVAL; 361 return -EINVAL;
270 } 362 }
271 mutex_unlock(&dev->mlock);
272 363
273 ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end); 364 ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
274 if (ret) 365 if (ret)
275 return ret; 366 return ret;
276 367
@@ -280,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
280 goto err_reserve; 371 goto err_reserve;
281 } 372 }
282 373
283 tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end); 374 tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
284 if (!tgt_dev) { 375 if (!tgt_dev) {
285 pr_err("nvm: could not create target device\n"); 376 pr_err("nvm: could not create target device\n");
286 ret = -ENOMEM; 377 ret = -ENOMEM;
@@ -350,7 +441,7 @@ err_dev:
350err_t: 441err_t:
351 kfree(t); 442 kfree(t);
352err_reserve: 443err_reserve:
353 nvm_release_luns_err(dev, s->lun_begin, s->lun_end); 444 nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
354 return ret; 445 return ret;
355} 446}
356 447
@@ -420,7 +511,7 @@ static int nvm_register_map(struct nvm_dev *dev)
420 for (i = 0; i < dev->geo.nr_chnls; i++) { 511 for (i = 0; i < dev->geo.nr_chnls; i++) {
421 struct nvm_ch_map *ch_rmap; 512 struct nvm_ch_map *ch_rmap;
422 int *lun_roffs; 513 int *lun_roffs;
423 int luns_in_chnl = dev->geo.luns_per_chnl; 514 int luns_in_chnl = dev->geo.nr_luns;
424 515
425 ch_rmap = &rmap->chnls[i]; 516 ch_rmap = &rmap->chnls[i];
426 517
@@ -524,41 +615,12 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
524 nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); 615 nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
525} 616}
526 617
527void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
528 int len)
529{
530 struct nvm_geo *geo = &dev->geo;
531 struct nvm_dev_map *dev_rmap = dev->rmap;
532 u64 i;
533
534 for (i = 0; i < len; i++) {
535 struct nvm_ch_map *ch_rmap;
536 int *lun_roffs;
537 struct ppa_addr gaddr;
538 u64 pba = le64_to_cpu(entries[i]);
539 u64 diff;
540
541 if (!pba)
542 continue;
543
544 gaddr = linear_to_generic_addr(geo, pba);
545 ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
546 lun_roffs = ch_rmap->lun_offs;
547
548 diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
549 (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
550
551 entries[i] -= cpu_to_le64(diff);
552 }
553}
554EXPORT_SYMBOL(nvm_part_to_tgt);
555
556int nvm_register_tgt_type(struct nvm_tgt_type *tt) 618int nvm_register_tgt_type(struct nvm_tgt_type *tt)
557{ 619{
558 int ret = 0; 620 int ret = 0;
559 621
560 down_write(&nvm_tgtt_lock); 622 down_write(&nvm_tgtt_lock);
561 if (nvm_find_target_type(tt->name, 0)) 623 if (__nvm_find_target_type(tt->name))
562 ret = -EEXIST; 624 ret = -EEXIST;
563 else 625 else
564 list_add(&tt->list, &nvm_tgt_types); 626 list_add(&tt->list, &nvm_tgt_types);
@@ -726,112 +788,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
726} 788}
727EXPORT_SYMBOL(nvm_submit_io_sync); 789EXPORT_SYMBOL(nvm_submit_io_sync);
728 790
729int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
730 int nr_ppas)
731{
732 struct nvm_geo *geo = &tgt_dev->geo;
733 struct nvm_rq rqd;
734 int ret;
735
736 memset(&rqd, 0, sizeof(struct nvm_rq));
737
738 rqd.opcode = NVM_OP_ERASE;
739 rqd.flags = geo->plane_mode >> 1;
740
741 ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
742 if (ret)
743 return ret;
744
745 ret = nvm_submit_io_sync(tgt_dev, &rqd);
746 if (ret) {
747 pr_err("rrpr: erase I/O submission failed: %d\n", ret);
748 goto free_ppa_list;
749 }
750
751free_ppa_list:
752 nvm_free_rqd_ppalist(tgt_dev, &rqd);
753
754 return ret;
755}
756EXPORT_SYMBOL(nvm_erase_sync);
757
758int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
759 nvm_l2p_update_fn *update_l2p, void *priv)
760{
761 struct nvm_dev *dev = tgt_dev->parent;
762
763 if (!dev->ops->get_l2p_tbl)
764 return 0;
765
766 return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv);
767}
768EXPORT_SYMBOL(nvm_get_l2p_tbl);
769
770int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
771{
772 struct nvm_dev *dev = tgt_dev->parent;
773 struct nvm_geo *geo = &dev->geo;
774 struct nvm_area *area, *prev, *next;
775 sector_t begin = 0;
776 sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
777
778 if (len > max_sectors)
779 return -EINVAL;
780
781 area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
782 if (!area)
783 return -ENOMEM;
784
785 prev = NULL;
786
787 spin_lock(&dev->lock);
788 list_for_each_entry(next, &dev->area_list, list) {
789 if (begin + len > next->begin) {
790 begin = next->end;
791 prev = next;
792 continue;
793 }
794 break;
795 }
796
797 if ((begin + len) > max_sectors) {
798 spin_unlock(&dev->lock);
799 kfree(area);
800 return -EINVAL;
801 }
802
803 area->begin = *lba = begin;
804 area->end = begin + len;
805
806 if (prev) /* insert into sorted order */
807 list_add(&area->list, &prev->list);
808 else
809 list_add(&area->list, &dev->area_list);
810 spin_unlock(&dev->lock);
811
812 return 0;
813}
814EXPORT_SYMBOL(nvm_get_area);
815
816void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
817{
818 struct nvm_dev *dev = tgt_dev->parent;
819 struct nvm_area *area;
820
821 spin_lock(&dev->lock);
822 list_for_each_entry(area, &dev->area_list, list) {
823 if (area->begin != begin)
824 continue;
825
826 list_del(&area->list);
827 spin_unlock(&dev->lock);
828 kfree(area);
829 return;
830 }
831 spin_unlock(&dev->lock);
832}
833EXPORT_SYMBOL(nvm_put_area);
834
835void nvm_end_io(struct nvm_rq *rqd) 791void nvm_end_io(struct nvm_rq *rqd)
836{ 792{
837 struct nvm_tgt_dev *tgt_dev = rqd->dev; 793 struct nvm_tgt_dev *tgt_dev = rqd->dev;
@@ -858,10 +814,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
858 struct nvm_geo *geo = &dev->geo; 814 struct nvm_geo *geo = &dev->geo;
859 int blk, offset, pl, blktype; 815 int blk, offset, pl, blktype;
860 816
861 if (nr_blks != geo->blks_per_lun * geo->plane_mode) 817 if (nr_blks != geo->nr_chks * geo->plane_mode)
862 return -EINVAL; 818 return -EINVAL;
863 819
864 for (blk = 0; blk < geo->blks_per_lun; blk++) { 820 for (blk = 0; blk < geo->nr_chks; blk++) {
865 offset = blk * geo->plane_mode; 821 offset = blk * geo->plane_mode;
866 blktype = blks[offset]; 822 blktype = blks[offset];
867 823
@@ -877,7 +833,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
877 blks[blk] = blktype; 833 blks[blk] = blktype;
878 } 834 }
879 835
880 return geo->blks_per_lun; 836 return geo->nr_chks;
881} 837}
882EXPORT_SYMBOL(nvm_bb_tbl_fold); 838EXPORT_SYMBOL(nvm_bb_tbl_fold);
883 839
@@ -892,53 +848,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
892} 848}
893EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); 849EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
894 850
895static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
896{
897 struct nvm_geo *geo = &dev->geo;
898 int i;
899
900 dev->lps_per_blk = geo->pgs_per_blk;
901 dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
902 if (!dev->lptbl)
903 return -ENOMEM;
904
905 /* Just a linear array */
906 for (i = 0; i < dev->lps_per_blk; i++)
907 dev->lptbl[i] = i;
908
909 return 0;
910}
911
912static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
913{
914 int i, p;
915 struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
916
917 if (!mlc->num_pairs)
918 return 0;
919
920 dev->lps_per_blk = mlc->num_pairs;
921 dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
922 if (!dev->lptbl)
923 return -ENOMEM;
924
925 /* The lower page table encoding consists of a list of bytes, where each
926 * has a lower and an upper half. The first half byte maintains the
927 * increment value and every value after is an offset added to the
928 * previous incrementation value
929 */
930 dev->lptbl[0] = mlc->pairs[0] & 0xF;
931 for (i = 1; i < dev->lps_per_blk; i++) {
932 p = mlc->pairs[i >> 1];
933 if (i & 0x1) /* upper */
934 dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
935 else /* lower */
936 dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
937 }
938
939 return 0;
940}
941
942static int nvm_core_init(struct nvm_dev *dev) 851static int nvm_core_init(struct nvm_dev *dev)
943{ 852{
944 struct nvm_id *id = &dev->identity; 853 struct nvm_id *id = &dev->identity;
@@ -946,66 +855,44 @@ static int nvm_core_init(struct nvm_dev *dev)
946 struct nvm_geo *geo = &dev->geo; 855 struct nvm_geo *geo = &dev->geo;
947 int ret; 856 int ret;
948 857
858 memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
859
860 if (grp->mtype != 0) {
861 pr_err("nvm: memory type not supported\n");
862 return -EINVAL;
863 }
864
949 /* Whole device values */ 865 /* Whole device values */
950 geo->nr_chnls = grp->num_ch; 866 geo->nr_chnls = grp->num_ch;
951 geo->luns_per_chnl = grp->num_lun; 867 geo->nr_luns = grp->num_lun;
952 868
953 /* Generic device values */ 869 /* Generic device geometry values */
954 geo->pgs_per_blk = grp->num_pg; 870 geo->ws_min = grp->ws_min;
955 geo->blks_per_lun = grp->num_blk; 871 geo->ws_opt = grp->ws_opt;
956 geo->nr_planes = grp->num_pln; 872 geo->ws_seq = grp->ws_seq;
957 geo->fpg_size = grp->fpg_sz; 873 geo->ws_per_chk = grp->ws_per_chk;
958 geo->pfpg_size = grp->fpg_sz * grp->num_pln; 874 geo->nr_chks = grp->num_chk;
959 geo->sec_size = grp->csecs; 875 geo->sec_size = grp->csecs;
960 geo->oob_size = grp->sos; 876 geo->oob_size = grp->sos;
961 geo->sec_per_pg = grp->fpg_sz / grp->csecs;
962 geo->mccap = grp->mccap; 877 geo->mccap = grp->mccap;
963 memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
964
965 geo->plane_mode = NVM_PLANE_SINGLE;
966 geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size; 878 geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
967 879
968 if (grp->mpos & 0x020202) 880 geo->sec_per_chk = grp->clba;
969 geo->plane_mode = NVM_PLANE_DOUBLE; 881 geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
970 if (grp->mpos & 0x040404) 882 geo->all_luns = geo->nr_luns * geo->nr_chnls;
971 geo->plane_mode = NVM_PLANE_QUAD;
972 883
973 if (grp->mtype != 0) { 884 /* 1.2 spec device geometry values */
974 pr_err("nvm: memory type not supported\n"); 885 geo->plane_mode = 1 << geo->ws_seq;
975 return -EINVAL; 886 geo->nr_planes = geo->ws_opt / geo->ws_min;
976 } 887 geo->sec_per_pg = geo->ws_min;
977
978 /* calculated values */
979 geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes; 888 geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
980 geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk;
981 geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun;
982 geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls;
983 889
984 dev->total_secs = geo->nr_luns * geo->sec_per_lun; 890 dev->total_secs = geo->all_luns * geo->sec_per_lun;
985 dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns), 891 dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
986 sizeof(unsigned long), GFP_KERNEL); 892 sizeof(unsigned long), GFP_KERNEL);
987 if (!dev->lun_map) 893 if (!dev->lun_map)
988 return -ENOMEM; 894 return -ENOMEM;
989 895
990 switch (grp->fmtype) {
991 case NVM_ID_FMTYPE_SLC:
992 if (nvm_init_slc_tbl(dev, grp)) {
993 ret = -ENOMEM;
994 goto err_fmtype;
995 }
996 break;
997 case NVM_ID_FMTYPE_MLC:
998 if (nvm_init_mlc_tbl(dev, grp)) {
999 ret = -ENOMEM;
1000 goto err_fmtype;
1001 }
1002 break;
1003 default:
1004 pr_err("nvm: flash type not supported\n");
1005 ret = -EINVAL;
1006 goto err_fmtype;
1007 }
1008
1009 INIT_LIST_HEAD(&dev->area_list); 896 INIT_LIST_HEAD(&dev->area_list);
1010 INIT_LIST_HEAD(&dev->targets); 897 INIT_LIST_HEAD(&dev->targets);
1011 mutex_init(&dev->mlock); 898 mutex_init(&dev->mlock);
@@ -1031,7 +918,6 @@ static void nvm_free(struct nvm_dev *dev)
1031 dev->ops->destroy_dma_pool(dev->dma_pool); 918 dev->ops->destroy_dma_pool(dev->dma_pool);
1032 919
1033 nvm_unregister_map(dev); 920 nvm_unregister_map(dev);
1034 kfree(dev->lptbl);
1035 kfree(dev->lun_map); 921 kfree(dev->lun_map);
1036 kfree(dev); 922 kfree(dev);
1037} 923}
@@ -1062,8 +948,8 @@ static int nvm_init(struct nvm_dev *dev)
1062 948
1063 pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n", 949 pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
1064 dev->name, geo->sec_per_pg, geo->nr_planes, 950 dev->name, geo->sec_per_pg, geo->nr_planes,
1065 geo->pgs_per_blk, geo->blks_per_lun, 951 geo->ws_per_chk, geo->nr_chks,
1066 geo->nr_luns, geo->nr_chnls); 952 geo->all_luns, geo->nr_chnls);
1067 return 0; 953 return 0;
1068err: 954err:
1069 pr_err("nvm: failed to initialize nvm\n"); 955 pr_err("nvm: failed to initialize nvm\n");
@@ -1135,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister);
1135static int __nvm_configure_create(struct nvm_ioctl_create *create) 1021static int __nvm_configure_create(struct nvm_ioctl_create *create)
1136{ 1022{
1137 struct nvm_dev *dev; 1023 struct nvm_dev *dev;
1138 struct nvm_ioctl_create_simple *s;
1139 1024
1140 down_write(&nvm_lock); 1025 down_write(&nvm_lock);
1141 dev = nvm_find_nvm_dev(create->dev); 1026 dev = nvm_find_nvm_dev(create->dev);
@@ -1146,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
1146 return -EINVAL; 1031 return -EINVAL;
1147 } 1032 }
1148 1033
1149 if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
1150 pr_err("nvm: config type not valid\n");
1151 return -EINVAL;
1152 }
1153 s = &create->conf.s;
1154
1155 if (s->lun_begin == -1 && s->lun_end == -1) {
1156 s->lun_begin = 0;
1157 s->lun_end = dev->geo.nr_luns - 1;
1158 }
1159
1160 if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) {
1161 pr_err("nvm: lun out of bound (%u:%u > %u)\n",
1162 s->lun_begin, s->lun_end, dev->geo.nr_luns - 1);
1163 return -EINVAL;
1164 }
1165
1166 return nvm_create_tgt(dev, create); 1034 return nvm_create_tgt(dev, create);
1167} 1035}
1168 1036
@@ -1262,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
1262 if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) 1130 if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
1263 return -EFAULT; 1131 return -EFAULT;
1264 1132
1133 if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
1134 create.conf.e.rsv != 0) {
1135 pr_err("nvm: reserved config field in use\n");
1136 return -EINVAL;
1137 }
1138
1265 create.dev[DISK_NAME_LEN - 1] = '\0'; 1139 create.dev[DISK_NAME_LEN - 1] = '\0';
1266 create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; 1140 create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
1267 create.tgtname[DISK_NAME_LEN - 1] = '\0'; 1141 create.tgtname[DISK_NAME_LEN - 1] = '\0';
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 0d227ef7d1b9..000fcad38136 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -19,12 +19,16 @@
19 19
20int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) 20int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
21{ 21{
22 struct request_queue *q = pblk->dev->q;
22 struct pblk_w_ctx w_ctx; 23 struct pblk_w_ctx w_ctx;
23 sector_t lba = pblk_get_lba(bio); 24 sector_t lba = pblk_get_lba(bio);
25 unsigned long start_time = jiffies;
24 unsigned int bpos, pos; 26 unsigned int bpos, pos;
25 int nr_entries = pblk_get_secs(bio); 27 int nr_entries = pblk_get_secs(bio);
26 int i, ret; 28 int i, ret;
27 29
30 generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
31
28 /* Update the write buffer head (mem) with the entries that we can 32 /* Update the write buffer head (mem) with the entries that we can
29 * write. The write in itself cannot fail, so there is no need to 33 * write. The write in itself cannot fail, so there is no need to
30 * rollback from here on. 34 * rollback from here on.
@@ -67,6 +71,7 @@ retry:
67 pblk_rl_inserted(&pblk->rl, nr_entries); 71 pblk_rl_inserted(&pblk->rl, nr_entries);
68 72
69out: 73out:
74 generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
70 pblk_write_should_kick(pblk); 75 pblk_write_should_kick(pblk);
71 return ret; 76 return ret;
72} 77}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 76516ee84e9a..0487b9340c1d 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work)
32 struct pblk_line *line; 32 struct pblk_line *line;
33 int pos; 33 int pos;
34 34
35 line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; 35 line = &pblk->lines[pblk_ppa_to_line(*ppa)];
36 pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); 36 pos = pblk_ppa_to_pos(&dev->geo, *ppa);
37 37
38 pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", 38 pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
39 line->id, pos); 39 line->id, pos);
@@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
48{ 48{
49 struct nvm_tgt_dev *dev = pblk->dev; 49 struct nvm_tgt_dev *dev = pblk->dev;
50 struct nvm_geo *geo = &dev->geo; 50 struct nvm_geo *geo = &dev->geo;
51 int pos = pblk_dev_ppa_to_pos(geo, *ppa); 51 int pos = pblk_ppa_to_pos(geo, *ppa);
52 52
53 pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); 53 pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
54 atomic_long_inc(&pblk->erase_failed); 54 atomic_long_inc(&pblk->erase_failed);
@@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
66{ 66{
67 struct pblk_line *line; 67 struct pblk_line *line;
68 68
69 line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)]; 69 line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
70 atomic_dec(&line->left_seblks); 70 atomic_dec(&line->left_seblks);
71 71
72 if (rqd->error) { 72 if (rqd->error) {
@@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
144 BUG_ON(pblk_ppa_empty(ppa)); 144 BUG_ON(pblk_ppa_empty(ppa));
145#endif 145#endif
146 146
147 line_id = pblk_tgt_ppa_to_line(ppa); 147 line_id = pblk_ppa_to_line(ppa);
148 line = &pblk->lines[line_id]; 148 line = &pblk->lines[line_id];
149 paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); 149 paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
150 150
@@ -650,7 +650,7 @@ next_rq:
650 } else { 650 } else {
651 for (i = 0; i < rqd.nr_ppas; ) { 651 for (i = 0; i < rqd.nr_ppas; ) {
652 struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); 652 struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
653 int pos = pblk_dev_ppa_to_pos(geo, ppa); 653 int pos = pblk_ppa_to_pos(geo, ppa);
654 int read_type = PBLK_READ_RANDOM; 654 int read_type = PBLK_READ_RANDOM;
655 655
656 if (pblk_io_aligned(pblk, rq_ppas)) 656 if (pblk_io_aligned(pblk, rq_ppas))
@@ -668,7 +668,7 @@ next_rq:
668 } 668 }
669 669
670 ppa = addr_to_gen_ppa(pblk, paddr, id); 670 ppa = addr_to_gen_ppa(pblk, paddr, id);
671 pos = pblk_dev_ppa_to_pos(geo, ppa); 671 pos = pblk_ppa_to_pos(geo, ppa);
672 } 672 }
673 673
674 if (pblk_boundary_paddr_checks(pblk, paddr + min)) { 674 if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
@@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
742 cmd_op = NVM_OP_PWRITE; 742 cmd_op = NVM_OP_PWRITE;
743 flags = pblk_set_progr_mode(pblk, PBLK_WRITE); 743 flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
744 lba_list = emeta_to_lbas(pblk, line->emeta->buf); 744 lba_list = emeta_to_lbas(pblk, line->emeta->buf);
745 } else if (dir == PBLK_READ) { 745 } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
746 bio_op = REQ_OP_READ; 746 bio_op = REQ_OP_READ;
747 cmd_op = NVM_OP_PREAD; 747 cmd_op = NVM_OP_PREAD;
748 flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); 748 flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
802 if (rqd.error) { 802 if (rqd.error) {
803 if (dir == PBLK_WRITE) 803 if (dir == PBLK_WRITE)
804 pblk_log_write_err(pblk, &rqd); 804 pblk_log_write_err(pblk, &rqd);
805 else 805 else if (dir == PBLK_READ)
806 pblk_log_read_err(pblk, &rqd); 806 pblk_log_read_err(pblk, &rqd);
807 } 807 }
808 808
@@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
816{ 816{
817 u64 bpaddr = pblk_line_smeta_start(pblk, line); 817 u64 bpaddr = pblk_line_smeta_start(pblk, line);
818 818
819 return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ); 819 return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
820} 820}
821 821
822int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, 822int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
@@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
854 struct nvm_geo *geo = &dev->geo; 854 struct nvm_geo *geo = &dev->geo;
855 855
856 pr_err("pblk: could not sync erase line:%d,blk:%d\n", 856 pr_err("pblk: could not sync erase line:%d,blk:%d\n",
857 pblk_dev_ppa_to_line(ppa), 857 pblk_ppa_to_line(ppa),
858 pblk_dev_ppa_to_pos(geo, ppa)); 858 pblk_ppa_to_pos(geo, ppa));
859 859
860 rqd.error = ret; 860 rqd.error = ret;
861 goto out; 861 goto out;
@@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
979 979
980 /* Start metadata */ 980 /* Start metadata */
981 smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); 981 smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
982 smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns); 982 smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
983 983
984 /* Fill metadata among lines */ 984 /* Fill metadata among lines */
985 if (cur) { 985 if (cur) {
@@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
1032 lm->sec_per_line); 1032 lm->sec_per_line);
1033 bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, 1033 bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
1034 lm->sec_per_line); 1034 lm->sec_per_line);
1035 line->sec_in_line -= geo->sec_per_blk; 1035 line->sec_in_line -= geo->sec_per_chk;
1036 if (bit >= lm->emeta_bb) 1036 if (bit >= lm->emeta_bb)
1037 nr_bb++; 1037 nr_bb++;
1038 } 1038 }
@@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
1145 } 1145 }
1146 spin_unlock(&l_mg->free_lock); 1146 spin_unlock(&l_mg->free_lock);
1147 1147
1148 pblk_rl_free_lines_dec(&pblk->rl, line); 1148 pblk_rl_free_lines_dec(&pblk->rl, line, true);
1149 1149
1150 if (!pblk_line_init_bb(pblk, line, 0)) { 1150 if (!pblk_line_init_bb(pblk, line, 0)) {
1151 list_add(&line->list, &l_mg->free_list); 1151 list_add(&line->list, &l_mg->free_list);
@@ -1233,7 +1233,7 @@ retry:
1233 l_mg->data_line = retry_line; 1233 l_mg->data_line = retry_line;
1234 spin_unlock(&l_mg->free_lock); 1234 spin_unlock(&l_mg->free_lock);
1235 1235
1236 pblk_rl_free_lines_dec(&pblk->rl, retry_line); 1236 pblk_rl_free_lines_dec(&pblk->rl, line, false);
1237 1237
1238 if (pblk_line_erase(pblk, retry_line)) 1238 if (pblk_line_erase(pblk, retry_line))
1239 goto retry; 1239 goto retry;
@@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
1252{ 1252{
1253 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 1253 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
1254 struct pblk_line *line; 1254 struct pblk_line *line;
1255 int is_next = 0;
1256 1255
1257 spin_lock(&l_mg->free_lock); 1256 spin_lock(&l_mg->free_lock);
1258 line = pblk_line_get(pblk); 1257 line = pblk_line_get(pblk);
@@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
1280 } else { 1279 } else {
1281 l_mg->data_next->seq_nr = l_mg->d_seq_nr++; 1280 l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
1282 l_mg->data_next->type = PBLK_LINETYPE_DATA; 1281 l_mg->data_next->type = PBLK_LINETYPE_DATA;
1283 is_next = 1;
1284 } 1282 }
1285 spin_unlock(&l_mg->free_lock); 1283 spin_unlock(&l_mg->free_lock);
1286 1284
@@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
1290 return NULL; 1288 return NULL;
1291 } 1289 }
1292 1290
1293 pblk_rl_free_lines_dec(&pblk->rl, line);
1294 if (is_next)
1295 pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
1296
1297retry_setup: 1291retry_setup:
1298 if (!pblk_line_init_metadata(pblk, line, NULL)) { 1292 if (!pblk_line_init_metadata(pblk, line, NULL)) {
1299 line = pblk_line_retry(pblk, line); 1293 line = pblk_line_retry(pblk, line);
@@ -1311,6 +1305,8 @@ retry_setup:
1311 goto retry_setup; 1305 goto retry_setup;
1312 } 1306 }
1313 1307
1308 pblk_rl_free_lines_dec(&pblk->rl, line, true);
1309
1314 return line; 1310 return line;
1315} 1311}
1316 1312
@@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
1395 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 1391 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
1396 struct pblk_line *cur, *new = NULL; 1392 struct pblk_line *cur, *new = NULL;
1397 unsigned int left_seblks; 1393 unsigned int left_seblks;
1398 int is_next = 0;
1399 1394
1400 cur = l_mg->data_line; 1395 cur = l_mg->data_line;
1401 new = l_mg->data_next; 1396 new = l_mg->data_next;
@@ -1444,6 +1439,8 @@ retry_setup:
1444 goto retry_setup; 1439 goto retry_setup;
1445 } 1440 }
1446 1441
1442 pblk_rl_free_lines_dec(&pblk->rl, new, true);
1443
1447 /* Allocate next line for preparation */ 1444 /* Allocate next line for preparation */
1448 spin_lock(&l_mg->free_lock); 1445 spin_lock(&l_mg->free_lock);
1449 l_mg->data_next = pblk_line_get(pblk); 1446 l_mg->data_next = pblk_line_get(pblk);
@@ -1457,13 +1454,9 @@ retry_setup:
1457 } else { 1454 } else {
1458 l_mg->data_next->seq_nr = l_mg->d_seq_nr++; 1455 l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
1459 l_mg->data_next->type = PBLK_LINETYPE_DATA; 1456 l_mg->data_next->type = PBLK_LINETYPE_DATA;
1460 is_next = 1;
1461 } 1457 }
1462 spin_unlock(&l_mg->free_lock); 1458 spin_unlock(&l_mg->free_lock);
1463 1459
1464 if (is_next)
1465 pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
1466
1467out: 1460out:
1468 return new; 1461 return new;
1469} 1462}
@@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
1561 struct nvm_geo *geo = &dev->geo; 1554 struct nvm_geo *geo = &dev->geo;
1562 1555
1563 pr_err("pblk: could not async erase line:%d,blk:%d\n", 1556 pr_err("pblk: could not async erase line:%d,blk:%d\n",
1564 pblk_dev_ppa_to_line(ppa), 1557 pblk_ppa_to_line(ppa),
1565 pblk_dev_ppa_to_pos(geo, ppa)); 1558 pblk_ppa_to_pos(geo, ppa));
1566 } 1559 }
1567 1560
1568 return err; 1561 return err;
@@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
1746 struct nvm_tgt_dev *dev = pblk->dev; 1739 struct nvm_tgt_dev *dev = pblk->dev;
1747 struct nvm_geo *geo = &dev->geo; 1740 struct nvm_geo *geo = &dev->geo;
1748 struct pblk_lun *rlun; 1741 struct pblk_lun *rlun;
1749 int nr_luns = geo->nr_luns; 1742 int nr_luns = geo->all_luns;
1750 int bit = -1; 1743 int bit = -1;
1751 1744
1752 while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) { 1745 while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
@@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
1884 1877
1885 /* If the L2P entry maps to a line, the reference is valid */ 1878 /* If the L2P entry maps to a line, the reference is valid */
1886 if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { 1879 if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
1887 int line_id = pblk_dev_ppa_to_line(ppa); 1880 int line_id = pblk_ppa_to_line(ppa);
1888 struct pblk_line *line = &pblk->lines[line_id]; 1881 struct pblk_line *line = &pblk->lines[line_id];
1889 1882
1890 kref_get(&line->ref); 1883 kref_get(&line->ref);
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 9c8e114c8a54..3d899383666e 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -169,7 +169,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
169 * the line untouched. TODO: Implement a recovery routine that scans and 169 * the line untouched. TODO: Implement a recovery routine that scans and
170 * moves all sectors on the line. 170 * moves all sectors on the line.
171 */ 171 */
172 lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); 172
173 ret = pblk_recov_check_emeta(pblk, emeta_buf);
174 if (ret) {
175 pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
176 goto fail_free_emeta;
177 }
178
179 lba_list = emeta_to_lbas(pblk, emeta_buf);
173 if (!lba_list) { 180 if (!lba_list) {
174 pr_err("pblk: could not interpret emeta (line %d)\n", line->id); 181 pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
175 goto fail_free_emeta; 182 goto fail_free_emeta;
@@ -519,22 +526,12 @@ void pblk_gc_should_start(struct pblk *pblk)
519 } 526 }
520} 527}
521 528
522/*
523 * If flush_wq == 1 then no lock should be held by the caller since
524 * flush_workqueue can sleep
525 */
526static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
527{
528 pblk->gc.gc_active = 0;
529 pr_debug("pblk: gc stop\n");
530}
531
532void pblk_gc_should_stop(struct pblk *pblk) 529void pblk_gc_should_stop(struct pblk *pblk)
533{ 530{
534 struct pblk_gc *gc = &pblk->gc; 531 struct pblk_gc *gc = &pblk->gc;
535 532
536 if (gc->gc_active && !gc->gc_forced) 533 if (gc->gc_active && !gc->gc_forced)
537 pblk_gc_stop(pblk, 0); 534 gc->gc_active = 0;
538} 535}
539 536
540void pblk_gc_should_kick(struct pblk *pblk) 537void pblk_gc_should_kick(struct pblk *pblk)
@@ -660,7 +657,7 @@ void pblk_gc_exit(struct pblk *pblk)
660 657
661 gc->gc_enabled = 0; 658 gc->gc_enabled = 0;
662 del_timer_sync(&gc->gc_timer); 659 del_timer_sync(&gc->gc_timer);
663 pblk_gc_stop(pblk, 1); 660 gc->gc_active = 0;
664 661
665 if (gc->gc_ts) 662 if (gc->gc_ts)
666 kthread_stop(gc->gc_ts); 663 kthread_stop(gc->gc_ts);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 695826a06b5d..93d671ca518e 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk)
169 } 169 }
170 ppaf.ch_len = power_len; 170 ppaf.ch_len = power_len;
171 171
172 power_len = get_count_order(geo->luns_per_chnl); 172 power_len = get_count_order(geo->nr_luns);
173 if (1 << power_len != geo->luns_per_chnl) { 173 if (1 << power_len != geo->nr_luns) {
174 pr_err("pblk: supports only power-of-two LUN config.\n"); 174 pr_err("pblk: supports only power-of-two LUN config.\n");
175 return -EINVAL; 175 return -EINVAL;
176 } 176 }
@@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk)
254 struct nvm_geo *geo = &dev->geo; 254 struct nvm_geo *geo = &dev->geo;
255 255
256 pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * 256 pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
257 geo->nr_planes * geo->nr_luns; 257 geo->nr_planes * geo->all_luns;
258 258
259 if (pblk_init_global_caches(pblk)) 259 if (pblk_init_global_caches(pblk))
260 return -ENOMEM; 260 return -ENOMEM;
@@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk)
270 if (!pblk->gen_ws_pool) 270 if (!pblk->gen_ws_pool)
271 goto free_page_bio_pool; 271 goto free_page_bio_pool;
272 272
273 pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); 273 pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
274 pblk_rec_cache);
274 if (!pblk->rec_pool) 275 if (!pblk->rec_pool)
275 goto free_gen_ws_pool; 276 goto free_gen_ws_pool;
276 277
277 pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns, 278 pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
278 pblk_g_rq_cache); 279 pblk_g_rq_cache);
279 if (!pblk->r_rq_pool) 280 if (!pblk->r_rq_pool)
280 goto free_rec_pool; 281 goto free_rec_pool;
281 282
282 pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns, 283 pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
283 pblk_g_rq_cache); 284 pblk_g_rq_cache);
284 if (!pblk->e_rq_pool) 285 if (!pblk->e_rq_pool)
285 goto free_r_rq_pool; 286 goto free_r_rq_pool;
286 287
287 pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns, 288 pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
288 pblk_w_rq_cache); 289 pblk_w_rq_cache);
289 if (!pblk->w_rq_pool) 290 if (!pblk->w_rq_pool)
290 goto free_e_rq_pool; 291 goto free_e_rq_pool;
@@ -354,6 +355,8 @@ static void pblk_core_free(struct pblk *pblk)
354 mempool_destroy(pblk->e_rq_pool); 355 mempool_destroy(pblk->e_rq_pool);
355 mempool_destroy(pblk->w_rq_pool); 356 mempool_destroy(pblk->w_rq_pool);
356 357
358 pblk_rwb_free(pblk);
359
357 pblk_free_global_caches(pblk); 360 pblk_free_global_caches(pblk);
358} 361}
359 362
@@ -409,7 +412,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
409 u8 *blks; 412 u8 *blks;
410 int nr_blks, ret; 413 int nr_blks, ret;
411 414
412 nr_blks = geo->blks_per_lun * geo->plane_mode; 415 nr_blks = geo->nr_chks * geo->plane_mode;
413 blks = kmalloc(nr_blks, GFP_KERNEL); 416 blks = kmalloc(nr_blks, GFP_KERNEL);
414 if (!blks) 417 if (!blks)
415 return -ENOMEM; 418 return -ENOMEM;
@@ -482,20 +485,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
482 int i, ret; 485 int i, ret;
483 486
484 /* TODO: Implement unbalanced LUN support */ 487 /* TODO: Implement unbalanced LUN support */
485 if (geo->luns_per_chnl < 0) { 488 if (geo->nr_luns < 0) {
486 pr_err("pblk: unbalanced LUN config.\n"); 489 pr_err("pblk: unbalanced LUN config.\n");
487 return -EINVAL; 490 return -EINVAL;
488 } 491 }
489 492
490 pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL); 493 pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
494 GFP_KERNEL);
491 if (!pblk->luns) 495 if (!pblk->luns)
492 return -ENOMEM; 496 return -ENOMEM;
493 497
494 for (i = 0; i < geo->nr_luns; i++) { 498 for (i = 0; i < geo->all_luns; i++) {
495 /* Stripe across channels */ 499 /* Stripe across channels */
496 int ch = i % geo->nr_chnls; 500 int ch = i % geo->nr_chnls;
497 int lun_raw = i / geo->nr_chnls; 501 int lun_raw = i / geo->nr_chnls;
498 int lunid = lun_raw + ch * geo->luns_per_chnl; 502 int lunid = lun_raw + ch * geo->nr_luns;
499 503
500 rlun = &pblk->luns[i]; 504 rlun = &pblk->luns[i];
501 rlun->bppa = luns[lunid]; 505 rlun->bppa = luns[lunid];
@@ -577,22 +581,37 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
577static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) 581static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
578{ 582{
579 struct nvm_tgt_dev *dev = pblk->dev; 583 struct nvm_tgt_dev *dev = pblk->dev;
584 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
585 struct pblk_line_meta *lm = &pblk->lm;
580 struct nvm_geo *geo = &dev->geo; 586 struct nvm_geo *geo = &dev->geo;
581 sector_t provisioned; 587 sector_t provisioned;
588 int sec_meta, blk_meta;
582 589
583 pblk->over_pct = 20; 590 if (geo->op == NVM_TARGET_DEFAULT_OP)
591 pblk->op = PBLK_DEFAULT_OP;
592 else
593 pblk->op = geo->op;
584 594
585 provisioned = nr_free_blks; 595 provisioned = nr_free_blks;
586 provisioned *= (100 - pblk->over_pct); 596 provisioned *= (100 - pblk->op);
587 sector_div(provisioned, 100); 597 sector_div(provisioned, 100);
588 598
599 pblk->op_blks = nr_free_blks - provisioned;
600
589 /* Internally pblk manages all free blocks, but all calculations based 601 /* Internally pblk manages all free blocks, but all calculations based
590 * on user capacity consider only provisioned blocks 602 * on user capacity consider only provisioned blocks
591 */ 603 */
592 pblk->rl.total_blocks = nr_free_blks; 604 pblk->rl.total_blocks = nr_free_blks;
593 pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk; 605 pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk;
594 pblk->capacity = provisioned * geo->sec_per_blk; 606
607 /* Consider sectors used for metadata */
608 sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
609 blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
610
611 pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk;
612
595 atomic_set(&pblk->rl.free_blocks, nr_free_blks); 613 atomic_set(&pblk->rl.free_blocks, nr_free_blks);
614 atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
596} 615}
597 616
598static int pblk_lines_alloc_metadata(struct pblk *pblk) 617static int pblk_lines_alloc_metadata(struct pblk *pblk)
@@ -683,7 +702,7 @@ static int pblk_lines_init(struct pblk *pblk)
683 int i, ret; 702 int i, ret;
684 703
685 pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); 704 pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
686 max_write_ppas = pblk->min_write_pgs * geo->nr_luns; 705 max_write_ppas = pblk->min_write_pgs * geo->all_luns;
687 pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? 706 pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
688 max_write_ppas : nvm_max_phys_sects(dev); 707 max_write_ppas : nvm_max_phys_sects(dev);
689 pblk_set_sec_per_write(pblk, pblk->min_write_pgs); 708 pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
@@ -693,26 +712,26 @@ static int pblk_lines_init(struct pblk *pblk)
693 return -EINVAL; 712 return -EINVAL;
694 } 713 }
695 714
696 div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); 715 div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod);
697 if (mod) { 716 if (mod) {
698 pr_err("pblk: bad configuration of sectors/pages\n"); 717 pr_err("pblk: bad configuration of sectors/pages\n");
699 return -EINVAL; 718 return -EINVAL;
700 } 719 }
701 720
702 l_mg->nr_lines = geo->blks_per_lun; 721 l_mg->nr_lines = geo->nr_chks;
703 l_mg->log_line = l_mg->data_line = NULL; 722 l_mg->log_line = l_mg->data_line = NULL;
704 l_mg->l_seq_nr = l_mg->d_seq_nr = 0; 723 l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
705 l_mg->nr_free_lines = 0; 724 l_mg->nr_free_lines = 0;
706 bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); 725 bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
707 726
708 lm->sec_per_line = geo->sec_per_blk * geo->nr_luns; 727 lm->sec_per_line = geo->sec_per_chk * geo->all_luns;
709 lm->blk_per_line = geo->nr_luns; 728 lm->blk_per_line = geo->all_luns;
710 lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); 729 lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
711 lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); 730 lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
712 lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); 731 lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
713 lm->mid_thrs = lm->sec_per_line / 2; 732 lm->mid_thrs = lm->sec_per_line / 2;
714 lm->high_thrs = lm->sec_per_line / 4; 733 lm->high_thrs = lm->sec_per_line / 4;
715 lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; 734 lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
716 735
717 /* Calculate necessary pages for smeta. See comment over struct 736 /* Calculate necessary pages for smeta. See comment over struct
718 * line_smeta definition 737 * line_smeta definition
@@ -742,12 +761,12 @@ add_emeta_page:
742 goto add_emeta_page; 761 goto add_emeta_page;
743 } 762 }
744 763
745 lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0; 764 lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
746 765
747 lm->min_blk_line = 1; 766 lm->min_blk_line = 1;
748 if (geo->nr_luns > 1) 767 if (geo->all_luns > 1)
749 lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + 768 lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
750 lm->emeta_sec[0], geo->sec_per_blk); 769 lm->emeta_sec[0], geo->sec_per_chk);
751 770
752 if (lm->min_blk_line > lm->blk_per_line) { 771 if (lm->min_blk_line > lm->blk_per_line) {
753 pr_err("pblk: config. not supported. Min. LUN in line:%d\n", 772 pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
@@ -772,7 +791,7 @@ add_emeta_page:
772 goto fail_free_bb_template; 791 goto fail_free_bb_template;
773 } 792 }
774 793
775 bb_distance = (geo->nr_luns) * geo->sec_per_pl; 794 bb_distance = (geo->all_luns) * geo->sec_per_pl;
776 for (i = 0; i < lm->sec_per_line; i += bb_distance) 795 for (i = 0; i < lm->sec_per_line; i += bb_distance)
777 bitmap_set(l_mg->bb_template, i, geo->sec_per_pl); 796 bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
778 797
@@ -844,7 +863,7 @@ add_emeta_page:
844 pblk_set_provision(pblk, nr_free_blks); 863 pblk_set_provision(pblk, nr_free_blks);
845 864
846 /* Cleanup per-LUN bad block lists - managed within lines on run-time */ 865 /* Cleanup per-LUN bad block lists - managed within lines on run-time */
847 for (i = 0; i < geo->nr_luns; i++) 866 for (i = 0; i < geo->all_luns; i++)
848 kfree(pblk->luns[i].bb_list); 867 kfree(pblk->luns[i].bb_list);
849 868
850 return 0; 869 return 0;
@@ -858,7 +877,7 @@ fail_free_bb_template:
858fail_free_meta: 877fail_free_meta:
859 pblk_line_meta_free(pblk); 878 pblk_line_meta_free(pblk);
860fail: 879fail:
861 for (i = 0; i < geo->nr_luns; i++) 880 for (i = 0; i < geo->all_luns; i++)
862 kfree(pblk->luns[i].bb_list); 881 kfree(pblk->luns[i].bb_list);
863 882
864 return ret; 883 return ret;
@@ -866,15 +885,19 @@ fail:
866 885
867static int pblk_writer_init(struct pblk *pblk) 886static int pblk_writer_init(struct pblk *pblk)
868{ 887{
869 timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
870 mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
871
872 pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); 888 pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
873 if (IS_ERR(pblk->writer_ts)) { 889 if (IS_ERR(pblk->writer_ts)) {
874 pr_err("pblk: could not allocate writer kthread\n"); 890 int err = PTR_ERR(pblk->writer_ts);
875 return PTR_ERR(pblk->writer_ts); 891
892 if (err != -EINTR)
893 pr_err("pblk: could not allocate writer kthread (%d)\n",
894 err);
895 return err;
876 } 896 }
877 897
898 timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
899 mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
900
878 return 0; 901 return 0;
879} 902}
880 903
@@ -910,7 +933,6 @@ static void pblk_tear_down(struct pblk *pblk)
910 pblk_pipeline_stop(pblk); 933 pblk_pipeline_stop(pblk);
911 pblk_writer_stop(pblk); 934 pblk_writer_stop(pblk);
912 pblk_rb_sync_l2p(&pblk->rwb); 935 pblk_rb_sync_l2p(&pblk->rwb);
913 pblk_rwb_free(pblk);
914 pblk_rl_free(&pblk->rl); 936 pblk_rl_free(&pblk->rl);
915 937
916 pr_debug("pblk: consistent tear down\n"); 938 pr_debug("pblk: consistent tear down\n");
@@ -1025,7 +1047,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1025 1047
1026 ret = pblk_writer_init(pblk); 1048 ret = pblk_writer_init(pblk);
1027 if (ret) { 1049 if (ret) {
1028 pr_err("pblk: could not initialize write thread\n"); 1050 if (ret != -EINTR)
1051 pr_err("pblk: could not initialize write thread\n");
1029 goto fail_free_lines; 1052 goto fail_free_lines;
1030 } 1053 }
1031 1054
@@ -1041,13 +1064,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1041 1064
1042 blk_queue_write_cache(tqueue, true, false); 1065 blk_queue_write_cache(tqueue, true, false);
1043 1066
1044 tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size; 1067 tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size;
1045 tqueue->limits.discard_alignment = 0; 1068 tqueue->limits.discard_alignment = 0;
1046 blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); 1069 blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
1047 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue); 1070 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
1048 1071
1049 pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n", 1072 pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
1050 geo->nr_luns, pblk->l_mg.nr_lines, 1073 tdisk->disk_name,
1074 geo->all_luns, pblk->l_mg.nr_lines,
1051 (unsigned long long)pblk->rl.nr_secs, 1075 (unsigned long long)pblk->rl.nr_secs,
1052 pblk->rwb.nr_entries); 1076 pblk->rwb.nr_entries);
1053 1077
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 6f3ecde2140f..7445e6430c52 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
146 return; 146 return;
147 147
148 /* Erase blocks that are bad in this line but might not be in next */ 148 /* Erase blocks that are bad in this line but might not be in next */
149 if (unlikely(ppa_empty(*erase_ppa)) && 149 if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
150 bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { 150 bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
151 int bit = -1; 151 int bit = -1;
152 152
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index b8f78e401482..ec8fc314646b 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -54,7 +54,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
54 rb->seg_size = (1 << power_seg_sz); 54 rb->seg_size = (1 << power_seg_sz);
55 rb->nr_entries = (1 << power_size); 55 rb->nr_entries = (1 << power_size);
56 rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; 56 rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
57 rb->sync_point = EMPTY_ENTRY; 57 rb->flush_point = EMPTY_ENTRY;
58 58
59 spin_lock_init(&rb->w_lock); 59 spin_lock_init(&rb->w_lock);
60 spin_lock_init(&rb->s_lock); 60 spin_lock_init(&rb->s_lock);
@@ -112,7 +112,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
112 up_write(&pblk_rb_lock); 112 up_write(&pblk_rb_lock);
113 113
114#ifdef CONFIG_NVM_DEBUG 114#ifdef CONFIG_NVM_DEBUG
115 atomic_set(&rb->inflight_sync_point, 0); 115 atomic_set(&rb->inflight_flush_point, 0);
116#endif 116#endif
117 117
118 /* 118 /*
@@ -226,7 +226,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
226 pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, 226 pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
227 entry->cacheline); 227 entry->cacheline);
228 228
229 line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; 229 line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
230 kref_put(&line->ref, pblk_line_put); 230 kref_put(&line->ref, pblk_line_put);
231 clean_wctx(w_ctx); 231 clean_wctx(w_ctx);
232 rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); 232 rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
@@ -349,35 +349,35 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
349 smp_store_release(&entry->w_ctx.flags, flags); 349 smp_store_release(&entry->w_ctx.flags, flags);
350} 350}
351 351
352static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, 352static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
353 unsigned int pos) 353 unsigned int pos)
354{ 354{
355 struct pblk_rb_entry *entry; 355 struct pblk_rb_entry *entry;
356 unsigned int subm, sync_point; 356 unsigned int sync, flush_point;
357 357
358 subm = READ_ONCE(rb->subm); 358 sync = READ_ONCE(rb->sync);
359
360 if (pos == sync)
361 return 0;
359 362
360#ifdef CONFIG_NVM_DEBUG 363#ifdef CONFIG_NVM_DEBUG
361 atomic_inc(&rb->inflight_sync_point); 364 atomic_inc(&rb->inflight_flush_point);
362#endif 365#endif
363 366
364 if (pos == subm) 367 flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
365 return 0; 368 entry = &rb->entries[flush_point];
366 369
367 sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); 370 pblk_rb_sync_init(rb, NULL);
368 entry = &rb->entries[sync_point];
369 371
370 /* Protect syncs */ 372 /* Protect flush points */
371 smp_store_release(&rb->sync_point, sync_point); 373 smp_store_release(&rb->flush_point, flush_point);
372 374
373 if (!bio) 375 if (bio)
374 return 0; 376 bio_list_add(&entry->w_ctx.bios, bio);
375 377
376 spin_lock_irq(&rb->s_lock); 378 pblk_rb_sync_end(rb, NULL);
377 bio_list_add(&entry->w_ctx.bios, bio);
378 spin_unlock_irq(&rb->s_lock);
379 379
380 return 1; 380 return bio ? 1 : 0;
381} 381}
382 382
383static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, 383static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
@@ -416,7 +416,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
416 struct pblk *pblk = container_of(rb, struct pblk, rwb); 416 struct pblk *pblk = container_of(rb, struct pblk, rwb);
417 unsigned int mem = READ_ONCE(rb->mem); 417 unsigned int mem = READ_ONCE(rb->mem);
418 418
419 if (pblk_rb_sync_point_set(rb, NULL, mem)) 419 if (pblk_rb_flush_point_set(rb, NULL, mem))
420 return; 420 return;
421 421
422 pblk_write_should_kick(pblk); 422 pblk_write_should_kick(pblk);
@@ -440,7 +440,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
440#ifdef CONFIG_NVM_DEBUG 440#ifdef CONFIG_NVM_DEBUG
441 atomic_long_inc(&pblk->nr_flush); 441 atomic_long_inc(&pblk->nr_flush);
442#endif 442#endif
443 if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem)) 443 if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
444 *io_ret = NVM_IO_OK; 444 *io_ret = NVM_IO_OK;
445 } 445 }
446 446
@@ -606,21 +606,6 @@ try:
606 return NVM_IO_ERR; 606 return NVM_IO_ERR;
607 } 607 }
608 608
609 if (flags & PBLK_FLUSH_ENTRY) {
610 unsigned int sync_point;
611
612 sync_point = READ_ONCE(rb->sync_point);
613 if (sync_point == pos) {
614 /* Protect syncs */
615 smp_store_release(&rb->sync_point, EMPTY_ENTRY);
616 }
617
618 flags &= ~PBLK_FLUSH_ENTRY;
619#ifdef CONFIG_NVM_DEBUG
620 atomic_dec(&rb->inflight_sync_point);
621#endif
622 }
623
624 flags &= ~PBLK_WRITTEN_DATA; 609 flags &= ~PBLK_WRITTEN_DATA;
625 flags |= PBLK_SUBMITTED_ENTRY; 610 flags |= PBLK_SUBMITTED_ENTRY;
626 611
@@ -730,15 +715,24 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
730 715
731unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) 716unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
732{ 717{
733 unsigned int sync; 718 unsigned int sync, flush_point;
734 unsigned int i;
735
736 lockdep_assert_held(&rb->s_lock); 719 lockdep_assert_held(&rb->s_lock);
737 720
738 sync = READ_ONCE(rb->sync); 721 sync = READ_ONCE(rb->sync);
722 flush_point = READ_ONCE(rb->flush_point);
739 723
740 for (i = 0; i < nr_entries; i++) 724 if (flush_point != EMPTY_ENTRY) {
741 sync = (sync + 1) & (rb->nr_entries - 1); 725 unsigned int secs_to_flush;
726
727 secs_to_flush = pblk_rb_ring_count(flush_point, sync,
728 rb->nr_entries);
729 if (secs_to_flush < nr_entries) {
730 /* Protect flush points */
731 smp_store_release(&rb->flush_point, EMPTY_ENTRY);
732 }
733 }
734
735 sync = (sync + nr_entries) & (rb->nr_entries - 1);
742 736
743 /* Protect from counts */ 737 /* Protect from counts */
744 smp_store_release(&rb->sync, sync); 738 smp_store_release(&rb->sync, sync);
@@ -746,22 +740,27 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
746 return sync; 740 return sync;
747} 741}
748 742
749unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb) 743/* Calculate how many sectors to submit up to the current flush point. */
744unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
750{ 745{
751 unsigned int subm, sync_point; 746 unsigned int subm, sync, flush_point;
752 unsigned int count; 747 unsigned int submitted, to_flush;
753 748
754 /* Protect syncs */ 749 /* Protect flush points */
755 sync_point = smp_load_acquire(&rb->sync_point); 750 flush_point = smp_load_acquire(&rb->flush_point);
756 if (sync_point == EMPTY_ENTRY) 751 if (flush_point == EMPTY_ENTRY)
757 return 0; 752 return 0;
758 753
754 /* Protect syncs */
755 sync = smp_load_acquire(&rb->sync);
756
759 subm = READ_ONCE(rb->subm); 757 subm = READ_ONCE(rb->subm);
758 submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
760 759
761 /* The sync point itself counts as a sector to sync */ 760 /* The sync point itself counts as a sector to sync */
762 count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1; 761 to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
763 762
764 return count; 763 return (submitted < to_flush) ? (to_flush - submitted) : 0;
765} 764}
766 765
767/* 766/*
@@ -801,7 +800,7 @@ int pblk_rb_tear_down_check(struct pblk_rb *rb)
801 800
802 if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && 801 if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
803 (rb->sync == rb->l2p_update) && 802 (rb->sync == rb->l2p_update) &&
804 (rb->sync_point == EMPTY_ENTRY)) { 803 (rb->flush_point == EMPTY_ENTRY)) {
805 goto out; 804 goto out;
806 } 805 }
807 806
@@ -848,7 +847,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
848 queued_entries++; 847 queued_entries++;
849 spin_unlock_irq(&rb->s_lock); 848 spin_unlock_irq(&rb->s_lock);
850 849
851 if (rb->sync_point != EMPTY_ENTRY) 850 if (rb->flush_point != EMPTY_ENTRY)
852 offset = scnprintf(buf, PAGE_SIZE, 851 offset = scnprintf(buf, PAGE_SIZE,
853 "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", 852 "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
854 rb->nr_entries, 853 rb->nr_entries,
@@ -857,14 +856,14 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
857 rb->sync, 856 rb->sync,
858 rb->l2p_update, 857 rb->l2p_update,
859#ifdef CONFIG_NVM_DEBUG 858#ifdef CONFIG_NVM_DEBUG
860 atomic_read(&rb->inflight_sync_point), 859 atomic_read(&rb->inflight_flush_point),
861#else 860#else
862 0, 861 0,
863#endif 862#endif
864 rb->sync_point, 863 rb->flush_point,
865 pblk_rb_read_count(rb), 864 pblk_rb_read_count(rb),
866 pblk_rb_space(rb), 865 pblk_rb_space(rb),
867 pblk_rb_sync_point_count(rb), 866 pblk_rb_flush_point_count(rb),
868 queued_entries); 867 queued_entries);
869 else 868 else
870 offset = scnprintf(buf, PAGE_SIZE, 869 offset = scnprintf(buf, PAGE_SIZE,
@@ -875,13 +874,13 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
875 rb->sync, 874 rb->sync,
876 rb->l2p_update, 875 rb->l2p_update,
877#ifdef CONFIG_NVM_DEBUG 876#ifdef CONFIG_NVM_DEBUG
878 atomic_read(&rb->inflight_sync_point), 877 atomic_read(&rb->inflight_flush_point),
879#else 878#else
880 0, 879 0,
881#endif 880#endif
882 pblk_rb_read_count(rb), 881 pblk_rb_read_count(rb),
883 pblk_rb_space(rb), 882 pblk_rb_space(rb),
884 pblk_rb_sync_point_count(rb), 883 pblk_rb_flush_point_count(rb),
885 queued_entries); 884 queued_entries);
886 885
887 return offset; 886 return offset;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index ca79d8fb3e60..2f761283f43e 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -141,7 +141,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
141 struct ppa_addr ppa = ppa_list[i]; 141 struct ppa_addr ppa = ppa_list[i];
142 struct pblk_line *line; 142 struct pblk_line *line;
143 143
144 line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; 144 line = &pblk->lines[pblk_ppa_to_line(ppa)];
145 kref_put(&line->ref, pblk_line_put_wq); 145 kref_put(&line->ref, pblk_line_put_wq);
146 } 146 }
147} 147}
@@ -158,8 +158,12 @@ static void pblk_end_user_read(struct bio *bio)
158static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, 158static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
159 bool put_line) 159 bool put_line)
160{ 160{
161 struct nvm_tgt_dev *dev = pblk->dev;
161 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); 162 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
162 struct bio *bio = rqd->bio; 163 struct bio *bio = rqd->bio;
164 unsigned long start_time = r_ctx->start_time;
165
166 generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
163 167
164 if (rqd->error) 168 if (rqd->error)
165 pblk_log_read_err(pblk, rqd); 169 pblk_log_read_err(pblk, rqd);
@@ -193,9 +197,9 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
193 __pblk_end_io_read(pblk, rqd, true); 197 __pblk_end_io_read(pblk, rqd, true);
194} 198}
195 199
196static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, 200static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
197 unsigned int bio_init_idx, 201 unsigned int bio_init_idx,
198 unsigned long *read_bitmap) 202 unsigned long *read_bitmap)
199{ 203{
200 struct bio *new_bio, *bio = rqd->bio; 204 struct bio *new_bio, *bio = rqd->bio;
201 struct pblk_sec_meta *meta_list = rqd->meta_list; 205 struct pblk_sec_meta *meta_list = rqd->meta_list;
@@ -270,7 +274,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
270 i = 0; 274 i = 0;
271 hole = find_first_zero_bit(read_bitmap, nr_secs); 275 hole = find_first_zero_bit(read_bitmap, nr_secs);
272 do { 276 do {
273 int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]); 277 int line_id = pblk_ppa_to_line(rqd->ppa_list[i]);
274 struct pblk_line *line = &pblk->lines[line_id]; 278 struct pblk_line *line = &pblk->lines[line_id];
275 279
276 kref_put(&line->ref, pblk_line_put); 280 kref_put(&line->ref, pblk_line_put);
@@ -306,6 +310,8 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
306 return NVM_IO_OK; 310 return NVM_IO_OK;
307 311
308err: 312err:
313 pr_err("pblk: failed to perform partial read\n");
314
309 /* Free allocated pages in new bio */ 315 /* Free allocated pages in new bio */
310 pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); 316 pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
311 __pblk_end_io_read(pblk, rqd, false); 317 __pblk_end_io_read(pblk, rqd, false);
@@ -357,6 +363,7 @@ retry:
357int pblk_submit_read(struct pblk *pblk, struct bio *bio) 363int pblk_submit_read(struct pblk *pblk, struct bio *bio)
358{ 364{
359 struct nvm_tgt_dev *dev = pblk->dev; 365 struct nvm_tgt_dev *dev = pblk->dev;
366 struct request_queue *q = dev->q;
360 sector_t blba = pblk_get_lba(bio); 367 sector_t blba = pblk_get_lba(bio);
361 unsigned int nr_secs = pblk_get_secs(bio); 368 unsigned int nr_secs = pblk_get_secs(bio);
362 struct pblk_g_ctx *r_ctx; 369 struct pblk_g_ctx *r_ctx;
@@ -372,6 +379,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
372 return NVM_IO_ERR; 379 return NVM_IO_ERR;
373 } 380 }
374 381
382 generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
383
375 bitmap_zero(&read_bitmap, nr_secs); 384 bitmap_zero(&read_bitmap, nr_secs);
376 385
377 rqd = pblk_alloc_rqd(pblk, PBLK_READ); 386 rqd = pblk_alloc_rqd(pblk, PBLK_READ);
@@ -383,6 +392,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
383 rqd->end_io = pblk_end_io_read; 392 rqd->end_io = pblk_end_io_read;
384 393
385 r_ctx = nvm_rq_to_pdu(rqd); 394 r_ctx = nvm_rq_to_pdu(rqd);
395 r_ctx->start_time = jiffies;
386 r_ctx->lba = blba; 396 r_ctx->lba = blba;
387 397
388 /* Save the index for this bio's start. This is needed in case 398 /* Save the index for this bio's start. This is needed in case
@@ -422,7 +432,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
422 int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); 432 int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
423 if (!int_bio) { 433 if (!int_bio) {
424 pr_err("pblk: could not clone read bio\n"); 434 pr_err("pblk: could not clone read bio\n");
425 return NVM_IO_ERR; 435 goto fail_end_io;
426 } 436 }
427 437
428 rqd->bio = int_bio; 438 rqd->bio = int_bio;
@@ -433,7 +443,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
433 pr_err("pblk: read IO submission failed\n"); 443 pr_err("pblk: read IO submission failed\n");
434 if (int_bio) 444 if (int_bio)
435 bio_put(int_bio); 445 bio_put(int_bio);
436 return ret; 446 goto fail_end_io;
437 } 447 }
438 448
439 return NVM_IO_OK; 449 return NVM_IO_OK;
@@ -442,17 +452,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
442 /* The read bio request could be partially filled by the write buffer, 452 /* The read bio request could be partially filled by the write buffer,
443 * but there are some holes that need to be read from the drive. 453 * but there are some holes that need to be read from the drive.
444 */ 454 */
445 ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap); 455 return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
446 if (ret) {
447 pr_err("pblk: failed to perform partial read\n");
448 return ret;
449 }
450
451 return NVM_IO_OK;
452 456
453fail_rqd_free: 457fail_rqd_free:
454 pblk_free_rqd(pblk, rqd, PBLK_READ); 458 pblk_free_rqd(pblk, rqd, PBLK_READ);
455 return ret; 459 return ret;
460fail_end_io:
461 __pblk_end_io_read(pblk, rqd, false);
462 return ret;
456} 463}
457 464
458static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, 465static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index eadb3eb5d4dc..1d5e961bf5e0 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -111,18 +111,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
111 return 0; 111 return 0;
112} 112}
113 113
114__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf) 114int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
115{ 115{
116 u32 crc; 116 u32 crc;
117 117
118 crc = pblk_calc_emeta_crc(pblk, emeta_buf); 118 crc = pblk_calc_emeta_crc(pblk, emeta_buf);
119 if (le32_to_cpu(emeta_buf->crc) != crc) 119 if (le32_to_cpu(emeta_buf->crc) != crc)
120 return NULL; 120 return 1;
121 121
122 if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) 122 if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
123 return NULL; 123 return 1;
124 124
125 return emeta_to_lbas(pblk, emeta_buf); 125 return 0;
126} 126}
127 127
128static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) 128static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
@@ -137,7 +137,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
137 u64 nr_valid_lbas, nr_lbas = 0; 137 u64 nr_valid_lbas, nr_lbas = 0;
138 u64 i; 138 u64 i;
139 139
140 lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); 140 lba_list = emeta_to_lbas(pblk, emeta_buf);
141 if (!lba_list) 141 if (!lba_list)
142 return 1; 142 return 1;
143 143
@@ -149,7 +149,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
149 struct ppa_addr ppa; 149 struct ppa_addr ppa;
150 int pos; 150 int pos;
151 151
152 ppa = addr_to_pblk_ppa(pblk, i, line->id); 152 ppa = addr_to_gen_ppa(pblk, i, line->id);
153 pos = pblk_ppa_to_pos(geo, ppa); 153 pos = pblk_ppa_to_pos(geo, ppa);
154 154
155 /* Do not update bad blocks */ 155 /* Do not update bad blocks */
@@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
188 int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); 188 int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
189 189
190 return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - 190 return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
191 nr_bb * geo->sec_per_blk; 191 nr_bb * geo->sec_per_chk;
192} 192}
193 193
194struct pblk_recov_alloc { 194struct pblk_recov_alloc {
@@ -263,12 +263,12 @@ next_read_rq:
263 int pos; 263 int pos;
264 264
265 ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); 265 ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
266 pos = pblk_dev_ppa_to_pos(geo, ppa); 266 pos = pblk_ppa_to_pos(geo, ppa);
267 267
268 while (test_bit(pos, line->blk_bitmap)) { 268 while (test_bit(pos, line->blk_bitmap)) {
269 r_ptr_int += pblk->min_write_pgs; 269 r_ptr_int += pblk->min_write_pgs;
270 ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); 270 ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
271 pos = pblk_dev_ppa_to_pos(geo, ppa); 271 pos = pblk_ppa_to_pos(geo, ppa);
272 } 272 }
273 273
274 for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++) 274 for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
@@ -288,7 +288,7 @@ next_read_rq:
288 /* At this point, the read should not fail. If it does, it is a problem 288 /* At this point, the read should not fail. If it does, it is a problem
289 * we cannot recover from here. Need FTL log. 289 * we cannot recover from here. Need FTL log.
290 */ 290 */
291 if (rqd->error) { 291 if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
292 pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); 292 pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
293 return -EINTR; 293 return -EINTR;
294 } 294 }
@@ -411,12 +411,12 @@ next_pad_rq:
411 int pos; 411 int pos;
412 412
413 w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); 413 w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
414 ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); 414 ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
415 pos = pblk_ppa_to_pos(geo, ppa); 415 pos = pblk_ppa_to_pos(geo, ppa);
416 416
417 while (test_bit(pos, line->blk_bitmap)) { 417 while (test_bit(pos, line->blk_bitmap)) {
418 w_ptr += pblk->min_write_pgs; 418 w_ptr += pblk->min_write_pgs;
419 ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); 419 ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
420 pos = pblk_ppa_to_pos(geo, ppa); 420 pos = pblk_ppa_to_pos(geo, ppa);
421 } 421 }
422 422
@@ -541,12 +541,12 @@ next_rq:
541 541
542 w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); 542 w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
543 ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); 543 ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
544 pos = pblk_dev_ppa_to_pos(geo, ppa); 544 pos = pblk_ppa_to_pos(geo, ppa);
545 545
546 while (test_bit(pos, line->blk_bitmap)) { 546 while (test_bit(pos, line->blk_bitmap)) {
547 w_ptr += pblk->min_write_pgs; 547 w_ptr += pblk->min_write_pgs;
548 ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); 548 ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
549 pos = pblk_dev_ppa_to_pos(geo, ppa); 549 pos = pblk_ppa_to_pos(geo, ppa);
550 } 550 }
551 551
552 for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) 552 for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
@@ -672,12 +672,12 @@ next_rq:
672 672
673 paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); 673 paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
674 ppa = addr_to_gen_ppa(pblk, paddr, line->id); 674 ppa = addr_to_gen_ppa(pblk, paddr, line->id);
675 pos = pblk_dev_ppa_to_pos(geo, ppa); 675 pos = pblk_ppa_to_pos(geo, ppa);
676 676
677 while (test_bit(pos, line->blk_bitmap)) { 677 while (test_bit(pos, line->blk_bitmap)) {
678 paddr += pblk->min_write_pgs; 678 paddr += pblk->min_write_pgs;
679 ppa = addr_to_gen_ppa(pblk, paddr, line->id); 679 ppa = addr_to_gen_ppa(pblk, paddr, line->id);
680 pos = pblk_dev_ppa_to_pos(geo, ppa); 680 pos = pblk_ppa_to_pos(geo, ppa);
681 } 681 }
682 682
683 for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++) 683 for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
@@ -817,7 +817,7 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
817 817
818 while (emeta_secs) { 818 while (emeta_secs) {
819 emeta_start--; 819 emeta_start--;
820 ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id); 820 ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
821 pos = pblk_ppa_to_pos(geo, ppa); 821 pos = pblk_ppa_to_pos(geo, ppa);
822 if (!test_bit(pos, line->blk_bitmap)) 822 if (!test_bit(pos, line->blk_bitmap))
823 emeta_secs--; 823 emeta_secs--;
@@ -938,6 +938,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
938 goto next; 938 goto next;
939 } 939 }
940 940
941 if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
942 pblk_recov_l2p_from_oob(pblk, line);
943 goto next;
944 }
945
941 if (pblk_recov_l2p_from_emeta(pblk, line)) 946 if (pblk_recov_l2p_from_emeta(pblk, line))
942 pblk_recov_l2p_from_oob(pblk, line); 947 pblk_recov_l2p_from_oob(pblk, line);
943 948
@@ -984,10 +989,8 @@ next:
984 } 989 }
985 spin_unlock(&l_mg->free_lock); 990 spin_unlock(&l_mg->free_lock);
986 991
987 if (is_next) { 992 if (is_next)
988 pblk_line_erase(pblk, l_mg->data_next); 993 pblk_line_erase(pblk, l_mg->data_next);
989 pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
990 }
991 994
992out: 995out:
993 if (found_lines != recovered_lines) 996 if (found_lines != recovered_lines)
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index dacc71922260..0d457b162f23 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -89,17 +89,15 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
89 return atomic_read(&rl->free_blocks); 89 return atomic_read(&rl->free_blocks);
90} 90}
91 91
92/* 92unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
93 * We check for (i) the number of free blocks in the current LUN and (ii) the 93{
94 * total number of free blocks in the pblk instance. This is to even out the 94 return atomic_read(&rl->free_user_blocks);
95 * number of free blocks on each LUN when GC kicks in. 95}
96 * 96
97 * Only the total number of free blocks is used to configure the rate limiter. 97static void __pblk_rl_update_rates(struct pblk_rl *rl,
98 */ 98 unsigned long free_blocks)
99void pblk_rl_update_rates(struct pblk_rl *rl)
100{ 99{
101 struct pblk *pblk = container_of(rl, struct pblk, rl); 100 struct pblk *pblk = container_of(rl, struct pblk, rl);
102 unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
103 int max = rl->rb_budget; 101 int max = rl->rb_budget;
104 102
105 if (free_blocks >= rl->high) { 103 if (free_blocks >= rl->high) {
@@ -132,20 +130,37 @@ void pblk_rl_update_rates(struct pblk_rl *rl)
132 pblk_gc_should_stop(pblk); 130 pblk_gc_should_stop(pblk);
133} 131}
134 132
133void pblk_rl_update_rates(struct pblk_rl *rl)
134{
135 __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
136}
137
135void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) 138void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
136{ 139{
137 int blk_in_line = atomic_read(&line->blk_in_line); 140 int blk_in_line = atomic_read(&line->blk_in_line);
141 int free_blocks;
138 142
139 atomic_add(blk_in_line, &rl->free_blocks); 143 atomic_add(blk_in_line, &rl->free_blocks);
140 pblk_rl_update_rates(rl); 144 free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
145
146 __pblk_rl_update_rates(rl, free_blocks);
141} 147}
142 148
143void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) 149void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
150 bool used)
144{ 151{
145 int blk_in_line = atomic_read(&line->blk_in_line); 152 int blk_in_line = atomic_read(&line->blk_in_line);
153 int free_blocks;
146 154
147 atomic_sub(blk_in_line, &rl->free_blocks); 155 atomic_sub(blk_in_line, &rl->free_blocks);
148 pblk_rl_update_rates(rl); 156
157 if (used)
158 free_blocks = atomic_sub_return(blk_in_line,
159 &rl->free_user_blocks);
160 else
161 free_blocks = atomic_read(&rl->free_user_blocks);
162
163 __pblk_rl_update_rates(rl, free_blocks);
149} 164}
150 165
151int pblk_rl_high_thrs(struct pblk_rl *rl) 166int pblk_rl_high_thrs(struct pblk_rl *rl)
@@ -174,16 +189,21 @@ void pblk_rl_free(struct pblk_rl *rl)
174void pblk_rl_init(struct pblk_rl *rl, int budget) 189void pblk_rl_init(struct pblk_rl *rl, int budget)
175{ 190{
176 struct pblk *pblk = container_of(rl, struct pblk, rl); 191 struct pblk *pblk = container_of(rl, struct pblk, rl);
192 struct nvm_tgt_dev *dev = pblk->dev;
193 struct nvm_geo *geo = &dev->geo;
194 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
177 struct pblk_line_meta *lm = &pblk->lm; 195 struct pblk_line_meta *lm = &pblk->lm;
178 int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; 196 int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
197 int sec_meta, blk_meta;
198
179 unsigned int rb_windows; 199 unsigned int rb_windows;
180 200
181 rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS; 201 /* Consider sectors used for metadata */
182 rl->high_pw = get_count_order(rl->high); 202 sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
203 blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
183 204
184 rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; 205 rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
185 if (rl->low < min_blocks) 206 rl->high_pw = get_count_order(rl->high);
186 rl->low = min_blocks;
187 207
188 rl->rsv_blocks = min_blocks; 208 rl->rsv_blocks = min_blocks;
189 209
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index cd49e8875d4e..620bab853579 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
28 ssize_t sz = 0; 28 ssize_t sz = 0;
29 int i; 29 int i;
30 30
31 for (i = 0; i < geo->nr_luns; i++) { 31 for (i = 0; i < geo->all_luns; i++) {
32 int active = 1; 32 int active = 1;
33 33
34 rlun = &pblk->luns[i]; 34 rlun = &pblk->luns[i];
@@ -49,11 +49,12 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
49 49
50static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) 50static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
51{ 51{
52 int free_blocks, total_blocks; 52 int free_blocks, free_user_blocks, total_blocks;
53 int rb_user_max, rb_user_cnt; 53 int rb_user_max, rb_user_cnt;
54 int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; 54 int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
55 55
56 free_blocks = atomic_read(&pblk->rl.free_blocks); 56 free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
57 free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
57 rb_user_max = pblk->rl.rb_user_max; 58 rb_user_max = pblk->rl.rb_user_max;
58 rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); 59 rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
59 rb_gc_max = pblk->rl.rb_gc_max; 60 rb_gc_max = pblk->rl.rb_gc_max;
@@ -64,16 +65,16 @@ static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
64 total_blocks = pblk->rl.total_blocks; 65 total_blocks = pblk->rl.total_blocks;
65 66
66 return snprintf(page, PAGE_SIZE, 67 return snprintf(page, PAGE_SIZE,
67 "u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", 68 "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
68 rb_user_cnt, 69 rb_user_cnt,
69 rb_user_max, 70 rb_user_max,
70 rb_gc_cnt, 71 rb_gc_cnt,
71 rb_gc_max, 72 rb_gc_max,
72 rb_state, 73 rb_state,
73 rb_budget, 74 rb_budget,
74 pblk->rl.low,
75 pblk->rl.high, 75 pblk->rl.high,
76 free_blocks, 76 free_blocks,
77 free_user_blocks,
77 total_blocks, 78 total_blocks,
78 READ_ONCE(pblk->rl.rb_user_active)); 79 READ_ONCE(pblk->rl.rb_user_active));
79} 80}
@@ -238,7 +239,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
238 239
239 sz = snprintf(page, PAGE_SIZE - sz, 240 sz = snprintf(page, PAGE_SIZE - sz,
240 "line: nluns:%d, nblks:%d, nsecs:%d\n", 241 "line: nluns:%d, nblks:%d, nsecs:%d\n",
241 geo->nr_luns, lm->blk_per_line, lm->sec_per_line); 242 geo->all_luns, lm->blk_per_line, lm->sec_per_line);
242 243
243 sz += snprintf(page + sz, PAGE_SIZE - sz, 244 sz += snprintf(page + sz, PAGE_SIZE - sz,
244 "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", 245 "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
@@ -287,7 +288,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
287 "blk_line:%d, sec_line:%d, sec_blk:%d\n", 288 "blk_line:%d, sec_line:%d, sec_blk:%d\n",
288 lm->blk_per_line, 289 lm->blk_per_line,
289 lm->sec_per_line, 290 lm->sec_per_line,
290 geo->sec_per_blk); 291 geo->sec_per_chk);
291 292
292 return sz; 293 return sz;
293} 294}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 6c1cafafef53..aae86ed60b98 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -21,13 +21,28 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
21 struct pblk_c_ctx *c_ctx) 21 struct pblk_c_ctx *c_ctx)
22{ 22{
23 struct bio *original_bio; 23 struct bio *original_bio;
24 struct pblk_rb *rwb = &pblk->rwb;
24 unsigned long ret; 25 unsigned long ret;
25 int i; 26 int i;
26 27
27 for (i = 0; i < c_ctx->nr_valid; i++) { 28 for (i = 0; i < c_ctx->nr_valid; i++) {
28 struct pblk_w_ctx *w_ctx; 29 struct pblk_w_ctx *w_ctx;
30 int pos = c_ctx->sentry + i;
31 int flags;
32
33 w_ctx = pblk_rb_w_ctx(rwb, pos);
34 flags = READ_ONCE(w_ctx->flags);
35
36 if (flags & PBLK_FLUSH_ENTRY) {
37 flags &= ~PBLK_FLUSH_ENTRY;
38 /* Release flags on context. Protect from writes */
39 smp_store_release(&w_ctx->flags, flags);
40
41#ifdef CONFIG_NVM_DEBUG
42 atomic_dec(&rwb->inflight_flush_point);
43#endif
44 }
29 45
30 w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
31 while ((original_bio = bio_list_pop(&w_ctx->bios))) 46 while ((original_bio = bio_list_pop(&w_ctx->bios)))
32 bio_endio(original_bio); 47 bio_endio(original_bio);
33 } 48 }
@@ -439,7 +454,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
439 struct pblk_line *meta_line; 454 struct pblk_line *meta_line;
440 int err; 455 int err;
441 456
442 ppa_set_empty(&erase_ppa); 457 pblk_ppa_set_empty(&erase_ppa);
443 458
444 /* Assign lbas to ppas and populate request structure */ 459 /* Assign lbas to ppas and populate request structure */
445 err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); 460 err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
@@ -457,7 +472,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
457 return NVM_IO_ERR; 472 return NVM_IO_ERR;
458 } 473 }
459 474
460 if (!ppa_empty(erase_ppa)) { 475 if (!pblk_ppa_empty(erase_ppa)) {
461 /* Submit erase for next data line */ 476 /* Submit erase for next data line */
462 if (pblk_blk_erase_async(pblk, erase_ppa)) { 477 if (pblk_blk_erase_async(pblk, erase_ppa)) {
463 struct pblk_line *e_line = pblk_line_get_erase(pblk); 478 struct pblk_line *e_line = pblk_line_get_erase(pblk);
@@ -508,7 +523,7 @@ static int pblk_submit_write(struct pblk *pblk)
508 if (!secs_avail) 523 if (!secs_avail)
509 return 1; 524 return 1;
510 525
511 secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb); 526 secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
512 if (!secs_to_flush && secs_avail < pblk->min_write_pgs) 527 if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
513 return 1; 528 return 1;
514 529
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 59a64d461a5d..8c357fb6538e 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -51,17 +51,16 @@
51 51
52#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) 52#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
53 53
54#define pblk_for_each_lun(pblk, rlun, i) \
55 for ((i) = 0, rlun = &(pblk)->luns[0]; \
56 (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
57
58/* Static pool sizes */ 54/* Static pool sizes */
59#define PBLK_GEN_WS_POOL_SIZE (2) 55#define PBLK_GEN_WS_POOL_SIZE (2)
60 56
57#define PBLK_DEFAULT_OP (11)
58
61enum { 59enum {
62 PBLK_READ = READ, 60 PBLK_READ = READ,
63 PBLK_WRITE = WRITE,/* Write from write buffer */ 61 PBLK_WRITE = WRITE,/* Write from write buffer */
64 PBLK_WRITE_INT, /* Internal write - no write buffer */ 62 PBLK_WRITE_INT, /* Internal write - no write buffer */
63 PBLK_READ_RECOV, /* Recovery read - errors allowed */
65 PBLK_ERASE, 64 PBLK_ERASE,
66}; 65};
67 66
@@ -114,6 +113,7 @@ struct pblk_c_ctx {
114/* read context */ 113/* read context */
115struct pblk_g_ctx { 114struct pblk_g_ctx {
116 void *private; 115 void *private;
116 unsigned long start_time;
117 u64 lba; 117 u64 lba;
118}; 118};
119 119
@@ -170,7 +170,7 @@ struct pblk_rb {
170 * the last submitted entry that has 170 * the last submitted entry that has
171 * been successfully persisted to media 171 * been successfully persisted to media
172 */ 172 */
173 unsigned int sync_point; /* Sync point - last entry that must be 173 unsigned int flush_point; /* Sync point - last entry that must be
174 * flushed to the media. Used with 174 * flushed to the media. Used with
175 * REQ_FLUSH and REQ_FUA 175 * REQ_FLUSH and REQ_FUA
176 */ 176 */
@@ -193,7 +193,7 @@ struct pblk_rb {
193 spinlock_t s_lock; /* Sync lock */ 193 spinlock_t s_lock; /* Sync lock */
194 194
195#ifdef CONFIG_NVM_DEBUG 195#ifdef CONFIG_NVM_DEBUG
196 atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */ 196 atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
197#endif 197#endif
198}; 198};
199 199
@@ -256,9 +256,6 @@ struct pblk_rl {
256 unsigned int high; /* Upper threshold for rate limiter (free run - 256 unsigned int high; /* Upper threshold for rate limiter (free run -
257 * user I/O rate limiter 257 * user I/O rate limiter
258 */ 258 */
259 unsigned int low; /* Lower threshold for rate limiter (user I/O
260 * rate limiter - stall)
261 */
262 unsigned int high_pw; /* High rounded up as a power of 2 */ 259 unsigned int high_pw; /* High rounded up as a power of 2 */
263 260
264#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ 261#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */
@@ -292,7 +289,9 @@ struct pblk_rl {
292 289
293 unsigned long long nr_secs; 290 unsigned long long nr_secs;
294 unsigned long total_blocks; 291 unsigned long total_blocks;
295 atomic_t free_blocks; 292
293 atomic_t free_blocks; /* Total number of free blocks (+ OP) */
294 atomic_t free_user_blocks; /* Number of user free blocks (no OP) */
296}; 295};
297 296
298#define PBLK_LINE_EMPTY (~0U) 297#define PBLK_LINE_EMPTY (~0U)
@@ -583,7 +582,9 @@ struct pblk {
583 */ 582 */
584 583
585 sector_t capacity; /* Device capacity when bad blocks are subtracted */ 584 sector_t capacity; /* Device capacity when bad blocks are subtracted */
586 int over_pct; /* Percentage of device used for over-provisioning */ 585
586 int op; /* Percentage of device used for over-provisioning */
587 int op_blks; /* Number of blocks used for over-provisioning */
587 588
588 /* pblk provisioning values. Used by rate limiter */ 589 /* pblk provisioning values. Used by rate limiter */
589 struct pblk_rl rl; 590 struct pblk_rl rl;
@@ -691,7 +692,7 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
691struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb, 692struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
692 struct ppa_addr *ppa); 693 struct ppa_addr *ppa);
693void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); 694void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
694unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb); 695unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb);
695 696
696unsigned int pblk_rb_read_count(struct pblk_rb *rb); 697unsigned int pblk_rb_read_count(struct pblk_rb *rb);
697unsigned int pblk_rb_sync_count(struct pblk_rb *rb); 698unsigned int pblk_rb_sync_count(struct pblk_rb *rb);
@@ -812,7 +813,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
812void pblk_submit_rec(struct work_struct *work); 813void pblk_submit_rec(struct work_struct *work);
813struct pblk_line *pblk_recov_l2p(struct pblk *pblk); 814struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
814int pblk_recov_pad(struct pblk *pblk); 815int pblk_recov_pad(struct pblk *pblk);
815__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta); 816int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
816int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, 817int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
817 struct pblk_rec_ctx *recovery, u64 *comp_bits, 818 struct pblk_rec_ctx *recovery, u64 *comp_bits,
818 unsigned int comp); 819 unsigned int comp);
@@ -843,6 +844,7 @@ void pblk_rl_free(struct pblk_rl *rl);
843void pblk_rl_update_rates(struct pblk_rl *rl); 844void pblk_rl_update_rates(struct pblk_rl *rl);
844int pblk_rl_high_thrs(struct pblk_rl *rl); 845int pblk_rl_high_thrs(struct pblk_rl *rl);
845unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); 846unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
847unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl);
846int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); 848int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
847void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); 849void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
848void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); 850void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
@@ -851,7 +853,8 @@ void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
851void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); 853void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
852int pblk_rl_max_io(struct pblk_rl *rl); 854int pblk_rl_max_io(struct pblk_rl *rl);
853void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); 855void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
854void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); 856void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
857 bool used);
855int pblk_rl_is_limit(struct pblk_rl *rl); 858int pblk_rl_is_limit(struct pblk_rl *rl);
856 859
857/* 860/*
@@ -907,28 +910,47 @@ static inline int pblk_pad_distance(struct pblk *pblk)
907 struct nvm_tgt_dev *dev = pblk->dev; 910 struct nvm_tgt_dev *dev = pblk->dev;
908 struct nvm_geo *geo = &dev->geo; 911 struct nvm_geo *geo = &dev->geo;
909 912
910 return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl; 913 return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl;
911} 914}
912 915
913static inline int pblk_dev_ppa_to_line(struct ppa_addr p) 916static inline int pblk_ppa_to_line(struct ppa_addr p)
914{ 917{
915 return p.g.blk; 918 return p.g.blk;
916} 919}
917 920
918static inline int pblk_tgt_ppa_to_line(struct ppa_addr p) 921static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
919{ 922{
920 return p.g.blk; 923 return p.g.lun * geo->nr_chnls + p.g.ch;
921} 924}
922 925
923static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) 926static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
927 u64 line_id)
924{ 928{
925 return p.g.lun * geo->nr_chnls + p.g.ch; 929 struct ppa_addr ppa;
930
931 ppa.ppa = 0;
932 ppa.g.blk = line_id;
933 ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
934 ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
935 ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
936 ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
937 ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
938
939 return ppa;
926} 940}
927 941
928/* A block within a line corresponds to the lun */ 942static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
929static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) 943 struct ppa_addr p)
930{ 944{
931 return p.g.lun * geo->nr_chnls + p.g.ch; 945 u64 paddr;
946
947 paddr = (u64)p.g.pg << pblk->ppaf.pg_offset;
948 paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
949 paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
950 paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
951 paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
952
953 return paddr;
932} 954}
933 955
934static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) 956static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
@@ -960,24 +982,6 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
960 return ppa64; 982 return ppa64;
961} 983}
962 984
963static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
964 sector_t lba)
965{
966 struct ppa_addr ppa;
967
968 if (pblk->ppaf_bitsize < 32) {
969 u32 *map = (u32 *)pblk->trans_map;
970
971 ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
972 } else {
973 struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
974
975 ppa = map[lba];
976 }
977
978 return ppa;
979}
980
981static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) 985static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
982{ 986{
983 u32 ppa32 = 0; 987 u32 ppa32 = 0;
@@ -999,33 +1003,36 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
999 return ppa32; 1003 return ppa32;
1000} 1004}
1001 1005
1002static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, 1006static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
1003 struct ppa_addr ppa) 1007 sector_t lba)
1004{ 1008{
1009 struct ppa_addr ppa;
1010
1005 if (pblk->ppaf_bitsize < 32) { 1011 if (pblk->ppaf_bitsize < 32) {
1006 u32 *map = (u32 *)pblk->trans_map; 1012 u32 *map = (u32 *)pblk->trans_map;
1007 1013
1008 map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); 1014 ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
1009 } else { 1015 } else {
1010 u64 *map = (u64 *)pblk->trans_map; 1016 struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
1011 1017
1012 map[lba] = ppa.ppa; 1018 ppa = map[lba];
1013 } 1019 }
1020
1021 return ppa;
1014} 1022}
1015 1023
1016static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, 1024static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
1017 struct ppa_addr p) 1025 struct ppa_addr ppa)
1018{ 1026{
1019 u64 paddr; 1027 if (pblk->ppaf_bitsize < 32) {
1028 u32 *map = (u32 *)pblk->trans_map;
1020 1029
1021 paddr = 0; 1030 map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
1022 paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset; 1031 } else {
1023 paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset; 1032 u64 *map = (u64 *)pblk->trans_map;
1024 paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
1025 paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
1026 paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
1027 1033
1028 return paddr; 1034 map[lba] = ppa.ppa;
1035 }
1029} 1036}
1030 1037
1031static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) 1038static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
@@ -1040,10 +1047,7 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
1040 1047
1041static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) 1048static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa)
1042{ 1049{
1043 if (lppa.ppa == rppa.ppa) 1050 return (lppa.ppa == rppa.ppa);
1044 return true;
1045
1046 return false;
1047} 1051}
1048 1052
1049static inline int pblk_addr_in_cache(struct ppa_addr ppa) 1053static inline int pblk_addr_in_cache(struct ppa_addr ppa)
@@ -1066,32 +1070,6 @@ static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
1066 return p; 1070 return p;
1067} 1071}
1068 1072
1069static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
1070 u64 line_id)
1071{
1072 struct ppa_addr ppa;
1073
1074 ppa.ppa = 0;
1075 ppa.g.blk = line_id;
1076 ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
1077 ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
1078 ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
1079 ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
1080 ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
1081
1082 return ppa;
1083}
1084
1085static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
1086 u64 line_id)
1087{
1088 struct ppa_addr ppa;
1089
1090 ppa = addr_to_gen_ppa(pblk, paddr, line_id);
1091
1092 return ppa;
1093}
1094
1095static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, 1073static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
1096 struct line_header *header) 1074 struct line_header *header)
1097{ 1075{
@@ -1212,10 +1190,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
1212 1190
1213 if (!ppa->c.is_cached && 1191 if (!ppa->c.is_cached &&
1214 ppa->g.ch < geo->nr_chnls && 1192 ppa->g.ch < geo->nr_chnls &&
1215 ppa->g.lun < geo->luns_per_chnl && 1193 ppa->g.lun < geo->nr_luns &&
1216 ppa->g.pl < geo->nr_planes && 1194 ppa->g.pl < geo->nr_planes &&
1217 ppa->g.blk < geo->blks_per_lun && 1195 ppa->g.blk < geo->nr_chks &&
1218 ppa->g.pg < geo->pgs_per_blk && 1196 ppa->g.pg < geo->ws_per_chk &&
1219 ppa->g.sec < geo->sec_per_pg) 1197 ppa->g.sec < geo->sec_per_pg)
1220 continue; 1198 continue;
1221 1199
@@ -1245,7 +1223,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
1245 1223
1246 for (i = 0; i < rqd->nr_ppas; i++) { 1224 for (i = 0; i < rqd->nr_ppas; i++) {
1247 ppa = ppa_list[i]; 1225 ppa = ppa_list[i];
1248 line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; 1226 line = &pblk->lines[pblk_ppa_to_line(ppa)];
1249 1227
1250 spin_lock(&line->lock); 1228 spin_lock(&line->lock);
1251 if (line->state != PBLK_LINESTATE_OPEN) { 1229 if (line->state != PBLK_LINESTATE_OPEN) {
@@ -1288,11 +1266,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
1288 return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; 1266 return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
1289} 1267}
1290 1268
1291static inline sector_t pblk_get_sector(sector_t lba)
1292{
1293 return lba * NR_PHY_IN_LOG;
1294}
1295
1296static inline void pblk_setup_uuid(struct pblk *pblk) 1269static inline void pblk_setup_uuid(struct pblk *pblk)
1297{ 1270{
1298 uuid_le uuid; 1271 uuid_le uuid;
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
deleted file mode 100644
index 0993c14be860..000000000000
--- a/drivers/lightnvm/rrpc.c
+++ /dev/null
@@ -1,1625 +0,0 @@
1/*
2 * Copyright (C) 2015 IT University of Copenhagen
3 * Initial release: Matias Bjorling <m@bjorling.me>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version
7 * 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
15 */
16
17#include "rrpc.h"
18
19static struct kmem_cache *rrpc_gcb_cache, *rrpc_rq_cache;
20static DECLARE_RWSEM(rrpc_lock);
21
22static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
23 struct nvm_rq *rqd, unsigned long flags);
24
25#define rrpc_for_each_lun(rrpc, rlun, i) \
26 for ((i) = 0, rlun = &(rrpc)->luns[0]; \
27 (i) < (rrpc)->nr_luns; (i)++, rlun = &(rrpc)->luns[(i)])
28
29static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
30{
31 struct nvm_tgt_dev *dev = rrpc->dev;
32 struct rrpc_block *rblk = a->rblk;
33 unsigned int pg_offset;
34
35 lockdep_assert_held(&rrpc->rev_lock);
36
37 if (a->addr == ADDR_EMPTY || !rblk)
38 return;
39
40 spin_lock(&rblk->lock);
41
42 div_u64_rem(a->addr, dev->geo.sec_per_blk, &pg_offset);
43 WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages));
44 rblk->nr_invalid_pages++;
45
46 spin_unlock(&rblk->lock);
47
48 rrpc->rev_trans_map[a->addr].addr = ADDR_EMPTY;
49}
50
51static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba,
52 unsigned int len)
53{
54 sector_t i;
55
56 spin_lock(&rrpc->rev_lock);
57 for (i = slba; i < slba + len; i++) {
58 struct rrpc_addr *gp = &rrpc->trans_map[i];
59
60 rrpc_page_invalidate(rrpc, gp);
61 gp->rblk = NULL;
62 }
63 spin_unlock(&rrpc->rev_lock);
64}
65
66static struct nvm_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc,
67 sector_t laddr, unsigned int pages)
68{
69 struct nvm_rq *rqd;
70 struct rrpc_inflight_rq *inf;
71
72 rqd = mempool_alloc(rrpc->rq_pool, GFP_ATOMIC);
73 if (!rqd)
74 return ERR_PTR(-ENOMEM);
75
76 inf = rrpc_get_inflight_rq(rqd);
77 if (rrpc_lock_laddr(rrpc, laddr, pages, inf)) {
78 mempool_free(rqd, rrpc->rq_pool);
79 return NULL;
80 }
81
82 return rqd;
83}
84
85static void rrpc_inflight_laddr_release(struct rrpc *rrpc, struct nvm_rq *rqd)
86{
87 struct rrpc_inflight_rq *inf = rrpc_get_inflight_rq(rqd);
88
89 rrpc_unlock_laddr(rrpc, inf);
90
91 mempool_free(rqd, rrpc->rq_pool);
92}
93
94static void rrpc_discard(struct rrpc *rrpc, struct bio *bio)
95{
96 sector_t slba = bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
97 sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
98 struct nvm_rq *rqd;
99
100 while (1) {
101 rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len);
102 if (rqd)
103 break;
104
105 schedule();
106 }
107
108 if (IS_ERR(rqd)) {
109 pr_err("rrpc: unable to acquire inflight IO\n");
110 bio_io_error(bio);
111 return;
112 }
113
114 rrpc_invalidate_range(rrpc, slba, len);
115 rrpc_inflight_laddr_release(rrpc, rqd);
116}
117
118static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk)
119{
120 struct nvm_tgt_dev *dev = rrpc->dev;
121
122 return (rblk->next_page == dev->geo.sec_per_blk);
123}
124
125/* Calculate relative addr for the given block, considering instantiated LUNs */
126static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
127{
128 struct nvm_tgt_dev *dev = rrpc->dev;
129 struct rrpc_lun *rlun = rblk->rlun;
130
131 return rlun->id * dev->geo.sec_per_blk;
132}
133
134static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev,
135 struct rrpc_addr *gp)
136{
137 struct rrpc_block *rblk = gp->rblk;
138 struct rrpc_lun *rlun = rblk->rlun;
139 u64 addr = gp->addr;
140 struct ppa_addr paddr;
141
142 paddr.ppa = addr;
143 paddr = rrpc_linear_to_generic_addr(&dev->geo, paddr);
144 paddr.g.ch = rlun->bppa.g.ch;
145 paddr.g.lun = rlun->bppa.g.lun;
146 paddr.g.blk = rblk->id;
147
148 return paddr;
149}
150
151/* requires lun->lock taken */
152static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk,
153 struct rrpc_block **cur_rblk)
154{
155 struct rrpc *rrpc = rlun->rrpc;
156
157 if (*cur_rblk) {
158 spin_lock(&(*cur_rblk)->lock);
159 WARN_ON(!block_is_full(rrpc, *cur_rblk));
160 spin_unlock(&(*cur_rblk)->lock);
161 }
162 *cur_rblk = new_rblk;
163}
164
165static struct rrpc_block *__rrpc_get_blk(struct rrpc *rrpc,
166 struct rrpc_lun *rlun)
167{
168 struct rrpc_block *rblk = NULL;
169
170 if (list_empty(&rlun->free_list))
171 goto out;
172
173 rblk = list_first_entry(&rlun->free_list, struct rrpc_block, list);
174
175 list_move_tail(&rblk->list, &rlun->used_list);
176 rblk->state = NVM_BLK_ST_TGT;
177 rlun->nr_free_blocks--;
178
179out:
180 return rblk;
181}
182
183static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
184 unsigned long flags)
185{
186 struct nvm_tgt_dev *dev = rrpc->dev;
187 struct rrpc_block *rblk;
188 int is_gc = flags & NVM_IOTYPE_GC;
189
190 spin_lock(&rlun->lock);
191 if (!is_gc && rlun->nr_free_blocks < rlun->reserved_blocks) {
192 pr_err("nvm: rrpc: cannot give block to non GC request\n");
193 spin_unlock(&rlun->lock);
194 return NULL;
195 }
196
197 rblk = __rrpc_get_blk(rrpc, rlun);
198 if (!rblk) {
199 pr_err("nvm: rrpc: cannot get new block\n");
200 spin_unlock(&rlun->lock);
201 return NULL;
202 }
203 spin_unlock(&rlun->lock);
204
205 bitmap_zero(rblk->invalid_pages, dev->geo.sec_per_blk);
206 rblk->next_page = 0;
207 rblk->nr_invalid_pages = 0;
208 atomic_set(&rblk->data_cmnt_size, 0);
209
210 return rblk;
211}
212
213static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
214{
215 struct rrpc_lun *rlun = rblk->rlun;
216
217 spin_lock(&rlun->lock);
218 if (rblk->state & NVM_BLK_ST_TGT) {
219 list_move_tail(&rblk->list, &rlun->free_list);
220 rlun->nr_free_blocks++;
221 rblk->state = NVM_BLK_ST_FREE;
222 } else if (rblk->state & NVM_BLK_ST_BAD) {
223 list_move_tail(&rblk->list, &rlun->bb_list);
224 rblk->state = NVM_BLK_ST_BAD;
225 } else {
226 WARN_ON_ONCE(1);
227 pr_err("rrpc: erroneous type (ch:%d,lun:%d,blk%d-> %u)\n",
228 rlun->bppa.g.ch, rlun->bppa.g.lun,
229 rblk->id, rblk->state);
230 list_move_tail(&rblk->list, &rlun->bb_list);
231 }
232 spin_unlock(&rlun->lock);
233}
234
235static void rrpc_put_blks(struct rrpc *rrpc)
236{
237 struct rrpc_lun *rlun;
238 int i;
239
240 for (i = 0; i < rrpc->nr_luns; i++) {
241 rlun = &rrpc->luns[i];
242 if (rlun->cur)
243 rrpc_put_blk(rrpc, rlun->cur);
244 if (rlun->gc_cur)
245 rrpc_put_blk(rrpc, rlun->gc_cur);
246 }
247}
248
249static struct rrpc_lun *get_next_lun(struct rrpc *rrpc)
250{
251 int next = atomic_inc_return(&rrpc->next_lun);
252
253 return &rrpc->luns[next % rrpc->nr_luns];
254}
255
256static void rrpc_gc_kick(struct rrpc *rrpc)
257{
258 struct rrpc_lun *rlun;
259 unsigned int i;
260
261 for (i = 0; i < rrpc->nr_luns; i++) {
262 rlun = &rrpc->luns[i];
263 queue_work(rrpc->krqd_wq, &rlun->ws_gc);
264 }
265}
266
267/*
268 * timed GC every interval.
269 */
270static void rrpc_gc_timer(struct timer_list *t)
271{
272 struct rrpc *rrpc = from_timer(rrpc, t, gc_timer);
273
274 rrpc_gc_kick(rrpc);
275 mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10));
276}
277
278static void rrpc_end_sync_bio(struct bio *bio)
279{
280 struct completion *waiting = bio->bi_private;
281
282 if (bio->bi_status)
283 pr_err("nvm: gc request failed (%u).\n", bio->bi_status);
284
285 complete(waiting);
286}
287
288/*
289 * rrpc_move_valid_pages -- migrate live data off the block
290 * @rrpc: the 'rrpc' structure
291 * @block: the block from which to migrate live pages
292 *
293 * Description:
294 * GC algorithms may call this function to migrate remaining live
295 * pages off the block prior to erasing it. This function blocks
296 * further execution until the operation is complete.
297 */
298static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
299{
300 struct nvm_tgt_dev *dev = rrpc->dev;
301 struct request_queue *q = dev->q;
302 struct rrpc_rev_addr *rev;
303 struct nvm_rq *rqd;
304 struct bio *bio;
305 struct page *page;
306 int slot;
307 int nr_sec_per_blk = dev->geo.sec_per_blk;
308 u64 phys_addr;
309 DECLARE_COMPLETION_ONSTACK(wait);
310
311 if (bitmap_full(rblk->invalid_pages, nr_sec_per_blk))
312 return 0;
313
314 bio = bio_alloc(GFP_NOIO, 1);
315 if (!bio) {
316 pr_err("nvm: could not alloc bio to gc\n");
317 return -ENOMEM;
318 }
319
320 page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
321
322 while ((slot = find_first_zero_bit(rblk->invalid_pages,
323 nr_sec_per_blk)) < nr_sec_per_blk) {
324
325 /* Lock laddr */
326 phys_addr = rrpc_blk_to_ppa(rrpc, rblk) + slot;
327
328try:
329 spin_lock(&rrpc->rev_lock);
330 /* Get logical address from physical to logical table */
331 rev = &rrpc->rev_trans_map[phys_addr];
332 /* already updated by previous regular write */
333 if (rev->addr == ADDR_EMPTY) {
334 spin_unlock(&rrpc->rev_lock);
335 continue;
336 }
337
338 rqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1);
339 if (IS_ERR_OR_NULL(rqd)) {
340 spin_unlock(&rrpc->rev_lock);
341 schedule();
342 goto try;
343 }
344
345 spin_unlock(&rrpc->rev_lock);
346
347 /* Perform read to do GC */
348 bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr);
349 bio_set_op_attrs(bio, REQ_OP_READ, 0);
350 bio->bi_private = &wait;
351 bio->bi_end_io = rrpc_end_sync_bio;
352
353 /* TODO: may fail when EXP_PG_SIZE > PAGE_SIZE */
354 bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
355
356 if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
357 pr_err("rrpc: gc read failed.\n");
358 rrpc_inflight_laddr_release(rrpc, rqd);
359 goto finished;
360 }
361 wait_for_completion_io(&wait);
362 if (bio->bi_status) {
363 rrpc_inflight_laddr_release(rrpc, rqd);
364 goto finished;
365 }
366
367 bio_reset(bio);
368 reinit_completion(&wait);
369
370 bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr);
371 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
372 bio->bi_private = &wait;
373 bio->bi_end_io = rrpc_end_sync_bio;
374
375 bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
376
377 /* turn the command around and write the data back to a new
378 * address
379 */
380 if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
381 pr_err("rrpc: gc write failed.\n");
382 rrpc_inflight_laddr_release(rrpc, rqd);
383 goto finished;
384 }
385 wait_for_completion_io(&wait);
386
387 rrpc_inflight_laddr_release(rrpc, rqd);
388 if (bio->bi_status)
389 goto finished;
390
391 bio_reset(bio);
392 }
393
394finished:
395 mempool_free(page, rrpc->page_pool);
396 bio_put(bio);
397
398 if (!bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) {
399 pr_err("nvm: failed to garbage collect block\n");
400 return -EIO;
401 }
402
403 return 0;
404}
405
406static void rrpc_block_gc(struct work_struct *work)
407{
408 struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc,
409 ws_gc);
410 struct rrpc *rrpc = gcb->rrpc;
411 struct rrpc_block *rblk = gcb->rblk;
412 struct rrpc_lun *rlun = rblk->rlun;
413 struct ppa_addr ppa;
414
415 mempool_free(gcb, rrpc->gcb_pool);
416 pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' being reclaimed\n",
417 rlun->bppa.g.ch, rlun->bppa.g.lun,
418 rblk->id);
419
420 if (rrpc_move_valid_pages(rrpc, rblk))
421 goto put_back;
422
423 ppa.ppa = 0;
424 ppa.g.ch = rlun->bppa.g.ch;
425 ppa.g.lun = rlun->bppa.g.lun;
426 ppa.g.blk = rblk->id;
427
428 if (nvm_erase_sync(rrpc->dev, &ppa, 1))
429 goto put_back;
430
431 rrpc_put_blk(rrpc, rblk);
432
433 return;
434
435put_back:
436 spin_lock(&rlun->lock);
437 list_add_tail(&rblk->prio, &rlun->prio_list);
438 spin_unlock(&rlun->lock);
439}
440
441/* the block with highest number of invalid pages, will be in the beginning
442 * of the list
443 */
444static struct rrpc_block *rblk_max_invalid(struct rrpc_block *ra,
445 struct rrpc_block *rb)
446{
447 if (ra->nr_invalid_pages == rb->nr_invalid_pages)
448 return ra;
449
450 return (ra->nr_invalid_pages < rb->nr_invalid_pages) ? rb : ra;
451}
452
453/* linearly find the block with highest number of invalid pages
454 * requires lun->lock
455 */
456static struct rrpc_block *block_prio_find_max(struct rrpc_lun *rlun)
457{
458 struct list_head *prio_list = &rlun->prio_list;
459 struct rrpc_block *rblk, *max;
460
461 BUG_ON(list_empty(prio_list));
462
463 max = list_first_entry(prio_list, struct rrpc_block, prio);
464 list_for_each_entry(rblk, prio_list, prio)
465 max = rblk_max_invalid(max, rblk);
466
467 return max;
468}
469
470static void rrpc_lun_gc(struct work_struct *work)
471{
472 struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_gc);
473 struct rrpc *rrpc = rlun->rrpc;
474 struct nvm_tgt_dev *dev = rrpc->dev;
475 struct rrpc_block_gc *gcb;
476 unsigned int nr_blocks_need;
477
478 nr_blocks_need = dev->geo.blks_per_lun / GC_LIMIT_INVERSE;
479
480 if (nr_blocks_need < rrpc->nr_luns)
481 nr_blocks_need = rrpc->nr_luns;
482
483 spin_lock(&rlun->lock);
484 while (nr_blocks_need > rlun->nr_free_blocks &&
485 !list_empty(&rlun->prio_list)) {
486 struct rrpc_block *rblk = block_prio_find_max(rlun);
487
488 if (!rblk->nr_invalid_pages)
489 break;
490
491 gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
492 if (!gcb)
493 break;
494
495 list_del_init(&rblk->prio);
496
497 WARN_ON(!block_is_full(rrpc, rblk));
498
499 pr_debug("rrpc: selected block 'ch:%d,lun:%d,blk:%d' for GC\n",
500 rlun->bppa.g.ch, rlun->bppa.g.lun,
501 rblk->id);
502
503 gcb->rrpc = rrpc;
504 gcb->rblk = rblk;
505 INIT_WORK(&gcb->ws_gc, rrpc_block_gc);
506
507 queue_work(rrpc->kgc_wq, &gcb->ws_gc);
508
509 nr_blocks_need--;
510 }
511 spin_unlock(&rlun->lock);
512
513 /* TODO: Hint that request queue can be started again */
514}
515
516static void rrpc_gc_queue(struct work_struct *work)
517{
518 struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc,
519 ws_gc);
520 struct rrpc *rrpc = gcb->rrpc;
521 struct rrpc_block *rblk = gcb->rblk;
522 struct rrpc_lun *rlun = rblk->rlun;
523
524 spin_lock(&rlun->lock);
525 list_add_tail(&rblk->prio, &rlun->prio_list);
526 spin_unlock(&rlun->lock);
527
528 mempool_free(gcb, rrpc->gcb_pool);
529 pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' full, allow GC (sched)\n",
530 rlun->bppa.g.ch, rlun->bppa.g.lun,
531 rblk->id);
532}
533
534static const struct block_device_operations rrpc_fops = {
535 .owner = THIS_MODULE,
536};
537
538static struct rrpc_lun *rrpc_get_lun_rr(struct rrpc *rrpc, int is_gc)
539{
540 unsigned int i;
541 struct rrpc_lun *rlun, *max_free;
542
543 if (!is_gc)
544 return get_next_lun(rrpc);
545
546 /* during GC, we don't care about RR, instead we want to make
547 * sure that we maintain evenness between the block luns.
548 */
549 max_free = &rrpc->luns[0];
550 /* prevent GC-ing lun from devouring pages of a lun with
551 * little free blocks. We don't take the lock as we only need an
552 * estimate.
553 */
554 rrpc_for_each_lun(rrpc, rlun, i) {
555 if (rlun->nr_free_blocks > max_free->nr_free_blocks)
556 max_free = rlun;
557 }
558
559 return max_free;
560}
561
562static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t laddr,
563 struct rrpc_block *rblk, u64 paddr)
564{
565 struct rrpc_addr *gp;
566 struct rrpc_rev_addr *rev;
567
568 BUG_ON(laddr >= rrpc->nr_sects);
569
570 gp = &rrpc->trans_map[laddr];
571 spin_lock(&rrpc->rev_lock);
572 if (gp->rblk)
573 rrpc_page_invalidate(rrpc, gp);
574
575 gp->addr = paddr;
576 gp->rblk = rblk;
577
578 rev = &rrpc->rev_trans_map[gp->addr];
579 rev->addr = laddr;
580 spin_unlock(&rrpc->rev_lock);
581
582 return gp;
583}
584
585static u64 rrpc_alloc_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
586{
587 u64 addr = ADDR_EMPTY;
588
589 spin_lock(&rblk->lock);
590 if (block_is_full(rrpc, rblk))
591 goto out;
592
593 addr = rblk->next_page;
594
595 rblk->next_page++;
596out:
597 spin_unlock(&rblk->lock);
598 return addr;
599}
600
601/* Map logical address to a physical page. The mapping implements a round robin
602 * approach and allocates a page from the next lun available.
603 *
604 * Returns rrpc_addr with the physical address and block. Returns NULL if no
605 * blocks in the next rlun are available.
606 */
607static struct ppa_addr rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
608 int is_gc)
609{
610 struct nvm_tgt_dev *tgt_dev = rrpc->dev;
611 struct rrpc_lun *rlun;
612 struct rrpc_block *rblk, **cur_rblk;
613 struct rrpc_addr *p;
614 struct ppa_addr ppa;
615 u64 paddr;
616 int gc_force = 0;
617
618 ppa.ppa = ADDR_EMPTY;
619 rlun = rrpc_get_lun_rr(rrpc, is_gc);
620
621 if (!is_gc && rlun->nr_free_blocks < rrpc->nr_luns * 4)
622 return ppa;
623
624 /*
625 * page allocation steps:
626 * 1. Try to allocate new page from current rblk
627 * 2a. If succeed, proceed to map it in and return
628 * 2b. If fail, first try to allocate a new block from media manger,
629 * and then retry step 1. Retry until the normal block pool is
630 * exhausted.
631 * 3. If exhausted, and garbage collector is requesting the block,
632 * go to the reserved block and retry step 1.
633 * In the case that this fails as well, or it is not GC
634 * requesting, report not able to retrieve a block and let the
635 * caller handle further processing.
636 */
637
638 spin_lock(&rlun->lock);
639 cur_rblk = &rlun->cur;
640 rblk = rlun->cur;
641retry:
642 paddr = rrpc_alloc_addr(rrpc, rblk);
643
644 if (paddr != ADDR_EMPTY)
645 goto done;
646
647 if (!list_empty(&rlun->wblk_list)) {
648new_blk:
649 rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block,
650 prio);
651 rrpc_set_lun_cur(rlun, rblk, cur_rblk);
652 list_del(&rblk->prio);
653 goto retry;
654 }
655 spin_unlock(&rlun->lock);
656
657 rblk = rrpc_get_blk(rrpc, rlun, gc_force);
658 if (rblk) {
659 spin_lock(&rlun->lock);
660 list_add_tail(&rblk->prio, &rlun->wblk_list);
661 /*
662 * another thread might already have added a new block,
663 * Therefore, make sure that one is used, instead of the
664 * one just added.
665 */
666 goto new_blk;
667 }
668
669 if (unlikely(is_gc) && !gc_force) {
670 /* retry from emergency gc block */
671 cur_rblk = &rlun->gc_cur;
672 rblk = rlun->gc_cur;
673 gc_force = 1;
674 spin_lock(&rlun->lock);
675 goto retry;
676 }
677
678 pr_err("rrpc: failed to allocate new block\n");
679 return ppa;
680done:
681 spin_unlock(&rlun->lock);
682 p = rrpc_update_map(rrpc, laddr, rblk, paddr);
683 if (!p)
684 return ppa;
685
686 /* return global address */
687 return rrpc_ppa_to_gaddr(tgt_dev, p);
688}
689
690static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
691{
692 struct rrpc_block_gc *gcb;
693
694 gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
695 if (!gcb) {
696 pr_err("rrpc: unable to queue block for gc.");
697 return;
698 }
699
700 gcb->rrpc = rrpc;
701 gcb->rblk = rblk;
702
703 INIT_WORK(&gcb->ws_gc, rrpc_gc_queue);
704 queue_work(rrpc->kgc_wq, &gcb->ws_gc);
705}
706
707static struct rrpc_lun *rrpc_ppa_to_lun(struct rrpc *rrpc, struct ppa_addr p)
708{
709 struct rrpc_lun *rlun = NULL;
710 int i;
711
712 for (i = 0; i < rrpc->nr_luns; i++) {
713 if (rrpc->luns[i].bppa.g.ch == p.g.ch &&
714 rrpc->luns[i].bppa.g.lun == p.g.lun) {
715 rlun = &rrpc->luns[i];
716 break;
717 }
718 }
719
720 return rlun;
721}
722
723static void __rrpc_mark_bad_block(struct rrpc *rrpc, struct ppa_addr ppa)
724{
725 struct nvm_tgt_dev *dev = rrpc->dev;
726 struct rrpc_lun *rlun;
727 struct rrpc_block *rblk;
728
729 rlun = rrpc_ppa_to_lun(rrpc, ppa);
730 rblk = &rlun->blocks[ppa.g.blk];
731 rblk->state = NVM_BLK_ST_BAD;
732
733 nvm_set_tgt_bb_tbl(dev, &ppa, 1, NVM_BLK_T_GRWN_BAD);
734}
735
736static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
737{
738 void *comp_bits = &rqd->ppa_status;
739 struct ppa_addr ppa, prev_ppa;
740 int nr_ppas = rqd->nr_ppas;
741 int bit;
742
743 if (rqd->nr_ppas == 1)
744 __rrpc_mark_bad_block(rrpc, rqd->ppa_addr);
745
746 ppa_set_empty(&prev_ppa);
747 bit = -1;
748 while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
749 ppa = rqd->ppa_list[bit];
750 if (ppa_cmp_blk(ppa, prev_ppa))
751 continue;
752
753 __rrpc_mark_bad_block(rrpc, ppa);
754 }
755}
756
757static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
758 sector_t laddr, uint8_t npages)
759{
760 struct nvm_tgt_dev *dev = rrpc->dev;
761 struct rrpc_addr *p;
762 struct rrpc_block *rblk;
763 int cmnt_size, i;
764
765 for (i = 0; i < npages; i++) {
766 p = &rrpc->trans_map[laddr + i];
767 rblk = p->rblk;
768
769 cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
770 if (unlikely(cmnt_size == dev->geo.sec_per_blk))
771 rrpc_run_gc(rrpc, rblk);
772 }
773}
774
775static void rrpc_end_io(struct nvm_rq *rqd)
776{
777 struct rrpc *rrpc = rqd->private;
778 struct nvm_tgt_dev *dev = rrpc->dev;
779 struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
780 uint8_t npages = rqd->nr_ppas;
781 sector_t laddr = rrpc_get_laddr(rqd->bio) - npages;
782
783 if (bio_data_dir(rqd->bio) == WRITE) {
784 if (rqd->error == NVM_RSP_ERR_FAILWRITE)
785 rrpc_mark_bad_block(rrpc, rqd);
786
787 rrpc_end_io_write(rrpc, rrqd, laddr, npages);
788 }
789
790 bio_put(rqd->bio);
791
792 if (rrqd->flags & NVM_IOTYPE_GC)
793 return;
794
795 rrpc_unlock_rq(rrpc, rqd);
796
797 if (npages > 1)
798 nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
799
800 mempool_free(rqd, rrpc->rq_pool);
801}
802
803static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
804 struct nvm_rq *rqd, unsigned long flags, int npages)
805{
806 struct nvm_tgt_dev *dev = rrpc->dev;
807 struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
808 struct rrpc_addr *gp;
809 sector_t laddr = rrpc_get_laddr(bio);
810 int is_gc = flags & NVM_IOTYPE_GC;
811 int i;
812
813 if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
814 nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
815 return NVM_IO_REQUEUE;
816 }
817
818 for (i = 0; i < npages; i++) {
819 /* We assume that mapping occurs at 4KB granularity */
820 BUG_ON(!(laddr + i < rrpc->nr_sects));
821 gp = &rrpc->trans_map[laddr + i];
822
823 if (gp->rblk) {
824 rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp);
825 } else {
826 BUG_ON(is_gc);
827 rrpc_unlock_laddr(rrpc, r);
828 nvm_dev_dma_free(dev->parent, rqd->ppa_list,
829 rqd->dma_ppa_list);
830 return NVM_IO_DONE;
831 }
832 }
833
834 rqd->opcode = NVM_OP_HBREAD;
835
836 return NVM_IO_OK;
837}
838
839static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
840 unsigned long flags)
841{
842 int is_gc = flags & NVM_IOTYPE_GC;
843 sector_t laddr = rrpc_get_laddr(bio);
844 struct rrpc_addr *gp;
845
846 if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
847 return NVM_IO_REQUEUE;
848
849 BUG_ON(!(laddr < rrpc->nr_sects));
850 gp = &rrpc->trans_map[laddr];
851
852 if (gp->rblk) {
853 rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp);
854 } else {
855 BUG_ON(is_gc);
856 rrpc_unlock_rq(rrpc, rqd);
857 return NVM_IO_DONE;
858 }
859
860 rqd->opcode = NVM_OP_HBREAD;
861
862 return NVM_IO_OK;
863}
864
865static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
866 struct nvm_rq *rqd, unsigned long flags, int npages)
867{
868 struct nvm_tgt_dev *dev = rrpc->dev;
869 struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
870 struct ppa_addr p;
871 sector_t laddr = rrpc_get_laddr(bio);
872 int is_gc = flags & NVM_IOTYPE_GC;
873 int i;
874
875 if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
876 nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
877 return NVM_IO_REQUEUE;
878 }
879
880 for (i = 0; i < npages; i++) {
881 /* We assume that mapping occurs at 4KB granularity */
882 p = rrpc_map_page(rrpc, laddr + i, is_gc);
883 if (p.ppa == ADDR_EMPTY) {
884 BUG_ON(is_gc);
885 rrpc_unlock_laddr(rrpc, r);
886 nvm_dev_dma_free(dev->parent, rqd->ppa_list,
887 rqd->dma_ppa_list);
888 rrpc_gc_kick(rrpc);
889 return NVM_IO_REQUEUE;
890 }
891
892 rqd->ppa_list[i] = p;
893 }
894
895 rqd->opcode = NVM_OP_HBWRITE;
896
897 return NVM_IO_OK;
898}
899
900static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
901 struct nvm_rq *rqd, unsigned long flags)
902{
903 struct ppa_addr p;
904 int is_gc = flags & NVM_IOTYPE_GC;
905 sector_t laddr = rrpc_get_laddr(bio);
906
907 if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
908 return NVM_IO_REQUEUE;
909
910 p = rrpc_map_page(rrpc, laddr, is_gc);
911 if (p.ppa == ADDR_EMPTY) {
912 BUG_ON(is_gc);
913 rrpc_unlock_rq(rrpc, rqd);
914 rrpc_gc_kick(rrpc);
915 return NVM_IO_REQUEUE;
916 }
917
918 rqd->ppa_addr = p;
919 rqd->opcode = NVM_OP_HBWRITE;
920
921 return NVM_IO_OK;
922}
923
924static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
925 struct nvm_rq *rqd, unsigned long flags, uint8_t npages)
926{
927 struct nvm_tgt_dev *dev = rrpc->dev;
928
929 if (npages > 1) {
930 rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
931 &rqd->dma_ppa_list);
932 if (!rqd->ppa_list) {
933 pr_err("rrpc: not able to allocate ppa list\n");
934 return NVM_IO_ERR;
935 }
936
937 if (bio_op(bio) == REQ_OP_WRITE)
938 return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags,
939 npages);
940
941 return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages);
942 }
943
944 if (bio_op(bio) == REQ_OP_WRITE)
945 return rrpc_write_rq(rrpc, bio, rqd, flags);
946
947 return rrpc_read_rq(rrpc, bio, rqd, flags);
948}
949
950static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
951 struct nvm_rq *rqd, unsigned long flags)
952{
953 struct nvm_tgt_dev *dev = rrpc->dev;
954 struct rrpc_rq *rrq = nvm_rq_to_pdu(rqd);
955 uint8_t nr_pages = rrpc_get_pages(bio);
956 int bio_size = bio_sectors(bio) << 9;
957 int err;
958
959 if (bio_size < dev->geo.sec_size)
960 return NVM_IO_ERR;
961 else if (bio_size > dev->geo.max_rq_size)
962 return NVM_IO_ERR;
963
964 err = rrpc_setup_rq(rrpc, bio, rqd, flags, nr_pages);
965 if (err)
966 return err;
967
968 bio_get(bio);
969 rqd->bio = bio;
970 rqd->private = rrpc;
971 rqd->nr_ppas = nr_pages;
972 rqd->end_io = rrpc_end_io;
973 rrq->flags = flags;
974
975 err = nvm_submit_io(dev, rqd);
976 if (err) {
977 pr_err("rrpc: I/O submission failed: %d\n", err);
978 bio_put(bio);
979 if (!(flags & NVM_IOTYPE_GC)) {
980 rrpc_unlock_rq(rrpc, rqd);
981 if (rqd->nr_ppas > 1)
982 nvm_dev_dma_free(dev->parent, rqd->ppa_list,
983 rqd->dma_ppa_list);
984 }
985 return NVM_IO_ERR;
986 }
987
988 return NVM_IO_OK;
989}
990
991static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
992{
993 struct rrpc *rrpc = q->queuedata;
994 struct nvm_rq *rqd;
995 int err;
996
997 blk_queue_split(q, &bio);
998
999 if (bio_op(bio) == REQ_OP_DISCARD) {
1000 rrpc_discard(rrpc, bio);
1001 return BLK_QC_T_NONE;
1002 }
1003
1004 rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
1005 memset(rqd, 0, sizeof(struct nvm_rq));
1006
1007 err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
1008 switch (err) {
1009 case NVM_IO_OK:
1010 return BLK_QC_T_NONE;
1011 case NVM_IO_ERR:
1012 bio_io_error(bio);
1013 break;
1014 case NVM_IO_DONE:
1015 bio_endio(bio);
1016 break;
1017 case NVM_IO_REQUEUE:
1018 spin_lock(&rrpc->bio_lock);
1019 bio_list_add(&rrpc->requeue_bios, bio);
1020 spin_unlock(&rrpc->bio_lock);
1021 queue_work(rrpc->kgc_wq, &rrpc->ws_requeue);
1022 break;
1023 }
1024
1025 mempool_free(rqd, rrpc->rq_pool);
1026 return BLK_QC_T_NONE;
1027}
1028
1029static void rrpc_requeue(struct work_struct *work)
1030{
1031 struct rrpc *rrpc = container_of(work, struct rrpc, ws_requeue);
1032 struct bio_list bios;
1033 struct bio *bio;
1034
1035 bio_list_init(&bios);
1036
1037 spin_lock(&rrpc->bio_lock);
1038 bio_list_merge(&bios, &rrpc->requeue_bios);
1039 bio_list_init(&rrpc->requeue_bios);
1040 spin_unlock(&rrpc->bio_lock);
1041
1042 while ((bio = bio_list_pop(&bios)))
1043 rrpc_make_rq(rrpc->disk->queue, bio);
1044}
1045
1046static void rrpc_gc_free(struct rrpc *rrpc)
1047{
1048 if (rrpc->krqd_wq)
1049 destroy_workqueue(rrpc->krqd_wq);
1050
1051 if (rrpc->kgc_wq)
1052 destroy_workqueue(rrpc->kgc_wq);
1053}
1054
1055static int rrpc_gc_init(struct rrpc *rrpc)
1056{
1057 rrpc->krqd_wq = alloc_workqueue("rrpc-lun", WQ_MEM_RECLAIM|WQ_UNBOUND,
1058 rrpc->nr_luns);
1059 if (!rrpc->krqd_wq)
1060 return -ENOMEM;
1061
1062 rrpc->kgc_wq = alloc_workqueue("rrpc-bg", WQ_MEM_RECLAIM, 1);
1063 if (!rrpc->kgc_wq)
1064 return -ENOMEM;
1065
1066 timer_setup(&rrpc->gc_timer, rrpc_gc_timer, 0);
1067
1068 return 0;
1069}
1070
1071static void rrpc_map_free(struct rrpc *rrpc)
1072{
1073 vfree(rrpc->rev_trans_map);
1074 vfree(rrpc->trans_map);
1075}
1076
1077static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
1078{
1079 struct rrpc *rrpc = (struct rrpc *)private;
1080 struct nvm_tgt_dev *dev = rrpc->dev;
1081 struct rrpc_addr *addr = rrpc->trans_map + slba;
1082 struct rrpc_rev_addr *raddr = rrpc->rev_trans_map;
1083 struct rrpc_lun *rlun;
1084 struct rrpc_block *rblk;
1085 u64 i;
1086
1087 for (i = 0; i < nlb; i++) {
1088 struct ppa_addr gaddr;
1089 u64 pba = le64_to_cpu(entries[i]);
1090 unsigned int mod;
1091
1092 /* LNVM treats address-spaces as silos, LBA and PBA are
1093 * equally large and zero-indexed.
1094 */
1095 if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) {
1096 pr_err("nvm: L2P data entry is out of bounds!\n");
1097 pr_err("nvm: Maybe loaded an old target L2P\n");
1098 return -EINVAL;
1099 }
1100
1101 /* Address zero is a special one. The first page on a disk is
1102 * protected. As it often holds internal device boot
1103 * information.
1104 */
1105 if (!pba)
1106 continue;
1107
1108 div_u64_rem(pba, rrpc->nr_sects, &mod);
1109
1110 gaddr = rrpc_recov_addr(dev, pba);
1111 rlun = rrpc_ppa_to_lun(rrpc, gaddr);
1112 if (!rlun) {
1113 pr_err("rrpc: l2p corruption on lba %llu\n",
1114 slba + i);
1115 return -EINVAL;
1116 }
1117
1118 rblk = &rlun->blocks[gaddr.g.blk];
1119 if (!rblk->state) {
1120 /* at this point, we don't know anything about the
1121 * block. It's up to the FTL on top to re-etablish the
1122 * block state. The block is assumed to be open.
1123 */
1124 list_move_tail(&rblk->list, &rlun->used_list);
1125 rblk->state = NVM_BLK_ST_TGT;
1126 rlun->nr_free_blocks--;
1127 }
1128
1129 addr[i].addr = pba;
1130 addr[i].rblk = rblk;
1131 raddr[mod].addr = slba + i;
1132 }
1133
1134 return 0;
1135}
1136
1137static int rrpc_map_init(struct rrpc *rrpc)
1138{
1139 struct nvm_tgt_dev *dev = rrpc->dev;
1140 sector_t i;
1141 int ret;
1142
1143 rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects);
1144 if (!rrpc->trans_map)
1145 return -ENOMEM;
1146
1147 rrpc->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr)
1148 * rrpc->nr_sects);
1149 if (!rrpc->rev_trans_map)
1150 return -ENOMEM;
1151
1152 for (i = 0; i < rrpc->nr_sects; i++) {
1153 struct rrpc_addr *p = &rrpc->trans_map[i];
1154 struct rrpc_rev_addr *r = &rrpc->rev_trans_map[i];
1155
1156 p->addr = ADDR_EMPTY;
1157 r->addr = ADDR_EMPTY;
1158 }
1159
1160 /* Bring up the mapping table from device */
1161 ret = nvm_get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects,
1162 rrpc_l2p_update, rrpc);
1163 if (ret) {
1164 pr_err("nvm: rrpc: could not read L2P table.\n");
1165 return -EINVAL;
1166 }
1167
1168 return 0;
1169}
1170
1171/* Minimum pages needed within a lun */
1172#define PAGE_POOL_SIZE 16
1173#define ADDR_POOL_SIZE 64
1174
1175static int rrpc_core_init(struct rrpc *rrpc)
1176{
1177 down_write(&rrpc_lock);
1178 if (!rrpc_gcb_cache) {
1179 rrpc_gcb_cache = kmem_cache_create("rrpc_gcb",
1180 sizeof(struct rrpc_block_gc), 0, 0, NULL);
1181 if (!rrpc_gcb_cache) {
1182 up_write(&rrpc_lock);
1183 return -ENOMEM;
1184 }
1185
1186 rrpc_rq_cache = kmem_cache_create("rrpc_rq",
1187 sizeof(struct nvm_rq) + sizeof(struct rrpc_rq),
1188 0, 0, NULL);
1189 if (!rrpc_rq_cache) {
1190 kmem_cache_destroy(rrpc_gcb_cache);
1191 up_write(&rrpc_lock);
1192 return -ENOMEM;
1193 }
1194 }
1195 up_write(&rrpc_lock);
1196
1197 rrpc->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
1198 if (!rrpc->page_pool)
1199 return -ENOMEM;
1200
1201 rrpc->gcb_pool = mempool_create_slab_pool(rrpc->dev->geo.nr_luns,
1202 rrpc_gcb_cache);
1203 if (!rrpc->gcb_pool)
1204 return -ENOMEM;
1205
1206 rrpc->rq_pool = mempool_create_slab_pool(64, rrpc_rq_cache);
1207 if (!rrpc->rq_pool)
1208 return -ENOMEM;
1209
1210 spin_lock_init(&rrpc->inflights.lock);
1211 INIT_LIST_HEAD(&rrpc->inflights.reqs);
1212
1213 return 0;
1214}
1215
1216static void rrpc_core_free(struct rrpc *rrpc)
1217{
1218 mempool_destroy(rrpc->page_pool);
1219 mempool_destroy(rrpc->gcb_pool);
1220 mempool_destroy(rrpc->rq_pool);
1221}
1222
1223static void rrpc_luns_free(struct rrpc *rrpc)
1224{
1225 struct rrpc_lun *rlun;
1226 int i;
1227
1228 if (!rrpc->luns)
1229 return;
1230
1231 for (i = 0; i < rrpc->nr_luns; i++) {
1232 rlun = &rrpc->luns[i];
1233 vfree(rlun->blocks);
1234 }
1235
1236 kfree(rrpc->luns);
1237}
1238
1239static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
1240{
1241 struct nvm_geo *geo = &dev->geo;
1242 struct rrpc_block *rblk;
1243 struct ppa_addr ppa;
1244 u8 *blks;
1245 int nr_blks;
1246 int i;
1247 int ret;
1248
1249 if (!dev->parent->ops->get_bb_tbl)
1250 return 0;
1251
1252 nr_blks = geo->blks_per_lun * geo->plane_mode;
1253 blks = kmalloc(nr_blks, GFP_KERNEL);
1254 if (!blks)
1255 return -ENOMEM;
1256
1257 ppa.ppa = 0;
1258 ppa.g.ch = rlun->bppa.g.ch;
1259 ppa.g.lun = rlun->bppa.g.lun;
1260
1261 ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
1262 if (ret) {
1263 pr_err("rrpc: could not get BB table\n");
1264 goto out;
1265 }
1266
1267 nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
1268 if (nr_blks < 0) {
1269 ret = nr_blks;
1270 goto out;
1271 }
1272
1273 for (i = 0; i < nr_blks; i++) {
1274 if (blks[i] == NVM_BLK_T_FREE)
1275 continue;
1276
1277 rblk = &rlun->blocks[i];
1278 list_move_tail(&rblk->list, &rlun->bb_list);
1279 rblk->state = NVM_BLK_ST_BAD;
1280 rlun->nr_free_blocks--;
1281 }
1282
1283out:
1284 kfree(blks);
1285 return ret;
1286}
1287
1288static void rrpc_set_lun_ppa(struct rrpc_lun *rlun, struct ppa_addr ppa)
1289{
1290 rlun->bppa.ppa = 0;
1291 rlun->bppa.g.ch = ppa.g.ch;
1292 rlun->bppa.g.lun = ppa.g.lun;
1293}
1294
1295static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns)
1296{
1297 struct nvm_tgt_dev *dev = rrpc->dev;
1298 struct nvm_geo *geo = &dev->geo;
1299 struct rrpc_lun *rlun;
1300 int i, j, ret = -EINVAL;
1301
1302 if (geo->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
1303 pr_err("rrpc: number of pages per block too high.");
1304 return -EINVAL;
1305 }
1306
1307 spin_lock_init(&rrpc->rev_lock);
1308
1309 rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun),
1310 GFP_KERNEL);
1311 if (!rrpc->luns)
1312 return -ENOMEM;
1313
1314 /* 1:1 mapping */
1315 for (i = 0; i < rrpc->nr_luns; i++) {
1316 rlun = &rrpc->luns[i];
1317 rlun->id = i;
1318 rrpc_set_lun_ppa(rlun, luns[i]);
1319 rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
1320 geo->blks_per_lun);
1321 if (!rlun->blocks) {
1322 ret = -ENOMEM;
1323 goto err;
1324 }
1325
1326 INIT_LIST_HEAD(&rlun->free_list);
1327 INIT_LIST_HEAD(&rlun->used_list);
1328 INIT_LIST_HEAD(&rlun->bb_list);
1329
1330 for (j = 0; j < geo->blks_per_lun; j++) {
1331 struct rrpc_block *rblk = &rlun->blocks[j];
1332
1333 rblk->id = j;
1334 rblk->rlun = rlun;
1335 rblk->state = NVM_BLK_T_FREE;
1336 INIT_LIST_HEAD(&rblk->prio);
1337 INIT_LIST_HEAD(&rblk->list);
1338 spin_lock_init(&rblk->lock);
1339
1340 list_add_tail(&rblk->list, &rlun->free_list);
1341 }
1342
1343 rlun->rrpc = rrpc;
1344 rlun->nr_free_blocks = geo->blks_per_lun;
1345 rlun->reserved_blocks = 2; /* for GC only */
1346
1347 INIT_LIST_HEAD(&rlun->prio_list);
1348 INIT_LIST_HEAD(&rlun->wblk_list);
1349
1350 INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
1351 spin_lock_init(&rlun->lock);
1352
1353 if (rrpc_bb_discovery(dev, rlun))
1354 goto err;
1355
1356 }
1357
1358 return 0;
1359err:
1360 return ret;
1361}
1362
1363/* returns 0 on success and stores the beginning address in *begin */
1364static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
1365{
1366 struct nvm_tgt_dev *dev = rrpc->dev;
1367 sector_t size = rrpc->nr_sects * dev->geo.sec_size;
1368 int ret;
1369
1370 size >>= 9;
1371
1372 ret = nvm_get_area(dev, begin, size);
1373 if (!ret)
1374 *begin >>= (ilog2(dev->geo.sec_size) - 9);
1375
1376 return ret;
1377}
1378
1379static void rrpc_area_free(struct rrpc *rrpc)
1380{
1381 struct nvm_tgt_dev *dev = rrpc->dev;
1382 sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9);
1383
1384 nvm_put_area(dev, begin);
1385}
1386
1387static void rrpc_free(struct rrpc *rrpc)
1388{
1389 rrpc_gc_free(rrpc);
1390 rrpc_map_free(rrpc);
1391 rrpc_core_free(rrpc);
1392 rrpc_luns_free(rrpc);
1393 rrpc_area_free(rrpc);
1394
1395 kfree(rrpc);
1396}
1397
1398static void rrpc_exit(void *private)
1399{
1400 struct rrpc *rrpc = private;
1401
1402 del_timer(&rrpc->gc_timer);
1403
1404 flush_workqueue(rrpc->krqd_wq);
1405 flush_workqueue(rrpc->kgc_wq);
1406
1407 rrpc_free(rrpc);
1408}
1409
1410static sector_t rrpc_capacity(void *private)
1411{
1412 struct rrpc *rrpc = private;
1413 struct nvm_tgt_dev *dev = rrpc->dev;
1414 sector_t reserved, provisioned;
1415
1416 /* cur, gc, and two emergency blocks for each lun */
1417 reserved = rrpc->nr_luns * dev->geo.sec_per_blk * 4;
1418 provisioned = rrpc->nr_sects - reserved;
1419
1420 if (reserved > rrpc->nr_sects) {
1421 pr_err("rrpc: not enough space available to expose storage.\n");
1422 return 0;
1423 }
1424
1425 sector_div(provisioned, 10);
1426 return provisioned * 9 * NR_PHY_IN_LOG;
1427}
1428
1429/*
1430 * Looks up the logical address from reverse trans map and check if its valid by
1431 * comparing the logical to physical address with the physical address.
1432 * Returns 0 on free, otherwise 1 if in use
1433 */
1434static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk)
1435{
1436 struct nvm_tgt_dev *dev = rrpc->dev;
1437 int offset;
1438 struct rrpc_addr *laddr;
1439 u64 bpaddr, paddr, pladdr;
1440
1441 bpaddr = block_to_rel_addr(rrpc, rblk);
1442 for (offset = 0; offset < dev->geo.sec_per_blk; offset++) {
1443 paddr = bpaddr + offset;
1444
1445 pladdr = rrpc->rev_trans_map[paddr].addr;
1446 if (pladdr == ADDR_EMPTY)
1447 continue;
1448
1449 laddr = &rrpc->trans_map[pladdr];
1450
1451 if (paddr == laddr->addr) {
1452 laddr->rblk = rblk;
1453 } else {
1454 set_bit(offset, rblk->invalid_pages);
1455 rblk->nr_invalid_pages++;
1456 }
1457 }
1458}
1459
1460static int rrpc_blocks_init(struct rrpc *rrpc)
1461{
1462 struct nvm_tgt_dev *dev = rrpc->dev;
1463 struct rrpc_lun *rlun;
1464 struct rrpc_block *rblk;
1465 int lun_iter, blk_iter;
1466
1467 for (lun_iter = 0; lun_iter < rrpc->nr_luns; lun_iter++) {
1468 rlun = &rrpc->luns[lun_iter];
1469
1470 for (blk_iter = 0; blk_iter < dev->geo.blks_per_lun;
1471 blk_iter++) {
1472 rblk = &rlun->blocks[blk_iter];
1473 rrpc_block_map_update(rrpc, rblk);
1474 }
1475 }
1476
1477 return 0;
1478}
1479
1480static int rrpc_luns_configure(struct rrpc *rrpc)
1481{
1482 struct rrpc_lun *rlun;
1483 struct rrpc_block *rblk;
1484 int i;
1485
1486 for (i = 0; i < rrpc->nr_luns; i++) {
1487 rlun = &rrpc->luns[i];
1488
1489 rblk = rrpc_get_blk(rrpc, rlun, 0);
1490 if (!rblk)
1491 goto err;
1492 rrpc_set_lun_cur(rlun, rblk, &rlun->cur);
1493
1494 /* Emergency gc block */
1495 rblk = rrpc_get_blk(rrpc, rlun, 1);
1496 if (!rblk)
1497 goto err;
1498 rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur);
1499 }
1500
1501 return 0;
1502err:
1503 rrpc_put_blks(rrpc);
1504 return -EINVAL;
1505}
1506
1507static struct nvm_tgt_type tt_rrpc;
1508
1509static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1510 int flags)
1511{
1512 struct request_queue *bqueue = dev->q;
1513 struct request_queue *tqueue = tdisk->queue;
1514 struct nvm_geo *geo = &dev->geo;
1515 struct rrpc *rrpc;
1516 sector_t soffset;
1517 int ret;
1518
1519 if (!(dev->identity.dom & NVM_RSP_L2P)) {
1520 pr_err("nvm: rrpc: device does not support l2p (%x)\n",
1521 dev->identity.dom);
1522 return ERR_PTR(-EINVAL);
1523 }
1524
1525 rrpc = kzalloc(sizeof(struct rrpc), GFP_KERNEL);
1526 if (!rrpc)
1527 return ERR_PTR(-ENOMEM);
1528
1529 rrpc->dev = dev;
1530 rrpc->disk = tdisk;
1531
1532 bio_list_init(&rrpc->requeue_bios);
1533 spin_lock_init(&rrpc->bio_lock);
1534 INIT_WORK(&rrpc->ws_requeue, rrpc_requeue);
1535
1536 rrpc->nr_luns = geo->nr_luns;
1537 rrpc->nr_sects = (unsigned long long)geo->sec_per_lun * rrpc->nr_luns;
1538
1539 /* simple round-robin strategy */
1540 atomic_set(&rrpc->next_lun, -1);
1541
1542 ret = rrpc_area_init(rrpc, &soffset);
1543 if (ret < 0) {
1544 pr_err("nvm: rrpc: could not initialize area\n");
1545 return ERR_PTR(ret);
1546 }
1547 rrpc->soffset = soffset;
1548
1549 ret = rrpc_luns_init(rrpc, dev->luns);
1550 if (ret) {
1551 pr_err("nvm: rrpc: could not initialize luns\n");
1552 goto err;
1553 }
1554
1555 ret = rrpc_core_init(rrpc);
1556 if (ret) {
1557 pr_err("nvm: rrpc: could not initialize core\n");
1558 goto err;
1559 }
1560
1561 ret = rrpc_map_init(rrpc);
1562 if (ret) {
1563 pr_err("nvm: rrpc: could not initialize maps\n");
1564 goto err;
1565 }
1566
1567 ret = rrpc_blocks_init(rrpc);
1568 if (ret) {
1569 pr_err("nvm: rrpc: could not initialize state for blocks\n");
1570 goto err;
1571 }
1572
1573 ret = rrpc_luns_configure(rrpc);
1574 if (ret) {
1575 pr_err("nvm: rrpc: not enough blocks available in LUNs.\n");
1576 goto err;
1577 }
1578
1579 ret = rrpc_gc_init(rrpc);
1580 if (ret) {
1581 pr_err("nvm: rrpc: could not initialize gc\n");
1582 goto err;
1583 }
1584
1585 /* inherit the size from the underlying device */
1586 blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
1587 blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
1588
1589 pr_info("nvm: rrpc initialized with %u luns and %llu pages.\n",
1590 rrpc->nr_luns, (unsigned long long)rrpc->nr_sects);
1591
1592 mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10));
1593
1594 return rrpc;
1595err:
1596 rrpc_free(rrpc);
1597 return ERR_PTR(ret);
1598}
1599
1600/* round robin, page-based FTL, and cost-based GC */
1601static struct nvm_tgt_type tt_rrpc = {
1602 .name = "rrpc",
1603 .version = {1, 0, 0},
1604
1605 .make_rq = rrpc_make_rq,
1606 .capacity = rrpc_capacity,
1607
1608 .init = rrpc_init,
1609 .exit = rrpc_exit,
1610};
1611
1612static int __init rrpc_module_init(void)
1613{
1614 return nvm_register_tgt_type(&tt_rrpc);
1615}
1616
1617static void rrpc_module_exit(void)
1618{
1619 nvm_unregister_tgt_type(&tt_rrpc);
1620}
1621
1622module_init(rrpc_module_init);
1623module_exit(rrpc_module_exit);
1624MODULE_LICENSE("GPL v2");
1625MODULE_DESCRIPTION("Block-Device Target for Open-Channel SSDs");
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
deleted file mode 100644
index fdb6ff902903..000000000000
--- a/drivers/lightnvm/rrpc.h
+++ /dev/null
@@ -1,290 +0,0 @@
1/*
2 * Copyright (C) 2015 IT University of Copenhagen
3 * Initial release: Matias Bjorling <m@bjorling.me>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version
7 * 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
15 */
16
17#ifndef RRPC_H_
18#define RRPC_H_
19
20#include <linux/blkdev.h>
21#include <linux/blk-mq.h>
22#include <linux/bio.h>
23#include <linux/module.h>
24#include <linux/kthread.h>
25#include <linux/vmalloc.h>
26
27#include <linux/lightnvm.h>
28
29/* Run only GC if less than 1/X blocks are free */
30#define GC_LIMIT_INVERSE 10
31#define GC_TIME_SECS 100
32
33#define RRPC_SECTOR (512)
34#define RRPC_EXPOSED_PAGE_SIZE (4096)
35
36#define NR_PHY_IN_LOG (RRPC_EXPOSED_PAGE_SIZE / RRPC_SECTOR)
37
38struct rrpc_inflight {
39 struct list_head reqs;
40 spinlock_t lock;
41};
42
43struct rrpc_inflight_rq {
44 struct list_head list;
45 sector_t l_start;
46 sector_t l_end;
47};
48
49struct rrpc_rq {
50 struct rrpc_inflight_rq inflight_rq;
51 unsigned long flags;
52};
53
54struct rrpc_block {
55 int id; /* id inside of LUN */
56 struct rrpc_lun *rlun;
57
58 struct list_head prio; /* LUN CG list */
59 struct list_head list; /* LUN free, used, bb list */
60
61#define MAX_INVALID_PAGES_STORAGE 8
62 /* Bitmap for invalid page intries */
63 unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE];
64 /* points to the next writable page within a block */
65 unsigned int next_page;
66 /* number of pages that are invalid, wrt host page size */
67 unsigned int nr_invalid_pages;
68
69 int state;
70
71 spinlock_t lock;
72 atomic_t data_cmnt_size; /* data pages committed to stable storage */
73};
74
75struct rrpc_lun {
76 struct rrpc *rrpc;
77
78 int id;
79 struct ppa_addr bppa;
80
81 struct rrpc_block *cur, *gc_cur;
82 struct rrpc_block *blocks; /* Reference to block allocation */
83
84 struct list_head prio_list; /* Blocks that may be GC'ed */
85 struct list_head wblk_list; /* Queued blocks to be written to */
86
87 /* lun block lists */
88 struct list_head used_list; /* In-use blocks */
89 struct list_head free_list; /* Not used blocks i.e. released
90 * and ready for use
91 */
92 struct list_head bb_list; /* Bad blocks. Mutually exclusive with
93 * free_list and used_list
94 */
95 unsigned int nr_free_blocks; /* Number of unused blocks */
96
97 struct work_struct ws_gc;
98
99 int reserved_blocks;
100
101 spinlock_t lock;
102};
103
104struct rrpc {
105 struct nvm_tgt_dev *dev;
106 struct gendisk *disk;
107
108 sector_t soffset; /* logical sector offset */
109
110 int nr_luns;
111 struct rrpc_lun *luns;
112
113 /* calculated values */
114 unsigned long long nr_sects;
115
116 /* Write strategy variables. Move these into each for structure for each
117 * strategy
118 */
119 atomic_t next_lun; /* Whenever a page is written, this is updated
120 * to point to the next write lun
121 */
122
123 spinlock_t bio_lock;
124 struct bio_list requeue_bios;
125 struct work_struct ws_requeue;
126
127 /* Simple translation map of logical addresses to physical addresses.
128 * The logical addresses is known by the host system, while the physical
129 * addresses are used when writing to the disk block device.
130 */
131 struct rrpc_addr *trans_map;
132 /* also store a reverse map for garbage collection */
133 struct rrpc_rev_addr *rev_trans_map;
134 spinlock_t rev_lock;
135
136 struct rrpc_inflight inflights;
137
138 mempool_t *addr_pool;
139 mempool_t *page_pool;
140 mempool_t *gcb_pool;
141 mempool_t *rq_pool;
142
143 struct timer_list gc_timer;
144 struct workqueue_struct *krqd_wq;
145 struct workqueue_struct *kgc_wq;
146};
147
148struct rrpc_block_gc {
149 struct rrpc *rrpc;
150 struct rrpc_block *rblk;
151 struct work_struct ws_gc;
152};
153
154/* Logical to physical mapping */
155struct rrpc_addr {
156 u64 addr;
157 struct rrpc_block *rblk;
158};
159
160/* Physical to logical mapping */
161struct rrpc_rev_addr {
162 u64 addr;
163};
164
165static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo,
166 struct ppa_addr r)
167{
168 struct ppa_addr l;
169 int secs, pgs;
170 sector_t ppa = r.ppa;
171
172 l.ppa = 0;
173
174 div_u64_rem(ppa, geo->sec_per_pg, &secs);
175 l.g.sec = secs;
176
177 sector_div(ppa, geo->sec_per_pg);
178 div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
179 l.g.pg = pgs;
180
181 return l;
182}
183
184static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba)
185{
186 return linear_to_generic_addr(&dev->geo, pba);
187}
188
189static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk)
190{
191 struct nvm_tgt_dev *dev = rrpc->dev;
192 struct nvm_geo *geo = &dev->geo;
193 struct rrpc_lun *rlun = rblk->rlun;
194
195 return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
196}
197
198static inline sector_t rrpc_get_laddr(struct bio *bio)
199{
200 return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
201}
202
203static inline unsigned int rrpc_get_pages(struct bio *bio)
204{
205 return bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
206}
207
208static inline sector_t rrpc_get_sector(sector_t laddr)
209{
210 return laddr * NR_PHY_IN_LOG;
211}
212
213static inline int request_intersects(struct rrpc_inflight_rq *r,
214 sector_t laddr_start, sector_t laddr_end)
215{
216 return (laddr_end >= r->l_start) && (laddr_start <= r->l_end);
217}
218
219static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
220 unsigned int pages, struct rrpc_inflight_rq *r)
221{
222 sector_t laddr_end = laddr + pages - 1;
223 struct rrpc_inflight_rq *rtmp;
224
225 WARN_ON(irqs_disabled());
226
227 spin_lock_irq(&rrpc->inflights.lock);
228 list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) {
229 if (unlikely(request_intersects(rtmp, laddr, laddr_end))) {
230 /* existing, overlapping request, come back later */
231 spin_unlock_irq(&rrpc->inflights.lock);
232 return 1;
233 }
234 }
235
236 r->l_start = laddr;
237 r->l_end = laddr_end;
238
239 list_add_tail(&r->list, &rrpc->inflights.reqs);
240 spin_unlock_irq(&rrpc->inflights.lock);
241 return 0;
242}
243
244static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
245 unsigned int pages,
246 struct rrpc_inflight_rq *r)
247{
248 BUG_ON((laddr + pages) > rrpc->nr_sects);
249
250 return __rrpc_lock_laddr(rrpc, laddr, pages, r);
251}
252
253static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq *rqd)
254{
255 struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
256
257 return &rrqd->inflight_rq;
258}
259
260static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio,
261 struct nvm_rq *rqd)
262{
263 sector_t laddr = rrpc_get_laddr(bio);
264 unsigned int pages = rrpc_get_pages(bio);
265 struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
266
267 return rrpc_lock_laddr(rrpc, laddr, pages, r);
268}
269
270static inline void rrpc_unlock_laddr(struct rrpc *rrpc,
271 struct rrpc_inflight_rq *r)
272{
273 unsigned long flags;
274
275 spin_lock_irqsave(&rrpc->inflights.lock, flags);
276 list_del_init(&r->list);
277 spin_unlock_irqrestore(&rrpc->inflights.lock, flags);
278}
279
280static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd)
281{
282 struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
283 uint8_t pages = rqd->nr_ppas;
284
285 BUG_ON((r->l_start + pages) > rrpc->nr_sects);
286
287 rrpc_unlock_laddr(rrpc, r);
288}
289
290#endif /* RRPC_H_ */
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a0cc1bc6d884..6cc6c0f9c3a9 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -525,15 +525,21 @@ struct open_bucket {
525 525
526/* 526/*
527 * We keep multiple buckets open for writes, and try to segregate different 527 * We keep multiple buckets open for writes, and try to segregate different
528 * write streams for better cache utilization: first we look for a bucket where 528 * write streams for better cache utilization: first we try to segregate flash
529 * the last write to it was sequential with the current write, and failing that 529 * only volume write streams from cached devices, secondly we look for a bucket
530 * we look for a bucket that was last used by the same task. 530 * where the last write to it was sequential with the current write, and
531 * failing that we look for a bucket that was last used by the same task.
531 * 532 *
532 * The ideas is if you've got multiple tasks pulling data into the cache at the 533 * The ideas is if you've got multiple tasks pulling data into the cache at the
533 * same time, you'll get better cache utilization if you try to segregate their 534 * same time, you'll get better cache utilization if you try to segregate their
534 * data and preserve locality. 535 * data and preserve locality.
535 * 536 *
536 * For example, say you've starting Firefox at the same time you're copying a 537 * For example, dirty sectors of flash only volume is not reclaimable, if their
538 * dirty sectors mixed with dirty sectors of cached device, such buckets will
539 * be marked as dirty and won't be reclaimed, though the dirty data of cached
540 * device have been written back to backend device.
541 *
542 * And say you've starting Firefox at the same time you're copying a
537 * bunch of files. Firefox will likely end up being fairly hot and stay in the 543 * bunch of files. Firefox will likely end up being fairly hot and stay in the
538 * cache awhile, but the data you copied might not be; if you wrote all that 544 * cache awhile, but the data you copied might not be; if you wrote all that
539 * data to the same buckets it'd get invalidated at the same time. 545 * data to the same buckets it'd get invalidated at the same time.
@@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
550 struct open_bucket *ret, *ret_task = NULL; 556 struct open_bucket *ret, *ret_task = NULL;
551 557
552 list_for_each_entry_reverse(ret, &c->data_buckets, list) 558 list_for_each_entry_reverse(ret, &c->data_buckets, list)
553 if (!bkey_cmp(&ret->key, search)) 559 if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
560 UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
561 continue;
562 else if (!bkey_cmp(&ret->key, search))
554 goto found; 563 goto found;
555 else if (ret->last_write_point == write_point) 564 else if (ret->last_write_point == write_point)
556 ret_task = ret; 565 ret_task = ret;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 843877e017e1..5e2d4e80198e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -320,14 +320,15 @@ struct cached_dev {
320 */ 320 */
321 atomic_t has_dirty; 321 atomic_t has_dirty;
322 322
323 struct bch_ratelimit writeback_rate;
324 struct delayed_work writeback_rate_update;
325
326 /* 323 /*
327 * Internal to the writeback code, so read_dirty() can keep track of 324 * Set to zero by things that touch the backing volume-- except
328 * where it's at. 325 * writeback. Incremented by writeback. Used to determine when to
326 * accelerate idle writeback.
329 */ 327 */
330 sector_t last_read; 328 atomic_t backing_idle;
329
330 struct bch_ratelimit writeback_rate;
331 struct delayed_work writeback_rate_update;
331 332
332 /* Limit number of writeback bios in flight */ 333 /* Limit number of writeback bios in flight */
333 struct semaphore in_flight; 334 struct semaphore in_flight;
@@ -336,6 +337,14 @@ struct cached_dev {
336 337
337 struct keybuf writeback_keys; 338 struct keybuf writeback_keys;
338 339
340 /*
341 * Order the write-half of writeback operations strongly in dispatch
342 * order. (Maintain LBA order; don't allow reads completing out of
343 * order to re-order the writes...)
344 */
345 struct closure_waitlist writeback_ordering_wait;
346 atomic_t writeback_sequence_next;
347
339 /* For tracking sequential IO */ 348 /* For tracking sequential IO */
340#define RECENT_IO_BITS 7 349#define RECENT_IO_BITS 7
341#define RECENT_IO (1 << RECENT_IO_BITS) 350#define RECENT_IO (1 << RECENT_IO_BITS)
@@ -488,6 +497,7 @@ struct cache_set {
488 int caches_loaded; 497 int caches_loaded;
489 498
490 struct bcache_device **devices; 499 struct bcache_device **devices;
500 unsigned devices_max_used;
491 struct list_head cached_devs; 501 struct list_head cached_devs;
492 uint64_t cached_dev_sectors; 502 uint64_t cached_dev_sectors;
493 struct closure caching; 503 struct closure caching;
@@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c)
852 862
853/* Forward declarations */ 863/* Forward declarations */
854 864
855void bch_count_io_errors(struct cache *, blk_status_t, const char *); 865void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
856void bch_bbio_count_io_errors(struct cache_set *, struct bio *, 866void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
857 blk_status_t, const char *); 867 blk_status_t, const char *);
858void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, 868void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81e8dc3dbe5e..bf3a48aa9a9a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
419 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + 419 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
420 bset_sector_offset(&b->keys, i)); 420 bset_sector_offset(&b->keys, i));
421 421
422 if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { 422 if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
423 int j; 423 int j;
424 struct bio_vec *bv; 424 struct bio_vec *bv;
425 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 425 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b)
432 432
433 continue_at(cl, btree_node_write_done, NULL); 433 continue_at(cl, btree_node_write_done, NULL);
434 } else { 434 } else {
435 /* No problem for multipage bvec since the bio is just allocated */
435 b->bio->bi_vcnt = 0; 436 b->bio->bi_vcnt = 0;
436 bch_bio_map(b->bio, i); 437 bch_bio_map(b->bio, i);
437 438
@@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
1678 1679
1679 /* don't reclaim buckets to which writeback keys point */ 1680 /* don't reclaim buckets to which writeback keys point */
1680 rcu_read_lock(); 1681 rcu_read_lock();
1681 for (i = 0; i < c->nr_uuids; i++) { 1682 for (i = 0; i < c->devices_max_used; i++) {
1682 struct bcache_device *d = c->devices[i]; 1683 struct bcache_device *d = c->devices[i];
1683 struct cached_dev *dc; 1684 struct cached_dev *dc;
1684 struct keybuf_key *w, *n; 1685 struct keybuf_key *w, *n;
@@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg)
1803int bch_gc_thread_start(struct cache_set *c) 1804int bch_gc_thread_start(struct cache_set *c)
1804{ 1805{
1805 c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); 1806 c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
1806 if (IS_ERR(c->gc_thread)) 1807 return PTR_ERR_OR_ZERO(c->gc_thread);
1807 return PTR_ERR(c->gc_thread);
1808
1809 return 0;
1810} 1808}
1811 1809
1812/* Initial partial gc */ 1810/* Initial partial gc */
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 1841d0359bac..7f12920c14f7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -8,6 +8,7 @@
8#include <linux/debugfs.h> 8#include <linux/debugfs.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/sched/debug.h>
11 12
12#include "closure.h" 13#include "closure.h"
13 14
@@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
18 BUG_ON(flags & CLOSURE_GUARD_MASK); 19 BUG_ON(flags & CLOSURE_GUARD_MASK);
19 BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); 20 BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
20 21
21 /* Must deliver precisely one wakeup */
22 if (r == 1 && (flags & CLOSURE_SLEEPING))
23 wake_up_process(cl->task);
24
25 if (!r) { 22 if (!r) {
26 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { 23 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
27 atomic_set(&cl->remaining, 24 atomic_set(&cl->remaining,
@@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
100} 97}
101EXPORT_SYMBOL(closure_wait); 98EXPORT_SYMBOL(closure_wait);
102 99
103/** 100struct closure_syncer {
104 * closure_sync - sleep until a closure has nothing left to wait on 101 struct task_struct *task;
105 * 102 int done;
106 * Sleeps until the refcount hits 1 - the thread that's running the closure owns 103};
107 * the last refcount. 104
108 */ 105static void closure_sync_fn(struct closure *cl)
109void closure_sync(struct closure *cl)
110{ 106{
111 while (1) { 107 cl->s->done = 1;
112 __closure_start_sleep(cl); 108 wake_up_process(cl->s->task);
113 closure_set_ret_ip(cl); 109}
114 110
115 if ((atomic_read(&cl->remaining) & 111void __sched __closure_sync(struct closure *cl)
116 CLOSURE_REMAINING_MASK) == 1) 112{
117 break; 113 struct closure_syncer s = { .task = current };
118 114
115 cl->s = &s;
116 continue_at(cl, closure_sync_fn, NULL);
117
118 while (1) {
119 set_current_state(TASK_UNINTERRUPTIBLE);
120 if (s.done)
121 break;
119 schedule(); 122 schedule();
120 } 123 }
121 124
122 __closure_end_sleep(cl); 125 __set_current_state(TASK_RUNNING);
123} 126}
124EXPORT_SYMBOL(closure_sync); 127EXPORT_SYMBOL(__closure_sync);
125 128
126#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 129#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
127 130
@@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data)
168 cl, (void *) cl->ip, cl->fn, cl->parent, 171 cl, (void *) cl->ip, cl->fn, cl->parent,
169 r & CLOSURE_REMAINING_MASK); 172 r & CLOSURE_REMAINING_MASK);
170 173
171 seq_printf(f, "%s%s%s%s\n", 174 seq_printf(f, "%s%s\n",
172 test_bit(WORK_STRUCT_PENDING_BIT, 175 test_bit(WORK_STRUCT_PENDING_BIT,
173 work_data_bits(&cl->work)) ? "Q" : "", 176 work_data_bits(&cl->work)) ? "Q" : "",
174 r & CLOSURE_RUNNING ? "R" : "", 177 r & CLOSURE_RUNNING ? "R" : "");
175 r & CLOSURE_STACK ? "S" : "",
176 r & CLOSURE_SLEEPING ? "Sl" : "");
177 178
178 if (r & CLOSURE_WAITING) 179 if (r & CLOSURE_WAITING)
179 seq_printf(f, " W %pF\n", 180 seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index ccfbea6f9f6b..3b9dfc9962ad 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -103,6 +103,7 @@
103 */ 103 */
104 104
105struct closure; 105struct closure;
106struct closure_syncer;
106typedef void (closure_fn) (struct closure *); 107typedef void (closure_fn) (struct closure *);
107 108
108struct closure_waitlist { 109struct closure_waitlist {
@@ -115,10 +116,6 @@ enum closure_state {
115 * the thread that owns the closure, and cleared by the thread that's 116 * the thread that owns the closure, and cleared by the thread that's
116 * waking up the closure. 117 * waking up the closure.
117 * 118 *
118 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
119 * - indicates that cl->task is valid and closure_put() may wake it up.
120 * Only set or cleared by the thread that owns the closure.
121 *
122 * The rest are for debugging and don't affect behaviour: 119 * The rest are for debugging and don't affect behaviour:
123 * 120 *
124 * CLOSURE_RUNNING: Set when a closure is running (i.e. by 121 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -128,22 +125,16 @@ enum closure_state {
128 * continue_at() and closure_return() clear it for you, if you're doing 125 * continue_at() and closure_return() clear it for you, if you're doing
129 * something unusual you can use closure_set_dead() which also helps 126 * something unusual you can use closure_set_dead() which also helps
130 * annotate where references are being transferred. 127 * annotate where references are being transferred.
131 *
132 * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
133 * closure with this flag set
134 */ 128 */
135 129
136 CLOSURE_BITS_START = (1 << 23), 130 CLOSURE_BITS_START = (1U << 26),
137 CLOSURE_DESTRUCTOR = (1 << 23), 131 CLOSURE_DESTRUCTOR = (1U << 26),
138 CLOSURE_WAITING = (1 << 25), 132 CLOSURE_WAITING = (1U << 28),
139 CLOSURE_SLEEPING = (1 << 27), 133 CLOSURE_RUNNING = (1U << 30),
140 CLOSURE_RUNNING = (1 << 29),
141 CLOSURE_STACK = (1 << 31),
142}; 134};
143 135
144#define CLOSURE_GUARD_MASK \ 136#define CLOSURE_GUARD_MASK \
145 ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ 137 ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
146 CLOSURE_RUNNING|CLOSURE_STACK) << 1)
147 138
148#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) 139#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
149#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) 140#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@@ -152,7 +143,7 @@ struct closure {
152 union { 143 union {
153 struct { 144 struct {
154 struct workqueue_struct *wq; 145 struct workqueue_struct *wq;
155 struct task_struct *task; 146 struct closure_syncer *s;
156 struct llist_node list; 147 struct llist_node list;
157 closure_fn *fn; 148 closure_fn *fn;
158 }; 149 };
@@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v);
178void closure_put(struct closure *cl); 169void closure_put(struct closure *cl);
179void __closure_wake_up(struct closure_waitlist *list); 170void __closure_wake_up(struct closure_waitlist *list);
180bool closure_wait(struct closure_waitlist *list, struct closure *cl); 171bool closure_wait(struct closure_waitlist *list, struct closure *cl);
181void closure_sync(struct closure *cl); 172void __closure_sync(struct closure *cl);
173
174/**
175 * closure_sync - sleep until a closure a closure has nothing left to wait on
176 *
177 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
178 * the last refcount.
179 */
180static inline void closure_sync(struct closure *cl)
181{
182 if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
183 __closure_sync(cl);
184}
182 185
183#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 186#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
184 187
@@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
215#endif 218#endif
216} 219}
217 220
218static inline void __closure_end_sleep(struct closure *cl)
219{
220 __set_current_state(TASK_RUNNING);
221
222 if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
223 atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
224}
225
226static inline void __closure_start_sleep(struct closure *cl)
227{
228 closure_set_ip(cl);
229 cl->task = current;
230 set_current_state(TASK_UNINTERRUPTIBLE);
231
232 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
233 atomic_add(CLOSURE_SLEEPING, &cl->remaining);
234}
235
236static inline void closure_set_stopped(struct closure *cl) 221static inline void closure_set_stopped(struct closure *cl)
237{ 222{
238 atomic_sub(CLOSURE_RUNNING, &cl->remaining); 223 atomic_sub(CLOSURE_RUNNING, &cl->remaining);
@@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl)
241static inline void set_closure_fn(struct closure *cl, closure_fn *fn, 226static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
242 struct workqueue_struct *wq) 227 struct workqueue_struct *wq)
243{ 228{
244 BUG_ON(object_is_on_stack(cl));
245 closure_set_ip(cl); 229 closure_set_ip(cl);
246 cl->fn = fn; 230 cl->fn = fn;
247 cl->wq = wq; 231 cl->wq = wq;
@@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
300static inline void closure_init_stack(struct closure *cl) 284static inline void closure_init_stack(struct closure *cl)
301{ 285{
302 memset(cl, 0, sizeof(struct closure)); 286 memset(cl, 0, sizeof(struct closure));
303 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); 287 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
304} 288}
305 289
306/** 290/**
@@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list)
322 * This is because after calling continue_at() you no longer have a ref on @cl, 306 * This is because after calling continue_at() you no longer have a ref on @cl,
323 * and whatever @cl owns may be freed out from under you - a running closure fn 307 * and whatever @cl owns may be freed out from under you - a running closure fn
324 * has a ref on its own closure which continue_at() drops. 308 * has a ref on its own closure which continue_at() drops.
309 *
310 * Note you are expected to immediately return after using this macro.
325 */ 311 */
326#define continue_at(_cl, _fn, _wq) \ 312#define continue_at(_cl, _fn, _wq) \
327do { \ 313do { \
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c7a02c4900da..af89408befe8 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
116 return; 116 return;
117 check->bi_opf = REQ_OP_READ; 117 check->bi_opf = REQ_OP_READ;
118 118
119 if (bio_alloc_pages(check, GFP_NOIO)) 119 if (bch_bio_alloc_pages(check, GFP_NOIO))
120 goto out_put; 120 goto out_put;
121 121
122 submit_bio_wait(check); 122 submit_bio_wait(check);
@@ -251,8 +251,7 @@ void bch_debug_exit(void)
251 251
252int __init bch_debug_init(struct kobject *kobj) 252int __init bch_debug_init(struct kobject *kobj)
253{ 253{
254 int ret = 0;
255
256 debug = debugfs_create_dir("bcache", NULL); 254 debug = debugfs_create_dir("bcache", NULL);
257 return ret; 255
256 return IS_ERR_OR_NULL(debug);
258} 257}
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fac97ec2d0e2..a783c5a41ff1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
51 51
52/* IO errors */ 52/* IO errors */
53 53
54void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) 54void bch_count_io_errors(struct cache *ca,
55 blk_status_t error,
56 int is_read,
57 const char *m)
55{ 58{
56 /* 59 /*
57 * The halflife of an error is: 60 * The halflife of an error is:
@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
94 errors >>= IO_ERROR_SHIFT; 97 errors >>= IO_ERROR_SHIFT;
95 98
96 if (errors < ca->set->error_limit) 99 if (errors < ca->set->error_limit)
97 pr_err("%s: IO error on %s, recovering", 100 pr_err("%s: IO error on %s%s",
98 bdevname(ca->bdev, buf), m); 101 bdevname(ca->bdev, buf), m,
102 is_read ? ", recovering." : ".");
99 else 103 else
100 bch_cache_set_error(ca->set, 104 bch_cache_set_error(ca->set,
101 "%s: too many IO errors %s", 105 "%s: too many IO errors %s",
@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
108{ 112{
109 struct bbio *b = container_of(bio, struct bbio, bio); 113 struct bbio *b = container_of(bio, struct bbio, bio);
110 struct cache *ca = PTR_CACHE(c, &b->key, 0); 114 struct cache *ca = PTR_CACHE(c, &b->key, 0);
115 int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
111 116
112 unsigned threshold = op_is_write(bio_op(bio)) 117 unsigned threshold = op_is_write(bio_op(bio))
113 ? c->congested_write_threshold_us 118 ? c->congested_write_threshold_us
@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
129 atomic_inc(&c->congested); 134 atomic_inc(&c->congested);
130 } 135 }
131 136
132 bch_count_io_errors(ca, error, m); 137 bch_count_io_errors(ca, error, is_read, m);
133} 138}
134 139
135void bch_bbio_endio(struct cache_set *c, struct bio *bio, 140void bch_bbio_endio(struct cache_set *c, struct bio *bio,
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index d50c1c97da68..a24c3a95b2c0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
162 bio_set_op_attrs(bio, REQ_OP_READ, 0); 162 bio_set_op_attrs(bio, REQ_OP_READ, 0);
163 bio->bi_end_io = read_moving_endio; 163 bio->bi_end_io = read_moving_endio;
164 164
165 if (bio_alloc_pages(bio, GFP_KERNEL)) 165 if (bch_bio_alloc_pages(bio, GFP_KERNEL))
166 goto err; 166 goto err;
167 167
168 trace_bcache_gc_copy(&w->key); 168 trace_bcache_gc_copy(&w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 643c3021624f..1a46b41dac70 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl)
576{ 576{
577 struct search *s = container_of(cl, struct search, iop.cl); 577 struct search *s = container_of(cl, struct search, iop.cl);
578 struct bio *bio = &s->bio.bio; 578 struct bio *bio = &s->bio.bio;
579 struct cached_dev *dc;
579 int ret; 580 int ret;
580 581
581 bch_btree_op_init(&s->op, -1); 582 bch_btree_op_init(&s->op, -1);
@@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl)
588 return; 589 return;
589 } 590 }
590 591
592 /*
593 * We might meet err when searching the btree, If that happens, we will
594 * get negative ret, in this scenario we should not recover data from
595 * backing device (when cache device is dirty) because we don't know
596 * whether bkeys the read request covered are all clean.
597 *
598 * And after that happened, s->iop.status is still its initial value
599 * before we submit s->bio.bio
600 */
601 if (ret < 0) {
602 BUG_ON(ret == -EINTR);
603 if (s->d && s->d->c &&
604 !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
605 dc = container_of(s->d, struct cached_dev, disk);
606 if (dc && atomic_read(&dc->has_dirty))
607 s->recoverable = false;
608 }
609 if (!s->iop.status)
610 s->iop.status = BLK_STS_IOERR;
611 }
612
591 closure_return(cl); 613 closure_return(cl);
592} 614}
593 615
@@ -611,8 +633,8 @@ static void request_endio(struct bio *bio)
611static void bio_complete(struct search *s) 633static void bio_complete(struct search *s)
612{ 634{
613 if (s->orig_bio) { 635 if (s->orig_bio) {
614 struct request_queue *q = s->orig_bio->bi_disk->queue; 636 generic_end_io_acct(s->d->disk->queue,
615 generic_end_io_acct(q, bio_data_dir(s->orig_bio), 637 bio_data_dir(s->orig_bio),
616 &s->d->disk->part0, s->start_time); 638 &s->d->disk->part0, s->start_time);
617 639
618 trace_bcache_request_end(s->d, s->orig_bio); 640 trace_bcache_request_end(s->d, s->orig_bio);
@@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
841 cache_bio->bi_private = &s->cl; 863 cache_bio->bi_private = &s->cl;
842 864
843 bch_bio_map(cache_bio, NULL); 865 bch_bio_map(cache_bio, NULL);
844 if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) 866 if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
845 goto out_put; 867 goto out_put;
846 868
847 if (reada) 869 if (reada)
@@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
974 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 996 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
975 int rw = bio_data_dir(bio); 997 int rw = bio_data_dir(bio);
976 998
999 atomic_set(&dc->backing_idle, 0);
977 generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); 1000 generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
978 1001
979 bio_set_dev(bio, dc->bdev); 1002 bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b4d28928dec5..133b81225ea9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio)
211 211
212static void __write_super(struct cache_sb *sb, struct bio *bio) 212static void __write_super(struct cache_sb *sb, struct bio *bio)
213{ 213{
214 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); 214 struct cache_sb *out = page_address(bio_first_page_all(bio));
215 unsigned i; 215 unsigned i;
216 216
217 bio->bi_iter.bi_sector = SB_SECTOR; 217 bio->bi_iter.bi_sector = SB_SECTOR;
@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio)
274{ 274{
275 struct cache *ca = bio->bi_private; 275 struct cache *ca = bio->bi_private;
276 276
277 bch_count_io_errors(ca, bio->bi_status, "writing superblock"); 277 /* is_read = 0 */
278 bch_count_io_errors(ca, bio->bi_status, 0,
279 "writing superblock");
278 closure_put(&ca->set->sb_write); 280 closure_put(&ca->set->sb_write);
279} 281}
280 282
@@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
721 d->c = c; 723 d->c = c;
722 c->devices[id] = d; 724 c->devices[id] = d;
723 725
726 if (id >= c->devices_max_used)
727 c->devices_max_used = id + 1;
728
724 closure_get(&c->caching); 729 closure_get(&c->caching);
725} 730}
726 731
@@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
906 911
907 mutex_lock(&bch_register_lock); 912 mutex_lock(&bch_register_lock);
908 913
914 cancel_delayed_work_sync(&dc->writeback_rate_update);
915 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
916 kthread_stop(dc->writeback_thread);
917 dc->writeback_thread = NULL;
918 }
919
909 memset(&dc->sb.set_uuid, 0, 16); 920 memset(&dc->sb.set_uuid, 0, 16);
910 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); 921 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
911 922
@@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1166 dc->bdev->bd_holder = dc; 1177 dc->bdev->bd_holder = dc;
1167 1178
1168 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); 1179 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
1169 dc->sb_bio.bi_io_vec[0].bv_page = sb_page; 1180 bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
1170 get_page(sb_page); 1181 get_page(sb_page);
1171 1182
1172 if (cached_dev_init(dc, sb->block_size << 9)) 1183 if (cached_dev_init(dc, sb->block_size << 9))
@@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c)
1261 struct uuid_entry *u; 1272 struct uuid_entry *u;
1262 1273
1263 for (u = c->uuids; 1274 for (u = c->uuids;
1264 u < c->uuids + c->nr_uuids && !ret; 1275 u < c->uuids + c->devices_max_used && !ret;
1265 u++) 1276 u++)
1266 if (UUID_FLASH_ONLY(u)) 1277 if (UUID_FLASH_ONLY(u))
1267 ret = flash_dev_run(c, u); 1278 ret = flash_dev_run(c, u);
@@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl)
1427 1438
1428 mutex_lock(&bch_register_lock); 1439 mutex_lock(&bch_register_lock);
1429 1440
1430 for (i = 0; i < c->nr_uuids; i++) 1441 for (i = 0; i < c->devices_max_used; i++)
1431 if (c->devices[i]) { 1442 if (c->devices[i]) {
1432 if (!UUID_FLASH_ONLY(&c->uuids[i]) && 1443 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1433 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { 1444 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
@@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1490 c->bucket_bits = ilog2(sb->bucket_size); 1501 c->bucket_bits = ilog2(sb->bucket_size);
1491 c->block_bits = ilog2(sb->block_size); 1502 c->block_bits = ilog2(sb->block_size);
1492 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); 1503 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1493 1504 c->devices_max_used = 0;
1494 c->btree_pages = bucket_pages(c); 1505 c->btree_pages = bucket_pages(c);
1495 if (c->btree_pages > BTREE_MAX_PAGES) 1506 if (c->btree_pages > BTREE_MAX_PAGES)
1496 c->btree_pages = max_t(int, c->btree_pages / 4, 1507 c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj)
1810 free_fifo(&ca->free[i]); 1821 free_fifo(&ca->free[i]);
1811 1822
1812 if (ca->sb_bio.bi_inline_vecs[0].bv_page) 1823 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1813 put_page(ca->sb_bio.bi_io_vec[0].bv_page); 1824 put_page(bio_first_page_all(&ca->sb_bio));
1814 1825
1815 if (!IS_ERR_OR_NULL(ca->bdev)) 1826 if (!IS_ERR_OR_NULL(ca->bdev))
1816 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1827 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
1864 ca->bdev->bd_holder = ca; 1875 ca->bdev->bd_holder = ca;
1865 1876
1866 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); 1877 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
1867 ca->sb_bio.bi_io_vec[0].bv_page = sb_page; 1878 bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
1868 get_page(sb_page); 1879 get_page(sb_page);
1869 1880
1870 if (blk_queue_discard(bdev_get_queue(ca->bdev))) 1881 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index e548b8b51322..a23cd6a14b74 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
249 : 0; 249 : 0;
250} 250}
251 251
252/*
253 * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly,
254 * the preferred way is bio_add_page, but in this case, bch_bio_map()
255 * supposes that the bvec table is empty, so it is safe to access
256 * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is
257 * supported.
258 */
252void bch_bio_map(struct bio *bio, void *base) 259void bch_bio_map(struct bio *bio, void *base)
253{ 260{
254 size_t size = bio->bi_iter.bi_size; 261 size_t size = bio->bi_iter.bi_size;
@@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
276 } 283 }
277} 284}
278 285
286/**
287 * bch_bio_alloc_pages - allocates a single page for each bvec in a bio
288 * @bio: bio to allocate pages for
289 * @gfp_mask: flags for allocation
290 *
291 * Allocates pages up to @bio->bi_vcnt.
292 *
293 * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
294 * freed.
295 */
296int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
297{
298 int i;
299 struct bio_vec *bv;
300
301 bio_for_each_segment_all(bv, bio, i) {
302 bv->bv_page = alloc_page(gfp_mask);
303 if (!bv->bv_page) {
304 while (--bv >= bio->bi_io_vec)
305 __free_page(bv->bv_page);
306 return -ENOMEM;
307 }
308 }
309
310 return 0;
311}
312
279/* 313/*
280 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any 314 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
281 * use permitted, subject to terms of PostgreSQL license; see.) 315 * use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ed5e8a412eb8..4df4c5c1cab2 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
558} 558}
559 559
560void bch_bio_map(struct bio *bio, void *base); 560void bch_bio_map(struct bio *bio, void *base);
561int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
561 562
562static inline sector_t bdev_sectors(struct block_device *bdev) 563static inline sector_t bdev_sectors(struct block_device *bdev)
563{ 564{
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 56a37884ca8b..51306a19ab03 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -18,17 +18,39 @@
18#include <trace/events/bcache.h> 18#include <trace/events/bcache.h>
19 19
20/* Rate limiting */ 20/* Rate limiting */
21 21static uint64_t __calc_target_rate(struct cached_dev *dc)
22static void __update_writeback_rate(struct cached_dev *dc)
23{ 22{
24 struct cache_set *c = dc->disk.c; 23 struct cache_set *c = dc->disk.c;
24
25 /*
26 * This is the size of the cache, minus the amount used for
27 * flash-only devices
28 */
25 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - 29 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
26 bcache_flash_devs_sectors_dirty(c); 30 bcache_flash_devs_sectors_dirty(c);
31
32 /*
33 * Unfortunately there is no control of global dirty data. If the
34 * user states that they want 10% dirty data in the cache, and has,
35 * e.g., 5 backing volumes of equal size, we try and ensure each
36 * backing volume uses about 2% of the cache for dirty data.
37 */
38 uint32_t bdev_share =
39 div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
40 c->cached_dev_sectors);
41
27 uint64_t cache_dirty_target = 42 uint64_t cache_dirty_target =
28 div_u64(cache_sectors * dc->writeback_percent, 100); 43 div_u64(cache_sectors * dc->writeback_percent, 100);
29 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
30 c->cached_dev_sectors);
31 44
45 /* Ensure each backing dev gets at least one dirty share */
46 if (bdev_share < 1)
47 bdev_share = 1;
48
49 return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
50}
51
52static void __update_writeback_rate(struct cached_dev *dc)
53{
32 /* 54 /*
33 * PI controller: 55 * PI controller:
34 * Figures out the amount that should be written per second. 56 * Figures out the amount that should be written per second.
@@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
49 * This acts as a slow, long-term average that is not subject to 71 * This acts as a slow, long-term average that is not subject to
50 * variations in usage like the p term. 72 * variations in usage like the p term.
51 */ 73 */
74 int64_t target = __calc_target_rate(dc);
52 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 75 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
53 int64_t error = dirty - target; 76 int64_t error = dirty - target;
54 int64_t proportional_scaled = 77 int64_t proportional_scaled =
@@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
116struct dirty_io { 139struct dirty_io {
117 struct closure cl; 140 struct closure cl;
118 struct cached_dev *dc; 141 struct cached_dev *dc;
142 uint16_t sequence;
119 struct bio bio; 143 struct bio bio;
120}; 144};
121 145
@@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl)
194{ 218{
195 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 219 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
196 struct keybuf_key *w = io->bio.bi_private; 220 struct keybuf_key *w = io->bio.bi_private;
221 struct cached_dev *dc = io->dc;
222
223 uint16_t next_sequence;
224
225 if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
226 /* Not our turn to write; wait for a write to complete */
227 closure_wait(&dc->writeback_ordering_wait, cl);
228
229 if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
230 /*
231 * Edge case-- it happened in indeterminate order
232 * relative to when we were added to wait list..
233 */
234 closure_wake_up(&dc->writeback_ordering_wait);
235 }
236
237 continue_at(cl, write_dirty, io->dc->writeback_write_wq);
238 return;
239 }
240
241 next_sequence = io->sequence + 1;
197 242
198 /* 243 /*
199 * IO errors are signalled using the dirty bit on the key. 244 * IO errors are signalled using the dirty bit on the key.
@@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl)
211 closure_bio_submit(&io->bio, cl); 256 closure_bio_submit(&io->bio, cl);
212 } 257 }
213 258
259 atomic_set(&dc->writeback_sequence_next, next_sequence);
260 closure_wake_up(&dc->writeback_ordering_wait);
261
214 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); 262 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
215} 263}
216 264
@@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio)
219 struct keybuf_key *w = bio->bi_private; 267 struct keybuf_key *w = bio->bi_private;
220 struct dirty_io *io = w->private; 268 struct dirty_io *io = w->private;
221 269
270 /* is_read = 1 */
222 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), 271 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
223 bio->bi_status, "reading dirty data from cache"); 272 bio->bi_status, 1,
273 "reading dirty data from cache");
224 274
225 dirty_endio(bio); 275 dirty_endio(bio);
226} 276}
@@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl)
237static void read_dirty(struct cached_dev *dc) 287static void read_dirty(struct cached_dev *dc)
238{ 288{
239 unsigned delay = 0; 289 unsigned delay = 0;
240 struct keybuf_key *w; 290 struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
291 size_t size;
292 int nk, i;
241 struct dirty_io *io; 293 struct dirty_io *io;
242 struct closure cl; 294 struct closure cl;
295 uint16_t sequence = 0;
243 296
297 BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
298 atomic_set(&dc->writeback_sequence_next, sequence);
244 closure_init_stack(&cl); 299 closure_init_stack(&cl);
245 300
246 /* 301 /*
@@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc)
248 * mempools. 303 * mempools.
249 */ 304 */
250 305
251 while (!kthread_should_stop()) { 306 next = bch_keybuf_next(&dc->writeback_keys);
252 307
253 w = bch_keybuf_next(&dc->writeback_keys); 308 while (!kthread_should_stop() && next) {
254 if (!w) 309 size = 0;
255 break; 310 nk = 0;
256 311
257 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 312 do {
258 313 BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
259 if (KEY_START(&w->key) != dc->last_read || 314
260 jiffies_to_msecs(delay) > 50) 315 /*
261 while (!kthread_should_stop() && delay) 316 * Don't combine too many operations, even if they
262 delay = schedule_timeout_interruptible(delay); 317 * are all small.
263 318 */
264 dc->last_read = KEY_OFFSET(&w->key); 319 if (nk >= MAX_WRITEBACKS_IN_PASS)
265 320 break;
266 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) 321
267 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 322 /*
268 GFP_KERNEL); 323 * If the current operation is very large, don't
269 if (!io) 324 * further combine operations.
270 goto err; 325 */
271 326 if (size >= MAX_WRITESIZE_IN_PASS)
272 w->private = io; 327 break;
273 io->dc = dc; 328
274 329 /*
275 dirty_init(w); 330 * Operations are only eligible to be combined
276 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); 331 * if they are contiguous.
277 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); 332 *
278 bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); 333 * TODO: add a heuristic willing to fire a
279 io->bio.bi_end_io = read_dirty_endio; 334 * certain amount of non-contiguous IO per pass,
280 335 * so that we can benefit from backing device
281 if (bio_alloc_pages(&io->bio, GFP_KERNEL)) 336 * command queueing.
282 goto err_free; 337 */
338 if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
339 &START_KEY(&next->key)))
340 break;
341
342 size += KEY_SIZE(&next->key);
343 keys[nk++] = next;
344 } while ((next = bch_keybuf_next(&dc->writeback_keys)));
345
346 /* Now we have gathered a set of 1..5 keys to write back. */
347 for (i = 0; i < nk; i++) {
348 w = keys[i];
349
350 io = kzalloc(sizeof(struct dirty_io) +
351 sizeof(struct bio_vec) *
352 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
353 GFP_KERNEL);
354 if (!io)
355 goto err;
356
357 w->private = io;
358 io->dc = dc;
359 io->sequence = sequence++;
360
361 dirty_init(w);
362 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
363 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
364 bio_set_dev(&io->bio,
365 PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
366 io->bio.bi_end_io = read_dirty_endio;
367
368 if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
369 goto err_free;
370
371 trace_bcache_writeback(&w->key);
372
373 down(&dc->in_flight);
374
375 /* We've acquired a semaphore for the maximum
376 * simultaneous number of writebacks; from here
377 * everything happens asynchronously.
378 */
379 closure_call(&io->cl, read_dirty_submit, NULL, &cl);
380 }
283 381
284 trace_bcache_writeback(&w->key); 382 delay = writeback_delay(dc, size);
285 383
286 down(&dc->in_flight); 384 /* If the control system would wait for at least half a
287 closure_call(&io->cl, read_dirty_submit, NULL, &cl); 385 * second, and there's been no reqs hitting the backing disk
386 * for awhile: use an alternate mode where we have at most
387 * one contiguous set of writebacks in flight at a time. If
388 * someone wants to do IO it will be quick, as it will only
389 * have to contend with one operation in flight, and we'll
390 * be round-tripping data to the backing disk as quickly as
391 * it can accept it.
392 */
393 if (delay >= HZ / 2) {
394 /* 3 means at least 1.5 seconds, up to 7.5 if we
395 * have slowed way down.
396 */
397 if (atomic_inc_return(&dc->backing_idle) >= 3) {
398 /* Wait for current I/Os to finish */
399 closure_sync(&cl);
400 /* And immediately launch a new set. */
401 delay = 0;
402 }
403 }
288 404
289 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 405 while (!kthread_should_stop() && delay) {
406 schedule_timeout_interruptible(delay);
407 delay = writeback_delay(dc, 0);
408 }
290 } 409 }
291 410
292 if (0) { 411 if (0) {
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index a9e3ffb4b03c..66f1c527fa24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,16 @@
5#define CUTOFF_WRITEBACK 40 5#define CUTOFF_WRITEBACK 40
6#define CUTOFF_WRITEBACK_SYNC 70 6#define CUTOFF_WRITEBACK_SYNC 70
7 7
8#define MAX_WRITEBACKS_IN_PASS 5
9#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
10
11/*
12 * 14 (16384ths) is chosen here as something that each backing device
13 * should be a reasonable fraction of the share, and not to blow up
14 * until individual backing devices are a petabyte.
15 */
16#define WRITEBACK_SHARE_SHIFT 14
17
8static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) 18static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
9{ 19{
10 uint64_t i, ret = 0; 20 uint64_t i, ret = 0;
@@ -21,7 +31,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
21 31
22 mutex_lock(&bch_register_lock); 32 mutex_lock(&bch_register_lock);
23 33
24 for (i = 0; i < c->nr_uuids; i++) { 34 for (i = 0; i < c->devices_max_used; i++) {
25 struct bcache_device *d = c->devices[i]; 35 struct bcache_device *d = c->devices[i];
26 36
27 if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) 37 if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 554d60394c06..2ad429100d25 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
1446 bio_for_each_segment_all(bv, clone, i) { 1446 bio_for_each_segment_all(bv, clone, i) {
1447 BUG_ON(!bv->bv_page); 1447 BUG_ON(!bv->bv_page);
1448 mempool_free(bv->bv_page, cc->page_pool); 1448 mempool_free(bv->bv_page, cc->page_pool);
1449 bv->bv_page = NULL;
1450 } 1449 }
1451} 1450}
1452 1451
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f7810cc869ac..ef57c6d1c887 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1475,21 +1475,6 @@ static void activate_path_work(struct work_struct *work)
1475 activate_or_offline_path(pgpath); 1475 activate_or_offline_path(pgpath);
1476} 1476}
1477 1477
1478static int noretry_error(blk_status_t error)
1479{
1480 switch (error) {
1481 case BLK_STS_NOTSUPP:
1482 case BLK_STS_NOSPC:
1483 case BLK_STS_TARGET:
1484 case BLK_STS_NEXUS:
1485 case BLK_STS_MEDIUM:
1486 return 1;
1487 }
1488
1489 /* Anything else could be a path failure, so should be retried */
1490 return 0;
1491}
1492
1493static int multipath_end_io(struct dm_target *ti, struct request *clone, 1478static int multipath_end_io(struct dm_target *ti, struct request *clone,
1494 blk_status_t error, union map_info *map_context) 1479 blk_status_t error, union map_info *map_context)
1495{ 1480{
@@ -1508,7 +1493,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
1508 * request into dm core, which will remake a clone request and 1493 * request into dm core, which will remake a clone request and
1509 * clone bios for it and resubmit it later. 1494 * clone bios for it and resubmit it later.
1510 */ 1495 */
1511 if (error && !noretry_error(error)) { 1496 if (error && blk_path_error(error)) {
1512 struct multipath *m = ti->private; 1497 struct multipath *m = ti->private;
1513 1498
1514 r = DM_ENDIO_REQUEUE; 1499 r = DM_ENDIO_REQUEUE;
@@ -1544,7 +1529,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
1544 unsigned long flags; 1529 unsigned long flags;
1545 int r = DM_ENDIO_DONE; 1530 int r = DM_ENDIO_DONE;
1546 1531
1547 if (!*error || noretry_error(*error)) 1532 if (!*error || !blk_path_error(*error))
1548 goto done; 1533 goto done;
1549 1534
1550 if (pgpath) 1535 if (pgpath)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d32f25489c2..b7d175e94a02 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
395 dm_complete_request(tio->orig, error); 395 dm_complete_request(tio->orig, error);
396} 396}
397 397
398static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 398static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
399{ 399{
400 blk_status_t r; 400 blk_status_t r;
401 401
@@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
404 404
405 clone->start_time = jiffies; 405 clone->start_time = jiffies;
406 r = blk_insert_cloned_request(clone->q, clone); 406 r = blk_insert_cloned_request(clone->q, clone);
407 if (r) 407 if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
408 /* must complete clone in terms of original request */ 408 /* must complete clone in terms of original request */
409 dm_complete_request(rq, r); 409 dm_complete_request(rq, r);
410 return r;
410} 411}
411 412
412static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 413static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
@@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
476 struct mapped_device *md = tio->md; 477 struct mapped_device *md = tio->md;
477 struct request *rq = tio->orig; 478 struct request *rq = tio->orig;
478 struct request *clone = NULL; 479 struct request *clone = NULL;
480 blk_status_t ret;
479 481
480 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 482 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
483check_again:
481 switch (r) { 484 switch (r) {
482 case DM_MAPIO_SUBMITTED: 485 case DM_MAPIO_SUBMITTED:
483 /* The target has taken the I/O to submit by itself later */ 486 /* The target has taken the I/O to submit by itself later */
@@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
492 /* The target has remapped the I/O so dispatch it */ 495 /* The target has remapped the I/O so dispatch it */
493 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 496 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
494 blk_rq_pos(rq)); 497 blk_rq_pos(rq));
495 dm_dispatch_clone_request(clone, rq); 498 ret = dm_dispatch_clone_request(clone, rq);
499 if (ret == BLK_STS_RESOURCE) {
500 blk_rq_unprep_clone(clone);
501 tio->ti->type->release_clone_rq(clone);
502 tio->clone = NULL;
503 if (!rq->q->mq_ops)
504 r = DM_MAPIO_DELAY_REQUEUE;
505 else
506 r = DM_MAPIO_REQUEUE;
507 goto check_again;
508 }
496 break; 509 break;
497 case DM_MAPIO_REQUEUE: 510 case DM_MAPIO_REQUEUE:
498 /* The target wants to requeue the I/O */ 511 /* The target wants to requeue the I/O */
@@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
713 return error; 726 return error;
714 } 727 }
715 728
716 elv_register_queue(md->queue);
717
718 return 0; 729 return 0;
719} 730}
720 731
@@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
812 } 823 }
813 dm_init_md_queue(md); 824 dm_init_md_queue(md);
814 825
815 /* backfill 'mq' sysfs registration normally done in blk_register_queue */
816 err = blk_mq_register_dev(disk_to_dev(md->disk), q);
817 if (err)
818 goto out_cleanup_queue;
819
820 return 0; 826 return 0;
821 827
822out_cleanup_queue:
823 blk_cleanup_queue(q);
824out_tag_set: 828out_tag_set:
825 blk_mq_free_tag_set(md->tag_set); 829 blk_mq_free_tag_set(md->tag_set);
826out_kfree_tag_set: 830out_kfree_tag_set:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index de17b7193299..8c26bfc35335 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -920,7 +920,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
920 return -EINVAL; 920 return -EINVAL;
921 } 921 }
922 922
923 ti->max_io_len = (uint32_t) len; 923 /*
924 * BIO based queue uses its own splitting. When multipage bvecs
925 * is switched on, size of the incoming bio may be too big to
926 * be handled in some targets, such as crypt.
927 *
928 * When these targets are ready for the big bio, we can remove
929 * the limit.
930 */
931 ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
924 932
925 return 0; 933 return 0;
926} 934}
@@ -1753,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor)
1753 goto bad; 1761 goto bad;
1754 md->dax_dev = dax_dev; 1762 md->dax_dev = dax_dev;
1755 1763
1756 add_disk(md->disk); 1764 add_disk_no_queue_reg(md->disk);
1757 format_dev_t(md->name, MKDEV(_major, minor)); 1765 format_dev_t(md->name, MKDEV(_major, minor));
1758 1766
1759 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 1767 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
@@ -2013,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2013int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2021int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2014{ 2022{
2015 int r; 2023 int r;
2024 struct queue_limits limits;
2016 enum dm_queue_mode type = dm_get_md_type(md); 2025 enum dm_queue_mode type = dm_get_md_type(md);
2017 2026
2018 switch (type) { 2027 switch (type) {
@@ -2049,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2049 break; 2058 break;
2050 } 2059 }
2051 2060
2061 r = dm_calculate_queue_limits(t, &limits);
2062 if (r) {
2063 DMERR("Cannot calculate initial queue limits");
2064 return r;
2065 }
2066 dm_table_set_restrictions(t, md->queue, &limits);
2067 blk_register_queue(md->disk);
2068
2052 return 0; 2069 return 0;
2053} 2070}
2054 2071
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a25fd43650ad..441e67e3a9d7 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,4 +1,7 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2
3ccflags-y += -I$(src)
4
2obj-$(CONFIG_NVME_CORE) += nvme-core.o 5obj-$(CONFIG_NVME_CORE) += nvme-core.o
3obj-$(CONFIG_BLK_DEV_NVME) += nvme.o 6obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
4obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o 7obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
@@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
6obj-$(CONFIG_NVME_FC) += nvme-fc.o 9obj-$(CONFIG_NVME_FC) += nvme-fc.o
7 10
8nvme-core-y := core.o 11nvme-core-y := core.o
12nvme-core-$(CONFIG_TRACING) += trace.o
9nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o 13nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
10nvme-core-$(CONFIG_NVM) += lightnvm.o 14nvme-core-$(CONFIG_NVM) += lightnvm.o
11 15
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 839650e0926a..e8104871cbbf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -29,6 +29,9 @@
29#include <linux/pm_qos.h> 29#include <linux/pm_qos.h>
30#include <asm/unaligned.h> 30#include <asm/unaligned.h>
31 31
32#define CREATE_TRACE_POINTS
33#include "trace.h"
34
32#include "nvme.h" 35#include "nvme.h"
33#include "fabrics.h" 36#include "fabrics.h"
34 37
@@ -65,9 +68,26 @@ static bool streams;
65module_param(streams, bool, 0644); 68module_param(streams, bool, 0644);
66MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); 69MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
67 70
71/*
72 * nvme_wq - hosts nvme related works that are not reset or delete
73 * nvme_reset_wq - hosts nvme reset works
74 * nvme_delete_wq - hosts nvme delete works
75 *
76 * nvme_wq will host works such are scan, aen handling, fw activation,
77 * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
78 * runs reset works which also flush works hosted on nvme_wq for
79 * serialization purposes. nvme_delete_wq host controller deletion
80 * works which flush reset works for serialization.
81 */
68struct workqueue_struct *nvme_wq; 82struct workqueue_struct *nvme_wq;
69EXPORT_SYMBOL_GPL(nvme_wq); 83EXPORT_SYMBOL_GPL(nvme_wq);
70 84
85struct workqueue_struct *nvme_reset_wq;
86EXPORT_SYMBOL_GPL(nvme_reset_wq);
87
88struct workqueue_struct *nvme_delete_wq;
89EXPORT_SYMBOL_GPL(nvme_delete_wq);
90
71static DEFINE_IDA(nvme_subsystems_ida); 91static DEFINE_IDA(nvme_subsystems_ida);
72static LIST_HEAD(nvme_subsystems); 92static LIST_HEAD(nvme_subsystems);
73static DEFINE_MUTEX(nvme_subsystems_lock); 93static DEFINE_MUTEX(nvme_subsystems_lock);
@@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
89{ 109{
90 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 110 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
91 return -EBUSY; 111 return -EBUSY;
92 if (!queue_work(nvme_wq, &ctrl->reset_work)) 112 if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
93 return -EBUSY; 113 return -EBUSY;
94 return 0; 114 return 0;
95} 115}
96EXPORT_SYMBOL_GPL(nvme_reset_ctrl); 116EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
97 117
98static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) 118int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
99{ 119{
100 int ret; 120 int ret;
101 121
@@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
104 flush_work(&ctrl->reset_work); 124 flush_work(&ctrl->reset_work);
105 return ret; 125 return ret;
106} 126}
127EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
107 128
108static void nvme_delete_ctrl_work(struct work_struct *work) 129static void nvme_delete_ctrl_work(struct work_struct *work)
109{ 130{
@@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
122{ 143{
123 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 144 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
124 return -EBUSY; 145 return -EBUSY;
125 if (!queue_work(nvme_wq, &ctrl->delete_work)) 146 if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
126 return -EBUSY; 147 return -EBUSY;
127 return 0; 148 return 0;
128} 149}
@@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req)
157 return BLK_STS_OK; 178 return BLK_STS_OK;
158 case NVME_SC_CAP_EXCEEDED: 179 case NVME_SC_CAP_EXCEEDED:
159 return BLK_STS_NOSPC; 180 return BLK_STS_NOSPC;
181 case NVME_SC_LBA_RANGE:
182 return BLK_STS_TARGET;
183 case NVME_SC_BAD_ATTRIBUTES:
160 case NVME_SC_ONCS_NOT_SUPPORTED: 184 case NVME_SC_ONCS_NOT_SUPPORTED:
185 case NVME_SC_INVALID_OPCODE:
186 case NVME_SC_INVALID_FIELD:
187 case NVME_SC_INVALID_NS:
161 return BLK_STS_NOTSUPP; 188 return BLK_STS_NOTSUPP;
162 case NVME_SC_WRITE_FAULT: 189 case NVME_SC_WRITE_FAULT:
163 case NVME_SC_READ_ERROR: 190 case NVME_SC_READ_ERROR:
164 case NVME_SC_UNWRITTEN_BLOCK: 191 case NVME_SC_UNWRITTEN_BLOCK:
165 case NVME_SC_ACCESS_DENIED: 192 case NVME_SC_ACCESS_DENIED:
166 case NVME_SC_READ_ONLY: 193 case NVME_SC_READ_ONLY:
194 case NVME_SC_COMPARE_FAILED:
167 return BLK_STS_MEDIUM; 195 return BLK_STS_MEDIUM;
168 case NVME_SC_GUARD_CHECK: 196 case NVME_SC_GUARD_CHECK:
169 case NVME_SC_APPTAG_CHECK: 197 case NVME_SC_APPTAG_CHECK:
@@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req)
190 218
191void nvme_complete_rq(struct request *req) 219void nvme_complete_rq(struct request *req)
192{ 220{
193 if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { 221 blk_status_t status = nvme_error_status(req);
194 if (nvme_req_needs_failover(req)) { 222
223 trace_nvme_complete_rq(req);
224
225 if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
226 if (nvme_req_needs_failover(req, status)) {
195 nvme_failover_req(req); 227 nvme_failover_req(req);
196 return; 228 return;
197 } 229 }
@@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req)
202 return; 234 return;
203 } 235 }
204 } 236 }
205 237 blk_mq_end_request(req, status);
206 blk_mq_end_request(req, nvme_error_status(req));
207} 238}
208EXPORT_SYMBOL_GPL(nvme_complete_rq); 239EXPORT_SYMBOL_GPL(nvme_complete_rq);
209 240
@@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
232 263
233 old_state = ctrl->state; 264 old_state = ctrl->state;
234 switch (new_state) { 265 switch (new_state) {
266 case NVME_CTRL_ADMIN_ONLY:
267 switch (old_state) {
268 case NVME_CTRL_RECONNECTING:
269 changed = true;
270 /* FALLTHRU */
271 default:
272 break;
273 }
274 break;
235 case NVME_CTRL_LIVE: 275 case NVME_CTRL_LIVE:
236 switch (old_state) { 276 switch (old_state) {
237 case NVME_CTRL_NEW: 277 case NVME_CTRL_NEW:
@@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
247 switch (old_state) { 287 switch (old_state) {
248 case NVME_CTRL_NEW: 288 case NVME_CTRL_NEW:
249 case NVME_CTRL_LIVE: 289 case NVME_CTRL_LIVE:
290 case NVME_CTRL_ADMIN_ONLY:
250 changed = true; 291 changed = true;
251 /* FALLTHRU */ 292 /* FALLTHRU */
252 default: 293 default:
@@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
266 case NVME_CTRL_DELETING: 307 case NVME_CTRL_DELETING:
267 switch (old_state) { 308 switch (old_state) {
268 case NVME_CTRL_LIVE: 309 case NVME_CTRL_LIVE:
310 case NVME_CTRL_ADMIN_ONLY:
269 case NVME_CTRL_RESETTING: 311 case NVME_CTRL_RESETTING:
270 case NVME_CTRL_RECONNECTING: 312 case NVME_CTRL_RECONNECTING:
271 changed = true; 313 changed = true;
@@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
591 } 633 }
592 634
593 cmd->common.command_id = req->tag; 635 cmd->common.command_id = req->tag;
636 if (ns)
637 trace_nvme_setup_nvm_cmd(req->q->id, cmd);
638 else
639 trace_nvme_setup_admin_cmd(cmd);
594 return ret; 640 return ret;
595} 641}
596EXPORT_SYMBOL_GPL(nvme_setup_cmd); 642EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
1217#ifdef CONFIG_NVME_MULTIPATH 1263#ifdef CONFIG_NVME_MULTIPATH
1218 /* should never be called due to GENHD_FL_HIDDEN */ 1264 /* should never be called due to GENHD_FL_HIDDEN */
1219 if (WARN_ON_ONCE(ns->head->disk)) 1265 if (WARN_ON_ONCE(ns->head->disk))
1220 return -ENXIO; 1266 goto fail;
1221#endif 1267#endif
1222 if (!kref_get_unless_zero(&ns->kref)) 1268 if (!kref_get_unless_zero(&ns->kref))
1223 return -ENXIO; 1269 goto fail;
1270 if (!try_module_get(ns->ctrl->ops->module))
1271 goto fail_put_ns;
1272
1224 return 0; 1273 return 0;
1274
1275fail_put_ns:
1276 nvme_put_ns(ns);
1277fail:
1278 return -ENXIO;
1225} 1279}
1226 1280
1227static void nvme_release(struct gendisk *disk, fmode_t mode) 1281static void nvme_release(struct gendisk *disk, fmode_t mode)
1228{ 1282{
1229 nvme_put_ns(disk->private_data); 1283 struct nvme_ns *ns = disk->private_data;
1284
1285 module_put(ns->ctrl->ops->module);
1286 nvme_put_ns(ns);
1230} 1287}
1231 1288
1232static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1289static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2052 NULL, 2109 NULL,
2053}; 2110};
2054 2111
2112static int nvme_active_ctrls(struct nvme_subsystem *subsys)
2113{
2114 int count = 0;
2115 struct nvme_ctrl *ctrl;
2116
2117 mutex_lock(&subsys->lock);
2118 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
2119 if (ctrl->state != NVME_CTRL_DELETING &&
2120 ctrl->state != NVME_CTRL_DEAD)
2121 count++;
2122 }
2123 mutex_unlock(&subsys->lock);
2124
2125 return count;
2126}
2127
2055static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2128static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2056{ 2129{
2057 struct nvme_subsystem *subsys, *found; 2130 struct nvme_subsystem *subsys, *found;
@@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2090 * Verify that the subsystem actually supports multiple 2163 * Verify that the subsystem actually supports multiple
2091 * controllers, else bail out. 2164 * controllers, else bail out.
2092 */ 2165 */
2093 if (!(id->cmic & (1 << 1))) { 2166 if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
2094 dev_err(ctrl->device, 2167 dev_err(ctrl->device,
2095 "ignoring ctrl due to duplicate subnqn (%s).\n", 2168 "ignoring ctrl due to duplicate subnqn (%s).\n",
2096 found->subnqn); 2169 found->subnqn);
@@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
2257 shutdown_timeout, 60); 2330 shutdown_timeout, 60);
2258 2331
2259 if (ctrl->shutdown_timeout != shutdown_timeout) 2332 if (ctrl->shutdown_timeout != shutdown_timeout)
2260 dev_warn(ctrl->device, 2333 dev_info(ctrl->device,
2261 "Shutdown timeout set to %u seconds\n", 2334 "Shutdown timeout set to %u seconds\n",
2262 ctrl->shutdown_timeout); 2335 ctrl->shutdown_timeout);
2263 } else 2336 } else
@@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
2341 struct nvme_ctrl *ctrl = 2414 struct nvme_ctrl *ctrl =
2342 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 2415 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
2343 2416
2344 if (ctrl->state != NVME_CTRL_LIVE) 2417 switch (ctrl->state) {
2418 case NVME_CTRL_LIVE:
2419 case NVME_CTRL_ADMIN_ONLY:
2420 break;
2421 default:
2345 return -EWOULDBLOCK; 2422 return -EWOULDBLOCK;
2423 }
2424
2346 file->private_data = ctrl; 2425 file->private_data = ctrl;
2347 return 0; 2426 return 0;
2348} 2427}
@@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
2606 static const char *const state_name[] = { 2685 static const char *const state_name[] = {
2607 [NVME_CTRL_NEW] = "new", 2686 [NVME_CTRL_NEW] = "new",
2608 [NVME_CTRL_LIVE] = "live", 2687 [NVME_CTRL_LIVE] = "live",
2688 [NVME_CTRL_ADMIN_ONLY] = "only-admin",
2609 [NVME_CTRL_RESETTING] = "resetting", 2689 [NVME_CTRL_RESETTING] = "resetting",
2610 [NVME_CTRL_RECONNECTING]= "reconnecting", 2690 [NVME_CTRL_RECONNECTING]= "reconnecting",
2611 [NVME_CTRL_DELETING] = "deleting", 2691 [NVME_CTRL_DELETING] = "deleting",
@@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work)
3079 if (ctrl->state != NVME_CTRL_LIVE) 3159 if (ctrl->state != NVME_CTRL_LIVE)
3080 return; 3160 return;
3081 3161
3162 WARN_ON_ONCE(!ctrl->tagset);
3163
3082 if (nvme_identify_ctrl(ctrl, &id)) 3164 if (nvme_identify_ctrl(ctrl, &id))
3083 return; 3165 return;
3084 3166
@@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work)
3099void nvme_queue_scan(struct nvme_ctrl *ctrl) 3181void nvme_queue_scan(struct nvme_ctrl *ctrl)
3100{ 3182{
3101 /* 3183 /*
3102 * Do not queue new scan work when a controller is reset during 3184 * Only new queue scan work when admin and IO queues are both alive
3103 * removal.
3104 */ 3185 */
3105 if (ctrl->state == NVME_CTRL_LIVE) 3186 if (ctrl->state == NVME_CTRL_LIVE)
3106 queue_work(nvme_wq, &ctrl->scan_work); 3187 queue_work(nvme_wq, &ctrl->scan_work);
@@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
3477 3558
3478int __init nvme_core_init(void) 3559int __init nvme_core_init(void)
3479{ 3560{
3480 int result; 3561 int result = -ENOMEM;
3481 3562
3482 nvme_wq = alloc_workqueue("nvme-wq", 3563 nvme_wq = alloc_workqueue("nvme-wq",
3483 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 3564 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3484 if (!nvme_wq) 3565 if (!nvme_wq)
3485 return -ENOMEM; 3566 goto out;
3567
3568 nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
3569 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3570 if (!nvme_reset_wq)
3571 goto destroy_wq;
3572
3573 nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
3574 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3575 if (!nvme_delete_wq)
3576 goto destroy_reset_wq;
3486 3577
3487 result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); 3578 result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
3488 if (result < 0) 3579 if (result < 0)
3489 goto destroy_wq; 3580 goto destroy_delete_wq;
3490 3581
3491 nvme_class = class_create(THIS_MODULE, "nvme"); 3582 nvme_class = class_create(THIS_MODULE, "nvme");
3492 if (IS_ERR(nvme_class)) { 3583 if (IS_ERR(nvme_class)) {
@@ -3505,8 +3596,13 @@ destroy_class:
3505 class_destroy(nvme_class); 3596 class_destroy(nvme_class);
3506unregister_chrdev: 3597unregister_chrdev:
3507 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); 3598 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3599destroy_delete_wq:
3600 destroy_workqueue(nvme_delete_wq);
3601destroy_reset_wq:
3602 destroy_workqueue(nvme_reset_wq);
3508destroy_wq: 3603destroy_wq:
3509 destroy_workqueue(nvme_wq); 3604 destroy_workqueue(nvme_wq);
3605out:
3510 return result; 3606 return result;
3511} 3607}
3512 3608
@@ -3516,6 +3612,8 @@ void nvme_core_exit(void)
3516 class_destroy(nvme_subsys_class); 3612 class_destroy(nvme_subsys_class);
3517 class_destroy(nvme_class); 3613 class_destroy(nvme_class);
3518 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); 3614 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3615 destroy_workqueue(nvme_delete_wq);
3616 destroy_workqueue(nvme_reset_wq);
3519 destroy_workqueue(nvme_wq); 3617 destroy_workqueue(nvme_wq);
3520} 3618}
3521 3619
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 894c2ccb3891..5dd4ceefed8f 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
493 */ 493 */
494int nvmf_register_transport(struct nvmf_transport_ops *ops) 494int nvmf_register_transport(struct nvmf_transport_ops *ops)
495{ 495{
496 if (!ops->create_ctrl) 496 if (!ops->create_ctrl || !ops->module)
497 return -EINVAL; 497 return -EINVAL;
498 498
499 down_write(&nvmf_transports_rwsem); 499 down_write(&nvmf_transports_rwsem);
@@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
739 ret = -ENOMEM; 739 ret = -ENOMEM;
740 goto out; 740 goto out;
741 } 741 }
742 if (uuid_parse(p, &hostid)) { 742 ret = uuid_parse(p, &hostid);
743 if (ret) {
743 pr_err("Invalid hostid %s\n", p); 744 pr_err("Invalid hostid %s\n", p);
744 ret = -EINVAL; 745 ret = -EINVAL;
746 kfree(p);
745 goto out; 747 goto out;
746 } 748 }
749 kfree(p);
747 break; 750 break;
748 case NVMF_OPT_DUP_CONNECT: 751 case NVMF_OPT_DUP_CONNECT:
749 opts->duplicate_connect = true; 752 opts->duplicate_connect = true;
@@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
869 goto out_unlock; 872 goto out_unlock;
870 } 873 }
871 874
875 if (!try_module_get(ops->module)) {
876 ret = -EBUSY;
877 goto out_unlock;
878 }
879
872 ret = nvmf_check_required_opts(opts, ops->required_opts); 880 ret = nvmf_check_required_opts(opts, ops->required_opts);
873 if (ret) 881 if (ret)
874 goto out_unlock; 882 goto out_module_put;
875 ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | 883 ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
876 ops->allowed_opts | ops->required_opts); 884 ops->allowed_opts | ops->required_opts);
877 if (ret) 885 if (ret)
878 goto out_unlock; 886 goto out_module_put;
879 887
880 ctrl = ops->create_ctrl(dev, opts); 888 ctrl = ops->create_ctrl(dev, opts);
881 if (IS_ERR(ctrl)) { 889 if (IS_ERR(ctrl)) {
882 ret = PTR_ERR(ctrl); 890 ret = PTR_ERR(ctrl);
883 goto out_unlock; 891 goto out_module_put;
884 } 892 }
885 893
886 if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { 894 if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
887 dev_warn(ctrl->device, 895 dev_warn(ctrl->device,
888 "controller returned incorrect NQN: \"%s\".\n", 896 "controller returned incorrect NQN: \"%s\".\n",
889 ctrl->subsys->subnqn); 897 ctrl->subsys->subnqn);
898 module_put(ops->module);
890 up_read(&nvmf_transports_rwsem); 899 up_read(&nvmf_transports_rwsem);
891 nvme_delete_ctrl_sync(ctrl); 900 nvme_delete_ctrl_sync(ctrl);
892 return ERR_PTR(-EINVAL); 901 return ERR_PTR(-EINVAL);
893 } 902 }
894 903
904 module_put(ops->module);
895 up_read(&nvmf_transports_rwsem); 905 up_read(&nvmf_transports_rwsem);
896 return ctrl; 906 return ctrl;
897 907
908out_module_put:
909 module_put(ops->module);
898out_unlock: 910out_unlock:
899 up_read(&nvmf_transports_rwsem); 911 up_read(&nvmf_transports_rwsem);
900out_free_opts: 912out_free_opts:
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 9ba614953607..25b19f722f5b 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -108,6 +108,7 @@ struct nvmf_ctrl_options {
108 * fabric implementation of NVMe fabrics. 108 * fabric implementation of NVMe fabrics.
109 * @entry: Used by the fabrics library to add the new 109 * @entry: Used by the fabrics library to add the new
110 * registration entry to its linked-list internal tree. 110 * registration entry to its linked-list internal tree.
111 * @module: Transport module reference
111 * @name: Name of the NVMe fabric driver implementation. 112 * @name: Name of the NVMe fabric driver implementation.
112 * @required_opts: sysfs command-line options that must be specified 113 * @required_opts: sysfs command-line options that must be specified
113 * when adding a new NVMe controller. 114 * when adding a new NVMe controller.
@@ -126,6 +127,7 @@ struct nvmf_ctrl_options {
126 */ 127 */
127struct nvmf_transport_ops { 128struct nvmf_transport_ops {
128 struct list_head entry; 129 struct list_head entry;
130 struct module *module;
129 const char *name; 131 const char *name;
130 int required_opts; 132 int required_opts;
131 int allowed_opts; 133 int allowed_opts;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 794e66e4aa20..99bf51c7e513 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
2921 __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); 2921 __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
2922 nvme_fc_free_queue(&ctrl->queues[0]); 2922 nvme_fc_free_queue(&ctrl->queues[0]);
2923 2923
2924 /* re-enable the admin_q so anything new can fast fail */
2925 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
2926
2924 nvme_fc_ctlr_inactive_on_rport(ctrl); 2927 nvme_fc_ctlr_inactive_on_rport(ctrl);
2925} 2928}
2926 2929
@@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
2935 * waiting for io to terminate 2938 * waiting for io to terminate
2936 */ 2939 */
2937 nvme_fc_delete_association(ctrl); 2940 nvme_fc_delete_association(ctrl);
2941
2942 /* resume the io queues so that things will fast fail */
2943 nvme_start_queues(nctrl);
2938} 2944}
2939 2945
2940static void 2946static void
@@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
3380 3386
3381static struct nvmf_transport_ops nvme_fc_transport = { 3387static struct nvmf_transport_ops nvme_fc_transport = {
3382 .name = "fc", 3388 .name = "fc",
3389 .module = THIS_MODULE,
3383 .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, 3390 .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
3384 .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, 3391 .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
3385 .create_ctrl = nvme_fc_create_ctrl, 3392 .create_ctrl = nvme_fc_create_ctrl,
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba3d7f3349e5..50ef71ee3d86 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -31,27 +31,10 @@
31 31
32enum nvme_nvm_admin_opcode { 32enum nvme_nvm_admin_opcode {
33 nvme_nvm_admin_identity = 0xe2, 33 nvme_nvm_admin_identity = 0xe2,
34 nvme_nvm_admin_get_l2p_tbl = 0xea,
35 nvme_nvm_admin_get_bb_tbl = 0xf2, 34 nvme_nvm_admin_get_bb_tbl = 0xf2,
36 nvme_nvm_admin_set_bb_tbl = 0xf1, 35 nvme_nvm_admin_set_bb_tbl = 0xf1,
37}; 36};
38 37
39struct nvme_nvm_hb_rw {
40 __u8 opcode;
41 __u8 flags;
42 __u16 command_id;
43 __le32 nsid;
44 __u64 rsvd2;
45 __le64 metadata;
46 __le64 prp1;
47 __le64 prp2;
48 __le64 spba;
49 __le16 length;
50 __le16 control;
51 __le32 dsmgmt;
52 __le64 slba;
53};
54
55struct nvme_nvm_ph_rw { 38struct nvme_nvm_ph_rw {
56 __u8 opcode; 39 __u8 opcode;
57 __u8 flags; 40 __u8 flags;
@@ -80,19 +63,6 @@ struct nvme_nvm_identity {
80 __u32 rsvd11[5]; 63 __u32 rsvd11[5];
81}; 64};
82 65
83struct nvme_nvm_l2ptbl {
84 __u8 opcode;
85 __u8 flags;
86 __u16 command_id;
87 __le32 nsid;
88 __le32 cdw2[4];
89 __le64 prp1;
90 __le64 prp2;
91 __le64 slba;
92 __le32 nlb;
93 __le16 cdw14[6];
94};
95
96struct nvme_nvm_getbbtbl { 66struct nvme_nvm_getbbtbl {
97 __u8 opcode; 67 __u8 opcode;
98 __u8 flags; 68 __u8 flags;
@@ -139,9 +109,7 @@ struct nvme_nvm_command {
139 union { 109 union {
140 struct nvme_common_command common; 110 struct nvme_common_command common;
141 struct nvme_nvm_identity identity; 111 struct nvme_nvm_identity identity;
142 struct nvme_nvm_hb_rw hb_rw;
143 struct nvme_nvm_ph_rw ph_rw; 112 struct nvme_nvm_ph_rw ph_rw;
144 struct nvme_nvm_l2ptbl l2p;
145 struct nvme_nvm_getbbtbl get_bb; 113 struct nvme_nvm_getbbtbl get_bb;
146 struct nvme_nvm_setbbtbl set_bb; 114 struct nvme_nvm_setbbtbl set_bb;
147 struct nvme_nvm_erase_blk erase; 115 struct nvme_nvm_erase_blk erase;
@@ -167,7 +135,7 @@ struct nvme_nvm_id_group {
167 __u8 num_lun; 135 __u8 num_lun;
168 __u8 num_pln; 136 __u8 num_pln;
169 __u8 rsvd1; 137 __u8 rsvd1;
170 __le16 num_blk; 138 __le16 num_chk;
171 __le16 num_pg; 139 __le16 num_pg;
172 __le16 fpg_sz; 140 __le16 fpg_sz;
173 __le16 csecs; 141 __le16 csecs;
@@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl {
234static inline void _nvme_nvm_check_size(void) 202static inline void _nvme_nvm_check_size(void)
235{ 203{
236 BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); 204 BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
237 BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64);
238 BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); 205 BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
239 BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); 206 BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
240 BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); 207 BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
241 BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
242 BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); 208 BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
243 BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); 209 BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
244 BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); 210 BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
@@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void)
249static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) 215static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
250{ 216{
251 struct nvme_nvm_id_group *src; 217 struct nvme_nvm_id_group *src;
252 struct nvm_id_group *dst; 218 struct nvm_id_group *grp;
219 int sec_per_pg, sec_per_pl, pg_per_blk;
253 220
254 if (nvme_nvm_id->cgrps != 1) 221 if (nvme_nvm_id->cgrps != 1)
255 return -EINVAL; 222 return -EINVAL;
256 223
257 src = &nvme_nvm_id->groups[0]; 224 src = &nvme_nvm_id->groups[0];
258 dst = &nvm_id->grp; 225 grp = &nvm_id->grp;
259 226
260 dst->mtype = src->mtype; 227 grp->mtype = src->mtype;
261 dst->fmtype = src->fmtype; 228 grp->fmtype = src->fmtype;
262 dst->num_ch = src->num_ch; 229
263 dst->num_lun = src->num_lun; 230 grp->num_ch = src->num_ch;
264 dst->num_pln = src->num_pln; 231 grp->num_lun = src->num_lun;
265 232
266 dst->num_pg = le16_to_cpu(src->num_pg); 233 grp->num_chk = le16_to_cpu(src->num_chk);
267 dst->num_blk = le16_to_cpu(src->num_blk); 234 grp->csecs = le16_to_cpu(src->csecs);
268 dst->fpg_sz = le16_to_cpu(src->fpg_sz); 235 grp->sos = le16_to_cpu(src->sos);
269 dst->csecs = le16_to_cpu(src->csecs); 236
270 dst->sos = le16_to_cpu(src->sos); 237 pg_per_blk = le16_to_cpu(src->num_pg);
271 238 sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs;
272 dst->trdt = le32_to_cpu(src->trdt); 239 sec_per_pl = sec_per_pg * src->num_pln;
273 dst->trdm = le32_to_cpu(src->trdm); 240 grp->clba = sec_per_pl * pg_per_blk;
274 dst->tprt = le32_to_cpu(src->tprt); 241 grp->ws_per_chk = pg_per_blk;
275 dst->tprm = le32_to_cpu(src->tprm);
276 dst->tbet = le32_to_cpu(src->tbet);
277 dst->tbem = le32_to_cpu(src->tbem);
278 dst->mpos = le32_to_cpu(src->mpos);
279 dst->mccap = le32_to_cpu(src->mccap);
280
281 dst->cpar = le16_to_cpu(src->cpar);
282
283 if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
284 memcpy(dst->lptbl.id, src->lptbl.id, 8);
285 dst->lptbl.mlc.num_pairs =
286 le16_to_cpu(src->lptbl.mlc.num_pairs);
287
288 if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
289 pr_err("nvm: number of MLC pairs not supported\n");
290 return -EINVAL;
291 }
292 242
293 memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, 243 grp->mpos = le32_to_cpu(src->mpos);
294 dst->lptbl.mlc.num_pairs); 244 grp->cpar = le16_to_cpu(src->cpar);
245 grp->mccap = le32_to_cpu(src->mccap);
246
247 grp->ws_opt = grp->ws_min = sec_per_pg;
248 grp->ws_seq = NVM_IO_SNGL_ACCESS;
249
250 if (grp->mpos & 0x020202) {
251 grp->ws_seq = NVM_IO_DUAL_ACCESS;
252 grp->ws_opt <<= 1;
253 } else if (grp->mpos & 0x040404) {
254 grp->ws_seq = NVM_IO_QUAD_ACCESS;
255 grp->ws_opt <<= 2;
295 } 256 }
296 257
258 grp->trdt = le32_to_cpu(src->trdt);
259 grp->trdm = le32_to_cpu(src->trdm);
260 grp->tprt = le32_to_cpu(src->tprt);
261 grp->tprm = le32_to_cpu(src->tprm);
262 grp->tbet = le32_to_cpu(src->tbet);
263 grp->tbem = le32_to_cpu(src->tbem);
264
265 /* 1.2 compatibility */
266 grp->num_pln = src->num_pln;
267 grp->num_pg = le16_to_cpu(src->num_pg);
268 grp->fpg_sz = le16_to_cpu(src->fpg_sz);
269
297 return 0; 270 return 0;
298} 271}
299 272
@@ -332,62 +305,6 @@ out:
332 return ret; 305 return ret;
333} 306}
334 307
335static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
336 nvm_l2p_update_fn *update_l2p, void *priv)
337{
338 struct nvme_ns *ns = nvmdev->q->queuedata;
339 struct nvme_nvm_command c = {};
340 u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
341 u32 nlb_pr_rq = len / sizeof(u64);
342 u64 cmd_slba = slba;
343 void *entries;
344 int ret = 0;
345
346 c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
347 c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
348 entries = kmalloc(len, GFP_KERNEL);
349 if (!entries)
350 return -ENOMEM;
351
352 while (nlb) {
353 u32 cmd_nlb = min(nlb_pr_rq, nlb);
354 u64 elba = slba + cmd_nlb;
355
356 c.l2p.slba = cpu_to_le64(cmd_slba);
357 c.l2p.nlb = cpu_to_le32(cmd_nlb);
358
359 ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
360 (struct nvme_command *)&c, entries, len);
361 if (ret) {
362 dev_err(ns->ctrl->device,
363 "L2P table transfer failed (%d)\n", ret);
364 ret = -EIO;
365 goto out;
366 }
367
368 if (unlikely(elba > nvmdev->total_secs)) {
369 pr_err("nvm: L2P data from device is out of bounds!\n");
370 ret = -EINVAL;
371 goto out;
372 }
373
374 /* Transform physical address to target address space */
375 nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
376
377 if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
378 ret = -EINTR;
379 goto out;
380 }
381
382 cmd_slba += cmd_nlb;
383 nlb -= cmd_nlb;
384 }
385
386out:
387 kfree(entries);
388 return ret;
389}
390
391static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, 308static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
392 u8 *blks) 309 u8 *blks)
393{ 310{
@@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
397 struct nvme_ctrl *ctrl = ns->ctrl; 314 struct nvme_ctrl *ctrl = ns->ctrl;
398 struct nvme_nvm_command c = {}; 315 struct nvme_nvm_command c = {};
399 struct nvme_nvm_bb_tbl *bb_tbl; 316 struct nvme_nvm_bb_tbl *bb_tbl;
400 int nr_blks = geo->blks_per_lun * geo->plane_mode; 317 int nr_blks = geo->nr_chks * geo->plane_mode;
401 int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; 318 int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
402 int ret = 0; 319 int ret = 0;
403 320
@@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
438 goto out; 355 goto out;
439 } 356 }
440 357
441 memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode); 358 memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode);
442out: 359out:
443 kfree(bb_tbl); 360 kfree(bb_tbl);
444 return ret; 361 return ret;
@@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
474 c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); 391 c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
475 c->ph_rw.control = cpu_to_le16(rqd->flags); 392 c->ph_rw.control = cpu_to_le16(rqd->flags);
476 c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); 393 c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
477
478 if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
479 c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
480 rqd->bio->bi_iter.bi_sector));
481} 394}
482 395
483static void nvme_nvm_end_io(struct request *rq, blk_status_t status) 396static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
@@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr,
597static struct nvm_dev_ops nvme_nvm_dev_ops = { 510static struct nvm_dev_ops nvme_nvm_dev_ops = {
598 .identity = nvme_nvm_identity, 511 .identity = nvme_nvm_identity,
599 512
600 .get_l2p_tbl = nvme_nvm_get_l2p_tbl,
601
602 .get_bb_tbl = nvme_nvm_get_bb_tbl, 513 .get_bb_tbl = nvme_nvm_get_bb_tbl,
603 .set_bb_tbl = nvme_nvm_set_bb_tbl, 514 .set_bb_tbl = nvme_nvm_set_bb_tbl,
604 515
@@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
883 } else if (strcmp(attr->name, "num_planes") == 0) { 794 } else if (strcmp(attr->name, "num_planes") == 0) {
884 return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); 795 return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
885 } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ 796 } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */
886 return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); 797 return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk);
887 } else if (strcmp(attr->name, "num_pages") == 0) { 798 } else if (strcmp(attr->name, "num_pages") == 0) {
888 return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); 799 return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
889 } else if (strcmp(attr->name, "page_size") == 0) { 800 } else if (strcmp(attr->name, "page_size") == 0) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1218a9fca846..3b211d9e58b8 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req)
33 kblockd_schedule_work(&ns->head->requeue_work); 33 kblockd_schedule_work(&ns->head->requeue_work);
34} 34}
35 35
36bool nvme_req_needs_failover(struct request *req) 36bool nvme_req_needs_failover(struct request *req, blk_status_t error)
37{ 37{
38 if (!(req->cmd_flags & REQ_NVME_MPATH)) 38 if (!(req->cmd_flags & REQ_NVME_MPATH))
39 return false; 39 return false;
40 40 return blk_path_error(error);
41 switch (nvme_req(req)->status & 0x7ff) {
42 /*
43 * Generic command status:
44 */
45 case NVME_SC_INVALID_OPCODE:
46 case NVME_SC_INVALID_FIELD:
47 case NVME_SC_INVALID_NS:
48 case NVME_SC_LBA_RANGE:
49 case NVME_SC_CAP_EXCEEDED:
50 case NVME_SC_RESERVATION_CONFLICT:
51 return false;
52
53 /*
54 * I/O command set specific error. Unfortunately these values are
55 * reused for fabrics commands, but those should never get here.
56 */
57 case NVME_SC_BAD_ATTRIBUTES:
58 case NVME_SC_INVALID_PI:
59 case NVME_SC_READ_ONLY:
60 case NVME_SC_ONCS_NOT_SUPPORTED:
61 WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
62 nvme_fabrics_command);
63 return false;
64
65 /*
66 * Media and Data Integrity Errors:
67 */
68 case NVME_SC_WRITE_FAULT:
69 case NVME_SC_READ_ERROR:
70 case NVME_SC_GUARD_CHECK:
71 case NVME_SC_APPTAG_CHECK:
72 case NVME_SC_REFTAG_CHECK:
73 case NVME_SC_COMPARE_FAILED:
74 case NVME_SC_ACCESS_DENIED:
75 case NVME_SC_UNWRITTEN_BLOCK:
76 return false;
77 }
78
79 /* Everything else could be a path failure, so should be retried */
80 return true;
81} 41}
82 42
83void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 43void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a00eabd06427..8e4550fa08f8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -32,6 +32,8 @@ extern unsigned int admin_timeout;
32#define NVME_KATO_GRACE 10 32#define NVME_KATO_GRACE 10
33 33
34extern struct workqueue_struct *nvme_wq; 34extern struct workqueue_struct *nvme_wq;
35extern struct workqueue_struct *nvme_reset_wq;
36extern struct workqueue_struct *nvme_delete_wq;
35 37
36enum { 38enum {
37 NVME_NS_LBA = 0, 39 NVME_NS_LBA = 0,
@@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
119enum nvme_ctrl_state { 121enum nvme_ctrl_state {
120 NVME_CTRL_NEW, 122 NVME_CTRL_NEW,
121 NVME_CTRL_LIVE, 123 NVME_CTRL_LIVE,
124 NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
122 NVME_CTRL_RESETTING, 125 NVME_CTRL_RESETTING,
123 NVME_CTRL_RECONNECTING, 126 NVME_CTRL_RECONNECTING,
124 NVME_CTRL_DELETING, 127 NVME_CTRL_DELETING,
@@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
393void nvme_start_keep_alive(struct nvme_ctrl *ctrl); 396void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
394void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); 397void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
395int nvme_reset_ctrl(struct nvme_ctrl *ctrl); 398int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
399int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
396int nvme_delete_ctrl(struct nvme_ctrl *ctrl); 400int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
397int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); 401int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
398 402
@@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops;
401 405
402#ifdef CONFIG_NVME_MULTIPATH 406#ifdef CONFIG_NVME_MULTIPATH
403void nvme_failover_req(struct request *req); 407void nvme_failover_req(struct request *req);
404bool nvme_req_needs_failover(struct request *req); 408bool nvme_req_needs_failover(struct request *req, blk_status_t error);
405void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); 409void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
406int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); 410int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
407void nvme_mpath_add_disk(struct nvme_ns_head *head); 411void nvme_mpath_add_disk(struct nvme_ns_head *head);
@@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
430static inline void nvme_failover_req(struct request *req) 434static inline void nvme_failover_req(struct request *req)
431{ 435{
432} 436}
433static inline bool nvme_req_needs_failover(struct request *req) 437static inline bool nvme_req_needs_failover(struct request *req,
438 blk_status_t error)
434{ 439{
435 return false; 440 return false;
436} 441}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4276ebfff22b..6fe7af00a1f4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
75 * Represents an NVM Express device. Each nvme_dev is a PCI function. 75 * Represents an NVM Express device. Each nvme_dev is a PCI function.
76 */ 76 */
77struct nvme_dev { 77struct nvme_dev {
78 struct nvme_queue **queues; 78 struct nvme_queue *queues;
79 struct blk_mq_tag_set tagset; 79 struct blk_mq_tag_set tagset;
80 struct blk_mq_tag_set admin_tagset; 80 struct blk_mq_tag_set admin_tagset;
81 u32 __iomem *dbs; 81 u32 __iomem *dbs;
@@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
365 unsigned int hctx_idx) 365 unsigned int hctx_idx)
366{ 366{
367 struct nvme_dev *dev = data; 367 struct nvme_dev *dev = data;
368 struct nvme_queue *nvmeq = dev->queues[0]; 368 struct nvme_queue *nvmeq = &dev->queues[0];
369 369
370 WARN_ON(hctx_idx != 0); 370 WARN_ON(hctx_idx != 0);
371 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 371 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
387 unsigned int hctx_idx) 387 unsigned int hctx_idx)
388{ 388{
389 struct nvme_dev *dev = data; 389 struct nvme_dev *dev = data;
390 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 390 struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
391 391
392 if (!nvmeq->tags) 392 if (!nvmeq->tags)
393 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 393 nvmeq->tags = &dev->tagset.tags[hctx_idx];
@@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
403 struct nvme_dev *dev = set->driver_data; 403 struct nvme_dev *dev = set->driver_data;
404 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 404 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
405 int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; 405 int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
406 struct nvme_queue *nvmeq = dev->queues[queue_idx]; 406 struct nvme_queue *nvmeq = &dev->queues[queue_idx];
407 407
408 BUG_ON(!nvmeq); 408 BUG_ON(!nvmeq);
409 iod->nvmeq = nvmeq; 409 iod->nvmeq = nvmeq;
@@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1044static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) 1044static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
1045{ 1045{
1046 struct nvme_dev *dev = to_nvme_dev(ctrl); 1046 struct nvme_dev *dev = to_nvme_dev(ctrl);
1047 struct nvme_queue *nvmeq = dev->queues[0]; 1047 struct nvme_queue *nvmeq = &dev->queues[0];
1048 struct nvme_command c; 1048 struct nvme_command c;
1049 1049
1050 memset(&c, 0, sizeof(c)); 1050 memset(&c, 0, sizeof(c));
@@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
1138 */ 1138 */
1139 bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); 1139 bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
1140 1140
1141 /* If there is a reset ongoing, we shouldn't reset again. */ 1141 /* If there is a reset/reinit ongoing, we shouldn't reset again. */
1142 if (dev->ctrl.state == NVME_CTRL_RESETTING) 1142 switch (dev->ctrl.state) {
1143 case NVME_CTRL_RESETTING:
1144 case NVME_CTRL_RECONNECTING:
1143 return false; 1145 return false;
1146 default:
1147 break;
1148 }
1144 1149
1145 /* We shouldn't reset unless the controller is on fatal error state 1150 /* We shouldn't reset unless the controller is on fatal error state
1146 * _or_ if we lost the communication with it. 1151 * _or_ if we lost the communication with it.
@@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
1280 if (nvmeq->sq_cmds) 1285 if (nvmeq->sq_cmds)
1281 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1286 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1282 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1287 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1283 kfree(nvmeq);
1284} 1288}
1285 1289
1286static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1290static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1288 int i; 1292 int i;
1289 1293
1290 for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { 1294 for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
1291 struct nvme_queue *nvmeq = dev->queues[i];
1292 dev->ctrl.queue_count--; 1295 dev->ctrl.queue_count--;
1293 dev->queues[i] = NULL; 1296 nvme_free_queue(&dev->queues[i]);
1294 nvme_free_queue(nvmeq);
1295 } 1297 }
1296} 1298}
1297 1299
@@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1323 1325
1324static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) 1326static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
1325{ 1327{
1326 struct nvme_queue *nvmeq = dev->queues[0]; 1328 struct nvme_queue *nvmeq = &dev->queues[0];
1327
1328 if (!nvmeq)
1329 return;
1330 if (nvme_suspend_queue(nvmeq))
1331 return;
1332 1329
1333 if (shutdown) 1330 if (shutdown)
1334 nvme_shutdown_ctrl(&dev->ctrl); 1331 nvme_shutdown_ctrl(&dev->ctrl);
@@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
1367static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1364static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1368 int qid, int depth) 1365 int qid, int depth)
1369{ 1366{
1370 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1367 if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1371 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), 1368 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
1372 dev->ctrl.page_size); 1369 dev->ctrl.page_size);
1373 nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; 1370 nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
@@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1382 return 0; 1379 return 0;
1383} 1380}
1384 1381
1385static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1382static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
1386 int depth, int node) 1383 int depth, int node)
1387{ 1384{
1388 struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, 1385 struct nvme_queue *nvmeq = &dev->queues[qid];
1389 node); 1386
1390 if (!nvmeq) 1387 if (dev->ctrl.queue_count > qid)
1391 return NULL; 1388 return 0;
1392 1389
1393 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1390 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
1394 &nvmeq->cq_dma_addr, GFP_KERNEL); 1391 &nvmeq->cq_dma_addr, GFP_KERNEL);
@@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1407 nvmeq->q_depth = depth; 1404 nvmeq->q_depth = depth;
1408 nvmeq->qid = qid; 1405 nvmeq->qid = qid;
1409 nvmeq->cq_vector = -1; 1406 nvmeq->cq_vector = -1;
1410 dev->queues[qid] = nvmeq;
1411 dev->ctrl.queue_count++; 1407 dev->ctrl.queue_count++;
1412 1408
1413 return nvmeq; 1409 return 0;
1414 1410
1415 free_cqdma: 1411 free_cqdma:
1416 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1412 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
1417 nvmeq->cq_dma_addr); 1413 nvmeq->cq_dma_addr);
1418 free_nvmeq: 1414 free_nvmeq:
1419 kfree(nvmeq); 1415 return -ENOMEM;
1420 return NULL;
1421} 1416}
1422 1417
1423static int queue_request_irq(struct nvme_queue *nvmeq) 1418static int queue_request_irq(struct nvme_queue *nvmeq)
@@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
1590 if (result < 0) 1585 if (result < 0)
1591 return result; 1586 return result;
1592 1587
1593 nvmeq = dev->queues[0]; 1588 result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
1594 if (!nvmeq) { 1589 dev_to_node(dev->dev));
1595 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 1590 if (result)
1596 dev_to_node(dev->dev)); 1591 return result;
1597 if (!nvmeq)
1598 return -ENOMEM;
1599 }
1600 1592
1593 nvmeq = &dev->queues[0];
1601 aqa = nvmeq->q_depth - 1; 1594 aqa = nvmeq->q_depth - 1;
1602 aqa |= aqa << 16; 1595 aqa |= aqa << 16;
1603 1596
@@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
1627 1620
1628 for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { 1621 for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1629 /* vector == qid - 1, match nvme_create_queue */ 1622 /* vector == qid - 1, match nvme_create_queue */
1630 if (!nvme_alloc_queue(dev, i, dev->q_depth, 1623 if (nvme_alloc_queue(dev, i, dev->q_depth,
1631 pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { 1624 pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
1632 ret = -ENOMEM; 1625 ret = -ENOMEM;
1633 break; 1626 break;
@@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
1636 1629
1637 max = min(dev->max_qid, dev->ctrl.queue_count - 1); 1630 max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1638 for (i = dev->online_queues; i <= max; i++) { 1631 for (i = dev->online_queues; i <= max; i++) {
1639 ret = nvme_create_queue(dev->queues[i], i); 1632 ret = nvme_create_queue(&dev->queues[i], i);
1640 if (ret) 1633 if (ret)
1641 break; 1634 break;
1642 } 1635 }
1643 1636
1644 /* 1637 /*
1645 * Ignore failing Create SQ/CQ commands, we can continue with less 1638 * Ignore failing Create SQ/CQ commands, we can continue with less
1646 * than the desired aount of queues, and even a controller without 1639 * than the desired amount of queues, and even a controller without
1647 * I/O queues an still be used to issue admin commands. This might 1640 * I/O queues can still be used to issue admin commands. This might
1648 * be useful to upgrade a buggy firmware for example. 1641 * be useful to upgrade a buggy firmware for example.
1649 */ 1642 */
1650 return ret >= 0 ? 0 : ret; 1643 return ret >= 0 ? 0 : ret;
@@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev,
1661} 1654}
1662static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); 1655static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
1663 1656
1664static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 1657static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1665{ 1658{
1666 u64 szu, size, offset; 1659 u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
1660
1661 return 1ULL << (12 + 4 * szu);
1662}
1663
1664static u32 nvme_cmb_size(struct nvme_dev *dev)
1665{
1666 return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
1667}
1668
1669static void nvme_map_cmb(struct nvme_dev *dev)
1670{
1671 u64 size, offset;
1667 resource_size_t bar_size; 1672 resource_size_t bar_size;
1668 struct pci_dev *pdev = to_pci_dev(dev->dev); 1673 struct pci_dev *pdev = to_pci_dev(dev->dev);
1669 void __iomem *cmb;
1670 int bar; 1674 int bar;
1671 1675
1672 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); 1676 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1673 if (!(NVME_CMB_SZ(dev->cmbsz))) 1677 if (!dev->cmbsz)
1674 return NULL; 1678 return;
1675 dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); 1679 dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1676 1680
1677 if (!use_cmb_sqes) 1681 if (!use_cmb_sqes)
1678 return NULL; 1682 return;
1679 1683
1680 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 1684 size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
1681 size = szu * NVME_CMB_SZ(dev->cmbsz); 1685 offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1682 offset = szu * NVME_CMB_OFST(dev->cmbloc);
1683 bar = NVME_CMB_BIR(dev->cmbloc); 1686 bar = NVME_CMB_BIR(dev->cmbloc);
1684 bar_size = pci_resource_len(pdev, bar); 1687 bar_size = pci_resource_len(pdev, bar);
1685 1688
1686 if (offset > bar_size) 1689 if (offset > bar_size)
1687 return NULL; 1690 return;
1688 1691
1689 /* 1692 /*
1690 * Controllers may support a CMB size larger than their BAR, 1693 * Controllers may support a CMB size larger than their BAR,
@@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
1694 if (size > bar_size - offset) 1697 if (size > bar_size - offset)
1695 size = bar_size - offset; 1698 size = bar_size - offset;
1696 1699
1697 cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); 1700 dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
1698 if (!cmb) 1701 if (!dev->cmb)
1699 return NULL; 1702 return;
1700
1701 dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; 1703 dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
1702 dev->cmb_size = size; 1704 dev->cmb_size = size;
1703 return cmb; 1705
1706 if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
1707 &dev_attr_cmb.attr, NULL))
1708 dev_warn(dev->ctrl.device,
1709 "failed to add sysfs attribute for CMB\n");
1704} 1710}
1705 1711
1706static inline void nvme_release_cmb(struct nvme_dev *dev) 1712static inline void nvme_release_cmb(struct nvme_dev *dev)
@@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
1768 dma_addr_t descs_dma; 1774 dma_addr_t descs_dma;
1769 int i = 0; 1775 int i = 0;
1770 void **bufs; 1776 void **bufs;
1771 u64 size = 0, tmp; 1777 u64 size, tmp;
1772 1778
1773 tmp = (preferred + chunk_size - 1); 1779 tmp = (preferred + chunk_size - 1);
1774 do_div(tmp, chunk_size); 1780 do_div(tmp, chunk_size);
@@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
1851 u64 preferred = (u64)dev->ctrl.hmpre * 4096; 1857 u64 preferred = (u64)dev->ctrl.hmpre * 4096;
1852 u64 min = (u64)dev->ctrl.hmmin * 4096; 1858 u64 min = (u64)dev->ctrl.hmmin * 4096;
1853 u32 enable_bits = NVME_HOST_MEM_ENABLE; 1859 u32 enable_bits = NVME_HOST_MEM_ENABLE;
1854 int ret = 0; 1860 int ret;
1855 1861
1856 preferred = min(preferred, max); 1862 preferred = min(preferred, max);
1857 if (min > max) { 1863 if (min > max) {
@@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
1892 1898
1893static int nvme_setup_io_queues(struct nvme_dev *dev) 1899static int nvme_setup_io_queues(struct nvme_dev *dev)
1894{ 1900{
1895 struct nvme_queue *adminq = dev->queues[0]; 1901 struct nvme_queue *adminq = &dev->queues[0];
1896 struct pci_dev *pdev = to_pci_dev(dev->dev); 1902 struct pci_dev *pdev = to_pci_dev(dev->dev);
1897 int result, nr_io_queues; 1903 int result, nr_io_queues;
1898 unsigned long size; 1904 unsigned long size;
@@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1905 if (nr_io_queues == 0) 1911 if (nr_io_queues == 0)
1906 return 0; 1912 return 0;
1907 1913
1908 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 1914 if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1909 result = nvme_cmb_qdepth(dev, nr_io_queues, 1915 result = nvme_cmb_qdepth(dev, nr_io_queues,
1910 sizeof(struct nvme_command)); 1916 sizeof(struct nvme_command));
1911 if (result > 0) 1917 if (result > 0)
@@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2005 return 0; 2011 return 0;
2006} 2012}
2007 2013
2008static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) 2014static void nvme_disable_io_queues(struct nvme_dev *dev)
2009{ 2015{
2010 int pass; 2016 int pass, queues = dev->online_queues - 1;
2011 unsigned long timeout; 2017 unsigned long timeout;
2012 u8 opcode = nvme_admin_delete_sq; 2018 u8 opcode = nvme_admin_delete_sq;
2013 2019
@@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
2018 retry: 2024 retry:
2019 timeout = ADMIN_TIMEOUT; 2025 timeout = ADMIN_TIMEOUT;
2020 for (; i > 0; i--, sent++) 2026 for (; i > 0; i--, sent++)
2021 if (nvme_delete_queue(dev->queues[i], opcode)) 2027 if (nvme_delete_queue(&dev->queues[i], opcode))
2022 break; 2028 break;
2023 2029
2024 while (sent--) { 2030 while (sent--) {
@@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
2033} 2039}
2034 2040
2035/* 2041/*
2036 * Return: error value if an error occurred setting up the queues or calling 2042 * return error value only when tagset allocation failed
2037 * Identify Device. 0 if these succeeded, even if adding some of the
2038 * namespaces failed. At the moment, these failures are silent. TBD which
2039 * failures should be reported.
2040 */ 2043 */
2041static int nvme_dev_add(struct nvme_dev *dev) 2044static int nvme_dev_add(struct nvme_dev *dev)
2042{ 2045{
2046 int ret;
2047
2043 if (!dev->ctrl.tagset) { 2048 if (!dev->ctrl.tagset) {
2044 dev->tagset.ops = &nvme_mq_ops; 2049 dev->tagset.ops = &nvme_mq_ops;
2045 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2050 dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
2055 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2060 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2056 dev->tagset.driver_data = dev; 2061 dev->tagset.driver_data = dev;
2057 2062
2058 if (blk_mq_alloc_tag_set(&dev->tagset)) 2063 ret = blk_mq_alloc_tag_set(&dev->tagset);
2059 return 0; 2064 if (ret) {
2065 dev_warn(dev->ctrl.device,
2066 "IO queues tagset allocation failed %d\n", ret);
2067 return ret;
2068 }
2060 dev->ctrl.tagset = &dev->tagset; 2069 dev->ctrl.tagset = &dev->tagset;
2061 2070
2062 nvme_dbbuf_set(dev); 2071 nvme_dbbuf_set(dev);
@@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
2122 "set queue depth=%u\n", dev->q_depth); 2131 "set queue depth=%u\n", dev->q_depth);
2123 } 2132 }
2124 2133
2125 /* 2134 nvme_map_cmb(dev);
2126 * CMBs can currently only exist on >=1.2 PCIe devices. We only
2127 * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
2128 * has no name we can pass NULL as final argument to
2129 * sysfs_add_file_to_group.
2130 */
2131
2132 if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
2133 dev->cmb = nvme_map_cmb(dev);
2134 if (dev->cmb) {
2135 if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
2136 &dev_attr_cmb.attr, NULL))
2137 dev_warn(dev->ctrl.device,
2138 "failed to add sysfs attribute for CMB\n");
2139 }
2140 }
2141 2135
2142 pci_enable_pcie_error_reporting(pdev); 2136 pci_enable_pcie_error_reporting(pdev);
2143 pci_save_state(pdev); 2137 pci_save_state(pdev);
@@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
2170 2164
2171static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) 2165static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
2172{ 2166{
2173 int i, queues; 2167 int i;
2174 bool dead = true; 2168 bool dead = true;
2175 struct pci_dev *pdev = to_pci_dev(dev->dev); 2169 struct pci_dev *pdev = to_pci_dev(dev->dev);
2176 2170
@@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
2205 } 2199 }
2206 nvme_stop_queues(&dev->ctrl); 2200 nvme_stop_queues(&dev->ctrl);
2207 2201
2208 queues = dev->online_queues - 1; 2202 if (!dead) {
2209 for (i = dev->ctrl.queue_count - 1; i > 0; i--) 2203 nvme_disable_io_queues(dev);
2210 nvme_suspend_queue(dev->queues[i]);
2211
2212 if (dead) {
2213 /* A device might become IO incapable very soon during
2214 * probe, before the admin queue is configured. Thus,
2215 * queue_count can be 0 here.
2216 */
2217 if (dev->ctrl.queue_count)
2218 nvme_suspend_queue(dev->queues[0]);
2219 } else {
2220 nvme_disable_io_queues(dev, queues);
2221 nvme_disable_admin_queue(dev, shutdown); 2204 nvme_disable_admin_queue(dev, shutdown);
2222 } 2205 }
2206 for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
2207 nvme_suspend_queue(&dev->queues[i]);
2208
2223 nvme_pci_disable(dev); 2209 nvme_pci_disable(dev);
2224 2210
2225 blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); 2211 blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
@@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work)
2289 container_of(work, struct nvme_dev, ctrl.reset_work); 2275 container_of(work, struct nvme_dev, ctrl.reset_work);
2290 bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); 2276 bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2291 int result = -ENODEV; 2277 int result = -ENODEV;
2278 enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
2292 2279
2293 if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) 2280 if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
2294 goto out; 2281 goto out;
@@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work)
2300 if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) 2287 if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2301 nvme_dev_disable(dev, false); 2288 nvme_dev_disable(dev, false);
2302 2289
2290 /*
2291 * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
2292 * initializing procedure here.
2293 */
2294 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
2295 dev_warn(dev->ctrl.device,
2296 "failed to mark controller RECONNECTING\n");
2297 goto out;
2298 }
2299
2303 result = nvme_pci_enable(dev); 2300 result = nvme_pci_enable(dev);
2304 if (result) 2301 if (result)
2305 goto out; 2302 goto out;
@@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work)
2352 dev_warn(dev->ctrl.device, "IO queues not created\n"); 2349 dev_warn(dev->ctrl.device, "IO queues not created\n");
2353 nvme_kill_queues(&dev->ctrl); 2350 nvme_kill_queues(&dev->ctrl);
2354 nvme_remove_namespaces(&dev->ctrl); 2351 nvme_remove_namespaces(&dev->ctrl);
2352 new_state = NVME_CTRL_ADMIN_ONLY;
2355 } else { 2353 } else {
2356 nvme_start_queues(&dev->ctrl); 2354 nvme_start_queues(&dev->ctrl);
2357 nvme_wait_freeze(&dev->ctrl); 2355 nvme_wait_freeze(&dev->ctrl);
2358 nvme_dev_add(dev); 2356 /* hit this only when allocate tagset fails */
2357 if (nvme_dev_add(dev))
2358 new_state = NVME_CTRL_ADMIN_ONLY;
2359 nvme_unfreeze(&dev->ctrl); 2359 nvme_unfreeze(&dev->ctrl);
2360 } 2360 }
2361 2361
2362 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { 2362 /*
2363 dev_warn(dev->ctrl.device, "failed to mark controller live\n"); 2363 * If only admin queue live, keep it to do further investigation or
2364 * recovery.
2365 */
2366 if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
2367 dev_warn(dev->ctrl.device,
2368 "failed to mark controller state %d\n", new_state);
2364 goto out; 2369 goto out;
2365 } 2370 }
2366 2371
@@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2468 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 2473 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
2469 if (!dev) 2474 if (!dev)
2470 return -ENOMEM; 2475 return -ENOMEM;
2471 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), 2476
2472 GFP_KERNEL, node); 2477 dev->queues = kcalloc_node(num_possible_cpus() + 1,
2478 sizeof(struct nvme_queue), GFP_KERNEL, node);
2473 if (!dev->queues) 2479 if (!dev->queues)
2474 goto free; 2480 goto free;
2475 2481
@@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2496 if (result) 2502 if (result)
2497 goto release_pools; 2503 goto release_pools;
2498 2504
2499 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
2500 dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); 2505 dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
2501 2506
2502 queue_work(nvme_wq, &dev->ctrl.reset_work); 2507 nvme_reset_ctrl(&dev->ctrl);
2508
2503 return 0; 2509 return 0;
2504 2510
2505 release_pools: 2511 release_pools:
@@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev)
2523static void nvme_reset_done(struct pci_dev *pdev) 2529static void nvme_reset_done(struct pci_dev *pdev)
2524{ 2530{
2525 struct nvme_dev *dev = pci_get_drvdata(pdev); 2531 struct nvme_dev *dev = pci_get_drvdata(pdev);
2526 nvme_reset_ctrl(&dev->ctrl); 2532 nvme_reset_ctrl_sync(&dev->ctrl);
2527} 2533}
2528 2534
2529static void nvme_shutdown(struct pci_dev *pdev) 2535static void nvme_shutdown(struct pci_dev *pdev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2a0bba7f50cf..2bc059f7d73c 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -66,7 +66,6 @@ struct nvme_rdma_request {
66 struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; 66 struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
67 u32 num_sge; 67 u32 num_sge;
68 int nents; 68 int nents;
69 bool inline_data;
70 struct ib_reg_wr reg_wr; 69 struct ib_reg_wr reg_wr;
71 struct ib_cqe reg_cqe; 70 struct ib_cqe reg_cqe;
72 struct nvme_rdma_queue *queue; 71 struct nvme_rdma_queue *queue;
@@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
1092 sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); 1091 sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
1093 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; 1092 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1094 1093
1095 req->inline_data = true;
1096 req->num_sge++; 1094 req->num_sge++;
1097 return 0; 1095 return 0;
1098} 1096}
@@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
1164 int count, ret; 1162 int count, ret;
1165 1163
1166 req->num_sge = 1; 1164 req->num_sge = 1;
1167 req->inline_data = false;
1168 refcount_set(&req->ref, 2); /* send and recv completions */ 1165 refcount_set(&req->ref, 2); /* send and recv completions */
1169 1166
1170 c->common.flags |= NVME_CMD_SGL_METABUF; 1167 c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -2018,6 +2015,7 @@ out_free_ctrl:
2018 2015
2019static struct nvmf_transport_ops nvme_rdma_transport = { 2016static struct nvmf_transport_ops nvme_rdma_transport = {
2020 .name = "rdma", 2017 .name = "rdma",
2018 .module = THIS_MODULE,
2021 .required_opts = NVMF_OPT_TRADDR, 2019 .required_opts = NVMF_OPT_TRADDR,
2022 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | 2020 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2023 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, 2021 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
@@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2040 } 2038 }
2041 mutex_unlock(&nvme_rdma_ctrl_mutex); 2039 mutex_unlock(&nvme_rdma_ctrl_mutex);
2042 2040
2043 flush_workqueue(nvme_wq); 2041 flush_workqueue(nvme_delete_wq);
2044} 2042}
2045 2043
2046static struct ib_client nvme_rdma_ib_client = { 2044static struct ib_client nvme_rdma_ib_client = {
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
new file mode 100644
index 000000000000..41944bbef835
--- /dev/null
+++ b/drivers/nvme/host/trace.c
@@ -0,0 +1,130 @@
1/*
2 * NVM Express device driver tracepoints
3 * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <asm/unaligned.h>
16#include "trace.h"
17
18static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
19{
20 const char *ret = trace_seq_buffer_ptr(p);
21 u16 sqid = get_unaligned_le16(cdw10);
22 u16 qsize = get_unaligned_le16(cdw10 + 2);
23 u16 sq_flags = get_unaligned_le16(cdw10 + 4);
24 u16 cqid = get_unaligned_le16(cdw10 + 6);
25
26
27 trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u",
28 sqid, qsize, sq_flags, cqid);
29 trace_seq_putc(p, 0);
30
31 return ret;
32}
33
34static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
35{
36 const char *ret = trace_seq_buffer_ptr(p);
37 u16 cqid = get_unaligned_le16(cdw10);
38 u16 qsize = get_unaligned_le16(cdw10 + 2);
39 u16 cq_flags = get_unaligned_le16(cdw10 + 4);
40 u16 irq_vector = get_unaligned_le16(cdw10 + 6);
41
42 trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u",
43 cqid, qsize, cq_flags, irq_vector);
44 trace_seq_putc(p, 0);
45
46 return ret;
47}
48
49static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10)
50{
51 const char *ret = trace_seq_buffer_ptr(p);
52 u8 cns = cdw10[0];
53 u16 ctrlid = get_unaligned_le16(cdw10 + 2);
54
55 trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid);
56 trace_seq_putc(p, 0);
57
58 return ret;
59}
60
61
62
63static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
64{
65 const char *ret = trace_seq_buffer_ptr(p);
66 u64 slba = get_unaligned_le64(cdw10);
67 u16 length = get_unaligned_le16(cdw10 + 8);
68 u16 control = get_unaligned_le16(cdw10 + 10);
69 u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
70 u32 reftag = get_unaligned_le32(cdw10 + 16);
71
72 trace_seq_printf(p,
73 "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
74 slba, length, control, dsmgmt, reftag);
75 trace_seq_putc(p, 0);
76
77 return ret;
78}
79
80static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
81{
82 const char *ret = trace_seq_buffer_ptr(p);
83
84 trace_seq_printf(p, "nr=%u, attributes=%u",
85 get_unaligned_le32(cdw10),
86 get_unaligned_le32(cdw10 + 4));
87 trace_seq_putc(p, 0);
88
89 return ret;
90}
91
92static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
93{
94 const char *ret = trace_seq_buffer_ptr(p);
95
96 trace_seq_printf(p, "cdw10=%*ph", 24, cdw10);
97 trace_seq_putc(p, 0);
98
99 return ret;
100}
101
102const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
103 u8 opcode, u8 *cdw10)
104{
105 switch (opcode) {
106 case nvme_admin_create_sq:
107 return nvme_trace_create_sq(p, cdw10);
108 case nvme_admin_create_cq:
109 return nvme_trace_create_cq(p, cdw10);
110 case nvme_admin_identify:
111 return nvme_trace_admin_identify(p, cdw10);
112 default:
113 return nvme_trace_common(p, cdw10);
114 }
115}
116
117const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
118 u8 opcode, u8 *cdw10)
119{
120 switch (opcode) {
121 case nvme_cmd_read:
122 case nvme_cmd_write:
123 case nvme_cmd_write_zeroes:
124 return nvme_trace_read_write(p, cdw10);
125 case nvme_cmd_dsm:
126 return nvme_trace_dsm(p, cdw10);
127 default:
128 return nvme_trace_common(p, cdw10);
129 }
130}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
new file mode 100644
index 000000000000..ea91fccd1bc0
--- /dev/null
+++ b/drivers/nvme/host/trace.h
@@ -0,0 +1,165 @@
1/*
2 * NVM Express device driver tracepoints
3 * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#undef TRACE_SYSTEM
16#define TRACE_SYSTEM nvme
17
18#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ)
19#define _TRACE_NVME_H
20
21#include <linux/nvme.h>
22#include <linux/tracepoint.h>
23#include <linux/trace_seq.h>
24
25#include "nvme.h"
26
27#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
28#define show_admin_opcode_name(val) \
29 __print_symbolic(val, \
30 nvme_admin_opcode_name(nvme_admin_delete_sq), \
31 nvme_admin_opcode_name(nvme_admin_create_sq), \
32 nvme_admin_opcode_name(nvme_admin_get_log_page), \
33 nvme_admin_opcode_name(nvme_admin_delete_cq), \
34 nvme_admin_opcode_name(nvme_admin_create_cq), \
35 nvme_admin_opcode_name(nvme_admin_identify), \
36 nvme_admin_opcode_name(nvme_admin_abort_cmd), \
37 nvme_admin_opcode_name(nvme_admin_set_features), \
38 nvme_admin_opcode_name(nvme_admin_get_features), \
39 nvme_admin_opcode_name(nvme_admin_async_event), \
40 nvme_admin_opcode_name(nvme_admin_ns_mgmt), \
41 nvme_admin_opcode_name(nvme_admin_activate_fw), \
42 nvme_admin_opcode_name(nvme_admin_download_fw), \
43 nvme_admin_opcode_name(nvme_admin_ns_attach), \
44 nvme_admin_opcode_name(nvme_admin_keep_alive), \
45 nvme_admin_opcode_name(nvme_admin_directive_send), \
46 nvme_admin_opcode_name(nvme_admin_directive_recv), \
47 nvme_admin_opcode_name(nvme_admin_dbbuf), \
48 nvme_admin_opcode_name(nvme_admin_format_nvm), \
49 nvme_admin_opcode_name(nvme_admin_security_send), \
50 nvme_admin_opcode_name(nvme_admin_security_recv), \
51 nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
52
53const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
54 u8 *cdw10);
55#define __parse_nvme_admin_cmd(opcode, cdw10) \
56 nvme_trace_parse_admin_cmd(p, opcode, cdw10)
57
58#define nvme_opcode_name(opcode) { opcode, #opcode }
59#define show_opcode_name(val) \
60 __print_symbolic(val, \
61 nvme_opcode_name(nvme_cmd_flush), \
62 nvme_opcode_name(nvme_cmd_write), \
63 nvme_opcode_name(nvme_cmd_read), \
64 nvme_opcode_name(nvme_cmd_write_uncor), \
65 nvme_opcode_name(nvme_cmd_compare), \
66 nvme_opcode_name(nvme_cmd_write_zeroes), \
67 nvme_opcode_name(nvme_cmd_dsm), \
68 nvme_opcode_name(nvme_cmd_resv_register), \
69 nvme_opcode_name(nvme_cmd_resv_report), \
70 nvme_opcode_name(nvme_cmd_resv_acquire), \
71 nvme_opcode_name(nvme_cmd_resv_release))
72
73const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
74 u8 *cdw10);
75#define __parse_nvme_cmd(opcode, cdw10) \
76 nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
77
78TRACE_EVENT(nvme_setup_admin_cmd,
79 TP_PROTO(struct nvme_command *cmd),
80 TP_ARGS(cmd),
81 TP_STRUCT__entry(
82 __field(u8, opcode)
83 __field(u8, flags)
84 __field(u16, cid)
85 __field(u64, metadata)
86 __array(u8, cdw10, 24)
87 ),
88 TP_fast_assign(
89 __entry->opcode = cmd->common.opcode;
90 __entry->flags = cmd->common.flags;
91 __entry->cid = cmd->common.command_id;
92 __entry->metadata = le64_to_cpu(cmd->common.metadata);
93 memcpy(__entry->cdw10, cmd->common.cdw10,
94 sizeof(__entry->cdw10));
95 ),
96 TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
97 __entry->cid, __entry->flags, __entry->metadata,
98 show_admin_opcode_name(__entry->opcode),
99 __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
100);
101
102
103TRACE_EVENT(nvme_setup_nvm_cmd,
104 TP_PROTO(int qid, struct nvme_command *cmd),
105 TP_ARGS(qid, cmd),
106 TP_STRUCT__entry(
107 __field(int, qid)
108 __field(u8, opcode)
109 __field(u8, flags)
110 __field(u16, cid)
111 __field(u32, nsid)
112 __field(u64, metadata)
113 __array(u8, cdw10, 24)
114 ),
115 TP_fast_assign(
116 __entry->qid = qid;
117 __entry->opcode = cmd->common.opcode;
118 __entry->flags = cmd->common.flags;
119 __entry->cid = cmd->common.command_id;
120 __entry->nsid = le32_to_cpu(cmd->common.nsid);
121 __entry->metadata = le64_to_cpu(cmd->common.metadata);
122 memcpy(__entry->cdw10, cmd->common.cdw10,
123 sizeof(__entry->cdw10));
124 ),
125 TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
126 __entry->qid, __entry->nsid, __entry->cid,
127 __entry->flags, __entry->metadata,
128 show_opcode_name(__entry->opcode),
129 __parse_nvme_cmd(__entry->opcode, __entry->cdw10))
130);
131
132TRACE_EVENT(nvme_complete_rq,
133 TP_PROTO(struct request *req),
134 TP_ARGS(req),
135 TP_STRUCT__entry(
136 __field(int, qid)
137 __field(int, cid)
138 __field(u64, result)
139 __field(u8, retries)
140 __field(u8, flags)
141 __field(u16, status)
142 ),
143 TP_fast_assign(
144 __entry->qid = req->q->id;
145 __entry->cid = req->tag;
146 __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
147 __entry->retries = nvme_req(req)->retries;
148 __entry->flags = nvme_req(req)->flags;
149 __entry->status = nvme_req(req)->status;
150 ),
151 TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
152 __entry->cid, __entry->qid, __entry->result,
153 __entry->retries, __entry->flags, __entry->status)
154
155);
156
157#endif /* _TRACE_NVME_H */
158
159#undef TRACE_INCLUDE_PATH
160#define TRACE_INCLUDE_PATH .
161#undef TRACE_INCLUDE_FILE
162#define TRACE_INCLUDE_FILE trace
163
164/* This part must be outside protection */
165#include <trace/define_trace.h>
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 03e4ab65fe77..5f4f8b16685f 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -29,6 +29,7 @@ config NVME_TARGET_RDMA
29 tristate "NVMe over Fabrics RDMA target support" 29 tristate "NVMe over Fabrics RDMA target support"
30 depends on INFINIBAND 30 depends on INFINIBAND
31 depends on NVME_TARGET 31 depends on NVME_TARGET
32 select SGL_ALLOC
32 help 33 help
33 This enables the NVMe RDMA target support, which allows exporting NVMe 34 This enables the NVMe RDMA target support, which allows exporting NVMe
34 devices over RDMA. 35 devices over RDMA.
@@ -39,6 +40,7 @@ config NVME_TARGET_FC
39 tristate "NVMe over Fabrics FC target driver" 40 tristate "NVMe over Fabrics FC target driver"
40 depends on NVME_TARGET 41 depends on NVME_TARGET
41 depends on HAS_DMA 42 depends on HAS_DMA
43 select SGL_ALLOC
42 help 44 help
43 This enables the NVMe FC target support, which allows exporting NVMe 45 This enables the NVMe FC target support, which allows exporting NVMe
44 devices over FC. 46 devices over FC.
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b54748ad5f48..0bd737117a80 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
512 req->sg_cnt = 0; 512 req->sg_cnt = 0;
513 req->transfer_len = 0; 513 req->transfer_len = 0;
514 req->rsp->status = 0; 514 req->rsp->status = 0;
515 req->ns = NULL;
515 516
516 /* no support for fused commands yet */ 517 /* no support for fused commands yet */
517 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { 518 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
557void nvmet_req_uninit(struct nvmet_req *req) 558void nvmet_req_uninit(struct nvmet_req *req)
558{ 559{
559 percpu_ref_put(&req->sq->ref); 560 percpu_ref_put(&req->sq->ref);
561 if (req->ns)
562 nvmet_put_namespace(req->ns);
560} 563}
561EXPORT_SYMBOL_GPL(nvmet_req_uninit); 564EXPORT_SYMBOL_GPL(nvmet_req_uninit);
562 565
@@ -830,7 +833,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
830 /* Don't accept keep-alive timeout for discovery controllers */ 833 /* Don't accept keep-alive timeout for discovery controllers */
831 if (kato) { 834 if (kato) {
832 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 835 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
833 goto out_free_sqs; 836 goto out_remove_ida;
834 } 837 }
835 838
836 /* 839 /*
@@ -860,6 +863,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
860 *ctrlp = ctrl; 863 *ctrlp = ctrl;
861 return 0; 864 return 0;
862 865
866out_remove_ida:
867 ida_simple_remove(&cntlid_ida, ctrl->cntlid);
863out_free_sqs: 868out_free_sqs:
864 kfree(ctrl->sqs); 869 kfree(ctrl->sqs);
865out_free_cqs: 870out_free_cqs:
@@ -877,21 +882,22 @@ static void nvmet_ctrl_free(struct kref *ref)
877 struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); 882 struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
878 struct nvmet_subsys *subsys = ctrl->subsys; 883 struct nvmet_subsys *subsys = ctrl->subsys;
879 884
880 nvmet_stop_keep_alive_timer(ctrl);
881
882 mutex_lock(&subsys->lock); 885 mutex_lock(&subsys->lock);
883 list_del(&ctrl->subsys_entry); 886 list_del(&ctrl->subsys_entry);
884 mutex_unlock(&subsys->lock); 887 mutex_unlock(&subsys->lock);
885 888
889 nvmet_stop_keep_alive_timer(ctrl);
890
886 flush_work(&ctrl->async_event_work); 891 flush_work(&ctrl->async_event_work);
887 cancel_work_sync(&ctrl->fatal_err_work); 892 cancel_work_sync(&ctrl->fatal_err_work);
888 893
889 ida_simple_remove(&cntlid_ida, ctrl->cntlid); 894 ida_simple_remove(&cntlid_ida, ctrl->cntlid);
890 nvmet_subsys_put(subsys);
891 895
892 kfree(ctrl->sqs); 896 kfree(ctrl->sqs);
893 kfree(ctrl->cqs); 897 kfree(ctrl->cqs);
894 kfree(ctrl); 898 kfree(ctrl);
899
900 nvmet_subsys_put(subsys);
895} 901}
896 902
897void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) 903void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index db3bf6b8bf9e..19e9e42ae943 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
225 goto out_ctrl_put; 225 goto out_ctrl_put;
226 } 226 }
227 227
228 pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); 228 pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
229 229
230out: 230out:
231 kfree(d); 231 kfree(d);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 5fd86039e353..9b39a6cb1935 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1697,31 +1697,12 @@ static int
1697nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) 1697nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
1698{ 1698{
1699 struct scatterlist *sg; 1699 struct scatterlist *sg;
1700 struct page *page;
1701 unsigned int nent; 1700 unsigned int nent;
1702 u32 page_len, length;
1703 int i = 0;
1704 1701
1705 length = fod->req.transfer_len; 1702 sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent);
1706 nent = DIV_ROUND_UP(length, PAGE_SIZE);
1707 sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
1708 if (!sg) 1703 if (!sg)
1709 goto out; 1704 goto out;
1710 1705
1711 sg_init_table(sg, nent);
1712
1713 while (length) {
1714 page_len = min_t(u32, length, PAGE_SIZE);
1715
1716 page = alloc_page(GFP_KERNEL);
1717 if (!page)
1718 goto out_free_pages;
1719
1720 sg_set_page(&sg[i], page, page_len, 0);
1721 length -= page_len;
1722 i++;
1723 }
1724
1725 fod->data_sg = sg; 1706 fod->data_sg = sg;
1726 fod->data_sg_cnt = nent; 1707 fod->data_sg_cnt = nent;
1727 fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent, 1708 fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent,
@@ -1731,14 +1712,6 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
1731 1712
1732 return 0; 1713 return 0;
1733 1714
1734out_free_pages:
1735 while (i > 0) {
1736 i--;
1737 __free_page(sg_page(&sg[i]));
1738 }
1739 kfree(sg);
1740 fod->data_sg = NULL;
1741 fod->data_sg_cnt = 0;
1742out: 1715out:
1743 return NVME_SC_INTERNAL; 1716 return NVME_SC_INTERNAL;
1744} 1717}
@@ -1746,18 +1719,13 @@ out:
1746static void 1719static void
1747nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) 1720nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
1748{ 1721{
1749 struct scatterlist *sg;
1750 int count;
1751
1752 if (!fod->data_sg || !fod->data_sg_cnt) 1722 if (!fod->data_sg || !fod->data_sg_cnt)
1753 return; 1723 return;
1754 1724
1755 fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt, 1725 fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt,
1756 ((fod->io_dir == NVMET_FCP_WRITE) ? 1726 ((fod->io_dir == NVMET_FCP_WRITE) ?
1757 DMA_FROM_DEVICE : DMA_TO_DEVICE)); 1727 DMA_FROM_DEVICE : DMA_TO_DEVICE));
1758 for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count) 1728 sgl_free(fod->data_sg);
1759 __free_page(sg_page(sg));
1760 kfree(fod->data_sg);
1761 fod->data_sg = NULL; 1729 fod->data_sg = NULL;
1762 fod->data_sg_cnt = 0; 1730 fod->data_sg_cnt = 0;
1763} 1731}
@@ -2522,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port)
2522 list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { 2490 list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
2523 if ((tgtport->fc_target_port.node_name == traddr.nn) && 2491 if ((tgtport->fc_target_port.node_name == traddr.nn) &&
2524 (tgtport->fc_target_port.port_name == traddr.pn)) { 2492 (tgtport->fc_target_port.port_name == traddr.pn)) {
2525 /* a FC port can only be 1 nvmet port id */ 2493 tgtport->port = port;
2526 if (!tgtport->port) { 2494 ret = 0;
2527 tgtport->port = port;
2528 port->priv = tgtport;
2529 nvmet_fc_tgtport_get(tgtport);
2530 ret = 0;
2531 } else
2532 ret = -EALREADY;
2533 break; 2495 break;
2534 } 2496 }
2535 } 2497 }
@@ -2540,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
2540static void 2502static void
2541nvmet_fc_remove_port(struct nvmet_port *port) 2503nvmet_fc_remove_port(struct nvmet_port *port)
2542{ 2504{
2543 struct nvmet_fc_tgtport *tgtport = port->priv; 2505 /* nothing to do */
2544 unsigned long flags;
2545 bool matched = false;
2546
2547 spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
2548 if (tgtport->port == port) {
2549 matched = true;
2550 tgtport->port = NULL;
2551 }
2552 spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
2553
2554 if (matched)
2555 nvmet_fc_tgtport_put(tgtport);
2556} 2506}
2557 2507
2558static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { 2508static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 6a018a0bd6ce..34712def81b1 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -204,6 +204,10 @@ struct fcloop_lport {
204 struct completion unreg_done; 204 struct completion unreg_done;
205}; 205};
206 206
207struct fcloop_lport_priv {
208 struct fcloop_lport *lport;
209};
210
207struct fcloop_rport { 211struct fcloop_rport {
208 struct nvme_fc_remote_port *remoteport; 212 struct nvme_fc_remote_port *remoteport;
209 struct nvmet_fc_target_port *targetport; 213 struct nvmet_fc_target_port *targetport;
@@ -238,21 +242,32 @@ struct fcloop_lsreq {
238 int status; 242 int status;
239}; 243};
240 244
245enum {
246 INI_IO_START = 0,
247 INI_IO_ACTIVE = 1,
248 INI_IO_ABORTED = 2,
249 INI_IO_COMPLETED = 3,
250};
251
241struct fcloop_fcpreq { 252struct fcloop_fcpreq {
242 struct fcloop_tport *tport; 253 struct fcloop_tport *tport;
243 struct nvmefc_fcp_req *fcpreq; 254 struct nvmefc_fcp_req *fcpreq;
244 spinlock_t reqlock; 255 spinlock_t reqlock;
245 u16 status; 256 u16 status;
257 u32 inistate;
246 bool active; 258 bool active;
247 bool aborted; 259 bool aborted;
248 struct work_struct work; 260 struct kref ref;
261 struct work_struct fcp_rcv_work;
262 struct work_struct abort_rcv_work;
263 struct work_struct tio_done_work;
249 struct nvmefc_tgt_fcp_req tgt_fcp_req; 264 struct nvmefc_tgt_fcp_req tgt_fcp_req;
250}; 265};
251 266
252struct fcloop_ini_fcpreq { 267struct fcloop_ini_fcpreq {
253 struct nvmefc_fcp_req *fcpreq; 268 struct nvmefc_fcp_req *fcpreq;
254 struct fcloop_fcpreq *tfcp_req; 269 struct fcloop_fcpreq *tfcp_req;
255 struct work_struct iniwork; 270 spinlock_t inilock;
256}; 271};
257 272
258static inline struct fcloop_lsreq * 273static inline struct fcloop_lsreq *
@@ -343,17 +358,122 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
343 return 0; 358 return 0;
344} 359}
345 360
346/*
347 * FCP IO operation done by initiator abort.
348 * call back up initiator "done" flows.
349 */
350static void 361static void
351fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work) 362fcloop_tfcp_req_free(struct kref *ref)
352{ 363{
353 struct fcloop_ini_fcpreq *inireq = 364 struct fcloop_fcpreq *tfcp_req =
354 container_of(work, struct fcloop_ini_fcpreq, iniwork); 365 container_of(ref, struct fcloop_fcpreq, ref);
366
367 kfree(tfcp_req);
368}
369
370static void
371fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
372{
373 kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
374}
375
376static int
377fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
378{
379 return kref_get_unless_zero(&tfcp_req->ref);
380}
381
382static void
383fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
384 struct fcloop_fcpreq *tfcp_req, int status)
385{
386 struct fcloop_ini_fcpreq *inireq = NULL;
387
388 if (fcpreq) {
389 inireq = fcpreq->private;
390 spin_lock(&inireq->inilock);
391 inireq->tfcp_req = NULL;
392 spin_unlock(&inireq->inilock);
393
394 fcpreq->status = status;
395 fcpreq->done(fcpreq);
396 }
397
398 /* release original io reference on tgt struct */
399 fcloop_tfcp_req_put(tfcp_req);
400}
401
402static void
403fcloop_fcp_recv_work(struct work_struct *work)
404{
405 struct fcloop_fcpreq *tfcp_req =
406 container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
407 struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
408 int ret = 0;
409 bool aborted = false;
410
411 spin_lock(&tfcp_req->reqlock);
412 switch (tfcp_req->inistate) {
413 case INI_IO_START:
414 tfcp_req->inistate = INI_IO_ACTIVE;
415 break;
416 case INI_IO_ABORTED:
417 aborted = true;
418 break;
419 default:
420 spin_unlock(&tfcp_req->reqlock);
421 WARN_ON(1);
422 return;
423 }
424 spin_unlock(&tfcp_req->reqlock);
425
426 if (unlikely(aborted))
427 ret = -ECANCELED;
428 else
429 ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
430 &tfcp_req->tgt_fcp_req,
431 fcpreq->cmdaddr, fcpreq->cmdlen);
432 if (ret)
433 fcloop_call_host_done(fcpreq, tfcp_req, ret);
434
435 return;
436}
437
438static void
439fcloop_fcp_abort_recv_work(struct work_struct *work)
440{
441 struct fcloop_fcpreq *tfcp_req =
442 container_of(work, struct fcloop_fcpreq, abort_rcv_work);
443 struct nvmefc_fcp_req *fcpreq;
444 bool completed = false;
445
446 spin_lock(&tfcp_req->reqlock);
447 fcpreq = tfcp_req->fcpreq;
448 switch (tfcp_req->inistate) {
449 case INI_IO_ABORTED:
450 break;
451 case INI_IO_COMPLETED:
452 completed = true;
453 break;
454 default:
455 spin_unlock(&tfcp_req->reqlock);
456 WARN_ON(1);
457 return;
458 }
459 spin_unlock(&tfcp_req->reqlock);
460
461 if (unlikely(completed)) {
462 /* remove reference taken in original abort downcall */
463 fcloop_tfcp_req_put(tfcp_req);
464 return;
465 }
355 466
356 inireq->fcpreq->done(inireq->fcpreq); 467 if (tfcp_req->tport->targetport)
468 nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
469 &tfcp_req->tgt_fcp_req);
470
471 spin_lock(&tfcp_req->reqlock);
472 tfcp_req->fcpreq = NULL;
473 spin_unlock(&tfcp_req->reqlock);
474
475 fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
476 /* call_host_done releases reference for abort downcall */
357} 477}
358 478
359/* 479/*
@@ -364,20 +484,15 @@ static void
364fcloop_tgt_fcprqst_done_work(struct work_struct *work) 484fcloop_tgt_fcprqst_done_work(struct work_struct *work)
365{ 485{
366 struct fcloop_fcpreq *tfcp_req = 486 struct fcloop_fcpreq *tfcp_req =
367 container_of(work, struct fcloop_fcpreq, work); 487 container_of(work, struct fcloop_fcpreq, tio_done_work);
368 struct fcloop_tport *tport = tfcp_req->tport;
369 struct nvmefc_fcp_req *fcpreq; 488 struct nvmefc_fcp_req *fcpreq;
370 489
371 spin_lock(&tfcp_req->reqlock); 490 spin_lock(&tfcp_req->reqlock);
372 fcpreq = tfcp_req->fcpreq; 491 fcpreq = tfcp_req->fcpreq;
492 tfcp_req->inistate = INI_IO_COMPLETED;
373 spin_unlock(&tfcp_req->reqlock); 493 spin_unlock(&tfcp_req->reqlock);
374 494
375 if (tport->remoteport && fcpreq) { 495 fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
376 fcpreq->status = tfcp_req->status;
377 fcpreq->done(fcpreq);
378 }
379
380 kfree(tfcp_req);
381} 496}
382 497
383 498
@@ -390,7 +505,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
390 struct fcloop_rport *rport = remoteport->private; 505 struct fcloop_rport *rport = remoteport->private;
391 struct fcloop_ini_fcpreq *inireq = fcpreq->private; 506 struct fcloop_ini_fcpreq *inireq = fcpreq->private;
392 struct fcloop_fcpreq *tfcp_req; 507 struct fcloop_fcpreq *tfcp_req;
393 int ret = 0;
394 508
395 if (!rport->targetport) 509 if (!rport->targetport)
396 return -ECONNREFUSED; 510 return -ECONNREFUSED;
@@ -401,16 +515,20 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
401 515
402 inireq->fcpreq = fcpreq; 516 inireq->fcpreq = fcpreq;
403 inireq->tfcp_req = tfcp_req; 517 inireq->tfcp_req = tfcp_req;
404 INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work); 518 spin_lock_init(&inireq->inilock);
519
405 tfcp_req->fcpreq = fcpreq; 520 tfcp_req->fcpreq = fcpreq;
406 tfcp_req->tport = rport->targetport->private; 521 tfcp_req->tport = rport->targetport->private;
522 tfcp_req->inistate = INI_IO_START;
407 spin_lock_init(&tfcp_req->reqlock); 523 spin_lock_init(&tfcp_req->reqlock);
408 INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work); 524 INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
525 INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
526 INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
527 kref_init(&tfcp_req->ref);
409 528
410 ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req, 529 schedule_work(&tfcp_req->fcp_rcv_work);
411 fcpreq->cmdaddr, fcpreq->cmdlen);
412 530
413 return ret; 531 return 0;
414} 532}
415 533
416static void 534static void
@@ -589,7 +707,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
589{ 707{
590 struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); 708 struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
591 709
592 schedule_work(&tfcp_req->work); 710 schedule_work(&tfcp_req->tio_done_work);
593} 711}
594 712
595static void 713static void
@@ -605,27 +723,47 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
605 void *hw_queue_handle, 723 void *hw_queue_handle,
606 struct nvmefc_fcp_req *fcpreq) 724 struct nvmefc_fcp_req *fcpreq)
607{ 725{
608 struct fcloop_rport *rport = remoteport->private;
609 struct fcloop_ini_fcpreq *inireq = fcpreq->private; 726 struct fcloop_ini_fcpreq *inireq = fcpreq->private;
610 struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req; 727 struct fcloop_fcpreq *tfcp_req;
728 bool abortio = true;
729
730 spin_lock(&inireq->inilock);
731 tfcp_req = inireq->tfcp_req;
732 if (tfcp_req)
733 fcloop_tfcp_req_get(tfcp_req);
734 spin_unlock(&inireq->inilock);
611 735
612 if (!tfcp_req) 736 if (!tfcp_req)
613 /* abort has already been called */ 737 /* abort has already been called */
614 return; 738 return;
615 739
616 if (rport->targetport)
617 nvmet_fc_rcv_fcp_abort(rport->targetport,
618 &tfcp_req->tgt_fcp_req);
619
620 /* break initiator/target relationship for io */ 740 /* break initiator/target relationship for io */
621 spin_lock(&tfcp_req->reqlock); 741 spin_lock(&tfcp_req->reqlock);
622 inireq->tfcp_req = NULL; 742 switch (tfcp_req->inistate) {
623 tfcp_req->fcpreq = NULL; 743 case INI_IO_START:
744 case INI_IO_ACTIVE:
745 tfcp_req->inistate = INI_IO_ABORTED;
746 break;
747 case INI_IO_COMPLETED:
748 abortio = false;
749 break;
750 default:
751 spin_unlock(&tfcp_req->reqlock);
752 WARN_ON(1);
753 return;
754 }
624 spin_unlock(&tfcp_req->reqlock); 755 spin_unlock(&tfcp_req->reqlock);
625 756
626 /* post the aborted io completion */ 757 if (abortio)
627 fcpreq->status = -ECANCELED; 758 /* leave the reference while the work item is scheduled */
628 schedule_work(&inireq->iniwork); 759 WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
760 else {
761 /*
762 * as the io has already had the done callback made,
763 * nothing more to do. So release the reference taken above
764 */
765 fcloop_tfcp_req_put(tfcp_req);
766 }
629} 767}
630 768
631static void 769static void
@@ -657,7 +795,8 @@ fcloop_nport_get(struct fcloop_nport *nport)
657static void 795static void
658fcloop_localport_delete(struct nvme_fc_local_port *localport) 796fcloop_localport_delete(struct nvme_fc_local_port *localport)
659{ 797{
660 struct fcloop_lport *lport = localport->private; 798 struct fcloop_lport_priv *lport_priv = localport->private;
799 struct fcloop_lport *lport = lport_priv->lport;
661 800
662 /* release any threads waiting for the unreg to complete */ 801 /* release any threads waiting for the unreg to complete */
663 complete(&lport->unreg_done); 802 complete(&lport->unreg_done);
@@ -697,7 +836,7 @@ static struct nvme_fc_port_template fctemplate = {
697 .max_dif_sgl_segments = FCLOOP_SGL_SEGS, 836 .max_dif_sgl_segments = FCLOOP_SGL_SEGS,
698 .dma_boundary = FCLOOP_DMABOUND_4G, 837 .dma_boundary = FCLOOP_DMABOUND_4G,
699 /* sizes of additional private data for data structures */ 838 /* sizes of additional private data for data structures */
700 .local_priv_sz = sizeof(struct fcloop_lport), 839 .local_priv_sz = sizeof(struct fcloop_lport_priv),
701 .remote_priv_sz = sizeof(struct fcloop_rport), 840 .remote_priv_sz = sizeof(struct fcloop_rport),
702 .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), 841 .lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
703 .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), 842 .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
@@ -714,8 +853,7 @@ static struct nvmet_fc_target_template tgttemplate = {
714 .max_dif_sgl_segments = FCLOOP_SGL_SEGS, 853 .max_dif_sgl_segments = FCLOOP_SGL_SEGS,
715 .dma_boundary = FCLOOP_DMABOUND_4G, 854 .dma_boundary = FCLOOP_DMABOUND_4G,
716 /* optional features */ 855 /* optional features */
717 .target_features = NVMET_FCTGTFEAT_CMD_IN_ISR | 856 .target_features = 0,
718 NVMET_FCTGTFEAT_OPDONE_IN_ISR,
719 /* sizes of additional private data for data structures */ 857 /* sizes of additional private data for data structures */
720 .target_priv_sz = sizeof(struct fcloop_tport), 858 .target_priv_sz = sizeof(struct fcloop_tport),
721}; 859};
@@ -728,11 +866,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
728 struct fcloop_ctrl_options *opts; 866 struct fcloop_ctrl_options *opts;
729 struct nvme_fc_local_port *localport; 867 struct nvme_fc_local_port *localport;
730 struct fcloop_lport *lport; 868 struct fcloop_lport *lport;
731 int ret; 869 struct fcloop_lport_priv *lport_priv;
870 unsigned long flags;
871 int ret = -ENOMEM;
872
873 lport = kzalloc(sizeof(*lport), GFP_KERNEL);
874 if (!lport)
875 return -ENOMEM;
732 876
733 opts = kzalloc(sizeof(*opts), GFP_KERNEL); 877 opts = kzalloc(sizeof(*opts), GFP_KERNEL);
734 if (!opts) 878 if (!opts)
735 return -ENOMEM; 879 goto out_free_lport;
736 880
737 ret = fcloop_parse_options(opts, buf); 881 ret = fcloop_parse_options(opts, buf);
738 if (ret) 882 if (ret)
@@ -752,23 +896,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
752 896
753 ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport); 897 ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
754 if (!ret) { 898 if (!ret) {
755 unsigned long flags;
756
757 /* success */ 899 /* success */
758 lport = localport->private; 900 lport_priv = localport->private;
901 lport_priv->lport = lport;
902
759 lport->localport = localport; 903 lport->localport = localport;
760 INIT_LIST_HEAD(&lport->lport_list); 904 INIT_LIST_HEAD(&lport->lport_list);
761 905
762 spin_lock_irqsave(&fcloop_lock, flags); 906 spin_lock_irqsave(&fcloop_lock, flags);
763 list_add_tail(&lport->lport_list, &fcloop_lports); 907 list_add_tail(&lport->lport_list, &fcloop_lports);
764 spin_unlock_irqrestore(&fcloop_lock, flags); 908 spin_unlock_irqrestore(&fcloop_lock, flags);
765
766 /* mark all of the input buffer consumed */
767 ret = count;
768 } 909 }
769 910
770out_free_opts: 911out_free_opts:
771 kfree(opts); 912 kfree(opts);
913out_free_lport:
914 /* free only if we're going to fail */
915 if (ret)
916 kfree(lport);
917
772 return ret ? ret : count; 918 return ret ? ret : count;
773} 919}
774 920
@@ -790,6 +936,8 @@ __wait_localport_unreg(struct fcloop_lport *lport)
790 936
791 wait_for_completion(&lport->unreg_done); 937 wait_for_completion(&lport->unreg_done);
792 938
939 kfree(lport);
940
793 return ret; 941 return ret;
794} 942}
795 943
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 1e21b286f299..7991ec3a17db 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = {
686 686
687static struct nvmf_transport_ops nvme_loop_transport = { 687static struct nvmf_transport_ops nvme_loop_transport = {
688 .name = "loop", 688 .name = "loop",
689 .module = THIS_MODULE,
689 .create_ctrl = nvme_loop_create_ctrl, 690 .create_ctrl = nvme_loop_create_ctrl,
690}; 691};
691 692
@@ -716,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void)
716 nvme_delete_ctrl(&ctrl->ctrl); 717 nvme_delete_ctrl(&ctrl->ctrl);
717 mutex_unlock(&nvme_loop_ctrl_mutex); 718 mutex_unlock(&nvme_loop_ctrl_mutex);
718 719
719 flush_workqueue(nvme_wq); 720 flush_workqueue(nvme_delete_wq);
720} 721}
721 722
722module_init(nvme_loop_init_module); 723module_init(nvme_loop_init_module);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 49912909c298..978e169c11bf 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -185,59 +185,6 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
185 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 185 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
186} 186}
187 187
188static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
189{
190 struct scatterlist *sg;
191 int count;
192
193 if (!sgl || !nents)
194 return;
195
196 for_each_sg(sgl, sg, nents, count)
197 __free_page(sg_page(sg));
198 kfree(sgl);
199}
200
201static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
202 u32 length)
203{
204 struct scatterlist *sg;
205 struct page *page;
206 unsigned int nent;
207 int i = 0;
208
209 nent = DIV_ROUND_UP(length, PAGE_SIZE);
210 sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
211 if (!sg)
212 goto out;
213
214 sg_init_table(sg, nent);
215
216 while (length) {
217 u32 page_len = min_t(u32, length, PAGE_SIZE);
218
219 page = alloc_page(GFP_KERNEL);
220 if (!page)
221 goto out_free_pages;
222
223 sg_set_page(&sg[i], page, page_len, 0);
224 length -= page_len;
225 i++;
226 }
227 *sgl = sg;
228 *nents = nent;
229 return 0;
230
231out_free_pages:
232 while (i > 0) {
233 i--;
234 __free_page(sg_page(&sg[i]));
235 }
236 kfree(sg);
237out:
238 return NVME_SC_INTERNAL;
239}
240
241static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 188static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
242 struct nvmet_rdma_cmd *c, bool admin) 189 struct nvmet_rdma_cmd *c, bool admin)
243{ 190{
@@ -484,7 +431,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
484 } 431 }
485 432
486 if (rsp->req.sg != &rsp->cmd->inline_sg) 433 if (rsp->req.sg != &rsp->cmd->inline_sg)
487 nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); 434 sgl_free(rsp->req.sg);
488 435
489 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 436 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
490 nvmet_rdma_process_wr_wait_list(queue); 437 nvmet_rdma_process_wr_wait_list(queue);
@@ -621,16 +568,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
621 u32 len = get_unaligned_le24(sgl->length); 568 u32 len = get_unaligned_le24(sgl->length);
622 u32 key = get_unaligned_le32(sgl->key); 569 u32 key = get_unaligned_le32(sgl->key);
623 int ret; 570 int ret;
624 u16 status;
625 571
626 /* no data command? */ 572 /* no data command? */
627 if (!len) 573 if (!len)
628 return 0; 574 return 0;
629 575
630 status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, 576 rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
631 len); 577 if (!rsp->req.sg)
632 if (status) 578 return NVME_SC_INTERNAL;
633 return status;
634 579
635 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 580 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
636 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 581 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
@@ -976,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
976 921
977static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 922static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
978{ 923{
979 pr_info("freeing queue %d\n", queue->idx); 924 pr_debug("freeing queue %d\n", queue->idx);
980 925
981 nvmet_sq_destroy(&queue->nvme_sq); 926 nvmet_sq_destroy(&queue->nvme_sq);
982 927
@@ -1558,25 +1503,9 @@ err_ib_client:
1558 1503
1559static void __exit nvmet_rdma_exit(void) 1504static void __exit nvmet_rdma_exit(void)
1560{ 1505{
1561 struct nvmet_rdma_queue *queue;
1562
1563 nvmet_unregister_transport(&nvmet_rdma_ops); 1506 nvmet_unregister_transport(&nvmet_rdma_ops);
1564
1565 flush_scheduled_work();
1566
1567 mutex_lock(&nvmet_rdma_queue_mutex);
1568 while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
1569 struct nvmet_rdma_queue, queue_list))) {
1570 list_del_init(&queue->queue_list);
1571
1572 mutex_unlock(&nvmet_rdma_queue_mutex);
1573 __nvmet_rdma_queue_disconnect(queue);
1574 mutex_lock(&nvmet_rdma_queue_mutex);
1575 }
1576 mutex_unlock(&nvmet_rdma_queue_mutex);
1577
1578 flush_scheduled_work();
1579 ib_unregister_client(&nvmet_rdma_ib_client); 1507 ib_unregister_client(&nvmet_rdma_ib_client);
1508 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1580 ida_destroy(&nvmet_rdma_queue_ida); 1509 ida_destroy(&nvmet_rdma_queue_ida);
1581} 1510}
1582 1511
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig
index e2bc99980f75..4c44d7bed01a 100644
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -5,6 +5,7 @@ menuconfig TARGET_CORE
5 select CONFIGFS_FS 5 select CONFIGFS_FS
6 select CRC_T10DIF 6 select CRC_T10DIF
7 select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. 7 select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
8 select SGL_ALLOC
8 default n 9 default n
9 help 10 help
10 Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled 11 Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 58caacd54a3b..c03a78ee26cd 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2300,13 +2300,7 @@ queue_full:
2300 2300
2301void target_free_sgl(struct scatterlist *sgl, int nents) 2301void target_free_sgl(struct scatterlist *sgl, int nents)
2302{ 2302{
2303 struct scatterlist *sg; 2303 sgl_free_n_order(sgl, nents, 0);
2304 int count;
2305
2306 for_each_sg(sgl, sg, nents, count)
2307 __free_page(sg_page(sg));
2308
2309 kfree(sgl);
2310} 2304}
2311EXPORT_SYMBOL(target_free_sgl); 2305EXPORT_SYMBOL(target_free_sgl);
2312 2306
@@ -2414,42 +2408,10 @@ int
2414target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length, 2408target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length,
2415 bool zero_page, bool chainable) 2409 bool zero_page, bool chainable)
2416{ 2410{
2417 struct scatterlist *sg; 2411 gfp_t gfp = GFP_KERNEL | (zero_page ? __GFP_ZERO : 0);
2418 struct page *page;
2419 gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0;
2420 unsigned int nalloc, nent;
2421 int i = 0;
2422
2423 nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE);
2424 if (chainable)
2425 nalloc++;
2426 sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL);
2427 if (!sg)
2428 return -ENOMEM;
2429 2412
2430 sg_init_table(sg, nalloc); 2413 *sgl = sgl_alloc_order(length, 0, chainable, gfp, nents);
2431 2414 return *sgl ? 0 : -ENOMEM;
2432 while (length) {
2433 u32 page_len = min_t(u32, length, PAGE_SIZE);
2434 page = alloc_page(GFP_KERNEL | zero_flag);
2435 if (!page)
2436 goto out;
2437
2438 sg_set_page(&sg[i], page, page_len, 0);
2439 length -= page_len;
2440 i++;
2441 }
2442 *sgl = sg;
2443 *nents = nent;
2444 return 0;
2445
2446out:
2447 while (i > 0) {
2448 i--;
2449 __free_page(sg_page(&sg[i]));
2450 }
2451 kfree(sg);
2452 return -ENOMEM;
2453} 2415}
2454EXPORT_SYMBOL(target_alloc_sgl); 2416EXPORT_SYMBOL(target_alloc_sgl);
2455 2417
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5982c8a71f02..75610d23d197 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -411,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
411 411
412static u64 bio_end_offset(struct bio *bio) 412static u64 bio_end_offset(struct bio *bio)
413{ 413{
414 struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; 414 struct bio_vec *last = bio_last_bvec_all(bio);
415 415
416 return page_offset(last->bv_page) + last->bv_len + last->bv_offset; 416 return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
417} 417}
@@ -563,7 +563,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
563 /* we need the actual starting offset of this extent in the file */ 563 /* we need the actual starting offset of this extent in the file */
564 read_lock(&em_tree->lock); 564 read_lock(&em_tree->lock);
565 em = lookup_extent_mapping(em_tree, 565 em = lookup_extent_mapping(em_tree,
566 page_offset(bio->bi_io_vec->bv_page), 566 page_offset(bio_first_page_all(bio)),
567 PAGE_SIZE); 567 PAGE_SIZE);
568 read_unlock(&em_tree->lock); 568 read_unlock(&em_tree->lock);
569 if (!em) 569 if (!em)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 012d63870b99..d43360b33ef6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2257,7 +2257,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2257 return 0; 2257 return 0;
2258} 2258}
2259 2259
2260bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 2260bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
2261 struct io_failure_record *failrec, int failed_mirror) 2261 struct io_failure_record *failrec, int failed_mirror)
2262{ 2262{
2263 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2263 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2281,7 +2281,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
2281 * a) deliver good data to the caller 2281 * a) deliver good data to the caller
2282 * b) correct the bad sectors on disk 2282 * b) correct the bad sectors on disk
2283 */ 2283 */
2284 if (failed_bio->bi_vcnt > 1) { 2284 if (failed_bio_pages > 1) {
2285 /* 2285 /*
2286 * to fulfill b), we need to know the exact failing sectors, as 2286 * to fulfill b), we need to know the exact failing sectors, as
2287 * we don't want to rewrite any more than the failed ones. thus, 2287 * we don't want to rewrite any more than the failed ones. thus,
@@ -2374,6 +2374,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2374 int read_mode = 0; 2374 int read_mode = 0;
2375 blk_status_t status; 2375 blk_status_t status;
2376 int ret; 2376 int ret;
2377 unsigned failed_bio_pages = bio_pages_all(failed_bio);
2377 2378
2378 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2379 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2379 2380
@@ -2381,13 +2382,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2381 if (ret) 2382 if (ret)
2382 return ret; 2383 return ret;
2383 2384
2384 if (!btrfs_check_repairable(inode, failed_bio, failrec, 2385 if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
2385 failed_mirror)) { 2386 failed_mirror)) {
2386 free_io_failure(failure_tree, tree, failrec); 2387 free_io_failure(failure_tree, tree, failrec);
2387 return -EIO; 2388 return -EIO;
2388 } 2389 }
2389 2390
2390 if (failed_bio->bi_vcnt > 1) 2391 if (failed_bio_pages > 1)
2391 read_mode |= REQ_FAILFAST_DEV; 2392 read_mode |= REQ_FAILFAST_DEV;
2392 2393
2393 phy_offset >>= inode->i_sb->s_blocksize_bits; 2394 phy_offset >>= inode->i_sb->s_blocksize_bits;
@@ -2724,7 +2725,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
2724 unsigned long bio_flags) 2725 unsigned long bio_flags)
2725{ 2726{
2726 blk_status_t ret = 0; 2727 blk_status_t ret = 0;
2727 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2728 struct bio_vec *bvec = bio_last_bvec_all(bio);
2728 struct page *page = bvec->bv_page; 2729 struct page *page = bvec->bv_page;
2729 struct extent_io_tree *tree = bio->bi_private; 2730 struct extent_io_tree *tree = bio->bi_private;
2730 u64 start; 2731 u64 start;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 93dcae0c3183..20854d63c75b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -540,7 +540,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
540 u64 end); 540 u64 end);
541int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 541int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
542 struct io_failure_record **failrec_ret); 542 struct io_failure_record **failrec_ret);
543bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 543bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
544 struct io_failure_record *failrec, int fail_mirror); 544 struct io_failure_record *failrec, int fail_mirror);
545struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 545struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
546 struct io_failure_record *failrec, 546 struct io_failure_record *failrec,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1a7f3cb5be9..cb1e2d201434 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8015,6 +8015,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
8015 int segs; 8015 int segs;
8016 int ret; 8016 int ret;
8017 blk_status_t status; 8017 blk_status_t status;
8018 struct bio_vec bvec;
8018 8019
8019 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 8020 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
8020 8021
@@ -8030,8 +8031,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
8030 } 8031 }
8031 8032
8032 segs = bio_segments(failed_bio); 8033 segs = bio_segments(failed_bio);
8034 bio_get_first_bvec(failed_bio, &bvec);
8033 if (segs > 1 || 8035 if (segs > 1 ||
8034 (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) 8036 (bvec.bv_len > btrfs_inode_sectorsize(inode)))
8035 read_mode |= REQ_FAILFAST_DEV; 8037 read_mode |= REQ_FAILFAST_DEV;
8036 8038
8037 isector = start - btrfs_io_bio(failed_bio)->logical; 8039 isector = start - btrfs_io_bio(failed_bio)->logical;
@@ -8074,7 +8076,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
8074 ASSERT(bio->bi_vcnt == 1); 8076 ASSERT(bio->bi_vcnt == 1);
8075 io_tree = &BTRFS_I(inode)->io_tree; 8077 io_tree = &BTRFS_I(inode)->io_tree;
8076 failure_tree = &BTRFS_I(inode)->io_failure_tree; 8078 failure_tree = &BTRFS_I(inode)->io_failure_tree;
8077 ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); 8079 ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
8078 8080
8079 done->uptodate = 1; 8081 done->uptodate = 1;
8080 ASSERT(!bio_flagged(bio, BIO_CLONED)); 8082 ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -8164,7 +8166,7 @@ static void btrfs_retry_endio(struct bio *bio)
8164 uptodate = 1; 8166 uptodate = 1;
8165 8167
8166 ASSERT(bio->bi_vcnt == 1); 8168 ASSERT(bio->bi_vcnt == 1);
8167 ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); 8169 ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
8168 8170
8169 io_tree = &BTRFS_I(inode)->io_tree; 8171 io_tree = &BTRFS_I(inode)->io_tree;
8170 failure_tree = &BTRFS_I(inode)->io_failure_tree; 8172 failure_tree = &BTRFS_I(inode)->io_failure_tree;
diff --git a/fs/buffer.c b/fs/buffer.c
index 0736a6a2e2f0..8b26295a56fe 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3014,7 +3014,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
3014void guard_bio_eod(int op, struct bio *bio) 3014void guard_bio_eod(int op, struct bio *bio)
3015{ 3015{
3016 sector_t maxsector; 3016 sector_t maxsector;
3017 struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; 3017 struct bio_vec *bvec = bio_last_bvec_all(bio);
3018 unsigned truncated_bytes; 3018 unsigned truncated_bytes;
3019 struct hd_struct *part; 3019 struct hd_struct *part;
3020 3020
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 516fa0d3ff9c..455f086cce3d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -56,7 +56,7 @@ static void f2fs_read_end_io(struct bio *bio)
56 int i; 56 int i;
57 57
58#ifdef CONFIG_F2FS_FAULT_INJECTION 58#ifdef CONFIG_F2FS_FAULT_INJECTION
59 if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { 59 if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) {
60 f2fs_show_injection_info(FAULT_IO); 60 f2fs_show_injection_info(FAULT_IO);
61 bio->bi_status = BLK_STS_IOERR; 61 bio->bi_status = BLK_STS_IOERR;
62 } 62 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cea4836385b7..d4d04fee568a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -126,7 +126,7 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
126 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list 126 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
127 * @inode: inode to be moved 127 * @inode: inode to be moved
128 * @wb: target bdi_writeback 128 * @wb: target bdi_writeback
129 * @head: one of @wb->b_{dirty|io|more_io} 129 * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
130 * 130 *
131 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. 131 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
132 * Returns %true if @inode is the first occupant of the !dirty_time IO 132 * Returns %true if @inode is the first occupant of the !dirty_time IO
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 23d29b39f71e..d0eb659fa733 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -300,6 +300,29 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
300 bv->bv_len = iter.bi_bvec_done; 300 bv->bv_len = iter.bi_bvec_done;
301} 301}
302 302
303static inline unsigned bio_pages_all(struct bio *bio)
304{
305 WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
306 return bio->bi_vcnt;
307}
308
309static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
310{
311 WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
312 return bio->bi_io_vec;
313}
314
315static inline struct page *bio_first_page_all(struct bio *bio)
316{
317 return bio_first_bvec_all(bio)->bv_page;
318}
319
320static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
321{
322 WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
323 return &bio->bi_io_vec[bio->bi_vcnt - 1];
324}
325
303enum bip_flags { 326enum bip_flags {
304 BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ 327 BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
305 BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ 328 BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
@@ -477,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
477#endif 500#endif
478 501
479extern void bio_copy_data(struct bio *dst, struct bio *src); 502extern void bio_copy_data(struct bio *dst, struct bio *src);
480extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
481extern void bio_free_pages(struct bio *bio); 503extern void bio_free_pages(struct bio *bio);
482 504
483extern struct bio *bio_copy_user_iov(struct request_queue *, 505extern struct bio *bio_copy_user_iov(struct request_queue *,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e9825ff57b15..69bea82ebeb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
660static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, 660static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
661 struct blkg_rwstat *from) 661 struct blkg_rwstat *from)
662{ 662{
663 struct blkg_rwstat v = blkg_rwstat_read(from); 663 u64 sum[BLKG_RWSTAT_NR];
664 int i; 664 int i;
665 665
666 for (i = 0; i < BLKG_RWSTAT_NR; i++) 666 for (i = 0; i < BLKG_RWSTAT_NR; i++)
667 atomic64_add(atomic64_read(&v.aux_cnt[i]) + 667 sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
668 atomic64_read(&from->aux_cnt[i]), 668
669 for (i = 0; i < BLKG_RWSTAT_NR; i++)
670 atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
669 &to->aux_cnt[i]); 671 &to->aux_cnt[i]);
670} 672}
671 673
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95c9a5c862e2..8efcf49796a3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
51 unsigned int queue_num; 51 unsigned int queue_num;
52 52
53 atomic_t nr_active; 53 atomic_t nr_active;
54 unsigned int nr_expired;
54 55
55 struct hlist_node cpuhp_dead; 56 struct hlist_node cpuhp_dead;
56 struct kobject kobj; 57 struct kobject kobj;
@@ -65,7 +66,7 @@ struct blk_mq_hw_ctx {
65#endif 66#endif
66 67
67 /* Must be the last member - see also blk_mq_hw_ctx_size(). */ 68 /* Must be the last member - see also blk_mq_hw_ctx_size(). */
68 struct srcu_struct queue_rq_srcu[0]; 69 struct srcu_struct srcu[0];
69}; 70};
70 71
71struct blk_mq_tag_set { 72struct blk_mq_tag_set {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9e7d8bd776d2..c5d3db0d83f8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -39,6 +39,34 @@ typedef u8 __bitwise blk_status_t;
39 39
40#define BLK_STS_AGAIN ((__force blk_status_t)12) 40#define BLK_STS_AGAIN ((__force blk_status_t)12)
41 41
42/**
43 * blk_path_error - returns true if error may be path related
44 * @error: status the request was completed with
45 *
46 * Description:
47 * This classifies block error status into non-retryable errors and ones
48 * that may be successful if retried on a failover path.
49 *
50 * Return:
51 * %false - retrying failover path will not help
52 * %true - may succeed if retried
53 */
54static inline bool blk_path_error(blk_status_t error)
55{
56 switch (error) {
57 case BLK_STS_NOTSUPP:
58 case BLK_STS_NOSPC:
59 case BLK_STS_TARGET:
60 case BLK_STS_NEXUS:
61 case BLK_STS_MEDIUM:
62 case BLK_STS_PROTECTION:
63 return false;
64 }
65
66 /* Anything else could be a path failure, so should be retried */
67 return true;
68}
69
42struct blk_issue_stat { 70struct blk_issue_stat {
43 u64 stat; 71 u64 stat;
44}; 72};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0ce8a372d506..4f3df807cf8f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,8 @@
27#include <linux/percpu-refcount.h> 27#include <linux/percpu-refcount.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/blkzoned.h> 29#include <linux/blkzoned.h>
30#include <linux/seqlock.h>
31#include <linux/u64_stats_sync.h>
30 32
31struct module; 33struct module;
32struct scsi_ioctl_command; 34struct scsi_ioctl_command;
@@ -121,6 +123,12 @@ typedef __u32 __bitwise req_flags_t;
121/* Look at ->special_vec for the actual data payload instead of the 123/* Look at ->special_vec for the actual data payload instead of the
122 bio chain. */ 124 bio chain. */
123#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) 125#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
126/* The per-zone write lock is held for this request */
127#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
128/* timeout is expired */
129#define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20))
130/* already slept for hybrid poll */
131#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21))
124 132
125/* flags that prevent us from merging requests: */ 133/* flags that prevent us from merging requests: */
126#define RQF_NOMERGE_FLAGS \ 134#define RQF_NOMERGE_FLAGS \
@@ -133,12 +141,6 @@ typedef __u32 __bitwise req_flags_t;
133 * especially blk_mq_rq_ctx_init() to take care of the added fields. 141 * especially blk_mq_rq_ctx_init() to take care of the added fields.
134 */ 142 */
135struct request { 143struct request {
136 struct list_head queuelist;
137 union {
138 struct __call_single_data csd;
139 u64 fifo_time;
140 };
141
142 struct request_queue *q; 144 struct request_queue *q;
143 struct blk_mq_ctx *mq_ctx; 145 struct blk_mq_ctx *mq_ctx;
144 146
@@ -148,8 +150,6 @@ struct request {
148 150
149 int internal_tag; 151 int internal_tag;
150 152
151 unsigned long atomic_flags;
152
153 /* the following two fields are internal, NEVER access directly */ 153 /* the following two fields are internal, NEVER access directly */
154 unsigned int __data_len; /* total data len */ 154 unsigned int __data_len; /* total data len */
155 int tag; 155 int tag;
@@ -158,6 +158,8 @@ struct request {
158 struct bio *bio; 158 struct bio *bio;
159 struct bio *biotail; 159 struct bio *biotail;
160 160
161 struct list_head queuelist;
162
161 /* 163 /*
162 * The hash is used inside the scheduler, and killed once the 164 * The hash is used inside the scheduler, and killed once the
163 * request reaches the dispatch list. The ipi_list is only used 165 * request reaches the dispatch list. The ipi_list is only used
@@ -205,19 +207,16 @@ struct request {
205 struct hd_struct *part; 207 struct hd_struct *part;
206 unsigned long start_time; 208 unsigned long start_time;
207 struct blk_issue_stat issue_stat; 209 struct blk_issue_stat issue_stat;
208#ifdef CONFIG_BLK_CGROUP
209 struct request_list *rl; /* rl this rq is alloced from */
210 unsigned long long start_time_ns;
211 unsigned long long io_start_time_ns; /* when passed to hardware */
212#endif
213 /* Number of scatter-gather DMA addr+len pairs after 210 /* Number of scatter-gather DMA addr+len pairs after
214 * physical address coalescing is performed. 211 * physical address coalescing is performed.
215 */ 212 */
216 unsigned short nr_phys_segments; 213 unsigned short nr_phys_segments;
214
217#if defined(CONFIG_BLK_DEV_INTEGRITY) 215#if defined(CONFIG_BLK_DEV_INTEGRITY)
218 unsigned short nr_integrity_segments; 216 unsigned short nr_integrity_segments;
219#endif 217#endif
220 218
219 unsigned short write_hint;
221 unsigned short ioprio; 220 unsigned short ioprio;
222 221
223 unsigned int timeout; 222 unsigned int timeout;
@@ -226,11 +225,37 @@ struct request {
226 225
227 unsigned int extra_len; /* length of alignment and padding */ 226 unsigned int extra_len; /* length of alignment and padding */
228 227
229 unsigned short write_hint; 228 /*
229 * On blk-mq, the lower bits of ->gstate (generation number and
230 * state) carry the MQ_RQ_* state value and the upper bits the
231 * generation number which is monotonically incremented and used to
232 * distinguish the reuse instances.
233 *
234 * ->gstate_seq allows updates to ->gstate and other fields
235 * (currently ->deadline) during request start to be read
236 * atomically from the timeout path, so that it can operate on a
237 * coherent set of information.
238 */
239 seqcount_t gstate_seq;
240 u64 gstate;
241
242 /*
243 * ->aborted_gstate is used by the timeout to claim a specific
244 * recycle instance of this request. See blk_mq_timeout_work().
245 */
246 struct u64_stats_sync aborted_gstate_sync;
247 u64 aborted_gstate;
248
249 /* access through blk_rq_set_deadline, blk_rq_deadline */
250 unsigned long __deadline;
230 251
231 unsigned long deadline;
232 struct list_head timeout_list; 252 struct list_head timeout_list;
233 253
254 union {
255 struct __call_single_data csd;
256 u64 fifo_time;
257 };
258
234 /* 259 /*
235 * completion callback. 260 * completion callback.
236 */ 261 */
@@ -239,6 +264,12 @@ struct request {
239 264
240 /* for bidi */ 265 /* for bidi */
241 struct request *next_rq; 266 struct request *next_rq;
267
268#ifdef CONFIG_BLK_CGROUP
269 struct request_list *rl; /* rl this rq is alloced from */
270 unsigned long long start_time_ns;
271 unsigned long long io_start_time_ns; /* when passed to hardware */
272#endif
242}; 273};
243 274
244static inline bool blk_op_is_scsi(unsigned int op) 275static inline bool blk_op_is_scsi(unsigned int op)
@@ -564,6 +595,22 @@ struct request_queue {
564 struct queue_limits limits; 595 struct queue_limits limits;
565 596
566 /* 597 /*
598 * Zoned block device information for request dispatch control.
599 * nr_zones is the total number of zones of the device. This is always
600 * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
601 * bits which indicates if a zone is conventional (bit clear) or
602 * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
603 * bits which indicates if a zone is write locked, that is, if a write
604 * request targeting the zone was dispatched. All three fields are
605 * initialized by the low level device driver (e.g. scsi/sd.c).
606 * Stacking drivers (device mappers) may or may not initialize
607 * these fields.
608 */
609 unsigned int nr_zones;
610 unsigned long *seq_zones_bitmap;
611 unsigned long *seq_zones_wlock;
612
613 /*
567 * sg stuff 614 * sg stuff
568 */ 615 */
569 unsigned int sg_timeout; 616 unsigned int sg_timeout;
@@ -807,6 +854,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
807 return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; 854 return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
808} 855}
809 856
857static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
858{
859 return q->nr_zones;
860}
861
862static inline unsigned int blk_queue_zone_no(struct request_queue *q,
863 sector_t sector)
864{
865 if (!blk_queue_is_zoned(q))
866 return 0;
867 return sector >> ilog2(q->limits.chunk_sectors);
868}
869
870static inline bool blk_queue_zone_is_seq(struct request_queue *q,
871 sector_t sector)
872{
873 if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
874 return false;
875 return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
876}
877
810static inline bool rq_is_sync(struct request *rq) 878static inline bool rq_is_sync(struct request *rq)
811{ 879{
812 return op_is_sync(rq->cmd_flags); 880 return op_is_sync(rq->cmd_flags);
@@ -1046,6 +1114,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
1046 return blk_rq_cur_bytes(rq) >> 9; 1114 return blk_rq_cur_bytes(rq) >> 9;
1047} 1115}
1048 1116
1117static inline unsigned int blk_rq_zone_no(struct request *rq)
1118{
1119 return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
1120}
1121
1122static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
1123{
1124 return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
1125}
1126
1049/* 1127/*
1050 * Some commands like WRITE SAME have a payload or data transfer size which 1128 * Some commands like WRITE SAME have a payload or data transfer size which
1051 * is different from the size of the request. Any driver that supports such 1129 * is different from the size of the request. Any driver that supports such
@@ -1595,7 +1673,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
1595 1673
1596 if (q) 1674 if (q)
1597 return blk_queue_zone_sectors(q); 1675 return blk_queue_zone_sectors(q);
1676 return 0;
1677}
1678
1679static inline unsigned int bdev_nr_zones(struct block_device *bdev)
1680{
1681 struct request_queue *q = bdev_get_queue(bdev);
1598 1682
1683 if (q)
1684 return blk_queue_nr_zones(q);
1599 return 0; 1685 return 0;
1600} 1686}
1601 1687
@@ -1731,8 +1817,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
1731 1817
1732int kblockd_schedule_work(struct work_struct *work); 1818int kblockd_schedule_work(struct work_struct *work);
1733int kblockd_schedule_work_on(int cpu, struct work_struct *work); 1819int kblockd_schedule_work_on(int cpu, struct work_struct *work);
1734int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
1735int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
1736int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); 1820int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
1737 1821
1738#ifdef CONFIG_BLK_CGROUP 1822#ifdef CONFIG_BLK_CGROUP
@@ -1971,6 +2055,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
1971extern int bdev_read_page(struct block_device *, sector_t, struct page *); 2055extern int bdev_read_page(struct block_device *, sector_t, struct page *);
1972extern int bdev_write_page(struct block_device *, sector_t, struct page *, 2056extern int bdev_write_page(struct block_device *, sector_t, struct page *,
1973 struct writeback_control *); 2057 struct writeback_control *);
2058
2059#ifdef CONFIG_BLK_DEV_ZONED
2060bool blk_req_needs_zone_write_lock(struct request *rq);
2061void __blk_req_zone_write_lock(struct request *rq);
2062void __blk_req_zone_write_unlock(struct request *rq);
2063
2064static inline void blk_req_zone_write_lock(struct request *rq)
2065{
2066 if (blk_req_needs_zone_write_lock(rq))
2067 __blk_req_zone_write_lock(rq);
2068}
2069
2070static inline void blk_req_zone_write_unlock(struct request *rq)
2071{
2072 if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
2073 __blk_req_zone_write_unlock(rq);
2074}
2075
2076static inline bool blk_req_zone_is_write_locked(struct request *rq)
2077{
2078 return rq->q->seq_zones_wlock &&
2079 test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
2080}
2081
2082static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
2083{
2084 if (!blk_req_needs_zone_write_lock(rq))
2085 return true;
2086 return !blk_req_zone_is_write_locked(rq);
2087}
2088#else
2089static inline bool blk_req_needs_zone_write_lock(struct request *rq)
2090{
2091 return false;
2092}
2093
2094static inline void blk_req_zone_write_lock(struct request *rq)
2095{
2096}
2097
2098static inline void blk_req_zone_write_unlock(struct request *rq)
2099{
2100}
2101static inline bool blk_req_zone_is_write_locked(struct request *rq)
2102{
2103 return false;
2104}
2105
2106static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
2107{
2108 return true;
2109}
2110#endif /* CONFIG_BLK_DEV_ZONED */
2111
1974#else /* CONFIG_BLOCK */ 2112#else /* CONFIG_BLOCK */
1975 2113
1976struct block_device; 2114struct block_device;
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ec8a4d7af6bd..fe7a22dd133b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv,
125 ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ 125 ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \
126 bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) 126 bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
127 127
128/* for iterating one bio from start to end */
129#define BVEC_ITER_ALL_INIT (struct bvec_iter) \
130{ \
131 .bi_sector = 0, \
132 .bi_size = UINT_MAX, \
133 .bi_idx = 0, \
134 .bi_bvec_done = 0, \
135}
136
128#endif /* __LINUX_BVEC_ITER_H */ 137#endif /* __LINUX_BVEC_ITER_H */
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 3d794b3dc532..6d9e230dffd2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
198extern void elv_requeue_request(struct request_queue *, struct request *); 198extern void elv_requeue_request(struct request_queue *, struct request *);
199extern struct request *elv_former_request(struct request_queue *, struct request *); 199extern struct request *elv_former_request(struct request_queue *, struct request *);
200extern struct request *elv_latter_request(struct request_queue *, struct request *); 200extern struct request *elv_latter_request(struct request_queue *, struct request *);
201extern int elv_register_queue(struct request_queue *q);
202extern void elv_unregister_queue(struct request_queue *q);
203extern int elv_may_queue(struct request_queue *, unsigned int); 201extern int elv_may_queue(struct request_queue *, unsigned int);
204extern void elv_completed_request(struct request_queue *, struct request *); 202extern void elv_completed_request(struct request_queue *, struct request *);
205extern int elv_set_request(struct request_queue *q, struct request *rq, 203extern int elv_set_request(struct request_queue *q, struct request *rq,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5144ebe046c9..5e3531027b51 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk)
395{ 395{
396 device_add_disk(NULL, disk); 396 device_add_disk(NULL, disk);
397} 397}
398extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk);
399static inline void add_disk_no_queue_reg(struct gendisk *disk)
400{
401 device_add_disk_no_queue_reg(NULL, disk);
402}
398 403
399extern void del_gendisk(struct gendisk *gp); 404extern void del_gendisk(struct gendisk *gp);
400extern struct gendisk *get_gendisk(dev_t dev, int *partno); 405extern struct gendisk *get_gendisk(dev_t dev, int *partno);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2d1d9de06728..7f4b60abdf27 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -50,10 +50,7 @@ struct nvm_id;
50struct nvm_dev; 50struct nvm_dev;
51struct nvm_tgt_dev; 51struct nvm_tgt_dev;
52 52
53typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
54typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); 53typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
55typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
56 nvm_l2p_update_fn *, void *);
57typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); 54typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
58typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); 55typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
59typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); 56typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
@@ -66,7 +63,6 @@ typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
66 63
67struct nvm_dev_ops { 64struct nvm_dev_ops {
68 nvm_id_fn *identity; 65 nvm_id_fn *identity;
69 nvm_get_l2p_tbl_fn *get_l2p_tbl;
70 nvm_op_bb_tbl_fn *get_bb_tbl; 66 nvm_op_bb_tbl_fn *get_bb_tbl;
71 nvm_op_set_bb_fn *set_bb_tbl; 67 nvm_op_set_bb_fn *set_bb_tbl;
72 68
@@ -112,8 +108,6 @@ enum {
112 NVM_RSP_WARN_HIGHECC = 0x4700, 108 NVM_RSP_WARN_HIGHECC = 0x4700,
113 109
114 /* Device opcodes */ 110 /* Device opcodes */
115 NVM_OP_HBREAD = 0x02,
116 NVM_OP_HBWRITE = 0x81,
117 NVM_OP_PWRITE = 0x91, 111 NVM_OP_PWRITE = 0x91,
118 NVM_OP_PREAD = 0x92, 112 NVM_OP_PREAD = 0x92,
119 NVM_OP_ERASE = 0x90, 113 NVM_OP_ERASE = 0x90,
@@ -165,12 +159,16 @@ struct nvm_id_group {
165 u8 fmtype; 159 u8 fmtype;
166 u8 num_ch; 160 u8 num_ch;
167 u8 num_lun; 161 u8 num_lun;
168 u8 num_pln; 162 u16 num_chk;
169 u16 num_blk; 163 u16 clba;
170 u16 num_pg;
171 u16 fpg_sz;
172 u16 csecs; 164 u16 csecs;
173 u16 sos; 165 u16 sos;
166
167 u16 ws_min;
168 u16 ws_opt;
169 u16 ws_seq;
170 u16 ws_per_chk;
171
174 u32 trdt; 172 u32 trdt;
175 u32 trdm; 173 u32 trdm;
176 u32 tprt; 174 u32 tprt;
@@ -181,7 +179,10 @@ struct nvm_id_group {
181 u32 mccap; 179 u32 mccap;
182 u16 cpar; 180 u16 cpar;
183 181
184 struct nvm_id_lp_tbl lptbl; 182 /* 1.2 compatibility */
183 u8 num_pln;
184 u16 num_pg;
185 u16 fpg_sz;
185}; 186};
186 187
187struct nvm_addr_format { 188struct nvm_addr_format {
@@ -217,6 +218,10 @@ struct nvm_target {
217 218
218#define ADDR_EMPTY (~0ULL) 219#define ADDR_EMPTY (~0ULL)
219 220
221#define NVM_TARGET_DEFAULT_OP (101)
222#define NVM_TARGET_MIN_OP (3)
223#define NVM_TARGET_MAX_OP (80)
224
220#define NVM_VERSION_MAJOR 1 225#define NVM_VERSION_MAJOR 1
221#define NVM_VERSION_MINOR 0 226#define NVM_VERSION_MINOR 0
222#define NVM_VERSION_PATCH 0 227#define NVM_VERSION_PATCH 0
@@ -239,7 +244,6 @@ struct nvm_rq {
239 void *meta_list; 244 void *meta_list;
240 dma_addr_t dma_meta_list; 245 dma_addr_t dma_meta_list;
241 246
242 struct completion *wait;
243 nvm_end_io_fn *end_io; 247 nvm_end_io_fn *end_io;
244 248
245 uint8_t opcode; 249 uint8_t opcode;
@@ -268,31 +272,38 @@ enum {
268 NVM_BLK_ST_BAD = 0x8, /* Bad block */ 272 NVM_BLK_ST_BAD = 0x8, /* Bad block */
269}; 273};
270 274
275
271/* Device generic information */ 276/* Device generic information */
272struct nvm_geo { 277struct nvm_geo {
278 /* generic geometry */
273 int nr_chnls; 279 int nr_chnls;
274 int nr_luns; 280 int all_luns; /* across channels */
275 int luns_per_chnl; /* -1 if channels are not symmetric */ 281 int nr_luns; /* per channel */
276 int nr_planes; 282 int nr_chks; /* per lun */
277 int sec_per_pg; /* only sectors for a single page */ 283
278 int pgs_per_blk;
279 int blks_per_lun;
280 int fpg_size;
281 int pfpg_size; /* size of buffer if all pages are to be read */
282 int sec_size; 284 int sec_size;
283 int oob_size; 285 int oob_size;
284 int mccap; 286 int mccap;
285 struct nvm_addr_format ppaf;
286 287
287 /* Calculated/Cached values. These do not reflect the actual usable 288 int sec_per_chk;
288 * blocks at run-time. 289 int sec_per_lun;
289 */ 290
291 int ws_min;
292 int ws_opt;
293 int ws_seq;
294 int ws_per_chk;
295
290 int max_rq_size; 296 int max_rq_size;
291 int plane_mode; /* drive device in single, double or quad mode */
292 297
298 int op;
299
300 struct nvm_addr_format ppaf;
301
302 /* Legacy 1.2 specific geometry */
303 int plane_mode; /* drive device in single, double or quad mode */
304 int nr_planes;
305 int sec_per_pg; /* only sectors for a single page */
293 int sec_per_pl; /* all sectors across planes */ 306 int sec_per_pl; /* all sectors across planes */
294 int sec_per_blk;
295 int sec_per_lun;
296}; 307};
297 308
298/* sub-device structure */ 309/* sub-device structure */
@@ -320,10 +331,6 @@ struct nvm_dev {
320 /* Device information */ 331 /* Device information */
321 struct nvm_geo geo; 332 struct nvm_geo geo;
322 333
323 /* lower page table */
324 int lps_per_blk;
325 int *lptbl;
326
327 unsigned long total_secs; 334 unsigned long total_secs;
328 335
329 unsigned long *lun_map; 336 unsigned long *lun_map;
@@ -346,36 +353,6 @@ struct nvm_dev {
346 struct list_head targets; 353 struct list_head targets;
347}; 354};
348 355
349static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
350 u64 pba)
351{
352 struct ppa_addr l;
353 int secs, pgs, blks, luns;
354 sector_t ppa = pba;
355
356 l.ppa = 0;
357
358 div_u64_rem(ppa, geo->sec_per_pg, &secs);
359 l.g.sec = secs;
360
361 sector_div(ppa, geo->sec_per_pg);
362 div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
363 l.g.pg = pgs;
364
365 sector_div(ppa, geo->pgs_per_blk);
366 div_u64_rem(ppa, geo->blks_per_lun, &blks);
367 l.g.blk = blks;
368
369 sector_div(ppa, geo->blks_per_lun);
370 div_u64_rem(ppa, geo->luns_per_chnl, &luns);
371 l.g.lun = luns;
372
373 sector_div(ppa, geo->luns_per_chnl);
374 l.g.ch = ppa;
375
376 return l;
377}
378
379static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, 356static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev,
380 struct ppa_addr r) 357 struct ppa_addr r)
381{ 358{
@@ -418,25 +395,6 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev,
418 return l; 395 return l;
419} 396}
420 397
421static inline int ppa_empty(struct ppa_addr ppa_addr)
422{
423 return (ppa_addr.ppa == ADDR_EMPTY);
424}
425
426static inline void ppa_set_empty(struct ppa_addr *ppa_addr)
427{
428 ppa_addr->ppa = ADDR_EMPTY;
429}
430
431static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
432{
433 if (ppa_empty(ppa1) || ppa_empty(ppa2))
434 return 0;
435
436 return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) &&
437 (ppa1.g.blk == ppa2.g.blk));
438}
439
440typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); 398typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
441typedef sector_t (nvm_tgt_capacity_fn)(void *); 399typedef sector_t (nvm_tgt_capacity_fn)(void *);
442typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, 400typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
@@ -481,17 +439,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
481extern int nvm_max_phys_sects(struct nvm_tgt_dev *); 439extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
482extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); 440extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
483extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); 441extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
484extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
485extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
486 void *);
487extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
488extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
489extern void nvm_end_io(struct nvm_rq *); 442extern void nvm_end_io(struct nvm_rq *);
490extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); 443extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
491extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); 444extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
492 445
493extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
494
495#else /* CONFIG_NVM */ 446#else /* CONFIG_NVM */
496struct nvm_dev_ops; 447struct nvm_dev_ops;
497 448
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index aea87f0d917b..4112e2bd747f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -124,14 +124,20 @@ enum {
124 124
125#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) 125#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7)
126#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) 126#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff)
127#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) 127
128#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) 128enum {
129 129 NVME_CMBSZ_SQS = 1 << 0,
130#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) 130 NVME_CMBSZ_CQS = 1 << 1,
131#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) 131 NVME_CMBSZ_LISTS = 1 << 2,
132#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) 132 NVME_CMBSZ_RDS = 1 << 3,
133#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) 133 NVME_CMBSZ_WDS = 1 << 4,
134#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) 134
135 NVME_CMBSZ_SZ_SHIFT = 12,
136 NVME_CMBSZ_SZ_MASK = 0xfffff,
137
138 NVME_CMBSZ_SZU_SHIFT = 8,
139 NVME_CMBSZ_SZU_MASK = 0xf,
140};
135 141
136/* 142/*
137 * Submission and Completion Queue Entry Sizes for the NVM command set. 143 * Submission and Completion Queue Entry Sizes for the NVM command set.
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index b7c83254c566..22b2131bcdcd 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -276,6 +276,17 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
276 unsigned int n_pages, unsigned int offset, 276 unsigned int n_pages, unsigned int offset,
277 unsigned long size, gfp_t gfp_mask); 277 unsigned long size, gfp_t gfp_mask);
278 278
279#ifdef CONFIG_SGL_ALLOC
280struct scatterlist *sgl_alloc_order(unsigned long long length,
281 unsigned int order, bool chainable,
282 gfp_t gfp, unsigned int *nent_p);
283struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
284 unsigned int *nent_p);
285void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
286void sgl_free_order(struct scatterlist *sgl, int order);
287void sgl_free(struct scatterlist *sgl);
288#endif /* CONFIG_SGL_ALLOC */
289
279size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, 290size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
280 size_t buflen, off_t skip, bool to_buffer); 291 size_t buflen, off_t skip, bool to_buffer);
281 292
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index 42d1a434af29..f9a1be7fc696 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -75,14 +75,23 @@ struct nvm_ioctl_create_simple {
75 __u32 lun_end; 75 __u32 lun_end;
76}; 76};
77 77
78struct nvm_ioctl_create_extended {
79 __u16 lun_begin;
80 __u16 lun_end;
81 __u16 op;
82 __u16 rsv;
83};
84
78enum { 85enum {
79 NVM_CONFIG_TYPE_SIMPLE = 0, 86 NVM_CONFIG_TYPE_SIMPLE = 0,
87 NVM_CONFIG_TYPE_EXTENDED = 1,
80}; 88};
81 89
82struct nvm_ioctl_create_conf { 90struct nvm_ioctl_create_conf {
83 __u32 type; 91 __u32 type;
84 union { 92 union {
85 struct nvm_ioctl_create_simple s; 93 struct nvm_ioctl_create_simple s;
94 struct nvm_ioctl_create_extended e;
86 }; 95 };
87}; 96};
88 97
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12d35108225..a37a3b4b6342 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
39 } 39 }
40} 40}
41 41
42static cpumask_var_t *alloc_node_to_present_cpumask(void) 42static cpumask_var_t *alloc_node_to_possible_cpumask(void)
43{ 43{
44 cpumask_var_t *masks; 44 cpumask_var_t *masks;
45 int node; 45 int node;
@@ -62,7 +62,7 @@ out_unwind:
62 return NULL; 62 return NULL;
63} 63}
64 64
65static void free_node_to_present_cpumask(cpumask_var_t *masks) 65static void free_node_to_possible_cpumask(cpumask_var_t *masks)
66{ 66{
67 int node; 67 int node;
68 68
@@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
71 kfree(masks); 71 kfree(masks);
72} 72}
73 73
74static void build_node_to_present_cpumask(cpumask_var_t *masks) 74static void build_node_to_possible_cpumask(cpumask_var_t *masks)
75{ 75{
76 int cpu; 76 int cpu;
77 77
78 for_each_present_cpu(cpu) 78 for_each_possible_cpu(cpu)
79 cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); 79 cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
80} 80}
81 81
82static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, 82static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
83 const struct cpumask *mask, nodemask_t *nodemsk) 83 const struct cpumask *mask, nodemask_t *nodemsk)
84{ 84{
85 int n, nodes = 0; 85 int n, nodes = 0;
86 86
87 /* Calculate the number of nodes in the supplied affinity mask */ 87 /* Calculate the number of nodes in the supplied affinity mask */
88 for_each_node(n) { 88 for_each_node(n) {
89 if (cpumask_intersects(mask, node_to_present_cpumask[n])) { 89 if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
90 node_set(n, *nodemsk); 90 node_set(n, *nodemsk);
91 nodes++; 91 nodes++;
92 } 92 }
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
109 int last_affv = affv + affd->pre_vectors; 109 int last_affv = affv + affd->pre_vectors;
110 nodemask_t nodemsk = NODE_MASK_NONE; 110 nodemask_t nodemsk = NODE_MASK_NONE;
111 struct cpumask *masks; 111 struct cpumask *masks;
112 cpumask_var_t nmsk, *node_to_present_cpumask; 112 cpumask_var_t nmsk, *node_to_possible_cpumask;
113 113
114 /* 114 /*
115 * If there aren't any vectors left after applying the pre/post 115 * If there aren't any vectors left after applying the pre/post
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
125 if (!masks) 125 if (!masks)
126 goto out; 126 goto out;
127 127
128 node_to_present_cpumask = alloc_node_to_present_cpumask(); 128 node_to_possible_cpumask = alloc_node_to_possible_cpumask();
129 if (!node_to_present_cpumask) 129 if (!node_to_possible_cpumask)
130 goto out; 130 goto out;
131 131
132 /* Fill out vectors at the beginning that don't need affinity */ 132 /* Fill out vectors at the beginning that don't need affinity */
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
135 135
136 /* Stabilize the cpumasks */ 136 /* Stabilize the cpumasks */
137 get_online_cpus(); 137 get_online_cpus();
138 build_node_to_present_cpumask(node_to_present_cpumask); 138 build_node_to_possible_cpumask(node_to_possible_cpumask);
139 nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, 139 nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
140 &nodemsk); 140 &nodemsk);
141 141
142 /* 142 /*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
146 if (affv <= nodes) { 146 if (affv <= nodes) {
147 for_each_node_mask(n, nodemsk) { 147 for_each_node_mask(n, nodemsk) {
148 cpumask_copy(masks + curvec, 148 cpumask_copy(masks + curvec,
149 node_to_present_cpumask[n]); 149 node_to_possible_cpumask[n]);
150 if (++curvec == last_affv) 150 if (++curvec == last_affv)
151 break; 151 break;
152 } 152 }
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
160 vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; 160 vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
161 161
162 /* Get the cpus on this node which are in the mask */ 162 /* Get the cpus on this node which are in the mask */
163 cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); 163 cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
164 164
165 /* Calculate the number of cpus per vector */ 165 /* Calculate the number of cpus per vector */
166 ncpus = cpumask_weight(nmsk); 166 ncpus = cpumask_weight(nmsk);
@@ -192,7 +192,7 @@ done:
192 /* Fill out vectors at the end that don't need affinity */ 192 /* Fill out vectors at the end that don't need affinity */
193 for (; curvec < nvecs; curvec++) 193 for (; curvec < nvecs; curvec++)
194 cpumask_copy(masks + curvec, irq_default_affinity); 194 cpumask_copy(masks + curvec, irq_default_affinity);
195 free_node_to_present_cpumask(node_to_present_cpumask); 195 free_node_to_possible_cpumask(node_to_possible_cpumask);
196out: 196out:
197 free_cpumask_var(nmsk); 197 free_cpumask_var(nmsk);
198 return masks; 198 return masks;
@@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
214 return 0; 214 return 0;
215 215
216 get_online_cpus(); 216 get_online_cpus();
217 ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; 217 ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
218 put_online_cpus(); 218 put_online_cpus();
219 return ret; 219 return ret;
220} 220}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a46be1261c09..11b4282c2d20 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb)
240static void hib_end_io(struct bio *bio) 240static void hib_end_io(struct bio *bio)
241{ 241{
242 struct hib_bio_batch *hb = bio->bi_private; 242 struct hib_bio_batch *hb = bio->bi_private;
243 struct page *page = bio->bi_io_vec[0].bv_page; 243 struct page *page = bio_first_page_all(bio);
244 244
245 if (bio->bi_status) { 245 if (bio->bi_status) {
246 pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", 246 pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
diff --git a/lib/Kconfig b/lib/Kconfig
index c5e84fbcb30b..4dd5c11366f9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -409,6 +409,10 @@ config HAS_DMA
409 depends on !NO_DMA 409 depends on !NO_DMA
410 default y 410 default y
411 411
412config SGL_ALLOC
413 bool
414 default n
415
412config DMA_NOOP_OPS 416config DMA_NOOP_OPS
413 bool 417 bool
414 depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT) 418 depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 80aa8d5463fa..42b5ca0acf93 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -462,7 +462,7 @@ static void sbq_wake_up(struct sbitmap_queue *sbq)
462 */ 462 */
463 atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); 463 atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch);
464 sbq_index_atomic_inc(&sbq->wake_index); 464 sbq_index_atomic_inc(&sbq->wake_index);
465 wake_up(&ws->wait); 465 wake_up_nr(&ws->wait, wake_batch);
466 } 466 }
467} 467}
468 468
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7c1c55f7daaa..53728d391d3a 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -474,6 +474,133 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
474} 474}
475EXPORT_SYMBOL(sg_alloc_table_from_pages); 475EXPORT_SYMBOL(sg_alloc_table_from_pages);
476 476
477#ifdef CONFIG_SGL_ALLOC
478
479/**
480 * sgl_alloc_order - allocate a scatterlist and its pages
481 * @length: Length in bytes of the scatterlist. Must be at least one
482 * @order: Second argument for alloc_pages()
483 * @chainable: Whether or not to allocate an extra element in the scatterlist
484 * for scatterlist chaining purposes
485 * @gfp: Memory allocation flags
486 * @nent_p: [out] Number of entries in the scatterlist that have pages
487 *
488 * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
489 */
490struct scatterlist *sgl_alloc_order(unsigned long long length,
491 unsigned int order, bool chainable,
492 gfp_t gfp, unsigned int *nent_p)
493{
494 struct scatterlist *sgl, *sg;
495 struct page *page;
496 unsigned int nent, nalloc;
497 u32 elem_len;
498
499 nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order);
500 /* Check for integer overflow */
501 if (length > (nent << (PAGE_SHIFT + order)))
502 return NULL;
503 nalloc = nent;
504 if (chainable) {
505 /* Check for integer overflow */
506 if (nalloc + 1 < nalloc)
507 return NULL;
508 nalloc++;
509 }
510 sgl = kmalloc_array(nalloc, sizeof(struct scatterlist),
511 (gfp & ~GFP_DMA) | __GFP_ZERO);
512 if (!sgl)
513 return NULL;
514
515 sg_init_table(sgl, nalloc);
516 sg = sgl;
517 while (length) {
518 elem_len = min_t(u64, length, PAGE_SIZE << order);
519 page = alloc_pages(gfp, order);
520 if (!page) {
521 sgl_free(sgl);
522 return NULL;
523 }
524
525 sg_set_page(sg, page, elem_len, 0);
526 length -= elem_len;
527 sg = sg_next(sg);
528 }
529 WARN_ONCE(length, "length = %lld\n", length);
530 if (nent_p)
531 *nent_p = nent;
532 return sgl;
533}
534EXPORT_SYMBOL(sgl_alloc_order);
535
536/**
537 * sgl_alloc - allocate a scatterlist and its pages
538 * @length: Length in bytes of the scatterlist
539 * @gfp: Memory allocation flags
540 * @nent_p: [out] Number of entries in the scatterlist
541 *
542 * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
543 */
544struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
545 unsigned int *nent_p)
546{
547 return sgl_alloc_order(length, 0, false, gfp, nent_p);
548}
549EXPORT_SYMBOL(sgl_alloc);
550
551/**
552 * sgl_free_n_order - free a scatterlist and its pages
553 * @sgl: Scatterlist with one or more elements
554 * @nents: Maximum number of elements to free
555 * @order: Second argument for __free_pages()
556 *
557 * Notes:
558 * - If several scatterlists have been chained and each chain element is
559 * freed separately then it's essential to set nents correctly to avoid that a
560 * page would get freed twice.
561 * - All pages in a chained scatterlist can be freed at once by setting @nents
562 * to a high number.
563 */
564void sgl_free_n_order(struct scatterlist *sgl, int nents, int order)
565{
566 struct scatterlist *sg;
567 struct page *page;
568 int i;
569
570 for_each_sg(sgl, sg, nents, i) {
571 if (!sg)
572 break;
573 page = sg_page(sg);
574 if (page)
575 __free_pages(page, order);
576 }
577 kfree(sgl);
578}
579EXPORT_SYMBOL(sgl_free_n_order);
580
581/**
582 * sgl_free_order - free a scatterlist and its pages
583 * @sgl: Scatterlist with one or more elements
584 * @order: Second argument for __free_pages()
585 */
586void sgl_free_order(struct scatterlist *sgl, int order)
587{
588 sgl_free_n_order(sgl, INT_MAX, order);
589}
590EXPORT_SYMBOL(sgl_free_order);
591
592/**
593 * sgl_free - free a scatterlist and its pages
594 * @sgl: Scatterlist with one or more elements
595 */
596void sgl_free(struct scatterlist *sgl)
597{
598 sgl_free_order(sgl, 0);
599}
600EXPORT_SYMBOL(sgl_free);
601
602#endif /* CONFIG_SGL_ALLOC */
603
477void __sg_page_iter_start(struct sg_page_iter *piter, 604void __sg_page_iter_start(struct sg_page_iter *piter,
478 struct scatterlist *sglist, unsigned int nents, 605 struct scatterlist *sglist, unsigned int nents,
479 unsigned long pgoffset) 606 unsigned long pgoffset)
diff --git a/mm/page_io.c b/mm/page_io.c
index e93f1a4cacd7..b41cf9644585 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -50,7 +50,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
50 50
51void end_swap_bio_write(struct bio *bio) 51void end_swap_bio_write(struct bio *bio)
52{ 52{
53 struct page *page = bio->bi_io_vec[0].bv_page; 53 struct page *page = bio_first_page_all(bio);
54 54
55 if (bio->bi_status) { 55 if (bio->bi_status) {
56 SetPageError(page); 56 SetPageError(page);
@@ -122,7 +122,7 @@ static void swap_slot_free_notify(struct page *page)
122 122
123static void end_swap_bio_read(struct bio *bio) 123static void end_swap_bio_read(struct bio *bio)
124{ 124{
125 struct page *page = bio->bi_io_vec[0].bv_page; 125 struct page *page = bio_first_page_all(bio);
126 struct task_struct *waiter = bio->bi_private; 126 struct task_struct *waiter = bio->bi_private;
127 127
128 if (bio->bi_status) { 128 if (bio->bi_status) {