diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 14:51:49 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 14:51:49 -0500 |
commit | 0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch) | |
tree | cefccd67dc1f27bb45830f6b8065dd4a1c05e83b | |
parent | 9697e9da84299d0d715d515dd2cc48f1eceb277d (diff) | |
parent | 796baeeef85a40b3495a907fb7425086e7010102 (diff) |
Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"This is the main pull request for block IO related changes for the
4.16 kernel. Nothing major in this pull request, but a good amount of
improvements and fixes all over the map. This contains:
- BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
Paolo.
- Support for SMR zones for deadline and mq-deadline from Damien and
Christoph.
- Set of fixes for bcache by way of Michael Lyle, including fixes
from himself, Kent, Rui, Tang, and Coly.
- Series from Matias for lightnvm with fixes from Hans Holmberg,
Javier, and Matias. Mostly centered around pblk, and the removing
rrpc 1.2 in preparation for supporting 2.0.
- A couple of NVMe pull requests from Christoph. Nothing major in
here, just fixes and cleanups, and support for command tracing from
Johannes.
- Support for blk-throttle for tracking reads and writes separately.
From Joseph Qi. A few cleanups/fixes also for blk-throttle from
Weiping.
- Series from Mike Snitzer that enables dm to register its queue more
logically, something that's alwways been problematic on dm since
it's a stacked device.
- Series from Ming cleaning up some of the bio accessor use, in
preparation for supporting multipage bvecs.
- Various fixes from Ming closing up holes around queue mapping and
quiescing.
- BSD partition fix from Richard Narron, fixing a problem where we
can't mount newer (10/11) FreeBSD partitions.
- Series from Tejun reworking blk-mq timeout handling. The previous
scheme relied on atomic bits, but it had races where we would think
a request had timed out if it to reused at the wrong time.
- null_blk now supports faking timeouts, to enable us to better
exercise and test that functionality separately. From me.
- Kill the separate atomic poll bit in the request struct. After
this, we don't use the atomic bits on blk-mq anymore at all. From
me.
- sgl_alloc/free helpers from Bart.
- Heavily contended tag case scalability improvement from me.
- Various little fixes and cleanups from Arnd, Bart, Corentin,
Douglas, Eryu, Goldwyn, and myself"
* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
block: remove smart1,2.h
nvme: add tracepoint for nvme_complete_rq
nvme: add tracepoint for nvme_setup_cmd
nvme-pci: introduce RECONNECTING state to mark initializing procedure
nvme-rdma: remove redundant boolean for inline_data
nvme: don't free uuid pointer before printing it
nvme-pci: Suspend queues after deleting them
bsg: use pr_debug instead of hand crafted macros
blk-mq-debugfs: don't allow write on attributes with seq_operations set
nvme-pci: Fix queue double allocations
block: Set BIO_TRACE_COMPLETION on new bio during split
blk-throttle: use queue_is_rq_based
block: Remove kblockd_schedule_delayed_work{,_on}()
blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
lib/scatterlist: Fix chaining support in sgl_alloc_order()
blk-throttle: track read and write request individually
block: add bdev_read_only() checks to common helpers
block: fail op_is_write() requests to read-only partitions
blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
...
124 files changed, 3884 insertions, 4729 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index da1525ec4c87..d819dc77fe65 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c | |||
@@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) | |||
775 | unsigned long flags; | 775 | unsigned long flags; |
776 | int i; | 776 | int i; |
777 | 777 | ||
778 | spin_lock_irqsave(&bfqd->lock, flags); | ||
779 | |||
778 | if (!entity) /* root group */ | 780 | if (!entity) /* root group */ |
779 | return; | 781 | goto put_async_queues; |
780 | 782 | ||
781 | spin_lock_irqsave(&bfqd->lock, flags); | ||
782 | /* | 783 | /* |
783 | * Empty all service_trees belonging to this group before | 784 | * Empty all service_trees belonging to this group before |
784 | * deactivating the group itself. | 785 | * deactivating the group itself. |
@@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) | |||
809 | } | 810 | } |
810 | 811 | ||
811 | __bfq_deactivate_entity(entity, false); | 812 | __bfq_deactivate_entity(entity, false); |
813 | |||
814 | put_async_queues: | ||
812 | bfq_put_async_queues(bfqd, bfqg); | 815 | bfq_put_async_queues(bfqd, bfqg); |
813 | 816 | ||
814 | spin_unlock_irqrestore(&bfqd->lock, flags); | 817 | spin_unlock_irqrestore(&bfqd->lock, flags); |
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bcb6d21baf12..47e6ec7427c4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10; | |||
166 | /* Default timeout values, in jiffies, approximating CFQ defaults. */ | 166 | /* Default timeout values, in jiffies, approximating CFQ defaults. */ |
167 | const int bfq_timeout = HZ / 8; | 167 | const int bfq_timeout = HZ / 8; |
168 | 168 | ||
169 | /* | ||
170 | * Time limit for merging (see comments in bfq_setup_cooperator). Set | ||
171 | * to the slowest value that, in our tests, proved to be effective in | ||
172 | * removing false positives, while not causing true positives to miss | ||
173 | * queue merging. | ||
174 | * | ||
175 | * As can be deduced from the low time limit below, queue merging, if | ||
176 | * successful, happens at the very beggining of the I/O of the involved | ||
177 | * cooperating processes, as a consequence of the arrival of the very | ||
178 | * first requests from each cooperator. After that, there is very | ||
179 | * little chance to find cooperators. | ||
180 | */ | ||
181 | static const unsigned long bfq_merge_time_limit = HZ/10; | ||
182 | |||
169 | static struct kmem_cache *bfq_pool; | 183 | static struct kmem_cache *bfq_pool; |
170 | 184 | ||
171 | /* Below this threshold (in ns), we consider thinktime immediate. */ | 185 | /* Below this threshold (in ns), we consider thinktime immediate. */ |
@@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool; | |||
178 | #define BFQQ_SEEK_THR (sector_t)(8 * 100) | 192 | #define BFQQ_SEEK_THR (sector_t)(8 * 100) |
179 | #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) | 193 | #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) |
180 | #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) | 194 | #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) |
181 | #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) | 195 | #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) |
182 | 196 | ||
183 | /* Min number of samples required to perform peak-rate update */ | 197 | /* Min number of samples required to perform peak-rate update */ |
184 | #define BFQ_RATE_MIN_SAMPLES 32 | 198 | #define BFQ_RATE_MIN_SAMPLES 32 |
@@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool; | |||
195 | * interactive applications automatically, using the following formula: | 209 | * interactive applications automatically, using the following formula: |
196 | * duration = (R / r) * T, where r is the peak rate of the device, and | 210 | * duration = (R / r) * T, where r is the peak rate of the device, and |
197 | * R and T are two reference parameters. | 211 | * R and T are two reference parameters. |
198 | * In particular, R is the peak rate of the reference device (see below), | 212 | * In particular, R is the peak rate of the reference device (see |
199 | * and T is a reference time: given the systems that are likely to be | 213 | * below), and T is a reference time: given the systems that are |
200 | * installed on the reference device according to its speed class, T is | 214 | * likely to be installed on the reference device according to its |
201 | * about the maximum time needed, under BFQ and while reading two files in | 215 | * speed class, T is about the maximum time needed, under BFQ and |
202 | * parallel, to load typical large applications on these systems. | 216 | * while reading two files in parallel, to load typical large |
203 | * In practice, the slower/faster the device at hand is, the more/less it | 217 | * applications on these systems (see the comments on |
204 | * takes to load applications with respect to the reference device. | 218 | * max_service_from_wr below, for more details on how T is obtained). |
205 | * Accordingly, the longer/shorter BFQ grants weight raising to interactive | 219 | * In practice, the slower/faster the device at hand is, the more/less |
206 | * applications. | 220 | * it takes to load applications with respect to the reference device. |
221 | * Accordingly, the longer/shorter BFQ grants weight raising to | ||
222 | * interactive applications. | ||
207 | * | 223 | * |
208 | * BFQ uses four different reference pairs (R, T), depending on: | 224 | * BFQ uses four different reference pairs (R, T), depending on: |
209 | * . whether the device is rotational or non-rotational; | 225 | * . whether the device is rotational or non-rotational; |
@@ -240,6 +256,60 @@ static int T_slow[2]; | |||
240 | static int T_fast[2]; | 256 | static int T_fast[2]; |
241 | static int device_speed_thresh[2]; | 257 | static int device_speed_thresh[2]; |
242 | 258 | ||
259 | /* | ||
260 | * BFQ uses the above-detailed, time-based weight-raising mechanism to | ||
261 | * privilege interactive tasks. This mechanism is vulnerable to the | ||
262 | * following false positives: I/O-bound applications that will go on | ||
263 | * doing I/O for much longer than the duration of weight | ||
264 | * raising. These applications have basically no benefit from being | ||
265 | * weight-raised at the beginning of their I/O. On the opposite end, | ||
266 | * while being weight-raised, these applications | ||
267 | * a) unjustly steal throughput to applications that may actually need | ||
268 | * low latency; | ||
269 | * b) make BFQ uselessly perform device idling; device idling results | ||
270 | * in loss of device throughput with most flash-based storage, and may | ||
271 | * increase latencies when used purposelessly. | ||
272 | * | ||
273 | * BFQ tries to reduce these problems, by adopting the following | ||
274 | * countermeasure. To introduce this countermeasure, we need first to | ||
275 | * finish explaining how the duration of weight-raising for | ||
276 | * interactive tasks is computed. | ||
277 | * | ||
278 | * For a bfq_queue deemed as interactive, the duration of weight | ||
279 | * raising is dynamically adjusted, as a function of the estimated | ||
280 | * peak rate of the device, so as to be equal to the time needed to | ||
281 | * execute the 'largest' interactive task we benchmarked so far. By | ||
282 | * largest task, we mean the task for which each involved process has | ||
283 | * to do more I/O than for any of the other tasks we benchmarked. This | ||
284 | * reference interactive task is the start-up of LibreOffice Writer, | ||
285 | * and in this task each process/bfq_queue needs to have at most ~110K | ||
286 | * sectors transferred. | ||
287 | * | ||
288 | * This last piece of information enables BFQ to reduce the actual | ||
289 | * duration of weight-raising for at least one class of I/O-bound | ||
290 | * applications: those doing sequential or quasi-sequential I/O. An | ||
291 | * example is file copy. In fact, once started, the main I/O-bound | ||
292 | * processes of these applications usually consume the above 110K | ||
293 | * sectors in much less time than the processes of an application that | ||
294 | * is starting, because these I/O-bound processes will greedily devote | ||
295 | * almost all their CPU cycles only to their target, | ||
296 | * throughput-friendly I/O operations. This is even more true if BFQ | ||
297 | * happens to be underestimating the device peak rate, and thus | ||
298 | * overestimating the duration of weight raising. But, according to | ||
299 | * our measurements, once transferred 110K sectors, these processes | ||
300 | * have no right to be weight-raised any longer. | ||
301 | * | ||
302 | * Basing on the last consideration, BFQ ends weight-raising for a | ||
303 | * bfq_queue if the latter happens to have received an amount of | ||
304 | * service at least equal to the following constant. The constant is | ||
305 | * set to slightly more than 110K, to have a minimum safety margin. | ||
306 | * | ||
307 | * This early ending of weight-raising reduces the amount of time | ||
308 | * during which interactive false positives cause the two problems | ||
309 | * described at the beginning of these comments. | ||
310 | */ | ||
311 | static const unsigned long max_service_from_wr = 120000; | ||
312 | |||
243 | #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) | 313 | #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) |
244 | #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) | 314 | #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) |
245 | 315 | ||
@@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, | |||
403 | } | 473 | } |
404 | } | 474 | } |
405 | 475 | ||
476 | /* | ||
477 | * See the comments on bfq_limit_depth for the purpose of | ||
478 | * the depths set in the function. | ||
479 | */ | ||
480 | static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) | ||
481 | { | ||
482 | bfqd->sb_shift = bt->sb.shift; | ||
483 | |||
484 | /* | ||
485 | * In-word depths if no bfq_queue is being weight-raised: | ||
486 | * leaving 25% of tags only for sync reads. | ||
487 | * | ||
488 | * In next formulas, right-shift the value | ||
489 | * (1U<<bfqd->sb_shift), instead of computing directly | ||
490 | * (1U<<(bfqd->sb_shift - something)), to be robust against | ||
491 | * any possible value of bfqd->sb_shift, without having to | ||
492 | * limit 'something'. | ||
493 | */ | ||
494 | /* no more than 50% of tags for async I/O */ | ||
495 | bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); | ||
496 | /* | ||
497 | * no more than 75% of tags for sync writes (25% extra tags | ||
498 | * w.r.t. async I/O, to prevent async I/O from starving sync | ||
499 | * writes) | ||
500 | */ | ||
501 | bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); | ||
502 | |||
503 | /* | ||
504 | * In-word depths in case some bfq_queue is being weight- | ||
505 | * raised: leaving ~63% of tags for sync reads. This is the | ||
506 | * highest percentage for which, in our tests, application | ||
507 | * start-up times didn't suffer from any regression due to tag | ||
508 | * shortage. | ||
509 | */ | ||
510 | /* no more than ~18% of tags for async I/O */ | ||
511 | bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); | ||
512 | /* no more than ~37% of tags for sync writes (~20% extra tags) */ | ||
513 | bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Async I/O can easily starve sync I/O (both sync reads and sync | ||
518 | * writes), by consuming all tags. Similarly, storms of sync writes, | ||
519 | * such as those that sync(2) may trigger, can starve sync reads. | ||
520 | * Limit depths of async I/O and sync writes so as to counter both | ||
521 | * problems. | ||
522 | */ | ||
523 | static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) | ||
524 | { | ||
525 | struct blk_mq_tags *tags = blk_mq_tags_from_data(data); | ||
526 | struct bfq_data *bfqd = data->q->elevator->elevator_data; | ||
527 | struct sbitmap_queue *bt; | ||
528 | |||
529 | if (op_is_sync(op) && !op_is_write(op)) | ||
530 | return; | ||
531 | |||
532 | if (data->flags & BLK_MQ_REQ_RESERVED) { | ||
533 | if (unlikely(!tags->nr_reserved_tags)) { | ||
534 | WARN_ON_ONCE(1); | ||
535 | return; | ||
536 | } | ||
537 | bt = &tags->breserved_tags; | ||
538 | } else | ||
539 | bt = &tags->bitmap_tags; | ||
540 | |||
541 | if (unlikely(bfqd->sb_shift != bt->sb.shift)) | ||
542 | bfq_update_depths(bfqd, bt); | ||
543 | |||
544 | data->shallow_depth = | ||
545 | bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; | ||
546 | |||
547 | bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", | ||
548 | __func__, bfqd->wr_busy_queues, op_is_sync(op), | ||
549 | data->shallow_depth); | ||
550 | } | ||
551 | |||
406 | static struct bfq_queue * | 552 | static struct bfq_queue * |
407 | bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, | 553 | bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, |
408 | sector_t sector, struct rb_node **ret_parent, | 554 | sector_t sector, struct rb_node **ret_parent, |
@@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, | |||
444 | return bfqq; | 590 | return bfqq; |
445 | } | 591 | } |
446 | 592 | ||
593 | static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) | ||
594 | { | ||
595 | return bfqq->service_from_backlogged > 0 && | ||
596 | time_is_before_jiffies(bfqq->first_IO_time + | ||
597 | bfq_merge_time_limit); | ||
598 | } | ||
599 | |||
447 | void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) | 600 | void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
448 | { | 601 | { |
449 | struct rb_node **p, *parent; | 602 | struct rb_node **p, *parent; |
@@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) | |||
454 | bfqq->pos_root = NULL; | 607 | bfqq->pos_root = NULL; |
455 | } | 608 | } |
456 | 609 | ||
610 | /* | ||
611 | * bfqq cannot be merged any longer (see comments in | ||
612 | * bfq_setup_cooperator): no point in adding bfqq into the | ||
613 | * position tree. | ||
614 | */ | ||
615 | if (bfq_too_late_for_merging(bfqq)) | ||
616 | return; | ||
617 | |||
457 | if (bfq_class_idle(bfqq)) | 618 | if (bfq_class_idle(bfqq)) |
458 | return; | 619 | return; |
459 | if (!bfqq->next_rq) | 620 | if (!bfqq->next_rq) |
@@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, | |||
1247 | if (old_wr_coeff == 1 && wr_or_deserves_wr) { | 1408 | if (old_wr_coeff == 1 && wr_or_deserves_wr) { |
1248 | /* start a weight-raising period */ | 1409 | /* start a weight-raising period */ |
1249 | if (interactive) { | 1410 | if (interactive) { |
1411 | bfqq->service_from_wr = 0; | ||
1250 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | 1412 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; |
1251 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | 1413 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
1252 | } else { | 1414 | } else { |
@@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q, | |||
1627 | rb_erase(&bfqq->pos_node, bfqq->pos_root); | 1789 | rb_erase(&bfqq->pos_node, bfqq->pos_root); |
1628 | bfqq->pos_root = NULL; | 1790 | bfqq->pos_root = NULL; |
1629 | } | 1791 | } |
1792 | } else { | ||
1793 | bfq_pos_tree_add_move(bfqd, bfqq); | ||
1630 | } | 1794 | } |
1631 | 1795 | ||
1632 | if (rq->cmd_flags & REQ_META) | 1796 | if (rq->cmd_flags & REQ_META) |
@@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) | |||
1933 | static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, | 2097 | static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, |
1934 | struct bfq_queue *new_bfqq) | 2098 | struct bfq_queue *new_bfqq) |
1935 | { | 2099 | { |
2100 | if (bfq_too_late_for_merging(new_bfqq)) | ||
2101 | return false; | ||
2102 | |||
1936 | if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || | 2103 | if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || |
1937 | (bfqq->ioprio_class != new_bfqq->ioprio_class)) | 2104 | (bfqq->ioprio_class != new_bfqq->ioprio_class)) |
1938 | return false; | 2105 | return false; |
@@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, | |||
1957 | } | 2124 | } |
1958 | 2125 | ||
1959 | /* | 2126 | /* |
1960 | * If this function returns true, then bfqq cannot be merged. The idea | ||
1961 | * is that true cooperation happens very early after processes start | ||
1962 | * to do I/O. Usually, late cooperations are just accidental false | ||
1963 | * positives. In case bfqq is weight-raised, such false positives | ||
1964 | * would evidently degrade latency guarantees for bfqq. | ||
1965 | */ | ||
1966 | static bool wr_from_too_long(struct bfq_queue *bfqq) | ||
1967 | { | ||
1968 | return bfqq->wr_coeff > 1 && | ||
1969 | time_is_before_jiffies(bfqq->last_wr_start_finish + | ||
1970 | msecs_to_jiffies(100)); | ||
1971 | } | ||
1972 | |||
1973 | /* | ||
1974 | * Attempt to schedule a merge of bfqq with the currently in-service | 2127 | * Attempt to schedule a merge of bfqq with the currently in-service |
1975 | * queue or with a close queue among the scheduled queues. Return | 2128 | * queue or with a close queue among the scheduled queues. Return |
1976 | * NULL if no merge was scheduled, a pointer to the shared bfq_queue | 2129 | * NULL if no merge was scheduled, a pointer to the shared bfq_queue |
@@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) | |||
1983 | * to maintain. Besides, in such a critical condition as an out of memory, | 2136 | * to maintain. Besides, in such a critical condition as an out of memory, |
1984 | * the benefits of queue merging may be little relevant, or even negligible. | 2137 | * the benefits of queue merging may be little relevant, or even negligible. |
1985 | * | 2138 | * |
1986 | * Weight-raised queues can be merged only if their weight-raising | ||
1987 | * period has just started. In fact cooperating processes are usually | ||
1988 | * started together. Thus, with this filter we avoid false positives | ||
1989 | * that would jeopardize low-latency guarantees. | ||
1990 | * | ||
1991 | * WARNING: queue merging may impair fairness among non-weight raised | 2139 | * WARNING: queue merging may impair fairness among non-weight raised |
1992 | * queues, for at least two reasons: 1) the original weight of a | 2140 | * queues, for at least two reasons: 1) the original weight of a |
1993 | * merged queue may change during the merged state, 2) even being the | 2141 | * merged queue may change during the merged state, 2) even being the |
@@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
2001 | { | 2149 | { |
2002 | struct bfq_queue *in_service_bfqq, *new_bfqq; | 2150 | struct bfq_queue *in_service_bfqq, *new_bfqq; |
2003 | 2151 | ||
2152 | /* | ||
2153 | * Prevent bfqq from being merged if it has been created too | ||
2154 | * long ago. The idea is that true cooperating processes, and | ||
2155 | * thus their associated bfq_queues, are supposed to be | ||
2156 | * created shortly after each other. This is the case, e.g., | ||
2157 | * for KVM/QEMU and dump I/O threads. Basing on this | ||
2158 | * assumption, the following filtering greatly reduces the | ||
2159 | * probability that two non-cooperating processes, which just | ||
2160 | * happen to do close I/O for some short time interval, have | ||
2161 | * their queues merged by mistake. | ||
2162 | */ | ||
2163 | if (bfq_too_late_for_merging(bfqq)) | ||
2164 | return NULL; | ||
2165 | |||
2004 | if (bfqq->new_bfqq) | 2166 | if (bfqq->new_bfqq) |
2005 | return bfqq->new_bfqq; | 2167 | return bfqq->new_bfqq; |
2006 | 2168 | ||
2007 | if (!io_struct || | 2169 | if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) |
2008 | wr_from_too_long(bfqq) || | ||
2009 | unlikely(bfqq == &bfqd->oom_bfqq)) | ||
2010 | return NULL; | 2170 | return NULL; |
2011 | 2171 | ||
2012 | /* If there is only one backlogged queue, don't search. */ | 2172 | /* If there is only one backlogged queue, don't search. */ |
@@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
2015 | 2175 | ||
2016 | in_service_bfqq = bfqd->in_service_queue; | 2176 | in_service_bfqq = bfqd->in_service_queue; |
2017 | 2177 | ||
2018 | if (!in_service_bfqq || in_service_bfqq == bfqq | 2178 | if (in_service_bfqq && in_service_bfqq != bfqq && |
2019 | || wr_from_too_long(in_service_bfqq) || | 2179 | likely(in_service_bfqq != &bfqd->oom_bfqq) && |
2020 | unlikely(in_service_bfqq == &bfqd->oom_bfqq)) | 2180 | bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && |
2021 | goto check_scheduled; | ||
2022 | |||
2023 | if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && | ||
2024 | bfqq->entity.parent == in_service_bfqq->entity.parent && | 2181 | bfqq->entity.parent == in_service_bfqq->entity.parent && |
2025 | bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { | 2182 | bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { |
2026 | new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); | 2183 | new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); |
@@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
2032 | * queues. The only thing we need is that the bio/request is not | 2189 | * queues. The only thing we need is that the bio/request is not |
2033 | * NULL, as we need it to establish whether a cooperator exists. | 2190 | * NULL, as we need it to establish whether a cooperator exists. |
2034 | */ | 2191 | */ |
2035 | check_scheduled: | ||
2036 | new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, | 2192 | new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, |
2037 | bfq_io_struct_pos(io_struct, request)); | 2193 | bfq_io_struct_pos(io_struct, request)); |
2038 | 2194 | ||
2039 | if (new_bfqq && !wr_from_too_long(new_bfqq) && | 2195 | if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && |
2040 | likely(new_bfqq != &bfqd->oom_bfqq) && | ||
2041 | bfq_may_be_close_cooperator(bfqq, new_bfqq)) | 2196 | bfq_may_be_close_cooperator(bfqq, new_bfqq)) |
2042 | return bfq_setup_merge(bfqq, new_bfqq); | 2197 | return bfq_setup_merge(bfqq, new_bfqq); |
2043 | 2198 | ||
@@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) | |||
2062 | bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); | 2217 | bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); |
2063 | bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); | 2218 | bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); |
2064 | if (unlikely(bfq_bfqq_just_created(bfqq) && | 2219 | if (unlikely(bfq_bfqq_just_created(bfqq) && |
2065 | !bfq_bfqq_in_large_burst(bfqq))) { | 2220 | !bfq_bfqq_in_large_burst(bfqq) && |
2221 | bfqq->bfqd->low_latency)) { | ||
2066 | /* | 2222 | /* |
2067 | * bfqq being merged right after being created: bfqq | 2223 | * bfqq being merged right after being created: bfqq |
2068 | * would have deserved interactive weight raising, but | 2224 | * would have deserved interactive weight raising, but |
@@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
2917 | * whereas soft_rt_next_start is set to infinity for applications that do | 3073 | * whereas soft_rt_next_start is set to infinity for applications that do |
2918 | * not. | 3074 | * not. |
2919 | * | 3075 | * |
2920 | * Unfortunately, even a greedy application may happen to behave in an | 3076 | * Unfortunately, even a greedy (i.e., I/O-bound) application may |
2921 | * isochronous way if the CPU load is high. In fact, the application may | 3077 | * happen to meet, occasionally or systematically, both the above |
2922 | * stop issuing requests while the CPUs are busy serving other processes, | 3078 | * bandwidth and isochrony requirements. This may happen at least in |
2923 | * then restart, then stop again for a while, and so on. In addition, if | 3079 | * the following circumstances. First, if the CPU load is high. The |
2924 | * the disk achieves a low enough throughput with the request pattern | 3080 | * application may stop issuing requests while the CPUs are busy |
2925 | * issued by the application (e.g., because the request pattern is random | 3081 | * serving other processes, then restart, then stop again for a while, |
2926 | * and/or the device is slow), then the application may meet the above | 3082 | * and so on. The other circumstances are related to the storage |
2927 | * bandwidth requirement too. To prevent such a greedy application to be | 3083 | * device: the storage device is highly loaded or reaches a low-enough |
2928 | * deemed as soft real-time, a further rule is used in the computation of | 3084 | * throughput with the I/O of the application (e.g., because the I/O |
2929 | * soft_rt_next_start: soft_rt_next_start must be higher than the current | 3085 | * is random and/or the device is slow). In all these cases, the |
2930 | * time plus the maximum time for which the arrival of a request is waited | 3086 | * I/O of the application may be simply slowed down enough to meet |
2931 | * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. | 3087 | * the bandwidth and isochrony requirements. To reduce the probability |
2932 | * This filters out greedy applications, as the latter issue instead their | 3088 | * that greedy applications are deemed as soft real-time in these |
2933 | * next request as soon as possible after the last one has been completed | 3089 | * corner cases, a further rule is used in the computation of |
2934 | * (in contrast, when a batch of requests is completed, a soft real-time | 3090 | * soft_rt_next_start: the return value of this function is forced to |
2935 | * application spends some time processing data). | 3091 | * be higher than the maximum between the following two quantities. |
3092 | * | ||
3093 | * (a) Current time plus: (1) the maximum time for which the arrival | ||
3094 | * of a request is waited for when a sync queue becomes idle, | ||
3095 | * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We | ||
3096 | * postpone for a moment the reason for adding a few extra | ||
3097 | * jiffies; we get back to it after next item (b). Lower-bounding | ||
3098 | * the return value of this function with the current time plus | ||
3099 | * bfqd->bfq_slice_idle tends to filter out greedy applications, | ||
3100 | * because the latter issue their next request as soon as possible | ||
3101 | * after the last one has been completed. In contrast, a soft | ||
3102 | * real-time application spends some time processing data, after a | ||
3103 | * batch of its requests has been completed. | ||
2936 | * | 3104 | * |
2937 | * Unfortunately, the last filter may easily generate false positives if | 3105 | * (b) Current value of bfqq->soft_rt_next_start. As pointed out |
2938 | * only bfqd->bfq_slice_idle is used as a reference time interval and one | 3106 | * above, greedy applications may happen to meet both the |
2939 | * or both the following cases occur: | 3107 | * bandwidth and isochrony requirements under heavy CPU or |
2940 | * 1) HZ is so low that the duration of a jiffy is comparable to or higher | 3108 | * storage-device load. In more detail, in these scenarios, these |
2941 | * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with | 3109 | * applications happen, only for limited time periods, to do I/O |
2942 | * HZ=100. | 3110 | * slowly enough to meet all the requirements described so far, |
3111 | * including the filtering in above item (a). These slow-speed | ||
3112 | * time intervals are usually interspersed between other time | ||
3113 | * intervals during which these applications do I/O at a very high | ||
3114 | * speed. Fortunately, exactly because of the high speed of the | ||
3115 | * I/O in the high-speed intervals, the values returned by this | ||
3116 | * function happen to be so high, near the end of any such | ||
3117 | * high-speed interval, to be likely to fall *after* the end of | ||
3118 | * the low-speed time interval that follows. These high values are | ||
3119 | * stored in bfqq->soft_rt_next_start after each invocation of | ||
3120 | * this function. As a consequence, if the last value of | ||
3121 | * bfqq->soft_rt_next_start is constantly used to lower-bound the | ||
3122 | * next value that this function may return, then, from the very | ||
3123 | * beginning of a low-speed interval, bfqq->soft_rt_next_start is | ||
3124 | * likely to be constantly kept so high that any I/O request | ||
3125 | * issued during the low-speed interval is considered as arriving | ||
3126 | * to soon for the application to be deemed as soft | ||
3127 | * real-time. Then, in the high-speed interval that follows, the | ||
3128 | * application will not be deemed as soft real-time, just because | ||
3129 | * it will do I/O at a high speed. And so on. | ||
3130 | * | ||
3131 | * Getting back to the filtering in item (a), in the following two | ||
3132 | * cases this filtering might be easily passed by a greedy | ||
3133 | * application, if the reference quantity was just | ||
3134 | * bfqd->bfq_slice_idle: | ||
3135 | * 1) HZ is so low that the duration of a jiffy is comparable to or | ||
3136 | * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow | ||
3137 | * devices with HZ=100. The time granularity may be so coarse | ||
3138 | * that the approximation, in jiffies, of bfqd->bfq_slice_idle | ||
3139 | * is rather lower than the exact value. | ||
2943 | * 2) jiffies, instead of increasing at a constant rate, may stop increasing | 3140 | * 2) jiffies, instead of increasing at a constant rate, may stop increasing |
2944 | * for a while, then suddenly 'jump' by several units to recover the lost | 3141 | * for a while, then suddenly 'jump' by several units to recover the lost |
2945 | * increments. This seems to happen, e.g., inside virtual machines. | 3142 | * increments. This seems to happen, e.g., inside virtual machines. |
2946 | * To address this issue, we do not use as a reference time interval just | 3143 | * To address this issue, in the filtering in (a) we do not use as a |
2947 | * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In | 3144 | * reference time interval just bfqd->bfq_slice_idle, but |
2948 | * particular we add the minimum number of jiffies for which the filter | 3145 | * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the |
2949 | * seems to be quite precise also in embedded systems and KVM/QEMU virtual | 3146 | * minimum number of jiffies for which the filter seems to be quite |
2950 | * machines. | 3147 | * precise also in embedded systems and KVM/QEMU virtual machines. |
2951 | */ | 3148 | */ |
2952 | static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, | 3149 | static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, |
2953 | struct bfq_queue *bfqq) | 3150 | struct bfq_queue *bfqq) |
2954 | { | 3151 | { |
2955 | return max(bfqq->last_idle_bklogged + | 3152 | return max3(bfqq->soft_rt_next_start, |
2956 | HZ * bfqq->service_from_backlogged / | 3153 | bfqq->last_idle_bklogged + |
2957 | bfqd->bfq_wr_max_softrt_rate, | 3154 | HZ * bfqq->service_from_backlogged / |
2958 | jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); | 3155 | bfqd->bfq_wr_max_softrt_rate, |
3156 | jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); | ||
2959 | } | 3157 | } |
2960 | 3158 | ||
2961 | /** | 3159 | /** |
@@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, | |||
3000 | slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); | 3198 | slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); |
3001 | 3199 | ||
3002 | /* | 3200 | /* |
3003 | * Increase service_from_backlogged before next statement, | ||
3004 | * because the possible next invocation of | ||
3005 | * bfq_bfqq_charge_time would likely inflate | ||
3006 | * entity->service. In contrast, service_from_backlogged must | ||
3007 | * contain real service, to enable the soft real-time | ||
3008 | * heuristic to correctly compute the bandwidth consumed by | ||
3009 | * bfqq. | ||
3010 | */ | ||
3011 | bfqq->service_from_backlogged += entity->service; | ||
3012 | |||
3013 | /* | ||
3014 | * As above explained, charge slow (typically seeky) and | 3201 | * As above explained, charge slow (typically seeky) and |
3015 | * timed-out queues with the time and not the service | 3202 | * timed-out queues with the time and not the service |
3016 | * received, to favor sequential workloads. | 3203 | * received, to favor sequential workloads. |
@@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) | |||
3535 | bfqq->entity.prio_changed = 1; | 3722 | bfqq->entity.prio_changed = 1; |
3536 | } | 3723 | } |
3537 | } | 3724 | } |
3725 | if (bfqq->wr_coeff > 1 && | ||
3726 | bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && | ||
3727 | bfqq->service_from_wr > max_service_from_wr) { | ||
3728 | /* see comments on max_service_from_wr */ | ||
3729 | bfq_bfqq_end_wr(bfqq); | ||
3730 | } | ||
3538 | } | 3731 | } |
3539 | /* | 3732 | /* |
3540 | * To improve latency (for this or other queues), immediately | 3733 | * To improve latency (for this or other queues), immediately |
@@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
3630 | } | 3823 | } |
3631 | 3824 | ||
3632 | /* | 3825 | /* |
3633 | * We exploit the put_rq_private hook to decrement | 3826 | * We exploit the bfq_finish_request hook to decrement |
3634 | * rq_in_driver, but put_rq_private will not be | 3827 | * rq_in_driver, but bfq_finish_request will not be |
3635 | * invoked on this request. So, to avoid unbalance, | 3828 | * invoked on this request. So, to avoid unbalance, |
3636 | * just start this request, without incrementing | 3829 | * just start this request, without incrementing |
3637 | * rq_in_driver. As a negative consequence, | 3830 | * rq_in_driver. As a negative consequence, |
@@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
3640 | * bfq_schedule_dispatch to be invoked uselessly. | 3833 | * bfq_schedule_dispatch to be invoked uselessly. |
3641 | * | 3834 | * |
3642 | * As for implementing an exact solution, the | 3835 | * As for implementing an exact solution, the |
3643 | * put_request hook, if defined, is probably invoked | 3836 | * bfq_finish_request hook, if defined, is probably |
3644 | * also on this request. So, by exploiting this hook, | 3837 | * invoked also on this request. So, by exploiting |
3645 | * we could 1) increment rq_in_driver here, and 2) | 3838 | * this hook, we could 1) increment rq_in_driver here, |
3646 | * decrement it in put_request. Such a solution would | 3839 | * and 2) decrement it in bfq_finish_request. Such a |
3647 | * let the value of the counter be always accurate, | 3840 | * solution would let the value of the counter be |
3648 | * but it would entail using an extra interface | 3841 | * always accurate, but it would entail using an extra |
3649 | * function. This cost seems higher than the benefit, | 3842 | * interface function. This cost seems higher than the |
3650 | * being the frequency of non-elevator-private | 3843 | * benefit, being the frequency of non-elevator-private |
3651 | * requests very low. | 3844 | * requests very low. |
3652 | */ | 3845 | */ |
3653 | goto start_rq; | 3846 | goto start_rq; |
@@ -3689,35 +3882,16 @@ exit: | |||
3689 | return rq; | 3882 | return rq; |
3690 | } | 3883 | } |
3691 | 3884 | ||
3692 | static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | ||
3693 | { | ||
3694 | struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; | ||
3695 | struct request *rq; | ||
3696 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | 3885 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) |
3697 | struct bfq_queue *in_serv_queue, *bfqq; | 3886 | static void bfq_update_dispatch_stats(struct request_queue *q, |
3698 | bool waiting_rq, idle_timer_disabled; | 3887 | struct request *rq, |
3699 | #endif | 3888 | struct bfq_queue *in_serv_queue, |
3700 | 3889 | bool idle_timer_disabled) | |
3701 | spin_lock_irq(&bfqd->lock); | 3890 | { |
3702 | 3891 | struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; | |
3703 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
3704 | in_serv_queue = bfqd->in_service_queue; | ||
3705 | waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); | ||
3706 | |||
3707 | rq = __bfq_dispatch_request(hctx); | ||
3708 | |||
3709 | idle_timer_disabled = | ||
3710 | waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); | ||
3711 | |||
3712 | #else | ||
3713 | rq = __bfq_dispatch_request(hctx); | ||
3714 | #endif | ||
3715 | spin_unlock_irq(&bfqd->lock); | ||
3716 | 3892 | ||
3717 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
3718 | bfqq = rq ? RQ_BFQQ(rq) : NULL; | ||
3719 | if (!idle_timer_disabled && !bfqq) | 3893 | if (!idle_timer_disabled && !bfqq) |
3720 | return rq; | 3894 | return; |
3721 | 3895 | ||
3722 | /* | 3896 | /* |
3723 | * rq and bfqq are guaranteed to exist until this function | 3897 | * rq and bfqq are guaranteed to exist until this function |
@@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
3732 | * In addition, the following queue lock guarantees that | 3906 | * In addition, the following queue lock guarantees that |
3733 | * bfqq_group(bfqq) exists as well. | 3907 | * bfqq_group(bfqq) exists as well. |
3734 | */ | 3908 | */ |
3735 | spin_lock_irq(hctx->queue->queue_lock); | 3909 | spin_lock_irq(q->queue_lock); |
3736 | if (idle_timer_disabled) | 3910 | if (idle_timer_disabled) |
3737 | /* | 3911 | /* |
3738 | * Since the idle timer has been disabled, | 3912 | * Since the idle timer has been disabled, |
@@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
3751 | bfqg_stats_set_start_empty_time(bfqg); | 3925 | bfqg_stats_set_start_empty_time(bfqg); |
3752 | bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); | 3926 | bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); |
3753 | } | 3927 | } |
3754 | spin_unlock_irq(hctx->queue->queue_lock); | 3928 | spin_unlock_irq(q->queue_lock); |
3929 | } | ||
3930 | #else | ||
3931 | static inline void bfq_update_dispatch_stats(struct request_queue *q, | ||
3932 | struct request *rq, | ||
3933 | struct bfq_queue *in_serv_queue, | ||
3934 | bool idle_timer_disabled) {} | ||
3755 | #endif | 3935 | #endif |
3756 | 3936 | ||
3937 | static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | ||
3938 | { | ||
3939 | struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; | ||
3940 | struct request *rq; | ||
3941 | struct bfq_queue *in_serv_queue; | ||
3942 | bool waiting_rq, idle_timer_disabled; | ||
3943 | |||
3944 | spin_lock_irq(&bfqd->lock); | ||
3945 | |||
3946 | in_serv_queue = bfqd->in_service_queue; | ||
3947 | waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); | ||
3948 | |||
3949 | rq = __bfq_dispatch_request(hctx); | ||
3950 | |||
3951 | idle_timer_disabled = | ||
3952 | waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); | ||
3953 | |||
3954 | spin_unlock_irq(&bfqd->lock); | ||
3955 | |||
3956 | bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, | ||
3957 | idle_timer_disabled); | ||
3958 | |||
3757 | return rq; | 3959 | return rq; |
3758 | } | 3960 | } |
3759 | 3961 | ||
@@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
4002 | bfqq->split_time = bfq_smallest_from_now(); | 4204 | bfqq->split_time = bfq_smallest_from_now(); |
4003 | 4205 | ||
4004 | /* | 4206 | /* |
4005 | * Set to the value for which bfqq will not be deemed as | 4207 | * To not forget the possibly high bandwidth consumed by a |
4006 | * soft rt when it becomes backlogged. | 4208 | * process/queue in the recent past, |
4209 | * bfq_bfqq_softrt_next_start() returns a value at least equal | ||
4210 | * to the current value of bfqq->soft_rt_next_start (see | ||
4211 | * comments on bfq_bfqq_softrt_next_start). Set | ||
4212 | * soft_rt_next_start to now, to mean that bfqq has consumed | ||
4213 | * no bandwidth so far. | ||
4007 | */ | 4214 | */ |
4008 | bfqq->soft_rt_next_start = bfq_greatest_from_now(); | 4215 | bfqq->soft_rt_next_start = jiffies; |
4009 | 4216 | ||
4010 | /* first request is almost certainly seeky */ | 4217 | /* first request is almost certainly seeky */ |
4011 | bfqq->seek_history = 1; | 4218 | bfqq->seek_history = 1; |
@@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) | |||
4276 | return idle_timer_disabled; | 4483 | return idle_timer_disabled; |
4277 | } | 4484 | } |
4278 | 4485 | ||
4486 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
4487 | static void bfq_update_insert_stats(struct request_queue *q, | ||
4488 | struct bfq_queue *bfqq, | ||
4489 | bool idle_timer_disabled, | ||
4490 | unsigned int cmd_flags) | ||
4491 | { | ||
4492 | if (!bfqq) | ||
4493 | return; | ||
4494 | |||
4495 | /* | ||
4496 | * bfqq still exists, because it can disappear only after | ||
4497 | * either it is merged with another queue, or the process it | ||
4498 | * is associated with exits. But both actions must be taken by | ||
4499 | * the same process currently executing this flow of | ||
4500 | * instructions. | ||
4501 | * | ||
4502 | * In addition, the following queue lock guarantees that | ||
4503 | * bfqq_group(bfqq) exists as well. | ||
4504 | */ | ||
4505 | spin_lock_irq(q->queue_lock); | ||
4506 | bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); | ||
4507 | if (idle_timer_disabled) | ||
4508 | bfqg_stats_update_idle_time(bfqq_group(bfqq)); | ||
4509 | spin_unlock_irq(q->queue_lock); | ||
4510 | } | ||
4511 | #else | ||
4512 | static inline void bfq_update_insert_stats(struct request_queue *q, | ||
4513 | struct bfq_queue *bfqq, | ||
4514 | bool idle_timer_disabled, | ||
4515 | unsigned int cmd_flags) {} | ||
4516 | #endif | ||
4517 | |||
4279 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | 4518 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, |
4280 | bool at_head) | 4519 | bool at_head) |
4281 | { | 4520 | { |
4282 | struct request_queue *q = hctx->queue; | 4521 | struct request_queue *q = hctx->queue; |
4283 | struct bfq_data *bfqd = q->elevator->elevator_data; | 4522 | struct bfq_data *bfqd = q->elevator->elevator_data; |
4284 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
4285 | struct bfq_queue *bfqq = RQ_BFQQ(rq); | 4523 | struct bfq_queue *bfqq = RQ_BFQQ(rq); |
4286 | bool idle_timer_disabled = false; | 4524 | bool idle_timer_disabled = false; |
4287 | unsigned int cmd_flags; | 4525 | unsigned int cmd_flags; |
4288 | #endif | ||
4289 | 4526 | ||
4290 | spin_lock_irq(&bfqd->lock); | 4527 | spin_lock_irq(&bfqd->lock); |
4291 | if (blk_mq_sched_try_insert_merge(q, rq)) { | 4528 | if (blk_mq_sched_try_insert_merge(q, rq)) { |
@@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
4304 | else | 4541 | else |
4305 | list_add_tail(&rq->queuelist, &bfqd->dispatch); | 4542 | list_add_tail(&rq->queuelist, &bfqd->dispatch); |
4306 | } else { | 4543 | } else { |
4307 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
4308 | idle_timer_disabled = __bfq_insert_request(bfqd, rq); | 4544 | idle_timer_disabled = __bfq_insert_request(bfqd, rq); |
4309 | /* | 4545 | /* |
4310 | * Update bfqq, because, if a queue merge has occurred | 4546 | * Update bfqq, because, if a queue merge has occurred |
@@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
4312 | * redirected into a new queue. | 4548 | * redirected into a new queue. |
4313 | */ | 4549 | */ |
4314 | bfqq = RQ_BFQQ(rq); | 4550 | bfqq = RQ_BFQQ(rq); |
4315 | #else | ||
4316 | __bfq_insert_request(bfqd, rq); | ||
4317 | #endif | ||
4318 | 4551 | ||
4319 | if (rq_mergeable(rq)) { | 4552 | if (rq_mergeable(rq)) { |
4320 | elv_rqhash_add(q, rq); | 4553 | elv_rqhash_add(q, rq); |
@@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
4323 | } | 4556 | } |
4324 | } | 4557 | } |
4325 | 4558 | ||
4326 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
4327 | /* | 4559 | /* |
4328 | * Cache cmd_flags before releasing scheduler lock, because rq | 4560 | * Cache cmd_flags before releasing scheduler lock, because rq |
4329 | * may disappear afterwards (for example, because of a request | 4561 | * may disappear afterwards (for example, because of a request |
4330 | * merge). | 4562 | * merge). |
4331 | */ | 4563 | */ |
4332 | cmd_flags = rq->cmd_flags; | 4564 | cmd_flags = rq->cmd_flags; |
4333 | #endif | 4565 | |
4334 | spin_unlock_irq(&bfqd->lock); | 4566 | spin_unlock_irq(&bfqd->lock); |
4335 | 4567 | ||
4336 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | 4568 | bfq_update_insert_stats(q, bfqq, idle_timer_disabled, |
4337 | if (!bfqq) | 4569 | cmd_flags); |
4338 | return; | ||
4339 | /* | ||
4340 | * bfqq still exists, because it can disappear only after | ||
4341 | * either it is merged with another queue, or the process it | ||
4342 | * is associated with exits. But both actions must be taken by | ||
4343 | * the same process currently executing this flow of | ||
4344 | * instruction. | ||
4345 | * | ||
4346 | * In addition, the following queue lock guarantees that | ||
4347 | * bfqq_group(bfqq) exists as well. | ||
4348 | */ | ||
4349 | spin_lock_irq(q->queue_lock); | ||
4350 | bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); | ||
4351 | if (idle_timer_disabled) | ||
4352 | bfqg_stats_update_idle_time(bfqq_group(bfqq)); | ||
4353 | spin_unlock_irq(q->queue_lock); | ||
4354 | #endif | ||
4355 | } | 4570 | } |
4356 | 4571 | ||
4357 | static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, | 4572 | static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, |
@@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | |||
4482 | bfq_schedule_dispatch(bfqd); | 4697 | bfq_schedule_dispatch(bfqd); |
4483 | } | 4698 | } |
4484 | 4699 | ||
4485 | static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) | 4700 | static void bfq_finish_request_body(struct bfq_queue *bfqq) |
4486 | { | 4701 | { |
4487 | bfqq->allocated--; | 4702 | bfqq->allocated--; |
4488 | 4703 | ||
@@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq) | |||
4512 | spin_lock_irqsave(&bfqd->lock, flags); | 4727 | spin_lock_irqsave(&bfqd->lock, flags); |
4513 | 4728 | ||
4514 | bfq_completed_request(bfqq, bfqd); | 4729 | bfq_completed_request(bfqq, bfqd); |
4515 | bfq_put_rq_priv_body(bfqq); | 4730 | bfq_finish_request_body(bfqq); |
4516 | 4731 | ||
4517 | spin_unlock_irqrestore(&bfqd->lock, flags); | 4732 | spin_unlock_irqrestore(&bfqd->lock, flags); |
4518 | } else { | 4733 | } else { |
@@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq) | |||
4533 | bfqg_stats_update_io_remove(bfqq_group(bfqq), | 4748 | bfqg_stats_update_io_remove(bfqq_group(bfqq), |
4534 | rq->cmd_flags); | 4749 | rq->cmd_flags); |
4535 | } | 4750 | } |
4536 | bfq_put_rq_priv_body(bfqq); | 4751 | bfq_finish_request_body(bfqq); |
4537 | } | 4752 | } |
4538 | 4753 | ||
4539 | rq->elv.priv[0] = NULL; | 4754 | rq->elv.priv[0] = NULL; |
@@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e) | |||
4818 | hrtimer_cancel(&bfqd->idle_slice_timer); | 5033 | hrtimer_cancel(&bfqd->idle_slice_timer); |
4819 | 5034 | ||
4820 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 5035 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
5036 | /* release oom-queue reference to root group */ | ||
5037 | bfqg_and_blkg_put(bfqd->root_group); | ||
5038 | |||
4821 | blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); | 5039 | blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); |
4822 | #else | 5040 | #else |
4823 | spin_lock_irq(&bfqd->lock); | 5041 | spin_lock_irq(&bfqd->lock); |
@@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = { | |||
5206 | 5424 | ||
5207 | static struct elevator_type iosched_bfq_mq = { | 5425 | static struct elevator_type iosched_bfq_mq = { |
5208 | .ops.mq = { | 5426 | .ops.mq = { |
5427 | .limit_depth = bfq_limit_depth, | ||
5209 | .prepare_request = bfq_prepare_request, | 5428 | .prepare_request = bfq_prepare_request, |
5210 | .finish_request = bfq_finish_request, | 5429 | .finish_request = bfq_finish_request, |
5211 | .exit_icq = bfq_exit_icq, | 5430 | .exit_icq = bfq_exit_icq, |
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 91c4390903a1..350c39ae2896 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h | |||
@@ -337,6 +337,11 @@ struct bfq_queue { | |||
337 | * last transition from idle to backlogged. | 337 | * last transition from idle to backlogged. |
338 | */ | 338 | */ |
339 | unsigned long service_from_backlogged; | 339 | unsigned long service_from_backlogged; |
340 | /* | ||
341 | * Cumulative service received from the @bfq_queue since its | ||
342 | * last transition to weight-raised state. | ||
343 | */ | ||
344 | unsigned long service_from_wr; | ||
340 | 345 | ||
341 | /* | 346 | /* |
342 | * Value of wr start time when switching to soft rt | 347 | * Value of wr start time when switching to soft rt |
@@ -344,6 +349,8 @@ struct bfq_queue { | |||
344 | unsigned long wr_start_at_switch_to_srt; | 349 | unsigned long wr_start_at_switch_to_srt; |
345 | 350 | ||
346 | unsigned long split_time; /* time of last split */ | 351 | unsigned long split_time; /* time of last split */ |
352 | |||
353 | unsigned long first_IO_time; /* time of first I/O for this queue */ | ||
347 | }; | 354 | }; |
348 | 355 | ||
349 | /** | 356 | /** |
@@ -627,6 +634,18 @@ struct bfq_data { | |||
627 | struct bfq_io_cq *bio_bic; | 634 | struct bfq_io_cq *bio_bic; |
628 | /* bfqq associated with the task issuing current bio for merging */ | 635 | /* bfqq associated with the task issuing current bio for merging */ |
629 | struct bfq_queue *bio_bfqq; | 636 | struct bfq_queue *bio_bfqq; |
637 | |||
638 | /* | ||
639 | * Cached sbitmap shift, used to compute depth limits in | ||
640 | * bfq_update_depths. | ||
641 | */ | ||
642 | unsigned int sb_shift; | ||
643 | |||
644 | /* | ||
645 | * Depth limits used in bfq_limit_depth (see comments on the | ||
646 | * function) | ||
647 | */ | ||
648 | unsigned int word_depths[2][2]; | ||
630 | }; | 649 | }; |
631 | 650 | ||
632 | enum bfqq_state_flags { | 651 | enum bfqq_state_flags { |
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index e495d3f9b4b0..4498c43245e2 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c | |||
@@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served) | |||
835 | struct bfq_entity *entity = &bfqq->entity; | 835 | struct bfq_entity *entity = &bfqq->entity; |
836 | struct bfq_service_tree *st; | 836 | struct bfq_service_tree *st; |
837 | 837 | ||
838 | if (!bfqq->service_from_backlogged) | ||
839 | bfqq->first_IO_time = jiffies; | ||
840 | |||
841 | if (bfqq->wr_coeff > 1) | ||
842 | bfqq->service_from_wr += served; | ||
843 | |||
844 | bfqq->service_from_backlogged += served; | ||
838 | for_each_entity(entity) { | 845 | for_each_entity(entity) { |
839 | st = bfq_entity_service_tree(entity); | 846 | st = bfq_entity_service_tree(entity); |
840 | 847 | ||
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 23b42e8aa03e..9cfdd6c83b5b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c | |||
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work) | |||
374 | /** | 374 | /** |
375 | * __bio_integrity_endio - Integrity I/O completion function | 375 | * __bio_integrity_endio - Integrity I/O completion function |
376 | * @bio: Protected bio | 376 | * @bio: Protected bio |
377 | * @error: Pointer to errno | ||
378 | * | 377 | * |
379 | * Description: Completion for integrity I/O | 378 | * Description: Completion for integrity I/O |
380 | * | 379 | * |
diff --git a/block/bio.c b/block/bio.c index 9ef6cf3addb3..e1708db48258 100644 --- a/block/bio.c +++ b/block/bio.c | |||
@@ -971,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes) | |||
971 | EXPORT_SYMBOL(bio_advance); | 971 | EXPORT_SYMBOL(bio_advance); |
972 | 972 | ||
973 | /** | 973 | /** |
974 | * bio_alloc_pages - allocates a single page for each bvec in a bio | ||
975 | * @bio: bio to allocate pages for | ||
976 | * @gfp_mask: flags for allocation | ||
977 | * | ||
978 | * Allocates pages up to @bio->bi_vcnt. | ||
979 | * | ||
980 | * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are | ||
981 | * freed. | ||
982 | */ | ||
983 | int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) | ||
984 | { | ||
985 | int i; | ||
986 | struct bio_vec *bv; | ||
987 | |||
988 | bio_for_each_segment_all(bv, bio, i) { | ||
989 | bv->bv_page = alloc_page(gfp_mask); | ||
990 | if (!bv->bv_page) { | ||
991 | while (--bv >= bio->bi_io_vec) | ||
992 | __free_page(bv->bv_page); | ||
993 | return -ENOMEM; | ||
994 | } | ||
995 | } | ||
996 | |||
997 | return 0; | ||
998 | } | ||
999 | EXPORT_SYMBOL(bio_alloc_pages); | ||
1000 | |||
1001 | /** | ||
1002 | * bio_copy_data - copy contents of data buffers from one chain of bios to | 974 | * bio_copy_data - copy contents of data buffers from one chain of bios to |
1003 | * another | 975 | * another |
1004 | * @src: source bio list | 976 | * @src: source bio list |
@@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors, | |||
1838 | bio_advance(bio, split->bi_iter.bi_size); | 1810 | bio_advance(bio, split->bi_iter.bi_size); |
1839 | 1811 | ||
1840 | if (bio_flagged(bio, BIO_TRACE_COMPLETION)) | 1812 | if (bio_flagged(bio, BIO_TRACE_COMPLETION)) |
1841 | bio_set_flag(bio, BIO_TRACE_COMPLETION); | 1813 | bio_set_flag(split, BIO_TRACE_COMPLETION); |
1842 | 1814 | ||
1843 | return split; | 1815 | return split; |
1844 | } | 1816 | } |
diff --git a/block/blk-core.c b/block/blk-core.c index 3ba4326a63b5..a2005a485335 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |||
126 | rq->start_time = jiffies; | 126 | rq->start_time = jiffies; |
127 | set_start_time_ns(rq); | 127 | set_start_time_ns(rq); |
128 | rq->part = NULL; | 128 | rq->part = NULL; |
129 | seqcount_init(&rq->gstate_seq); | ||
130 | u64_stats_init(&rq->aborted_gstate_sync); | ||
129 | } | 131 | } |
130 | EXPORT_SYMBOL(blk_rq_init); | 132 | EXPORT_SYMBOL(blk_rq_init); |
131 | 133 | ||
@@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q) | |||
699 | queue_flag_set(QUEUE_FLAG_DEAD, q); | 701 | queue_flag_set(QUEUE_FLAG_DEAD, q); |
700 | spin_unlock_irq(lock); | 702 | spin_unlock_irq(lock); |
701 | 703 | ||
704 | /* | ||
705 | * make sure all in-progress dispatch are completed because | ||
706 | * blk_freeze_queue() can only complete all requests, and | ||
707 | * dispatch may still be in-progress since we dispatch requests | ||
708 | * from more than one contexts | ||
709 | */ | ||
710 | if (q->mq_ops) | ||
711 | blk_mq_quiesce_queue(q); | ||
712 | |||
702 | /* for synchronous bio-based driver finish in-flight integrity i/o */ | 713 | /* for synchronous bio-based driver finish in-flight integrity i/o */ |
703 | blk_flush_integrity(); | 714 | blk_flush_integrity(); |
704 | 715 | ||
@@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) | |||
1646 | 1657 | ||
1647 | lockdep_assert_held(q->queue_lock); | 1658 | lockdep_assert_held(q->queue_lock); |
1648 | 1659 | ||
1660 | blk_req_zone_write_unlock(req); | ||
1649 | blk_pm_put_request(req); | 1661 | blk_pm_put_request(req); |
1650 | 1662 | ||
1651 | elv_completed_request(q, req); | 1663 | elv_completed_request(q, req); |
@@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part, | |||
2055 | 2067 | ||
2056 | #endif /* CONFIG_FAIL_MAKE_REQUEST */ | 2068 | #endif /* CONFIG_FAIL_MAKE_REQUEST */ |
2057 | 2069 | ||
2070 | static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) | ||
2071 | { | ||
2072 | if (part->policy && op_is_write(bio_op(bio))) { | ||
2073 | char b[BDEVNAME_SIZE]; | ||
2074 | |||
2075 | printk(KERN_ERR | ||
2076 | "generic_make_request: Trying to write " | ||
2077 | "to read-only block-device %s (partno %d)\n", | ||
2078 | bio_devname(bio, b), part->partno); | ||
2079 | return true; | ||
2080 | } | ||
2081 | |||
2082 | return false; | ||
2083 | } | ||
2084 | |||
2058 | /* | 2085 | /* |
2059 | * Remap block n of partition p to block n+start(p) of the disk. | 2086 | * Remap block n of partition p to block n+start(p) of the disk. |
2060 | */ | 2087 | */ |
@@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio) | |||
2063 | struct hd_struct *p; | 2090 | struct hd_struct *p; |
2064 | int ret = 0; | 2091 | int ret = 0; |
2065 | 2092 | ||
2093 | rcu_read_lock(); | ||
2094 | p = __disk_get_part(bio->bi_disk, bio->bi_partno); | ||
2095 | if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) || | ||
2096 | bio_check_ro(bio, p))) { | ||
2097 | ret = -EIO; | ||
2098 | goto out; | ||
2099 | } | ||
2100 | |||
2066 | /* | 2101 | /* |
2067 | * Zone reset does not include bi_size so bio_sectors() is always 0. | 2102 | * Zone reset does not include bi_size so bio_sectors() is always 0. |
2068 | * Include a test for the reset op code and perform the remap if needed. | 2103 | * Include a test for the reset op code and perform the remap if needed. |
2069 | */ | 2104 | */ |
2070 | if (!bio->bi_partno || | 2105 | if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET) |
2071 | (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) | 2106 | goto out; |
2072 | return 0; | ||
2073 | 2107 | ||
2074 | rcu_read_lock(); | 2108 | bio->bi_iter.bi_sector += p->start_sect; |
2075 | p = __disk_get_part(bio->bi_disk, bio->bi_partno); | 2109 | bio->bi_partno = 0; |
2076 | if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { | 2110 | trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), |
2077 | bio->bi_iter.bi_sector += p->start_sect; | 2111 | bio->bi_iter.bi_sector - p->start_sect); |
2078 | bio->bi_partno = 0; | ||
2079 | trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), | ||
2080 | bio->bi_iter.bi_sector - p->start_sect); | ||
2081 | } else { | ||
2082 | printk("%s: fail for partition %d\n", __func__, bio->bi_partno); | ||
2083 | ret = -EIO; | ||
2084 | } | ||
2085 | rcu_read_unlock(); | ||
2086 | 2112 | ||
2113 | out: | ||
2114 | rcu_read_unlock(); | ||
2087 | return ret; | 2115 | return ret; |
2088 | } | 2116 | } |
2089 | 2117 | ||
@@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio) | |||
2142 | * For a REQ_NOWAIT based request, return -EOPNOTSUPP | 2170 | * For a REQ_NOWAIT based request, return -EOPNOTSUPP |
2143 | * if queue is not a request based queue. | 2171 | * if queue is not a request based queue. |
2144 | */ | 2172 | */ |
2145 | |||
2146 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) | 2173 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) |
2147 | goto not_supported; | 2174 | goto not_supported; |
2148 | 2175 | ||
2149 | if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) | 2176 | if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) |
2150 | goto end_io; | 2177 | goto end_io; |
2151 | 2178 | ||
2152 | if (blk_partition_remap(bio)) | 2179 | if (!bio->bi_partno) { |
2153 | goto end_io; | 2180 | if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) |
2181 | goto end_io; | ||
2182 | } else { | ||
2183 | if (blk_partition_remap(bio)) | ||
2184 | goto end_io; | ||
2185 | } | ||
2154 | 2186 | ||
2155 | if (bio_check_eod(bio, nr_sectors)) | 2187 | if (bio_check_eod(bio, nr_sectors)) |
2156 | goto end_io; | 2188 | goto end_io; |
@@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * | |||
2493 | * bypass a potential scheduler on the bottom device for | 2525 | * bypass a potential scheduler on the bottom device for |
2494 | * insert. | 2526 | * insert. |
2495 | */ | 2527 | */ |
2496 | blk_mq_request_bypass_insert(rq, true); | 2528 | return blk_mq_request_issue_directly(rq); |
2497 | return BLK_STS_OK; | ||
2498 | } | 2529 | } |
2499 | 2530 | ||
2500 | spin_lock_irqsave(q->queue_lock, flags); | 2531 | spin_lock_irqsave(q->queue_lock, flags); |
@@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req) | |||
2846 | wbt_issue(req->q->rq_wb, &req->issue_stat); | 2877 | wbt_issue(req->q->rq_wb, &req->issue_stat); |
2847 | } | 2878 | } |
2848 | 2879 | ||
2849 | BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); | 2880 | BUG_ON(blk_rq_is_complete(req)); |
2850 | blk_add_timer(req); | 2881 | blk_add_timer(req); |
2851 | } | 2882 | } |
2852 | EXPORT_SYMBOL(blk_start_request); | 2883 | EXPORT_SYMBOL(blk_start_request); |
@@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, | |||
3415 | } | 3446 | } |
3416 | EXPORT_SYMBOL(kblockd_mod_delayed_work_on); | 3447 | EXPORT_SYMBOL(kblockd_mod_delayed_work_on); |
3417 | 3448 | ||
3418 | int kblockd_schedule_delayed_work(struct delayed_work *dwork, | ||
3419 | unsigned long delay) | ||
3420 | { | ||
3421 | return queue_delayed_work(kblockd_workqueue, dwork, delay); | ||
3422 | } | ||
3423 | EXPORT_SYMBOL(kblockd_schedule_delayed_work); | ||
3424 | |||
3425 | int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, | ||
3426 | unsigned long delay) | ||
3427 | { | ||
3428 | return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); | ||
3429 | } | ||
3430 | EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); | ||
3431 | |||
3432 | /** | 3449 | /** |
3433 | * blk_start_plug - initialize blk_plug and track it inside the task_struct | 3450 | * blk_start_plug - initialize blk_plug and track it inside the task_struct |
3434 | * @plug: The &struct blk_plug that needs to be initialized | 3451 | * @plug: The &struct blk_plug that needs to be initialized |
diff --git a/block/blk-exec.c b/block/blk-exec.c index 5c0f3dc446dc..f7b292f12449 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | |||
61 | * be reused after dying flag is set | 61 | * be reused after dying flag is set |
62 | */ | 62 | */ |
63 | if (q->mq_ops) { | 63 | if (q->mq_ops) { |
64 | blk_mq_sched_insert_request(rq, at_head, true, false, false); | 64 | blk_mq_sched_insert_request(rq, at_head, true, false); |
65 | return; | 65 | return; |
66 | } | 66 | } |
67 | 67 | ||
diff --git a/block/blk-lib.c b/block/blk-lib.c index 2bc544ce3d2e..a676084d4740 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
37 | if (!q) | 37 | if (!q) |
38 | return -ENXIO; | 38 | return -ENXIO; |
39 | 39 | ||
40 | if (bdev_read_only(bdev)) | ||
41 | return -EPERM; | ||
42 | |||
40 | if (flags & BLKDEV_DISCARD_SECURE) { | 43 | if (flags & BLKDEV_DISCARD_SECURE) { |
41 | if (!blk_queue_secure_erase(q)) | 44 | if (!blk_queue_secure_erase(q)) |
42 | return -EOPNOTSUPP; | 45 | return -EOPNOTSUPP; |
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, | |||
156 | if (!q) | 159 | if (!q) |
157 | return -ENXIO; | 160 | return -ENXIO; |
158 | 161 | ||
162 | if (bdev_read_only(bdev)) | ||
163 | return -EPERM; | ||
164 | |||
159 | bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; | 165 | bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; |
160 | if ((sector | nr_sects) & bs_mask) | 166 | if ((sector | nr_sects) & bs_mask) |
161 | return -EINVAL; | 167 | return -EINVAL; |
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, | |||
233 | if (!q) | 239 | if (!q) |
234 | return -ENXIO; | 240 | return -ENXIO; |
235 | 241 | ||
242 | if (bdev_read_only(bdev)) | ||
243 | return -EPERM; | ||
244 | |||
236 | /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ | 245 | /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ |
237 | max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); | 246 | max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); |
238 | 247 | ||
@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, | |||
287 | if (!q) | 296 | if (!q) |
288 | return -ENXIO; | 297 | return -ENXIO; |
289 | 298 | ||
299 | if (bdev_read_only(bdev)) | ||
300 | return -EPERM; | ||
301 | |||
290 | while (nr_sects != 0) { | 302 | while (nr_sects != 0) { |
291 | bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), | 303 | bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), |
292 | gfp_mask); | 304 | gfp_mask); |
diff --git a/block/blk-map.c b/block/blk-map.c index d3a94719f03f..db9373bd31ac 100644 --- a/block/blk-map.c +++ b/block/blk-map.c | |||
@@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, | |||
119 | unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); | 119 | unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); |
120 | struct bio *bio = NULL; | 120 | struct bio *bio = NULL; |
121 | struct iov_iter i; | 121 | struct iov_iter i; |
122 | int ret; | 122 | int ret = -EINVAL; |
123 | 123 | ||
124 | if (!iter_is_iovec(iter)) | 124 | if (!iter_is_iovec(iter)) |
125 | goto fail; | 125 | goto fail; |
@@ -148,7 +148,7 @@ unmap_rq: | |||
148 | __blk_rq_unmap_user(bio); | 148 | __blk_rq_unmap_user(bio); |
149 | fail: | 149 | fail: |
150 | rq->bio = NULL; | 150 | rq->bio = NULL; |
151 | return -EINVAL; | 151 | return ret; |
152 | } | 152 | } |
153 | EXPORT_SYMBOL(blk_rq_map_user_iov); | 153 | EXPORT_SYMBOL(blk_rq_map_user_iov); |
154 | 154 | ||
diff --git a/block/blk-merge.c b/block/blk-merge.c index f5dedd57dff6..8452fc7164cc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, | |||
128 | nsegs++; | 128 | nsegs++; |
129 | sectors = max_sectors; | 129 | sectors = max_sectors; |
130 | } | 130 | } |
131 | if (sectors) | 131 | goto split; |
132 | goto split; | ||
133 | /* Make this single bvec as the 1st segment */ | ||
134 | } | 132 | } |
135 | 133 | ||
136 | if (bvprvp && blk_queue_cluster(q)) { | 134 | if (bvprvp && blk_queue_cluster(q)) { |
@@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, | |||
146 | bvprvp = &bvprv; | 144 | bvprvp = &bvprv; |
147 | sectors += bv.bv_len >> 9; | 145 | sectors += bv.bv_len >> 9; |
148 | 146 | ||
149 | if (nsegs == 1 && seg_size > front_seg_size) | ||
150 | front_seg_size = seg_size; | ||
151 | continue; | 147 | continue; |
152 | } | 148 | } |
153 | new_segment: | 149 | new_segment: |
154 | if (nsegs == queue_max_segments(q)) | 150 | if (nsegs == queue_max_segments(q)) |
155 | goto split; | 151 | goto split; |
156 | 152 | ||
153 | if (nsegs == 1 && seg_size > front_seg_size) | ||
154 | front_seg_size = seg_size; | ||
155 | |||
157 | nsegs++; | 156 | nsegs++; |
158 | bvprv = bv; | 157 | bvprv = bv; |
159 | bvprvp = &bvprv; | 158 | bvprvp = &bvprv; |
160 | seg_size = bv.bv_len; | 159 | seg_size = bv.bv_len; |
161 | sectors += bv.bv_len >> 9; | 160 | sectors += bv.bv_len >> 9; |
162 | 161 | ||
163 | if (nsegs == 1 && seg_size > front_seg_size) | ||
164 | front_seg_size = seg_size; | ||
165 | } | 162 | } |
166 | 163 | ||
167 | do_split = false; | 164 | do_split = false; |
@@ -174,6 +171,8 @@ split: | |||
174 | bio = new; | 171 | bio = new; |
175 | } | 172 | } |
176 | 173 | ||
174 | if (nsegs == 1 && seg_size > front_seg_size) | ||
175 | front_seg_size = seg_size; | ||
177 | bio->bi_seg_front_size = front_seg_size; | 176 | bio->bi_seg_front_size = front_seg_size; |
178 | if (seg_size > bio->bi_seg_back_size) | 177 | if (seg_size > bio->bi_seg_back_size) |
179 | bio->bi_seg_back_size = seg_size; | 178 | bio->bi_seg_back_size = seg_size; |
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b56a4f35720d..21cbc1f071c6 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c | |||
@@ -289,17 +289,12 @@ static const char *const rqf_name[] = { | |||
289 | RQF_NAME(HASHED), | 289 | RQF_NAME(HASHED), |
290 | RQF_NAME(STATS), | 290 | RQF_NAME(STATS), |
291 | RQF_NAME(SPECIAL_PAYLOAD), | 291 | RQF_NAME(SPECIAL_PAYLOAD), |
292 | RQF_NAME(ZONE_WRITE_LOCKED), | ||
293 | RQF_NAME(MQ_TIMEOUT_EXPIRED), | ||
294 | RQF_NAME(MQ_POLL_SLEPT), | ||
292 | }; | 295 | }; |
293 | #undef RQF_NAME | 296 | #undef RQF_NAME |
294 | 297 | ||
295 | #define RQAF_NAME(name) [REQ_ATOM_##name] = #name | ||
296 | static const char *const rqaf_name[] = { | ||
297 | RQAF_NAME(COMPLETE), | ||
298 | RQAF_NAME(STARTED), | ||
299 | RQAF_NAME(POLL_SLEPT), | ||
300 | }; | ||
301 | #undef RQAF_NAME | ||
302 | |||
303 | int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) | 298 | int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) |
304 | { | 299 | { |
305 | const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; | 300 | const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; |
@@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) | |||
316 | seq_puts(m, ", .rq_flags="); | 311 | seq_puts(m, ", .rq_flags="); |
317 | blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, | 312 | blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, |
318 | ARRAY_SIZE(rqf_name)); | 313 | ARRAY_SIZE(rqf_name)); |
319 | seq_puts(m, ", .atomic_flags="); | 314 | seq_printf(m, ", complete=%d", blk_rq_is_complete(rq)); |
320 | blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name)); | ||
321 | seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, | 315 | seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, |
322 | rq->internal_tag); | 316 | rq->internal_tag); |
323 | if (mq_ops->show_rq) | 317 | if (mq_ops->show_rq) |
@@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) | |||
409 | const struct show_busy_params *params = data; | 403 | const struct show_busy_params *params = data; |
410 | 404 | ||
411 | if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && | 405 | if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && |
412 | test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | 406 | blk_mq_rq_state(rq) != MQ_RQ_IDLE) |
413 | __blk_mq_debugfs_rq_show(params->m, | 407 | __blk_mq_debugfs_rq_show(params->m, |
414 | list_entry_rq(&rq->queuelist)); | 408 | list_entry_rq(&rq->queuelist)); |
415 | } | 409 | } |
@@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, | |||
703 | const struct blk_mq_debugfs_attr *attr = m->private; | 697 | const struct blk_mq_debugfs_attr *attr = m->private; |
704 | void *data = d_inode(file->f_path.dentry->d_parent)->i_private; | 698 | void *data = d_inode(file->f_path.dentry->d_parent)->i_private; |
705 | 699 | ||
706 | if (!attr->write) | 700 | /* |
701 | * Attributes that only implement .seq_ops are read-only and 'attr' is | ||
702 | * the same with 'data' in this case. | ||
703 | */ | ||
704 | if (attr == data || !attr->write) | ||
707 | return -EPERM; | 705 | return -EPERM; |
708 | 706 | ||
709 | return attr->write(data, buf, count, ppos); | 707 | return attr->write(data, buf, count, ppos); |
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c117bd8fd1f6..55c0a745b427 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c | |||
@@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) | |||
172 | WRITE_ONCE(hctx->dispatch_from, ctx); | 172 | WRITE_ONCE(hctx->dispatch_from, ctx); |
173 | } | 173 | } |
174 | 174 | ||
175 | /* return true if hw queue need to be run again */ | ||
176 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | 175 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
177 | { | 176 | { |
178 | struct request_queue *q = hctx->queue; | 177 | struct request_queue *q = hctx->queue; |
@@ -428,7 +427,7 @@ done: | |||
428 | } | 427 | } |
429 | 428 | ||
430 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, | 429 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, |
431 | bool run_queue, bool async, bool can_block) | 430 | bool run_queue, bool async) |
432 | { | 431 | { |
433 | struct request_queue *q = rq->q; | 432 | struct request_queue *q = rq->q; |
434 | struct elevator_queue *e = q->elevator; | 433 | struct elevator_queue *e = q->elevator; |
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index ba1d1418a96d..1e9c9018ace1 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h | |||
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); | |||
18 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); | 18 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); |
19 | 19 | ||
20 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, | 20 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, |
21 | bool run_queue, bool async, bool can_block); | 21 | bool run_queue, bool async); |
22 | void blk_mq_sched_insert_requests(struct request_queue *q, | 22 | void blk_mq_sched_insert_requests(struct request_queue *q, |
23 | struct blk_mq_ctx *ctx, | 23 | struct blk_mq_ctx *ctx, |
24 | struct list_head *list, bool run_queue_async); | 24 | struct list_head *list, bool run_queue_async); |
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 79969c3c234f..a54b4b070f1c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c | |||
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) | |||
248 | return ret; | 248 | return ret; |
249 | } | 249 | } |
250 | 250 | ||
251 | static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) | 251 | void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) |
252 | { | 252 | { |
253 | struct blk_mq_hw_ctx *hctx; | 253 | struct blk_mq_hw_ctx *hctx; |
254 | int i; | 254 | int i; |
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) | |||
265 | q->mq_sysfs_init_done = false; | 265 | q->mq_sysfs_init_done = false; |
266 | } | 266 | } |
267 | 267 | ||
268 | void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) | ||
269 | { | ||
270 | mutex_lock(&q->sysfs_lock); | ||
271 | __blk_mq_unregister_dev(dev, q); | ||
272 | mutex_unlock(&q->sysfs_lock); | ||
273 | } | ||
274 | |||
275 | void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) | 268 | void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) |
276 | { | 269 | { |
277 | kobject_init(&hctx->kobj, &blk_mq_hw_ktype); | 270 | kobject_init(&hctx->kobj, &blk_mq_hw_ktype); |
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c81b40ecd3f1..336dde07b230 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c | |||
@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
134 | ws = bt_wait_ptr(bt, data->hctx); | 134 | ws = bt_wait_ptr(bt, data->hctx); |
135 | drop_ctx = data->ctx == NULL; | 135 | drop_ctx = data->ctx == NULL; |
136 | do { | 136 | do { |
137 | prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
138 | |||
139 | tag = __blk_mq_get_tag(data, bt); | ||
140 | if (tag != -1) | ||
141 | break; | ||
142 | |||
143 | /* | 137 | /* |
144 | * We're out of tags on this hardware queue, kick any | 138 | * We're out of tags on this hardware queue, kick any |
145 | * pending IO submits before going to sleep waiting for | 139 | * pending IO submits before going to sleep waiting for |
@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
155 | if (tag != -1) | 149 | if (tag != -1) |
156 | break; | 150 | break; |
157 | 151 | ||
152 | prepare_to_wait_exclusive(&ws->wait, &wait, | ||
153 | TASK_UNINTERRUPTIBLE); | ||
154 | |||
155 | tag = __blk_mq_get_tag(data, bt); | ||
156 | if (tag != -1) | ||
157 | break; | ||
158 | |||
158 | if (data->ctx) | 159 | if (data->ctx) |
159 | blk_mq_put_ctx(data->ctx); | 160 | blk_mq_put_ctx(data->ctx); |
160 | 161 | ||
diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d3797327491..01f271d40825 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, | |||
95 | { | 95 | { |
96 | struct mq_inflight *mi = priv; | 96 | struct mq_inflight *mi = priv; |
97 | 97 | ||
98 | if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && | 98 | if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { |
99 | !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { | ||
100 | /* | 99 | /* |
101 | * index[0] counts the specific partition that was asked | 100 | * index[0] counts the specific partition that was asked |
102 | * for. index[1] counts the ones that are active on the | 101 | * for. index[1] counts the ones that are active on the |
@@ -222,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q) | |||
222 | 221 | ||
223 | queue_for_each_hw_ctx(q, hctx, i) { | 222 | queue_for_each_hw_ctx(q, hctx, i) { |
224 | if (hctx->flags & BLK_MQ_F_BLOCKING) | 223 | if (hctx->flags & BLK_MQ_F_BLOCKING) |
225 | synchronize_srcu(hctx->queue_rq_srcu); | 224 | synchronize_srcu(hctx->srcu); |
226 | else | 225 | else |
227 | rcu = true; | 226 | rcu = true; |
228 | } | 227 | } |
@@ -272,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
272 | { | 271 | { |
273 | struct blk_mq_tags *tags = blk_mq_tags_from_data(data); | 272 | struct blk_mq_tags *tags = blk_mq_tags_from_data(data); |
274 | struct request *rq = tags->static_rqs[tag]; | 273 | struct request *rq = tags->static_rqs[tag]; |
275 | 274 | req_flags_t rq_flags = 0; | |
276 | rq->rq_flags = 0; | ||
277 | 275 | ||
278 | if (data->flags & BLK_MQ_REQ_INTERNAL) { | 276 | if (data->flags & BLK_MQ_REQ_INTERNAL) { |
279 | rq->tag = -1; | 277 | rq->tag = -1; |
280 | rq->internal_tag = tag; | 278 | rq->internal_tag = tag; |
281 | } else { | 279 | } else { |
282 | if (blk_mq_tag_busy(data->hctx)) { | 280 | if (blk_mq_tag_busy(data->hctx)) { |
283 | rq->rq_flags = RQF_MQ_INFLIGHT; | 281 | rq_flags = RQF_MQ_INFLIGHT; |
284 | atomic_inc(&data->hctx->nr_active); | 282 | atomic_inc(&data->hctx->nr_active); |
285 | } | 283 | } |
286 | rq->tag = tag; | 284 | rq->tag = tag; |
@@ -288,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
288 | data->hctx->tags->rqs[rq->tag] = rq; | 286 | data->hctx->tags->rqs[rq->tag] = rq; |
289 | } | 287 | } |
290 | 288 | ||
291 | INIT_LIST_HEAD(&rq->queuelist); | ||
292 | /* csd/requeue_work/fifo_time is initialized before use */ | 289 | /* csd/requeue_work/fifo_time is initialized before use */ |
293 | rq->q = data->q; | 290 | rq->q = data->q; |
294 | rq->mq_ctx = data->ctx; | 291 | rq->mq_ctx = data->ctx; |
292 | rq->rq_flags = rq_flags; | ||
293 | rq->cpu = -1; | ||
295 | rq->cmd_flags = op; | 294 | rq->cmd_flags = op; |
296 | if (data->flags & BLK_MQ_REQ_PREEMPT) | 295 | if (data->flags & BLK_MQ_REQ_PREEMPT) |
297 | rq->rq_flags |= RQF_PREEMPT; | 296 | rq->rq_flags |= RQF_PREEMPT; |
298 | if (blk_queue_io_stat(data->q)) | 297 | if (blk_queue_io_stat(data->q)) |
299 | rq->rq_flags |= RQF_IO_STAT; | 298 | rq->rq_flags |= RQF_IO_STAT; |
300 | /* do not touch atomic flags, it needs atomic ops against the timer */ | 299 | INIT_LIST_HEAD(&rq->queuelist); |
301 | rq->cpu = -1; | ||
302 | INIT_HLIST_NODE(&rq->hash); | 300 | INIT_HLIST_NODE(&rq->hash); |
303 | RB_CLEAR_NODE(&rq->rb_node); | 301 | RB_CLEAR_NODE(&rq->rb_node); |
304 | rq->rq_disk = NULL; | 302 | rq->rq_disk = NULL; |
305 | rq->part = NULL; | 303 | rq->part = NULL; |
306 | rq->start_time = jiffies; | 304 | rq->start_time = jiffies; |
307 | #ifdef CONFIG_BLK_CGROUP | ||
308 | rq->rl = NULL; | ||
309 | set_start_time_ns(rq); | ||
310 | rq->io_start_time_ns = 0; | ||
311 | #endif | ||
312 | rq->nr_phys_segments = 0; | 305 | rq->nr_phys_segments = 0; |
313 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 306 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
314 | rq->nr_integrity_segments = 0; | 307 | rq->nr_integrity_segments = 0; |
@@ -316,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
316 | rq->special = NULL; | 309 | rq->special = NULL; |
317 | /* tag was already set */ | 310 | /* tag was already set */ |
318 | rq->extra_len = 0; | 311 | rq->extra_len = 0; |
312 | rq->__deadline = 0; | ||
319 | 313 | ||
320 | INIT_LIST_HEAD(&rq->timeout_list); | 314 | INIT_LIST_HEAD(&rq->timeout_list); |
321 | rq->timeout = 0; | 315 | rq->timeout = 0; |
@@ -324,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
324 | rq->end_io_data = NULL; | 318 | rq->end_io_data = NULL; |
325 | rq->next_rq = NULL; | 319 | rq->next_rq = NULL; |
326 | 320 | ||
321 | #ifdef CONFIG_BLK_CGROUP | ||
322 | rq->rl = NULL; | ||
323 | set_start_time_ns(rq); | ||
324 | rq->io_start_time_ns = 0; | ||
325 | #endif | ||
326 | |||
327 | data->ctx->rq_dispatched[op_is_sync(op)]++; | 327 | data->ctx->rq_dispatched[op_is_sync(op)]++; |
328 | return rq; | 328 | return rq; |
329 | } | 329 | } |
@@ -443,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, | |||
443 | blk_queue_exit(q); | 443 | blk_queue_exit(q); |
444 | return ERR_PTR(-EXDEV); | 444 | return ERR_PTR(-EXDEV); |
445 | } | 445 | } |
446 | cpu = cpumask_first(alloc_data.hctx->cpumask); | 446 | cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); |
447 | alloc_data.ctx = __blk_mq_get_ctx(q, cpu); | 447 | alloc_data.ctx = __blk_mq_get_ctx(q, cpu); |
448 | 448 | ||
449 | rq = blk_mq_get_request(q, NULL, op, &alloc_data); | 449 | rq = blk_mq_get_request(q, NULL, op, &alloc_data); |
@@ -485,8 +485,7 @@ void blk_mq_free_request(struct request *rq) | |||
485 | if (blk_rq_rl(rq)) | 485 | if (blk_rq_rl(rq)) |
486 | blk_put_rl(blk_rq_rl(rq)); | 486 | blk_put_rl(blk_rq_rl(rq)); |
487 | 487 | ||
488 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 488 | blk_mq_rq_update_state(rq, MQ_RQ_IDLE); |
489 | clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); | ||
490 | if (rq->tag != -1) | 489 | if (rq->tag != -1) |
491 | blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); | 490 | blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); |
492 | if (sched_tag != -1) | 491 | if (sched_tag != -1) |
@@ -532,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq) | |||
532 | bool shared = false; | 531 | bool shared = false; |
533 | int cpu; | 532 | int cpu; |
534 | 533 | ||
534 | WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); | ||
535 | blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE); | ||
536 | |||
535 | if (rq->internal_tag != -1) | 537 | if (rq->internal_tag != -1) |
536 | blk_mq_sched_completed_request(rq); | 538 | blk_mq_sched_completed_request(rq); |
537 | if (rq->rq_flags & RQF_STATS) { | 539 | if (rq->rq_flags & RQF_STATS) { |
@@ -559,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq) | |||
559 | put_cpu(); | 561 | put_cpu(); |
560 | } | 562 | } |
561 | 563 | ||
564 | static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) | ||
565 | __releases(hctx->srcu) | ||
566 | { | ||
567 | if (!(hctx->flags & BLK_MQ_F_BLOCKING)) | ||
568 | rcu_read_unlock(); | ||
569 | else | ||
570 | srcu_read_unlock(hctx->srcu, srcu_idx); | ||
571 | } | ||
572 | |||
573 | static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) | ||
574 | __acquires(hctx->srcu) | ||
575 | { | ||
576 | if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { | ||
577 | /* shut up gcc false positive */ | ||
578 | *srcu_idx = 0; | ||
579 | rcu_read_lock(); | ||
580 | } else | ||
581 | *srcu_idx = srcu_read_lock(hctx->srcu); | ||
582 | } | ||
583 | |||
584 | static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) | ||
585 | { | ||
586 | unsigned long flags; | ||
587 | |||
588 | /* | ||
589 | * blk_mq_rq_aborted_gstate() is used from the completion path and | ||
590 | * can thus be called from irq context. u64_stats_fetch in the | ||
591 | * middle of update on the same CPU leads to lockup. Disable irq | ||
592 | * while updating. | ||
593 | */ | ||
594 | local_irq_save(flags); | ||
595 | u64_stats_update_begin(&rq->aborted_gstate_sync); | ||
596 | rq->aborted_gstate = gstate; | ||
597 | u64_stats_update_end(&rq->aborted_gstate_sync); | ||
598 | local_irq_restore(flags); | ||
599 | } | ||
600 | |||
601 | static u64 blk_mq_rq_aborted_gstate(struct request *rq) | ||
602 | { | ||
603 | unsigned int start; | ||
604 | u64 aborted_gstate; | ||
605 | |||
606 | do { | ||
607 | start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); | ||
608 | aborted_gstate = rq->aborted_gstate; | ||
609 | } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); | ||
610 | |||
611 | return aborted_gstate; | ||
612 | } | ||
613 | |||
562 | /** | 614 | /** |
563 | * blk_mq_complete_request - end I/O on a request | 615 | * blk_mq_complete_request - end I/O on a request |
564 | * @rq: the request being processed | 616 | * @rq: the request being processed |
@@ -570,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq) | |||
570 | void blk_mq_complete_request(struct request *rq) | 622 | void blk_mq_complete_request(struct request *rq) |
571 | { | 623 | { |
572 | struct request_queue *q = rq->q; | 624 | struct request_queue *q = rq->q; |
625 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); | ||
626 | int srcu_idx; | ||
573 | 627 | ||
574 | if (unlikely(blk_should_fake_timeout(q))) | 628 | if (unlikely(blk_should_fake_timeout(q))) |
575 | return; | 629 | return; |
576 | if (!blk_mark_rq_complete(rq)) | 630 | |
631 | /* | ||
632 | * If @rq->aborted_gstate equals the current instance, timeout is | ||
633 | * claiming @rq and we lost. This is synchronized through | ||
634 | * hctx_lock(). See blk_mq_timeout_work() for details. | ||
635 | * | ||
636 | * Completion path never blocks and we can directly use RCU here | ||
637 | * instead of hctx_lock() which can be either RCU or SRCU. | ||
638 | * However, that would complicate paths which want to synchronize | ||
639 | * against us. Let stay in sync with the issue path so that | ||
640 | * hctx_lock() covers both issue and completion paths. | ||
641 | */ | ||
642 | hctx_lock(hctx, &srcu_idx); | ||
643 | if (blk_mq_rq_aborted_gstate(rq) != rq->gstate) | ||
577 | __blk_mq_complete_request(rq); | 644 | __blk_mq_complete_request(rq); |
645 | hctx_unlock(hctx, srcu_idx); | ||
578 | } | 646 | } |
579 | EXPORT_SYMBOL(blk_mq_complete_request); | 647 | EXPORT_SYMBOL(blk_mq_complete_request); |
580 | 648 | ||
581 | int blk_mq_request_started(struct request *rq) | 649 | int blk_mq_request_started(struct request *rq) |
582 | { | 650 | { |
583 | return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 651 | return blk_mq_rq_state(rq) != MQ_RQ_IDLE; |
584 | } | 652 | } |
585 | EXPORT_SYMBOL_GPL(blk_mq_request_started); | 653 | EXPORT_SYMBOL_GPL(blk_mq_request_started); |
586 | 654 | ||
@@ -598,34 +666,27 @@ void blk_mq_start_request(struct request *rq) | |||
598 | wbt_issue(q->rq_wb, &rq->issue_stat); | 666 | wbt_issue(q->rq_wb, &rq->issue_stat); |
599 | } | 667 | } |
600 | 668 | ||
601 | blk_add_timer(rq); | 669 | WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); |
602 | |||
603 | WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); | ||
604 | 670 | ||
605 | /* | 671 | /* |
606 | * Mark us as started and clear complete. Complete might have been | 672 | * Mark @rq in-flight which also advances the generation number, |
607 | * set if requeue raced with timeout, which then marked it as | 673 | * and register for timeout. Protect with a seqcount to allow the |
608 | * complete. So be sure to clear complete again when we start | 674 | * timeout path to read both @rq->gstate and @rq->deadline |
609 | * the request, otherwise we'll ignore the completion event. | 675 | * coherently. |
610 | * | 676 | * |
611 | * Ensure that ->deadline is visible before we set STARTED, such that | 677 | * This is the only place where a request is marked in-flight. If |
612 | * blk_mq_check_expired() is guaranteed to observe our ->deadline when | 678 | * the timeout path reads an in-flight @rq->gstate, the |
613 | * it observes STARTED. | 679 | * @rq->deadline it reads together under @rq->gstate_seq is |
680 | * guaranteed to be the matching one. | ||
614 | */ | 681 | */ |
615 | smp_wmb(); | 682 | preempt_disable(); |
616 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 683 | write_seqcount_begin(&rq->gstate_seq); |
617 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { | 684 | |
618 | /* | 685 | blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); |
619 | * Coherence order guarantees these consecutive stores to a | 686 | blk_add_timer(rq); |
620 | * single variable propagate in the specified order. Thus the | 687 | |
621 | * clear_bit() is ordered _after_ the set bit. See | 688 | write_seqcount_end(&rq->gstate_seq); |
622 | * blk_mq_check_expired(). | 689 | preempt_enable(); |
623 | * | ||
624 | * (the bits must be part of the same byte for this to be | ||
625 | * true). | ||
626 | */ | ||
627 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); | ||
628 | } | ||
629 | 690 | ||
630 | if (q->dma_drain_size && blk_rq_bytes(rq)) { | 691 | if (q->dma_drain_size && blk_rq_bytes(rq)) { |
631 | /* | 692 | /* |
@@ -639,13 +700,9 @@ void blk_mq_start_request(struct request *rq) | |||
639 | EXPORT_SYMBOL(blk_mq_start_request); | 700 | EXPORT_SYMBOL(blk_mq_start_request); |
640 | 701 | ||
641 | /* | 702 | /* |
642 | * When we reach here because queue is busy, REQ_ATOM_COMPLETE | 703 | * When we reach here because queue is busy, it's safe to change the state |
643 | * flag isn't set yet, so there may be race with timeout handler, | 704 | * to IDLE without checking @rq->aborted_gstate because we should still be |
644 | * but given rq->deadline is just set in .queue_rq() under | 705 | * holding the RCU read lock and thus protected against timeout. |
645 | * this situation, the race won't be possible in reality because | ||
646 | * rq->timeout should be set as big enough to cover the window | ||
647 | * between blk_mq_start_request() called from .queue_rq() and | ||
648 | * clearing REQ_ATOM_STARTED here. | ||
649 | */ | 706 | */ |
650 | static void __blk_mq_requeue_request(struct request *rq) | 707 | static void __blk_mq_requeue_request(struct request *rq) |
651 | { | 708 | { |
@@ -657,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq) | |||
657 | wbt_requeue(q->rq_wb, &rq->issue_stat); | 714 | wbt_requeue(q->rq_wb, &rq->issue_stat); |
658 | blk_mq_sched_requeue_request(rq); | 715 | blk_mq_sched_requeue_request(rq); |
659 | 716 | ||
660 | if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { | 717 | if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { |
718 | blk_mq_rq_update_state(rq, MQ_RQ_IDLE); | ||
661 | if (q->dma_drain_size && blk_rq_bytes(rq)) | 719 | if (q->dma_drain_size && blk_rq_bytes(rq)) |
662 | rq->nr_phys_segments--; | 720 | rq->nr_phys_segments--; |
663 | } | 721 | } |
@@ -689,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work) | |||
689 | 747 | ||
690 | rq->rq_flags &= ~RQF_SOFTBARRIER; | 748 | rq->rq_flags &= ~RQF_SOFTBARRIER; |
691 | list_del_init(&rq->queuelist); | 749 | list_del_init(&rq->queuelist); |
692 | blk_mq_sched_insert_request(rq, true, false, false, true); | 750 | blk_mq_sched_insert_request(rq, true, false, false); |
693 | } | 751 | } |
694 | 752 | ||
695 | while (!list_empty(&rq_list)) { | 753 | while (!list_empty(&rq_list)) { |
696 | rq = list_entry(rq_list.next, struct request, queuelist); | 754 | rq = list_entry(rq_list.next, struct request, queuelist); |
697 | list_del_init(&rq->queuelist); | 755 | list_del_init(&rq->queuelist); |
698 | blk_mq_sched_insert_request(rq, false, false, false, true); | 756 | blk_mq_sched_insert_request(rq, false, false, false); |
699 | } | 757 | } |
700 | 758 | ||
701 | blk_mq_run_hw_queues(q, false); | 759 | blk_mq_run_hw_queues(q, false); |
@@ -729,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); | |||
729 | 787 | ||
730 | void blk_mq_kick_requeue_list(struct request_queue *q) | 788 | void blk_mq_kick_requeue_list(struct request_queue *q) |
731 | { | 789 | { |
732 | kblockd_schedule_delayed_work(&q->requeue_work, 0); | 790 | kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); |
733 | } | 791 | } |
734 | EXPORT_SYMBOL(blk_mq_kick_requeue_list); | 792 | EXPORT_SYMBOL(blk_mq_kick_requeue_list); |
735 | 793 | ||
@@ -755,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); | |||
755 | struct blk_mq_timeout_data { | 813 | struct blk_mq_timeout_data { |
756 | unsigned long next; | 814 | unsigned long next; |
757 | unsigned int next_set; | 815 | unsigned int next_set; |
816 | unsigned int nr_expired; | ||
758 | }; | 817 | }; |
759 | 818 | ||
760 | void blk_mq_rq_timed_out(struct request *req, bool reserved) | 819 | static void blk_mq_rq_timed_out(struct request *req, bool reserved) |
761 | { | 820 | { |
762 | const struct blk_mq_ops *ops = req->q->mq_ops; | 821 | const struct blk_mq_ops *ops = req->q->mq_ops; |
763 | enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; | 822 | enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; |
764 | 823 | ||
765 | /* | 824 | req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED; |
766 | * We know that complete is set at this point. If STARTED isn't set | ||
767 | * anymore, then the request isn't active and the "timeout" should | ||
768 | * just be ignored. This can happen due to the bitflag ordering. | ||
769 | * Timeout first checks if STARTED is set, and if it is, assumes | ||
770 | * the request is active. But if we race with completion, then | ||
771 | * both flags will get cleared. So check here again, and ignore | ||
772 | * a timeout event with a request that isn't active. | ||
773 | */ | ||
774 | if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) | ||
775 | return; | ||
776 | 825 | ||
777 | if (ops->timeout) | 826 | if (ops->timeout) |
778 | ret = ops->timeout(req, reserved); | 827 | ret = ops->timeout(req, reserved); |
@@ -782,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) | |||
782 | __blk_mq_complete_request(req); | 831 | __blk_mq_complete_request(req); |
783 | break; | 832 | break; |
784 | case BLK_EH_RESET_TIMER: | 833 | case BLK_EH_RESET_TIMER: |
834 | /* | ||
835 | * As nothing prevents from completion happening while | ||
836 | * ->aborted_gstate is set, this may lead to ignored | ||
837 | * completions and further spurious timeouts. | ||
838 | */ | ||
839 | blk_mq_rq_update_aborted_gstate(req, 0); | ||
785 | blk_add_timer(req); | 840 | blk_add_timer(req); |
786 | blk_clear_rq_complete(req); | ||
787 | break; | 841 | break; |
788 | case BLK_EH_NOT_HANDLED: | 842 | case BLK_EH_NOT_HANDLED: |
789 | break; | 843 | break; |
@@ -797,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
797 | struct request *rq, void *priv, bool reserved) | 851 | struct request *rq, void *priv, bool reserved) |
798 | { | 852 | { |
799 | struct blk_mq_timeout_data *data = priv; | 853 | struct blk_mq_timeout_data *data = priv; |
800 | unsigned long deadline; | 854 | unsigned long gstate, deadline; |
855 | int start; | ||
801 | 856 | ||
802 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | 857 | might_sleep(); |
803 | return; | ||
804 | 858 | ||
805 | /* | 859 | if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) |
806 | * Ensures that if we see STARTED we must also see our | 860 | return; |
807 | * up-to-date deadline, see blk_mq_start_request(). | ||
808 | */ | ||
809 | smp_rmb(); | ||
810 | 861 | ||
811 | deadline = READ_ONCE(rq->deadline); | 862 | /* read coherent snapshots of @rq->state_gen and @rq->deadline */ |
863 | while (true) { | ||
864 | start = read_seqcount_begin(&rq->gstate_seq); | ||
865 | gstate = READ_ONCE(rq->gstate); | ||
866 | deadline = blk_rq_deadline(rq); | ||
867 | if (!read_seqcount_retry(&rq->gstate_seq, start)) | ||
868 | break; | ||
869 | cond_resched(); | ||
870 | } | ||
812 | 871 | ||
813 | /* | 872 | /* if in-flight && overdue, mark for abortion */ |
814 | * The rq being checked may have been freed and reallocated | 873 | if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && |
815 | * out already here, we avoid this race by checking rq->deadline | 874 | time_after_eq(jiffies, deadline)) { |
816 | * and REQ_ATOM_COMPLETE flag together: | 875 | blk_mq_rq_update_aborted_gstate(rq, gstate); |
817 | * | 876 | data->nr_expired++; |
818 | * - if rq->deadline is observed as new value because of | 877 | hctx->nr_expired++; |
819 | * reusing, the rq won't be timed out because of timing. | ||
820 | * - if rq->deadline is observed as previous value, | ||
821 | * REQ_ATOM_COMPLETE flag won't be cleared in reuse path | ||
822 | * because we put a barrier between setting rq->deadline | ||
823 | * and clearing the flag in blk_mq_start_request(), so | ||
824 | * this rq won't be timed out too. | ||
825 | */ | ||
826 | if (time_after_eq(jiffies, deadline)) { | ||
827 | if (!blk_mark_rq_complete(rq)) { | ||
828 | /* | ||
829 | * Again coherence order ensures that consecutive reads | ||
830 | * from the same variable must be in that order. This | ||
831 | * ensures that if we see COMPLETE clear, we must then | ||
832 | * see STARTED set and we'll ignore this timeout. | ||
833 | * | ||
834 | * (There's also the MB implied by the test_and_clear()) | ||
835 | */ | ||
836 | blk_mq_rq_timed_out(rq, reserved); | ||
837 | } | ||
838 | } else if (!data->next_set || time_after(data->next, deadline)) { | 878 | } else if (!data->next_set || time_after(data->next, deadline)) { |
839 | data->next = deadline; | 879 | data->next = deadline; |
840 | data->next_set = 1; | 880 | data->next_set = 1; |
841 | } | 881 | } |
842 | } | 882 | } |
843 | 883 | ||
884 | static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, | ||
885 | struct request *rq, void *priv, bool reserved) | ||
886 | { | ||
887 | /* | ||
888 | * We marked @rq->aborted_gstate and waited for RCU. If there were | ||
889 | * completions that we lost to, they would have finished and | ||
890 | * updated @rq->gstate by now; otherwise, the completion path is | ||
891 | * now guaranteed to see @rq->aborted_gstate and yield. If | ||
892 | * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. | ||
893 | */ | ||
894 | if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && | ||
895 | READ_ONCE(rq->gstate) == rq->aborted_gstate) | ||
896 | blk_mq_rq_timed_out(rq, reserved); | ||
897 | } | ||
898 | |||
844 | static void blk_mq_timeout_work(struct work_struct *work) | 899 | static void blk_mq_timeout_work(struct work_struct *work) |
845 | { | 900 | { |
846 | struct request_queue *q = | 901 | struct request_queue *q = |
@@ -848,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work) | |||
848 | struct blk_mq_timeout_data data = { | 903 | struct blk_mq_timeout_data data = { |
849 | .next = 0, | 904 | .next = 0, |
850 | .next_set = 0, | 905 | .next_set = 0, |
906 | .nr_expired = 0, | ||
851 | }; | 907 | }; |
908 | struct blk_mq_hw_ctx *hctx; | ||
852 | int i; | 909 | int i; |
853 | 910 | ||
854 | /* A deadlock might occur if a request is stuck requiring a | 911 | /* A deadlock might occur if a request is stuck requiring a |
@@ -867,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work) | |||
867 | if (!percpu_ref_tryget(&q->q_usage_counter)) | 924 | if (!percpu_ref_tryget(&q->q_usage_counter)) |
868 | return; | 925 | return; |
869 | 926 | ||
927 | /* scan for the expired ones and set their ->aborted_gstate */ | ||
870 | blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); | 928 | blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); |
871 | 929 | ||
930 | if (data.nr_expired) { | ||
931 | bool has_rcu = false; | ||
932 | |||
933 | /* | ||
934 | * Wait till everyone sees ->aborted_gstate. The | ||
935 | * sequential waits for SRCUs aren't ideal. If this ever | ||
936 | * becomes a problem, we can add per-hw_ctx rcu_head and | ||
937 | * wait in parallel. | ||
938 | */ | ||
939 | queue_for_each_hw_ctx(q, hctx, i) { | ||
940 | if (!hctx->nr_expired) | ||
941 | continue; | ||
942 | |||
943 | if (!(hctx->flags & BLK_MQ_F_BLOCKING)) | ||
944 | has_rcu = true; | ||
945 | else | ||
946 | synchronize_srcu(hctx->srcu); | ||
947 | |||
948 | hctx->nr_expired = 0; | ||
949 | } | ||
950 | if (has_rcu) | ||
951 | synchronize_rcu(); | ||
952 | |||
953 | /* terminate the ones we won */ | ||
954 | blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); | ||
955 | } | ||
956 | |||
872 | if (data.next_set) { | 957 | if (data.next_set) { |
873 | data.next = blk_rq_timeout(round_jiffies_up(data.next)); | 958 | data.next = blk_rq_timeout(round_jiffies_up(data.next)); |
874 | mod_timer(&q->timeout, data.next); | 959 | mod_timer(&q->timeout, data.next); |
875 | } else { | 960 | } else { |
876 | struct blk_mq_hw_ctx *hctx; | 961 | /* |
877 | 962 | * Request timeouts are handled as a forward rolling timer. If | |
963 | * we end up here it means that no requests are pending and | ||
964 | * also that no request has been pending for a while. Mark | ||
965 | * each hctx as idle. | ||
966 | */ | ||
878 | queue_for_each_hw_ctx(q, hctx, i) { | 967 | queue_for_each_hw_ctx(q, hctx, i) { |
879 | /* the hctx may be unmapped, so check it here */ | 968 | /* the hctx may be unmapped, so check it here */ |
880 | if (blk_mq_hw_queue_mapped(hctx)) | 969 | if (blk_mq_hw_queue_mapped(hctx)) |
@@ -1010,66 +1099,67 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, | |||
1010 | 1099 | ||
1011 | /* | 1100 | /* |
1012 | * Mark us waiting for a tag. For shared tags, this involves hooking us into | 1101 | * Mark us waiting for a tag. For shared tags, this involves hooking us into |
1013 | * the tag wakeups. For non-shared tags, we can simply mark us nedeing a | 1102 | * the tag wakeups. For non-shared tags, we can simply mark us needing a |
1014 | * restart. For both caes, take care to check the condition again after | 1103 | * restart. For both cases, take care to check the condition again after |
1015 | * marking us as waiting. | 1104 | * marking us as waiting. |
1016 | */ | 1105 | */ |
1017 | static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, | 1106 | static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, |
1018 | struct request *rq) | 1107 | struct request *rq) |
1019 | { | 1108 | { |
1020 | struct blk_mq_hw_ctx *this_hctx = *hctx; | 1109 | struct blk_mq_hw_ctx *this_hctx = *hctx; |
1021 | bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; | ||
1022 | struct sbq_wait_state *ws; | 1110 | struct sbq_wait_state *ws; |
1023 | wait_queue_entry_t *wait; | 1111 | wait_queue_entry_t *wait; |
1024 | bool ret; | 1112 | bool ret; |
1025 | 1113 | ||
1026 | if (!shared_tags) { | 1114 | if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { |
1027 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) | 1115 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) |
1028 | set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); | 1116 | set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); |
1029 | } else { | ||
1030 | wait = &this_hctx->dispatch_wait; | ||
1031 | if (!list_empty_careful(&wait->entry)) | ||
1032 | return false; | ||
1033 | 1117 | ||
1034 | spin_lock(&this_hctx->lock); | 1118 | /* |
1035 | if (!list_empty(&wait->entry)) { | 1119 | * It's possible that a tag was freed in the window between the |
1036 | spin_unlock(&this_hctx->lock); | 1120 | * allocation failure and adding the hardware queue to the wait |
1037 | return false; | 1121 | * queue. |
1038 | } | 1122 | * |
1123 | * Don't clear RESTART here, someone else could have set it. | ||
1124 | * At most this will cost an extra queue run. | ||
1125 | */ | ||
1126 | return blk_mq_get_driver_tag(rq, hctx, false); | ||
1127 | } | ||
1039 | 1128 | ||
1040 | ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); | 1129 | wait = &this_hctx->dispatch_wait; |
1041 | add_wait_queue(&ws->wait, wait); | 1130 | if (!list_empty_careful(&wait->entry)) |
1131 | return false; | ||
1132 | |||
1133 | spin_lock(&this_hctx->lock); | ||
1134 | if (!list_empty(&wait->entry)) { | ||
1135 | spin_unlock(&this_hctx->lock); | ||
1136 | return false; | ||
1042 | } | 1137 | } |
1043 | 1138 | ||
1139 | ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); | ||
1140 | add_wait_queue(&ws->wait, wait); | ||
1141 | |||
1044 | /* | 1142 | /* |
1045 | * It's possible that a tag was freed in the window between the | 1143 | * It's possible that a tag was freed in the window between the |
1046 | * allocation failure and adding the hardware queue to the wait | 1144 | * allocation failure and adding the hardware queue to the wait |
1047 | * queue. | 1145 | * queue. |
1048 | */ | 1146 | */ |
1049 | ret = blk_mq_get_driver_tag(rq, hctx, false); | 1147 | ret = blk_mq_get_driver_tag(rq, hctx, false); |
1050 | 1148 | if (!ret) { | |
1051 | if (!shared_tags) { | ||
1052 | /* | ||
1053 | * Don't clear RESTART here, someone else could have set it. | ||
1054 | * At most this will cost an extra queue run. | ||
1055 | */ | ||
1056 | return ret; | ||
1057 | } else { | ||
1058 | if (!ret) { | ||
1059 | spin_unlock(&this_hctx->lock); | ||
1060 | return false; | ||
1061 | } | ||
1062 | |||
1063 | /* | ||
1064 | * We got a tag, remove ourselves from the wait queue to ensure | ||
1065 | * someone else gets the wakeup. | ||
1066 | */ | ||
1067 | spin_lock_irq(&ws->wait.lock); | ||
1068 | list_del_init(&wait->entry); | ||
1069 | spin_unlock_irq(&ws->wait.lock); | ||
1070 | spin_unlock(&this_hctx->lock); | 1149 | spin_unlock(&this_hctx->lock); |
1071 | return true; | 1150 | return false; |
1072 | } | 1151 | } |
1152 | |||
1153 | /* | ||
1154 | * We got a tag, remove ourselves from the wait queue to ensure | ||
1155 | * someone else gets the wakeup. | ||
1156 | */ | ||
1157 | spin_lock_irq(&ws->wait.lock); | ||
1158 | list_del_init(&wait->entry); | ||
1159 | spin_unlock_irq(&ws->wait.lock); | ||
1160 | spin_unlock(&this_hctx->lock); | ||
1161 | |||
1162 | return true; | ||
1073 | } | 1163 | } |
1074 | 1164 | ||
1075 | bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | 1165 | bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, |
@@ -1206,9 +1296,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
1206 | /* | 1296 | /* |
1207 | * We should be running this queue from one of the CPUs that | 1297 | * We should be running this queue from one of the CPUs that |
1208 | * are mapped to it. | 1298 | * are mapped to it. |
1299 | * | ||
1300 | * There are at least two related races now between setting | ||
1301 | * hctx->next_cpu from blk_mq_hctx_next_cpu() and running | ||
1302 | * __blk_mq_run_hw_queue(): | ||
1303 | * | ||
1304 | * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), | ||
1305 | * but later it becomes online, then this warning is harmless | ||
1306 | * at all | ||
1307 | * | ||
1308 | * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), | ||
1309 | * but later it becomes offline, then the warning can't be | ||
1310 | * triggered, and we depend on blk-mq timeout handler to | ||
1311 | * handle dispatched requests to this hctx | ||
1209 | */ | 1312 | */ |
1210 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && | 1313 | if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && |
1211 | cpu_online(hctx->next_cpu)); | 1314 | cpu_online(hctx->next_cpu)) { |
1315 | printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", | ||
1316 | raw_smp_processor_id(), | ||
1317 | cpumask_empty(hctx->cpumask) ? "inactive": "active"); | ||
1318 | dump_stack(); | ||
1319 | } | ||
1212 | 1320 | ||
1213 | /* | 1321 | /* |
1214 | * We can't run the queue inline with ints disabled. Ensure that | 1322 | * We can't run the queue inline with ints disabled. Ensure that |
@@ -1216,17 +1324,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
1216 | */ | 1324 | */ |
1217 | WARN_ON_ONCE(in_interrupt()); | 1325 | WARN_ON_ONCE(in_interrupt()); |
1218 | 1326 | ||
1219 | if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { | 1327 | might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); |
1220 | rcu_read_lock(); | ||
1221 | blk_mq_sched_dispatch_requests(hctx); | ||
1222 | rcu_read_unlock(); | ||
1223 | } else { | ||
1224 | might_sleep(); | ||
1225 | 1328 | ||
1226 | srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); | 1329 | hctx_lock(hctx, &srcu_idx); |
1227 | blk_mq_sched_dispatch_requests(hctx); | 1330 | blk_mq_sched_dispatch_requests(hctx); |
1228 | srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); | 1331 | hctx_unlock(hctx, srcu_idx); |
1229 | } | ||
1230 | } | 1332 | } |
1231 | 1333 | ||
1232 | /* | 1334 | /* |
@@ -1237,20 +1339,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
1237 | */ | 1339 | */ |
1238 | static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) | 1340 | static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) |
1239 | { | 1341 | { |
1342 | bool tried = false; | ||
1343 | |||
1240 | if (hctx->queue->nr_hw_queues == 1) | 1344 | if (hctx->queue->nr_hw_queues == 1) |
1241 | return WORK_CPU_UNBOUND; | 1345 | return WORK_CPU_UNBOUND; |
1242 | 1346 | ||
1243 | if (--hctx->next_cpu_batch <= 0) { | 1347 | if (--hctx->next_cpu_batch <= 0) { |
1244 | int next_cpu; | 1348 | int next_cpu; |
1245 | 1349 | select_cpu: | |
1246 | next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); | 1350 | next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, |
1351 | cpu_online_mask); | ||
1247 | if (next_cpu >= nr_cpu_ids) | 1352 | if (next_cpu >= nr_cpu_ids) |
1248 | next_cpu = cpumask_first(hctx->cpumask); | 1353 | next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); |
1249 | 1354 | ||
1250 | hctx->next_cpu = next_cpu; | 1355 | /* |
1356 | * No online CPU is found, so have to make sure hctx->next_cpu | ||
1357 | * is set correctly for not breaking workqueue. | ||
1358 | */ | ||
1359 | if (next_cpu >= nr_cpu_ids) | ||
1360 | hctx->next_cpu = cpumask_first(hctx->cpumask); | ||
1361 | else | ||
1362 | hctx->next_cpu = next_cpu; | ||
1251 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; | 1363 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; |
1252 | } | 1364 | } |
1253 | 1365 | ||
1366 | /* | ||
1367 | * Do unbound schedule if we can't find a online CPU for this hctx, | ||
1368 | * and it should only happen in the path of handling CPU DEAD. | ||
1369 | */ | ||
1370 | if (!cpu_online(hctx->next_cpu)) { | ||
1371 | if (!tried) { | ||
1372 | tried = true; | ||
1373 | goto select_cpu; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Make sure to re-select CPU next time once after CPUs | ||
1378 | * in hctx->cpumask become online again. | ||
1379 | */ | ||
1380 | hctx->next_cpu_batch = 1; | ||
1381 | return WORK_CPU_UNBOUND; | ||
1382 | } | ||
1254 | return hctx->next_cpu; | 1383 | return hctx->next_cpu; |
1255 | } | 1384 | } |
1256 | 1385 | ||
@@ -1274,9 +1403,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, | |||
1274 | put_cpu(); | 1403 | put_cpu(); |
1275 | } | 1404 | } |
1276 | 1405 | ||
1277 | kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), | 1406 | kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, |
1278 | &hctx->run_work, | 1407 | msecs_to_jiffies(msecs)); |
1279 | msecs_to_jiffies(msecs)); | ||
1280 | } | 1408 | } |
1281 | 1409 | ||
1282 | void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) | 1410 | void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) |
@@ -1287,7 +1415,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); | |||
1287 | 1415 | ||
1288 | bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | 1416 | bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) |
1289 | { | 1417 | { |
1290 | if (blk_mq_hctx_has_pending(hctx)) { | 1418 | int srcu_idx; |
1419 | bool need_run; | ||
1420 | |||
1421 | /* | ||
1422 | * When queue is quiesced, we may be switching io scheduler, or | ||
1423 | * updating nr_hw_queues, or other things, and we can't run queue | ||
1424 | * any more, even __blk_mq_hctx_has_pending() can't be called safely. | ||
1425 | * | ||
1426 | * And queue will be rerun in blk_mq_unquiesce_queue() if it is | ||
1427 | * quiesced. | ||
1428 | */ | ||
1429 | hctx_lock(hctx, &srcu_idx); | ||
1430 | need_run = !blk_queue_quiesced(hctx->queue) && | ||
1431 | blk_mq_hctx_has_pending(hctx); | ||
1432 | hctx_unlock(hctx, srcu_idx); | ||
1433 | |||
1434 | if (need_run) { | ||
1291 | __blk_mq_delay_run_hw_queue(hctx, async, 0); | 1435 | __blk_mq_delay_run_hw_queue(hctx, async, 0); |
1292 | return true; | 1436 | return true; |
1293 | } | 1437 | } |
@@ -1595,9 +1739,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) | |||
1595 | return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); | 1739 | return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); |
1596 | } | 1740 | } |
1597 | 1741 | ||
1598 | static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | 1742 | static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, |
1599 | struct request *rq, | 1743 | struct request *rq, |
1600 | blk_qc_t *cookie, bool may_sleep) | 1744 | blk_qc_t *cookie) |
1601 | { | 1745 | { |
1602 | struct request_queue *q = rq->q; | 1746 | struct request_queue *q = rq->q; |
1603 | struct blk_mq_queue_data bd = { | 1747 | struct blk_mq_queue_data bd = { |
@@ -1606,15 +1750,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | |||
1606 | }; | 1750 | }; |
1607 | blk_qc_t new_cookie; | 1751 | blk_qc_t new_cookie; |
1608 | blk_status_t ret; | 1752 | blk_status_t ret; |
1753 | |||
1754 | new_cookie = request_to_qc_t(hctx, rq); | ||
1755 | |||
1756 | /* | ||
1757 | * For OK queue, we are done. For error, caller may kill it. | ||
1758 | * Any other error (busy), just add it to our list as we | ||
1759 | * previously would have done. | ||
1760 | */ | ||
1761 | ret = q->mq_ops->queue_rq(hctx, &bd); | ||
1762 | switch (ret) { | ||
1763 | case BLK_STS_OK: | ||
1764 | *cookie = new_cookie; | ||
1765 | break; | ||
1766 | case BLK_STS_RESOURCE: | ||
1767 | __blk_mq_requeue_request(rq); | ||
1768 | break; | ||
1769 | default: | ||
1770 | *cookie = BLK_QC_T_NONE; | ||
1771 | break; | ||
1772 | } | ||
1773 | |||
1774 | return ret; | ||
1775 | } | ||
1776 | |||
1777 | static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | ||
1778 | struct request *rq, | ||
1779 | blk_qc_t *cookie, | ||
1780 | bool bypass_insert) | ||
1781 | { | ||
1782 | struct request_queue *q = rq->q; | ||
1609 | bool run_queue = true; | 1783 | bool run_queue = true; |
1610 | 1784 | ||
1611 | /* RCU or SRCU read lock is needed before checking quiesced flag */ | 1785 | /* |
1786 | * RCU or SRCU read lock is needed before checking quiesced flag. | ||
1787 | * | ||
1788 | * When queue is stopped or quiesced, ignore 'bypass_insert' from | ||
1789 | * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, | ||
1790 | * and avoid driver to try to dispatch again. | ||
1791 | */ | ||
1612 | if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { | 1792 | if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { |
1613 | run_queue = false; | 1793 | run_queue = false; |
1794 | bypass_insert = false; | ||
1614 | goto insert; | 1795 | goto insert; |
1615 | } | 1796 | } |
1616 | 1797 | ||
1617 | if (q->elevator) | 1798 | if (q->elevator && !bypass_insert) |
1618 | goto insert; | 1799 | goto insert; |
1619 | 1800 | ||
1620 | if (!blk_mq_get_driver_tag(rq, NULL, false)) | 1801 | if (!blk_mq_get_driver_tag(rq, NULL, false)) |
@@ -1625,47 +1806,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | |||
1625 | goto insert; | 1806 | goto insert; |
1626 | } | 1807 | } |
1627 | 1808 | ||
1628 | new_cookie = request_to_qc_t(hctx, rq); | 1809 | return __blk_mq_issue_directly(hctx, rq, cookie); |
1629 | |||
1630 | /* | ||
1631 | * For OK queue, we are done. For error, kill it. Any other | ||
1632 | * error (busy), just add it to our list as we previously | ||
1633 | * would have done | ||
1634 | */ | ||
1635 | ret = q->mq_ops->queue_rq(hctx, &bd); | ||
1636 | switch (ret) { | ||
1637 | case BLK_STS_OK: | ||
1638 | *cookie = new_cookie; | ||
1639 | return; | ||
1640 | case BLK_STS_RESOURCE: | ||
1641 | __blk_mq_requeue_request(rq); | ||
1642 | goto insert; | ||
1643 | default: | ||
1644 | *cookie = BLK_QC_T_NONE; | ||
1645 | blk_mq_end_request(rq, ret); | ||
1646 | return; | ||
1647 | } | ||
1648 | |||
1649 | insert: | 1810 | insert: |
1650 | blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); | 1811 | if (bypass_insert) |
1812 | return BLK_STS_RESOURCE; | ||
1813 | |||
1814 | blk_mq_sched_insert_request(rq, false, run_queue, false); | ||
1815 | return BLK_STS_OK; | ||
1651 | } | 1816 | } |
1652 | 1817 | ||
1653 | static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | 1818 | static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, |
1654 | struct request *rq, blk_qc_t *cookie) | 1819 | struct request *rq, blk_qc_t *cookie) |
1655 | { | 1820 | { |
1656 | if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { | 1821 | blk_status_t ret; |
1657 | rcu_read_lock(); | 1822 | int srcu_idx; |
1658 | __blk_mq_try_issue_directly(hctx, rq, cookie, false); | ||
1659 | rcu_read_unlock(); | ||
1660 | } else { | ||
1661 | unsigned int srcu_idx; | ||
1662 | 1823 | ||
1663 | might_sleep(); | 1824 | might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); |
1664 | 1825 | ||
1665 | srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); | 1826 | hctx_lock(hctx, &srcu_idx); |
1666 | __blk_mq_try_issue_directly(hctx, rq, cookie, true); | 1827 | |
1667 | srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); | 1828 | ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); |
1668 | } | 1829 | if (ret == BLK_STS_RESOURCE) |
1830 | blk_mq_sched_insert_request(rq, false, true, false); | ||
1831 | else if (ret != BLK_STS_OK) | ||
1832 | blk_mq_end_request(rq, ret); | ||
1833 | |||
1834 | hctx_unlock(hctx, srcu_idx); | ||
1835 | } | ||
1836 | |||
1837 | blk_status_t blk_mq_request_issue_directly(struct request *rq) | ||
1838 | { | ||
1839 | blk_status_t ret; | ||
1840 | int srcu_idx; | ||
1841 | blk_qc_t unused_cookie; | ||
1842 | struct blk_mq_ctx *ctx = rq->mq_ctx; | ||
1843 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); | ||
1844 | |||
1845 | hctx_lock(hctx, &srcu_idx); | ||
1846 | ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); | ||
1847 | hctx_unlock(hctx, srcu_idx); | ||
1848 | |||
1849 | return ret; | ||
1669 | } | 1850 | } |
1670 | 1851 | ||
1671 | static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | 1852 | static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) |
@@ -1776,7 +1957,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1776 | } else if (q->elevator) { | 1957 | } else if (q->elevator) { |
1777 | blk_mq_put_ctx(data.ctx); | 1958 | blk_mq_put_ctx(data.ctx); |
1778 | blk_mq_bio_to_request(rq, bio); | 1959 | blk_mq_bio_to_request(rq, bio); |
1779 | blk_mq_sched_insert_request(rq, false, true, true, true); | 1960 | blk_mq_sched_insert_request(rq, false, true, true); |
1780 | } else { | 1961 | } else { |
1781 | blk_mq_put_ctx(data.ctx); | 1962 | blk_mq_put_ctx(data.ctx); |
1782 | blk_mq_bio_to_request(rq, bio); | 1963 | blk_mq_bio_to_request(rq, bio); |
@@ -1869,6 +2050,22 @@ static size_t order_to_size(unsigned int order) | |||
1869 | return (size_t)PAGE_SIZE << order; | 2050 | return (size_t)PAGE_SIZE << order; |
1870 | } | 2051 | } |
1871 | 2052 | ||
2053 | static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, | ||
2054 | unsigned int hctx_idx, int node) | ||
2055 | { | ||
2056 | int ret; | ||
2057 | |||
2058 | if (set->ops->init_request) { | ||
2059 | ret = set->ops->init_request(set, rq, hctx_idx, node); | ||
2060 | if (ret) | ||
2061 | return ret; | ||
2062 | } | ||
2063 | |||
2064 | seqcount_init(&rq->gstate_seq); | ||
2065 | u64_stats_init(&rq->aborted_gstate_sync); | ||
2066 | return 0; | ||
2067 | } | ||
2068 | |||
1872 | int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, | 2069 | int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, |
1873 | unsigned int hctx_idx, unsigned int depth) | 2070 | unsigned int hctx_idx, unsigned int depth) |
1874 | { | 2071 | { |
@@ -1930,12 +2127,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, | |||
1930 | struct request *rq = p; | 2127 | struct request *rq = p; |
1931 | 2128 | ||
1932 | tags->static_rqs[i] = rq; | 2129 | tags->static_rqs[i] = rq; |
1933 | if (set->ops->init_request) { | 2130 | if (blk_mq_init_request(set, rq, hctx_idx, node)) { |
1934 | if (set->ops->init_request(set, rq, hctx_idx, | 2131 | tags->static_rqs[i] = NULL; |
1935 | node)) { | 2132 | goto fail; |
1936 | tags->static_rqs[i] = NULL; | ||
1937 | goto fail; | ||
1938 | } | ||
1939 | } | 2133 | } |
1940 | 2134 | ||
1941 | p += rq_size; | 2135 | p += rq_size; |
@@ -1994,7 +2188,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, | |||
1994 | { | 2188 | { |
1995 | blk_mq_debugfs_unregister_hctx(hctx); | 2189 | blk_mq_debugfs_unregister_hctx(hctx); |
1996 | 2190 | ||
1997 | blk_mq_tag_idle(hctx); | 2191 | if (blk_mq_hw_queue_mapped(hctx)) |
2192 | blk_mq_tag_idle(hctx); | ||
1998 | 2193 | ||
1999 | if (set->ops->exit_request) | 2194 | if (set->ops->exit_request) |
2000 | set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); | 2195 | set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); |
@@ -2005,7 +2200,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, | |||
2005 | set->ops->exit_hctx(hctx, hctx_idx); | 2200 | set->ops->exit_hctx(hctx, hctx_idx); |
2006 | 2201 | ||
2007 | if (hctx->flags & BLK_MQ_F_BLOCKING) | 2202 | if (hctx->flags & BLK_MQ_F_BLOCKING) |
2008 | cleanup_srcu_struct(hctx->queue_rq_srcu); | 2203 | cleanup_srcu_struct(hctx->srcu); |
2009 | 2204 | ||
2010 | blk_mq_remove_cpuhp(hctx); | 2205 | blk_mq_remove_cpuhp(hctx); |
2011 | blk_free_flush_queue(hctx->fq); | 2206 | blk_free_flush_queue(hctx->fq); |
@@ -2074,13 +2269,11 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
2074 | if (!hctx->fq) | 2269 | if (!hctx->fq) |
2075 | goto sched_exit_hctx; | 2270 | goto sched_exit_hctx; |
2076 | 2271 | ||
2077 | if (set->ops->init_request && | 2272 | if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) |
2078 | set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, | ||
2079 | node)) | ||
2080 | goto free_fq; | 2273 | goto free_fq; |
2081 | 2274 | ||
2082 | if (hctx->flags & BLK_MQ_F_BLOCKING) | 2275 | if (hctx->flags & BLK_MQ_F_BLOCKING) |
2083 | init_srcu_struct(hctx->queue_rq_srcu); | 2276 | init_srcu_struct(hctx->srcu); |
2084 | 2277 | ||
2085 | blk_mq_debugfs_register_hctx(q, hctx); | 2278 | blk_mq_debugfs_register_hctx(q, hctx); |
2086 | 2279 | ||
@@ -2116,16 +2309,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, | |||
2116 | INIT_LIST_HEAD(&__ctx->rq_list); | 2309 | INIT_LIST_HEAD(&__ctx->rq_list); |
2117 | __ctx->queue = q; | 2310 | __ctx->queue = q; |
2118 | 2311 | ||
2119 | /* If the cpu isn't present, the cpu is mapped to first hctx */ | ||
2120 | if (!cpu_present(i)) | ||
2121 | continue; | ||
2122 | |||
2123 | hctx = blk_mq_map_queue(q, i); | ||
2124 | |||
2125 | /* | 2312 | /* |
2126 | * Set local node, IFF we have more than one hw queue. If | 2313 | * Set local node, IFF we have more than one hw queue. If |
2127 | * not, we remain on the home node of the device | 2314 | * not, we remain on the home node of the device |
2128 | */ | 2315 | */ |
2316 | hctx = blk_mq_map_queue(q, i); | ||
2129 | if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) | 2317 | if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) |
2130 | hctx->numa_node = local_memory_node(cpu_to_node(i)); | 2318 | hctx->numa_node = local_memory_node(cpu_to_node(i)); |
2131 | } | 2319 | } |
@@ -2182,7 +2370,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) | |||
2182 | * | 2370 | * |
2183 | * If the cpu isn't present, the cpu is mapped to first hctx. | 2371 | * If the cpu isn't present, the cpu is mapped to first hctx. |
2184 | */ | 2372 | */ |
2185 | for_each_present_cpu(i) { | 2373 | for_each_possible_cpu(i) { |
2186 | hctx_idx = q->mq_map[i]; | 2374 | hctx_idx = q->mq_map[i]; |
2187 | /* unmapped hw queue can be remapped after CPU topo changed */ | 2375 | /* unmapped hw queue can be remapped after CPU topo changed */ |
2188 | if (!set->tags[hctx_idx] && | 2376 | if (!set->tags[hctx_idx] && |
@@ -2236,7 +2424,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) | |||
2236 | /* | 2424 | /* |
2237 | * Initialize batch roundrobin counts | 2425 | * Initialize batch roundrobin counts |
2238 | */ | 2426 | */ |
2239 | hctx->next_cpu = cpumask_first(hctx->cpumask); | 2427 | hctx->next_cpu = cpumask_first_and(hctx->cpumask, |
2428 | cpu_online_mask); | ||
2240 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; | 2429 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; |
2241 | } | 2430 | } |
2242 | } | 2431 | } |
@@ -2369,7 +2558,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) | |||
2369 | { | 2558 | { |
2370 | int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); | 2559 | int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); |
2371 | 2560 | ||
2372 | BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), | 2561 | BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), |
2373 | __alignof__(struct blk_mq_hw_ctx)) != | 2562 | __alignof__(struct blk_mq_hw_ctx)) != |
2374 | sizeof(struct blk_mq_hw_ctx)); | 2563 | sizeof(struct blk_mq_hw_ctx)); |
2375 | 2564 | ||
@@ -2386,6 +2575,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, | |||
2386 | struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; | 2575 | struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; |
2387 | 2576 | ||
2388 | blk_mq_sysfs_unregister(q); | 2577 | blk_mq_sysfs_unregister(q); |
2578 | |||
2579 | /* protect against switching io scheduler */ | ||
2580 | mutex_lock(&q->sysfs_lock); | ||
2389 | for (i = 0; i < set->nr_hw_queues; i++) { | 2581 | for (i = 0; i < set->nr_hw_queues; i++) { |
2390 | int node; | 2582 | int node; |
2391 | 2583 | ||
@@ -2430,6 +2622,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, | |||
2430 | } | 2622 | } |
2431 | } | 2623 | } |
2432 | q->nr_hw_queues = i; | 2624 | q->nr_hw_queues = i; |
2625 | mutex_unlock(&q->sysfs_lock); | ||
2433 | blk_mq_sysfs_register(q); | 2626 | blk_mq_sysfs_register(q); |
2434 | } | 2627 | } |
2435 | 2628 | ||
@@ -2601,9 +2794,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | |||
2601 | 2794 | ||
2602 | static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | 2795 | static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) |
2603 | { | 2796 | { |
2604 | if (set->ops->map_queues) | 2797 | if (set->ops->map_queues) { |
2798 | int cpu; | ||
2799 | /* | ||
2800 | * transport .map_queues is usually done in the following | ||
2801 | * way: | ||
2802 | * | ||
2803 | * for (queue = 0; queue < set->nr_hw_queues; queue++) { | ||
2804 | * mask = get_cpu_mask(queue) | ||
2805 | * for_each_cpu(cpu, mask) | ||
2806 | * set->mq_map[cpu] = queue; | ||
2807 | * } | ||
2808 | * | ||
2809 | * When we need to remap, the table has to be cleared for | ||
2810 | * killing stale mapping since one CPU may not be mapped | ||
2811 | * to any hw queue. | ||
2812 | */ | ||
2813 | for_each_possible_cpu(cpu) | ||
2814 | set->mq_map[cpu] = 0; | ||
2815 | |||
2605 | return set->ops->map_queues(set); | 2816 | return set->ops->map_queues(set); |
2606 | else | 2817 | } else |
2607 | return blk_mq_map_queues(set); | 2818 | return blk_mq_map_queues(set); |
2608 | } | 2819 | } |
2609 | 2820 | ||
@@ -2712,6 +2923,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) | |||
2712 | return -EINVAL; | 2923 | return -EINVAL; |
2713 | 2924 | ||
2714 | blk_mq_freeze_queue(q); | 2925 | blk_mq_freeze_queue(q); |
2926 | blk_mq_quiesce_queue(q); | ||
2715 | 2927 | ||
2716 | ret = 0; | 2928 | ret = 0; |
2717 | queue_for_each_hw_ctx(q, hctx, i) { | 2929 | queue_for_each_hw_ctx(q, hctx, i) { |
@@ -2735,6 +2947,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) | |||
2735 | if (!ret) | 2947 | if (!ret) |
2736 | q->nr_requests = nr; | 2948 | q->nr_requests = nr; |
2737 | 2949 | ||
2950 | blk_mq_unquiesce_queue(q); | ||
2738 | blk_mq_unfreeze_queue(q); | 2951 | blk_mq_unfreeze_queue(q); |
2739 | 2952 | ||
2740 | return ret; | 2953 | return ret; |
@@ -2850,7 +3063,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, | |||
2850 | unsigned int nsecs; | 3063 | unsigned int nsecs; |
2851 | ktime_t kt; | 3064 | ktime_t kt; |
2852 | 3065 | ||
2853 | if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) | 3066 | if (rq->rq_flags & RQF_MQ_POLL_SLEPT) |
2854 | return false; | 3067 | return false; |
2855 | 3068 | ||
2856 | /* | 3069 | /* |
@@ -2870,7 +3083,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, | |||
2870 | if (!nsecs) | 3083 | if (!nsecs) |
2871 | return false; | 3084 | return false; |
2872 | 3085 | ||
2873 | set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); | 3086 | rq->rq_flags |= RQF_MQ_POLL_SLEPT; |
2874 | 3087 | ||
2875 | /* | 3088 | /* |
2876 | * This will be replaced with the stats tracking code, using | 3089 | * This will be replaced with the stats tracking code, using |
@@ -2884,7 +3097,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, | |||
2884 | 3097 | ||
2885 | hrtimer_init_sleeper(&hs, current); | 3098 | hrtimer_init_sleeper(&hs, current); |
2886 | do { | 3099 | do { |
2887 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) | 3100 | if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) |
2888 | break; | 3101 | break; |
2889 | set_current_state(TASK_UNINTERRUPTIBLE); | 3102 | set_current_state(TASK_UNINTERRUPTIBLE); |
2890 | hrtimer_start_expires(&hs.timer, mode); | 3103 | hrtimer_start_expires(&hs.timer, mode); |
@@ -2970,12 +3183,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) | |||
2970 | 3183 | ||
2971 | static int __init blk_mq_init(void) | 3184 | static int __init blk_mq_init(void) |
2972 | { | 3185 | { |
2973 | /* | ||
2974 | * See comment in block/blk.h rq_atomic_flags enum | ||
2975 | */ | ||
2976 | BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != | ||
2977 | (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); | ||
2978 | |||
2979 | cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, | 3186 | cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, |
2980 | blk_mq_hctx_notify_dead); | 3187 | blk_mq_hctx_notify_dead); |
2981 | return 0; | 3188 | return 0; |
diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c7c3ff5bf62..88c558f71819 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h | |||
@@ -27,6 +27,20 @@ struct blk_mq_ctx { | |||
27 | struct kobject kobj; | 27 | struct kobject kobj; |
28 | } ____cacheline_aligned_in_smp; | 28 | } ____cacheline_aligned_in_smp; |
29 | 29 | ||
30 | /* | ||
31 | * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value | ||
32 | * and the upper bits the generation number. | ||
33 | */ | ||
34 | enum mq_rq_state { | ||
35 | MQ_RQ_IDLE = 0, | ||
36 | MQ_RQ_IN_FLIGHT = 1, | ||
37 | MQ_RQ_COMPLETE = 2, | ||
38 | |||
39 | MQ_RQ_STATE_BITS = 2, | ||
40 | MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1, | ||
41 | MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS, | ||
42 | }; | ||
43 | |||
30 | void blk_mq_freeze_queue(struct request_queue *q); | 44 | void blk_mq_freeze_queue(struct request_queue *q); |
31 | void blk_mq_free_queue(struct request_queue *q); | 45 | void blk_mq_free_queue(struct request_queue *q); |
32 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); | 46 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); |
@@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); | |||
60 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | 74 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, |
61 | struct list_head *list); | 75 | struct list_head *list); |
62 | 76 | ||
77 | /* Used by blk_insert_cloned_request() to issue request directly */ | ||
78 | blk_status_t blk_mq_request_issue_directly(struct request *rq); | ||
79 | |||
63 | /* | 80 | /* |
64 | * CPU -> queue mappings | 81 | * CPU -> queue mappings |
65 | */ | 82 | */ |
@@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q); | |||
81 | extern void blk_mq_sysfs_unregister(struct request_queue *q); | 98 | extern void blk_mq_sysfs_unregister(struct request_queue *q); |
82 | extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); | 99 | extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); |
83 | 100 | ||
84 | extern void blk_mq_rq_timed_out(struct request *req, bool reserved); | ||
85 | |||
86 | void blk_mq_release(struct request_queue *q); | 101 | void blk_mq_release(struct request_queue *q); |
87 | 102 | ||
103 | /** | ||
104 | * blk_mq_rq_state() - read the current MQ_RQ_* state of a request | ||
105 | * @rq: target request. | ||
106 | */ | ||
107 | static inline int blk_mq_rq_state(struct request *rq) | ||
108 | { | ||
109 | return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK; | ||
110 | } | ||
111 | |||
112 | /** | ||
113 | * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request | ||
114 | * @rq: target request. | ||
115 | * @state: new state to set. | ||
116 | * | ||
117 | * Set @rq's state to @state. The caller is responsible for ensuring that | ||
118 | * there are no other updaters. A request can transition into IN_FLIGHT | ||
119 | * only from IDLE and doing so increments the generation number. | ||
120 | */ | ||
121 | static inline void blk_mq_rq_update_state(struct request *rq, | ||
122 | enum mq_rq_state state) | ||
123 | { | ||
124 | u64 old_val = READ_ONCE(rq->gstate); | ||
125 | u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state; | ||
126 | |||
127 | if (state == MQ_RQ_IN_FLIGHT) { | ||
128 | WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE); | ||
129 | new_val += MQ_RQ_GEN_INC; | ||
130 | } | ||
131 | |||
132 | /* avoid exposing interim values */ | ||
133 | WRITE_ONCE(rq->gstate, new_val); | ||
134 | } | ||
135 | |||
88 | static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, | 136 | static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, |
89 | unsigned int cpu) | 137 | unsigned int cpu) |
90 | { | 138 | { |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 870484eaed1f..cbea895a5547 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = { | |||
853 | .release = blk_release_queue, | 853 | .release = blk_release_queue, |
854 | }; | 854 | }; |
855 | 855 | ||
856 | /** | ||
857 | * blk_register_queue - register a block layer queue with sysfs | ||
858 | * @disk: Disk of which the request queue should be registered with sysfs. | ||
859 | */ | ||
856 | int blk_register_queue(struct gendisk *disk) | 860 | int blk_register_queue(struct gendisk *disk) |
857 | { | 861 | { |
858 | int ret; | 862 | int ret; |
@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk) | |||
909 | if (q->request_fn || (q->mq_ops && q->elevator)) { | 913 | if (q->request_fn || (q->mq_ops && q->elevator)) { |
910 | ret = elv_register_queue(q); | 914 | ret = elv_register_queue(q); |
911 | if (ret) { | 915 | if (ret) { |
916 | mutex_unlock(&q->sysfs_lock); | ||
912 | kobject_uevent(&q->kobj, KOBJ_REMOVE); | 917 | kobject_uevent(&q->kobj, KOBJ_REMOVE); |
913 | kobject_del(&q->kobj); | 918 | kobject_del(&q->kobj); |
914 | blk_trace_remove_sysfs(dev); | 919 | blk_trace_remove_sysfs(dev); |
915 | kobject_put(&dev->kobj); | 920 | kobject_put(&dev->kobj); |
916 | goto unlock; | 921 | return ret; |
917 | } | 922 | } |
918 | } | 923 | } |
919 | ret = 0; | 924 | ret = 0; |
@@ -921,7 +926,15 @@ unlock: | |||
921 | mutex_unlock(&q->sysfs_lock); | 926 | mutex_unlock(&q->sysfs_lock); |
922 | return ret; | 927 | return ret; |
923 | } | 928 | } |
929 | EXPORT_SYMBOL_GPL(blk_register_queue); | ||
924 | 930 | ||
931 | /** | ||
932 | * blk_unregister_queue - counterpart of blk_register_queue() | ||
933 | * @disk: Disk of which the request queue should be unregistered from sysfs. | ||
934 | * | ||
935 | * Note: the caller is responsible for guaranteeing that this function is called | ||
936 | * after blk_register_queue() has finished. | ||
937 | */ | ||
925 | void blk_unregister_queue(struct gendisk *disk) | 938 | void blk_unregister_queue(struct gendisk *disk) |
926 | { | 939 | { |
927 | struct request_queue *q = disk->queue; | 940 | struct request_queue *q = disk->queue; |
@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk) | |||
929 | if (WARN_ON(!q)) | 942 | if (WARN_ON(!q)) |
930 | return; | 943 | return; |
931 | 944 | ||
932 | mutex_lock(&q->sysfs_lock); | 945 | /* Return early if disk->queue was never registered. */ |
933 | queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); | 946 | if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) |
934 | mutex_unlock(&q->sysfs_lock); | 947 | return; |
935 | 948 | ||
936 | wbt_exit(q); | 949 | /* |
950 | * Since sysfs_remove_dir() prevents adding new directory entries | ||
951 | * before removal of existing entries starts, protect against | ||
952 | * concurrent elv_iosched_store() calls. | ||
953 | */ | ||
954 | mutex_lock(&q->sysfs_lock); | ||
937 | 955 | ||
956 | spin_lock_irq(q->queue_lock); | ||
957 | queue_flag_clear(QUEUE_FLAG_REGISTERED, q); | ||
958 | spin_unlock_irq(q->queue_lock); | ||
938 | 959 | ||
960 | /* | ||
961 | * Remove the sysfs attributes before unregistering the queue data | ||
962 | * structures that can be modified through sysfs. | ||
963 | */ | ||
939 | if (q->mq_ops) | 964 | if (q->mq_ops) |
940 | blk_mq_unregister_dev(disk_to_dev(disk), q); | 965 | blk_mq_unregister_dev(disk_to_dev(disk), q); |
941 | 966 | mutex_unlock(&q->sysfs_lock); | |
942 | if (q->request_fn || (q->mq_ops && q->elevator)) | ||
943 | elv_unregister_queue(q); | ||
944 | 967 | ||
945 | kobject_uevent(&q->kobj, KOBJ_REMOVE); | 968 | kobject_uevent(&q->kobj, KOBJ_REMOVE); |
946 | kobject_del(&q->kobj); | 969 | kobject_del(&q->kobj); |
947 | blk_trace_remove_sysfs(disk_to_dev(disk)); | 970 | blk_trace_remove_sysfs(disk_to_dev(disk)); |
971 | |||
972 | wbt_exit(q); | ||
973 | |||
974 | mutex_lock(&q->sysfs_lock); | ||
975 | if (q->request_fn || (q->mq_ops && q->elevator)) | ||
976 | elv_unregister_queue(q); | ||
977 | mutex_unlock(&q->sysfs_lock); | ||
978 | |||
948 | kobject_put(&disk_to_dev(disk)->kobj); | 979 | kobject_put(&disk_to_dev(disk)->kobj); |
949 | } | 980 | } |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d19f416d6101..c5a131673733 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -216,9 +216,9 @@ struct throtl_data | |||
216 | 216 | ||
217 | unsigned int scale; | 217 | unsigned int scale; |
218 | 218 | ||
219 | struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; | 219 | struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; |
220 | struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; | 220 | struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; |
221 | struct latency_bucket __percpu *latency_buckets; | 221 | struct latency_bucket __percpu *latency_buckets[2]; |
222 | unsigned long last_calculate_time; | 222 | unsigned long last_calculate_time; |
223 | unsigned long filtered_latency; | 223 | unsigned long filtered_latency; |
224 | 224 | ||
@@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = { | |||
1511 | .seq_show = blkg_print_stat_bytes, | 1511 | .seq_show = blkg_print_stat_bytes, |
1512 | }, | 1512 | }, |
1513 | { | 1513 | { |
1514 | .name = "throttle.io_service_bytes_recursive", | ||
1515 | .private = (unsigned long)&blkcg_policy_throtl, | ||
1516 | .seq_show = blkg_print_stat_bytes_recursive, | ||
1517 | }, | ||
1518 | { | ||
1514 | .name = "throttle.io_serviced", | 1519 | .name = "throttle.io_serviced", |
1515 | .private = (unsigned long)&blkcg_policy_throtl, | 1520 | .private = (unsigned long)&blkcg_policy_throtl, |
1516 | .seq_show = blkg_print_stat_ios, | 1521 | .seq_show = blkg_print_stat_ios, |
1517 | }, | 1522 | }, |
1523 | { | ||
1524 | .name = "throttle.io_serviced_recursive", | ||
1525 | .private = (unsigned long)&blkcg_policy_throtl, | ||
1526 | .seq_show = blkg_print_stat_ios_recursive, | ||
1527 | }, | ||
1518 | { } /* terminate */ | 1528 | { } /* terminate */ |
1519 | }; | 1529 | }; |
1520 | 1530 | ||
@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) | |||
2040 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | 2050 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW |
2041 | static void throtl_update_latency_buckets(struct throtl_data *td) | 2051 | static void throtl_update_latency_buckets(struct throtl_data *td) |
2042 | { | 2052 | { |
2043 | struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; | 2053 | struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; |
2044 | int i, cpu; | 2054 | int i, cpu, rw; |
2045 | unsigned long last_latency = 0; | 2055 | unsigned long last_latency[2] = { 0 }; |
2046 | unsigned long latency; | 2056 | unsigned long latency[2]; |
2047 | 2057 | ||
2048 | if (!blk_queue_nonrot(td->queue)) | 2058 | if (!blk_queue_nonrot(td->queue)) |
2049 | return; | 2059 | return; |
@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td) | |||
2052 | td->last_calculate_time = jiffies; | 2062 | td->last_calculate_time = jiffies; |
2053 | 2063 | ||
2054 | memset(avg_latency, 0, sizeof(avg_latency)); | 2064 | memset(avg_latency, 0, sizeof(avg_latency)); |
2055 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { | 2065 | for (rw = READ; rw <= WRITE; rw++) { |
2056 | struct latency_bucket *tmp = &td->tmp_buckets[i]; | 2066 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { |
2057 | 2067 | struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; | |
2058 | for_each_possible_cpu(cpu) { | 2068 | |
2059 | struct latency_bucket *bucket; | 2069 | for_each_possible_cpu(cpu) { |
2060 | 2070 | struct latency_bucket *bucket; | |
2061 | /* this isn't race free, but ok in practice */ | 2071 | |
2062 | bucket = per_cpu_ptr(td->latency_buckets, cpu); | 2072 | /* this isn't race free, but ok in practice */ |
2063 | tmp->total_latency += bucket[i].total_latency; | 2073 | bucket = per_cpu_ptr(td->latency_buckets[rw], |
2064 | tmp->samples += bucket[i].samples; | 2074 | cpu); |
2065 | bucket[i].total_latency = 0; | 2075 | tmp->total_latency += bucket[i].total_latency; |
2066 | bucket[i].samples = 0; | 2076 | tmp->samples += bucket[i].samples; |
2067 | } | 2077 | bucket[i].total_latency = 0; |
2078 | bucket[i].samples = 0; | ||
2079 | } | ||
2068 | 2080 | ||
2069 | if (tmp->samples >= 32) { | 2081 | if (tmp->samples >= 32) { |
2070 | int samples = tmp->samples; | 2082 | int samples = tmp->samples; |
2071 | 2083 | ||
2072 | latency = tmp->total_latency; | 2084 | latency[rw] = tmp->total_latency; |
2073 | 2085 | ||
2074 | tmp->total_latency = 0; | 2086 | tmp->total_latency = 0; |
2075 | tmp->samples = 0; | 2087 | tmp->samples = 0; |
2076 | latency /= samples; | 2088 | latency[rw] /= samples; |
2077 | if (latency == 0) | 2089 | if (latency[rw] == 0) |
2078 | continue; | 2090 | continue; |
2079 | avg_latency[i].latency = latency; | 2091 | avg_latency[rw][i].latency = latency[rw]; |
2092 | } | ||
2080 | } | 2093 | } |
2081 | } | 2094 | } |
2082 | 2095 | ||
2083 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { | 2096 | for (rw = READ; rw <= WRITE; rw++) { |
2084 | if (!avg_latency[i].latency) { | 2097 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { |
2085 | if (td->avg_buckets[i].latency < last_latency) | 2098 | if (!avg_latency[rw][i].latency) { |
2086 | td->avg_buckets[i].latency = last_latency; | 2099 | if (td->avg_buckets[rw][i].latency < last_latency[rw]) |
2087 | continue; | 2100 | td->avg_buckets[rw][i].latency = |
2088 | } | 2101 | last_latency[rw]; |
2102 | continue; | ||
2103 | } | ||
2089 | 2104 | ||
2090 | if (!td->avg_buckets[i].valid) | 2105 | if (!td->avg_buckets[rw][i].valid) |
2091 | latency = avg_latency[i].latency; | 2106 | latency[rw] = avg_latency[rw][i].latency; |
2092 | else | 2107 | else |
2093 | latency = (td->avg_buckets[i].latency * 7 + | 2108 | latency[rw] = (td->avg_buckets[rw][i].latency * 7 + |
2094 | avg_latency[i].latency) >> 3; | 2109 | avg_latency[rw][i].latency) >> 3; |
2095 | 2110 | ||
2096 | td->avg_buckets[i].latency = max(latency, last_latency); | 2111 | td->avg_buckets[rw][i].latency = max(latency[rw], |
2097 | td->avg_buckets[i].valid = true; | 2112 | last_latency[rw]); |
2098 | last_latency = td->avg_buckets[i].latency; | 2113 | td->avg_buckets[rw][i].valid = true; |
2114 | last_latency[rw] = td->avg_buckets[rw][i].latency; | ||
2115 | } | ||
2099 | } | 2116 | } |
2100 | 2117 | ||
2101 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) | 2118 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) |
2102 | throtl_log(&td->service_queue, | 2119 | throtl_log(&td->service_queue, |
2103 | "Latency bucket %d: latency=%ld, valid=%d", i, | 2120 | "Latency bucket %d: read latency=%ld, read valid=%d, " |
2104 | td->avg_buckets[i].latency, td->avg_buckets[i].valid); | 2121 | "write latency=%ld, write valid=%d", i, |
2122 | td->avg_buckets[READ][i].latency, | ||
2123 | td->avg_buckets[READ][i].valid, | ||
2124 | td->avg_buckets[WRITE][i].latency, | ||
2125 | td->avg_buckets[WRITE][i].valid); | ||
2105 | } | 2126 | } |
2106 | #else | 2127 | #else |
2107 | static inline void throtl_update_latency_buckets(struct throtl_data *td) | 2128 | static inline void throtl_update_latency_buckets(struct throtl_data *td) |
@@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size, | |||
2242 | struct latency_bucket *latency; | 2263 | struct latency_bucket *latency; |
2243 | int index; | 2264 | int index; |
2244 | 2265 | ||
2245 | if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || | 2266 | if (!td || td->limit_index != LIMIT_LOW || |
2267 | !(op == REQ_OP_READ || op == REQ_OP_WRITE) || | ||
2246 | !blk_queue_nonrot(td->queue)) | 2268 | !blk_queue_nonrot(td->queue)) |
2247 | return; | 2269 | return; |
2248 | 2270 | ||
2249 | index = request_bucket_index(size); | 2271 | index = request_bucket_index(size); |
2250 | 2272 | ||
2251 | latency = get_cpu_ptr(td->latency_buckets); | 2273 | latency = get_cpu_ptr(td->latency_buckets[op]); |
2252 | latency[index].total_latency += time; | 2274 | latency[index].total_latency += time; |
2253 | latency[index].samples++; | 2275 | latency[index].samples++; |
2254 | put_cpu_ptr(td->latency_buckets); | 2276 | put_cpu_ptr(td->latency_buckets[op]); |
2255 | } | 2277 | } |
2256 | 2278 | ||
2257 | void blk_throtl_stat_add(struct request *rq, u64 time_ns) | 2279 | void blk_throtl_stat_add(struct request *rq, u64 time_ns) |
@@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio) | |||
2270 | unsigned long finish_time; | 2292 | unsigned long finish_time; |
2271 | unsigned long start_time; | 2293 | unsigned long start_time; |
2272 | unsigned long lat; | 2294 | unsigned long lat; |
2295 | int rw = bio_data_dir(bio); | ||
2273 | 2296 | ||
2274 | tg = bio->bi_cg_private; | 2297 | tg = bio->bi_cg_private; |
2275 | if (!tg) | 2298 | if (!tg) |
@@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio) | |||
2298 | 2321 | ||
2299 | bucket = request_bucket_index( | 2322 | bucket = request_bucket_index( |
2300 | blk_stat_size(&bio->bi_issue_stat)); | 2323 | blk_stat_size(&bio->bi_issue_stat)); |
2301 | threshold = tg->td->avg_buckets[bucket].latency + | 2324 | threshold = tg->td->avg_buckets[rw][bucket].latency + |
2302 | tg->latency_target; | 2325 | tg->latency_target; |
2303 | if (lat > threshold) | 2326 | if (lat > threshold) |
2304 | tg->bad_bio_cnt++; | 2327 | tg->bad_bio_cnt++; |
@@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q) | |||
2391 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); | 2414 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); |
2392 | if (!td) | 2415 | if (!td) |
2393 | return -ENOMEM; | 2416 | return -ENOMEM; |
2394 | td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * | 2417 | td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * |
2395 | LATENCY_BUCKET_SIZE, __alignof__(u64)); | 2418 | LATENCY_BUCKET_SIZE, __alignof__(u64)); |
2396 | if (!td->latency_buckets) { | 2419 | if (!td->latency_buckets[READ]) { |
2420 | kfree(td); | ||
2421 | return -ENOMEM; | ||
2422 | } | ||
2423 | td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * | ||
2424 | LATENCY_BUCKET_SIZE, __alignof__(u64)); | ||
2425 | if (!td->latency_buckets[WRITE]) { | ||
2426 | free_percpu(td->latency_buckets[READ]); | ||
2397 | kfree(td); | 2427 | kfree(td); |
2398 | return -ENOMEM; | 2428 | return -ENOMEM; |
2399 | } | 2429 | } |
@@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q) | |||
2412 | /* activate policy */ | 2442 | /* activate policy */ |
2413 | ret = blkcg_activate_policy(q, &blkcg_policy_throtl); | 2443 | ret = blkcg_activate_policy(q, &blkcg_policy_throtl); |
2414 | if (ret) { | 2444 | if (ret) { |
2415 | free_percpu(td->latency_buckets); | 2445 | free_percpu(td->latency_buckets[READ]); |
2446 | free_percpu(td->latency_buckets[WRITE]); | ||
2416 | kfree(td); | 2447 | kfree(td); |
2417 | } | 2448 | } |
2418 | return ret; | 2449 | return ret; |
@@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q) | |||
2423 | BUG_ON(!q->td); | 2454 | BUG_ON(!q->td); |
2424 | throtl_shutdown_wq(q); | 2455 | throtl_shutdown_wq(q); |
2425 | blkcg_deactivate_policy(q, &blkcg_policy_throtl); | 2456 | blkcg_deactivate_policy(q, &blkcg_policy_throtl); |
2426 | free_percpu(q->td->latency_buckets); | 2457 | free_percpu(q->td->latency_buckets[READ]); |
2458 | free_percpu(q->td->latency_buckets[WRITE]); | ||
2427 | kfree(q->td); | 2459 | kfree(q->td); |
2428 | } | 2460 | } |
2429 | 2461 | ||
@@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q) | |||
2441 | } else { | 2473 | } else { |
2442 | td->throtl_slice = DFL_THROTL_SLICE_HD; | 2474 | td->throtl_slice = DFL_THROTL_SLICE_HD; |
2443 | td->filtered_latency = LATENCY_FILTERED_HD; | 2475 | td->filtered_latency = LATENCY_FILTERED_HD; |
2444 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) | 2476 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { |
2445 | td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; | 2477 | td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; |
2478 | td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; | ||
2479 | } | ||
2446 | } | 2480 | } |
2447 | #ifndef CONFIG_BLK_DEV_THROTTLING_LOW | 2481 | #ifndef CONFIG_BLK_DEV_THROTTLING_LOW |
2448 | /* if no low limit, use previous default */ | 2482 | /* if no low limit, use previous default */ |
2449 | td->throtl_slice = DFL_THROTL_SLICE_HD; | 2483 | td->throtl_slice = DFL_THROTL_SLICE_HD; |
2450 | #endif | 2484 | #endif |
2451 | 2485 | ||
2452 | td->track_bio_latency = !q->mq_ops && !q->request_fn; | 2486 | td->track_bio_latency = !queue_is_rq_based(q); |
2453 | if (!td->track_bio_latency) | 2487 | if (!td->track_bio_latency) |
2454 | blk_stat_enable_accounting(q); | 2488 | blk_stat_enable_accounting(q); |
2455 | } | 2489 | } |
diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 764ecf9aeb30..a05e3676d24a 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c | |||
@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req) | |||
112 | static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, | 112 | static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, |
113 | unsigned int *next_set) | 113 | unsigned int *next_set) |
114 | { | 114 | { |
115 | if (time_after_eq(jiffies, rq->deadline)) { | 115 | const unsigned long deadline = blk_rq_deadline(rq); |
116 | |||
117 | if (time_after_eq(jiffies, deadline)) { | ||
116 | list_del_init(&rq->timeout_list); | 118 | list_del_init(&rq->timeout_list); |
117 | 119 | ||
118 | /* | 120 | /* |
@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout | |||
120 | */ | 122 | */ |
121 | if (!blk_mark_rq_complete(rq)) | 123 | if (!blk_mark_rq_complete(rq)) |
122 | blk_rq_timed_out(rq); | 124 | blk_rq_timed_out(rq); |
123 | } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { | 125 | } else if (!*next_set || time_after(*next_timeout, deadline)) { |
124 | *next_timeout = rq->deadline; | 126 | *next_timeout = deadline; |
125 | *next_set = 1; | 127 | *next_set = 1; |
126 | } | 128 | } |
127 | } | 129 | } |
@@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work) | |||
156 | */ | 158 | */ |
157 | void blk_abort_request(struct request *req) | 159 | void blk_abort_request(struct request *req) |
158 | { | 160 | { |
159 | if (blk_mark_rq_complete(req)) | ||
160 | return; | ||
161 | |||
162 | if (req->q->mq_ops) { | 161 | if (req->q->mq_ops) { |
163 | blk_mq_rq_timed_out(req, false); | 162 | /* |
163 | * All we need to ensure is that timeout scan takes place | ||
164 | * immediately and that scan sees the new timeout value. | ||
165 | * No need for fancy synchronizations. | ||
166 | */ | ||
167 | blk_rq_set_deadline(req, jiffies); | ||
168 | mod_timer(&req->q->timeout, 0); | ||
164 | } else { | 169 | } else { |
170 | if (blk_mark_rq_complete(req)) | ||
171 | return; | ||
165 | blk_delete_timer(req); | 172 | blk_delete_timer(req); |
166 | blk_rq_timed_out(req); | 173 | blk_rq_timed_out(req); |
167 | } | 174 | } |
@@ -208,7 +215,8 @@ void blk_add_timer(struct request *req) | |||
208 | if (!req->timeout) | 215 | if (!req->timeout) |
209 | req->timeout = q->rq_timeout; | 216 | req->timeout = q->rq_timeout; |
210 | 217 | ||
211 | WRITE_ONCE(req->deadline, jiffies + req->timeout); | 218 | blk_rq_set_deadline(req, jiffies + req->timeout); |
219 | req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED; | ||
212 | 220 | ||
213 | /* | 221 | /* |
214 | * Only the non-mq case needs to add the request to a protected list. | 222 | * Only the non-mq case needs to add the request to a protected list. |
@@ -222,7 +230,7 @@ void blk_add_timer(struct request *req) | |||
222 | * than an existing one, modify the timer. Round up to next nearest | 230 | * than an existing one, modify the timer. Round up to next nearest |
223 | * second. | 231 | * second. |
224 | */ | 232 | */ |
225 | expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); | 233 | expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); |
226 | 234 | ||
227 | if (!timer_pending(&q->timeout) || | 235 | if (!timer_pending(&q->timeout) || |
228 | time_before(expiry, q->timeout.expires)) { | 236 | time_before(expiry, q->timeout.expires)) { |
diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ff57fb51b338..acb7252c7e81 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c | |||
@@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q, | |||
22 | } | 22 | } |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * Return true if a request is a write requests that needs zone write locking. | ||
26 | */ | ||
27 | bool blk_req_needs_zone_write_lock(struct request *rq) | ||
28 | { | ||
29 | if (!rq->q->seq_zones_wlock) | ||
30 | return false; | ||
31 | |||
32 | if (blk_rq_is_passthrough(rq)) | ||
33 | return false; | ||
34 | |||
35 | switch (req_op(rq)) { | ||
36 | case REQ_OP_WRITE_ZEROES: | ||
37 | case REQ_OP_WRITE_SAME: | ||
38 | case REQ_OP_WRITE: | ||
39 | return blk_rq_zone_is_seq(rq); | ||
40 | default: | ||
41 | return false; | ||
42 | } | ||
43 | } | ||
44 | EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); | ||
45 | |||
46 | void __blk_req_zone_write_lock(struct request *rq) | ||
47 | { | ||
48 | if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), | ||
49 | rq->q->seq_zones_wlock))) | ||
50 | return; | ||
51 | |||
52 | WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); | ||
53 | rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; | ||
54 | } | ||
55 | EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); | ||
56 | |||
57 | void __blk_req_zone_write_unlock(struct request *rq) | ||
58 | { | ||
59 | rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; | ||
60 | if (rq->q->seq_zones_wlock) | ||
61 | WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), | ||
62 | rq->q->seq_zones_wlock)); | ||
63 | } | ||
64 | EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); | ||
65 | |||
66 | /* | ||
25 | * Check that a zone report belongs to the partition. | 67 | * Check that a zone report belongs to the partition. |
26 | * If yes, fix its start sector and write pointer, copy it in the | 68 | * If yes, fix its start sector and write pointer, copy it in the |
27 | * zone information array and return true. Return false otherwise. | 69 | * zone information array and return true. Return false otherwise. |
diff --git a/block/blk.h b/block/blk.h index 442098aa9463..46db5dc83dcb 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -120,33 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes); | |||
120 | void blk_account_io_done(struct request *req); | 120 | void blk_account_io_done(struct request *req); |
121 | 121 | ||
122 | /* | 122 | /* |
123 | * Internal atomic flags for request handling | ||
124 | */ | ||
125 | enum rq_atomic_flags { | ||
126 | /* | ||
127 | * Keep these two bits first - not because we depend on the | ||
128 | * value of them, but we do depend on them being in the same | ||
129 | * byte of storage to ensure ordering on writes. Keeping them | ||
130 | * first will achieve that nicely. | ||
131 | */ | ||
132 | REQ_ATOM_COMPLETE = 0, | ||
133 | REQ_ATOM_STARTED, | ||
134 | |||
135 | REQ_ATOM_POLL_SLEPT, | ||
136 | }; | ||
137 | |||
138 | /* | ||
139 | * EH timer and IO completion will both attempt to 'grab' the request, make | 123 | * EH timer and IO completion will both attempt to 'grab' the request, make |
140 | * sure that only one of them succeeds | 124 | * sure that only one of them succeeds. Steal the bottom bit of the |
125 | * __deadline field for this. | ||
141 | */ | 126 | */ |
142 | static inline int blk_mark_rq_complete(struct request *rq) | 127 | static inline int blk_mark_rq_complete(struct request *rq) |
143 | { | 128 | { |
144 | return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); | 129 | return test_and_set_bit(0, &rq->__deadline); |
145 | } | 130 | } |
146 | 131 | ||
147 | static inline void blk_clear_rq_complete(struct request *rq) | 132 | static inline void blk_clear_rq_complete(struct request *rq) |
148 | { | 133 | { |
149 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); | 134 | clear_bit(0, &rq->__deadline); |
135 | } | ||
136 | |||
137 | static inline bool blk_rq_is_complete(struct request *rq) | ||
138 | { | ||
139 | return test_bit(0, &rq->__deadline); | ||
150 | } | 140 | } |
151 | 141 | ||
152 | /* | 142 | /* |
@@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq | |||
172 | e->type->ops.sq.elevator_deactivate_req_fn(q, rq); | 162 | e->type->ops.sq.elevator_deactivate_req_fn(q, rq); |
173 | } | 163 | } |
174 | 164 | ||
165 | int elv_register_queue(struct request_queue *q); | ||
166 | void elv_unregister_queue(struct request_queue *q); | ||
167 | |||
175 | struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); | 168 | struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); |
176 | 169 | ||
177 | #ifdef CONFIG_FAIL_IO_TIMEOUT | 170 | #ifdef CONFIG_FAIL_IO_TIMEOUT |
@@ -246,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) | |||
246 | } | 239 | } |
247 | 240 | ||
248 | /* | 241 | /* |
242 | * Steal a bit from this field for legacy IO path atomic IO marking. Note that | ||
243 | * setting the deadline clears the bottom bit, potentially clearing the | ||
244 | * completed bit. The user has to be OK with this (current ones are fine). | ||
245 | */ | ||
246 | static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) | ||
247 | { | ||
248 | rq->__deadline = time & ~0x1UL; | ||
249 | } | ||
250 | |||
251 | static inline unsigned long blk_rq_deadline(struct request *rq) | ||
252 | { | ||
253 | return rq->__deadline & ~0x1UL; | ||
254 | } | ||
255 | |||
256 | /* | ||
249 | * Internal io_context interface | 257 | * Internal io_context interface |
250 | */ | 258 | */ |
251 | void get_io_context(struct io_context *ioc); | 259 | void get_io_context(struct io_context *ioc); |
diff --git a/block/bounce.c b/block/bounce.c index 1d05c422c932..6a3e68292273 100644 --- a/block/bounce.c +++ b/block/bounce.c | |||
@@ -113,45 +113,50 @@ int init_emergency_isa_pool(void) | |||
113 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) | 113 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) |
114 | { | 114 | { |
115 | unsigned char *vfrom; | 115 | unsigned char *vfrom; |
116 | struct bio_vec tovec, *fromvec = from->bi_io_vec; | 116 | struct bio_vec tovec, fromvec; |
117 | struct bvec_iter iter; | 117 | struct bvec_iter iter; |
118 | /* | ||
119 | * The bio of @from is created by bounce, so we can iterate | ||
120 | * its bvec from start to end, but the @from->bi_iter can't be | ||
121 | * trusted because it might be changed by splitting. | ||
122 | */ | ||
123 | struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; | ||
118 | 124 | ||
119 | bio_for_each_segment(tovec, to, iter) { | 125 | bio_for_each_segment(tovec, to, iter) { |
120 | if (tovec.bv_page != fromvec->bv_page) { | 126 | fromvec = bio_iter_iovec(from, from_iter); |
127 | if (tovec.bv_page != fromvec.bv_page) { | ||
121 | /* | 128 | /* |
122 | * fromvec->bv_offset and fromvec->bv_len might have | 129 | * fromvec->bv_offset and fromvec->bv_len might have |
123 | * been modified by the block layer, so use the original | 130 | * been modified by the block layer, so use the original |
124 | * copy, bounce_copy_vec already uses tovec->bv_len | 131 | * copy, bounce_copy_vec already uses tovec->bv_len |
125 | */ | 132 | */ |
126 | vfrom = page_address(fromvec->bv_page) + | 133 | vfrom = page_address(fromvec.bv_page) + |
127 | tovec.bv_offset; | 134 | tovec.bv_offset; |
128 | 135 | ||
129 | bounce_copy_vec(&tovec, vfrom); | 136 | bounce_copy_vec(&tovec, vfrom); |
130 | flush_dcache_page(tovec.bv_page); | 137 | flush_dcache_page(tovec.bv_page); |
131 | } | 138 | } |
132 | 139 | bio_advance_iter(from, &from_iter, tovec.bv_len); | |
133 | fromvec++; | ||
134 | } | 140 | } |
135 | } | 141 | } |
136 | 142 | ||
137 | static void bounce_end_io(struct bio *bio, mempool_t *pool) | 143 | static void bounce_end_io(struct bio *bio, mempool_t *pool) |
138 | { | 144 | { |
139 | struct bio *bio_orig = bio->bi_private; | 145 | struct bio *bio_orig = bio->bi_private; |
140 | struct bio_vec *bvec, *org_vec; | 146 | struct bio_vec *bvec, orig_vec; |
141 | int i; | 147 | int i; |
142 | int start = bio_orig->bi_iter.bi_idx; | 148 | struct bvec_iter orig_iter = bio_orig->bi_iter; |
143 | 149 | ||
144 | /* | 150 | /* |
145 | * free up bounce indirect pages used | 151 | * free up bounce indirect pages used |
146 | */ | 152 | */ |
147 | bio_for_each_segment_all(bvec, bio, i) { | 153 | bio_for_each_segment_all(bvec, bio, i) { |
148 | org_vec = bio_orig->bi_io_vec + i + start; | 154 | orig_vec = bio_iter_iovec(bio_orig, orig_iter); |
149 | 155 | if (bvec->bv_page != orig_vec.bv_page) { | |
150 | if (bvec->bv_page == org_vec->bv_page) | 156 | dec_zone_page_state(bvec->bv_page, NR_BOUNCE); |
151 | continue; | 157 | mempool_free(bvec->bv_page, pool); |
152 | 158 | } | |
153 | dec_zone_page_state(bvec->bv_page, NR_BOUNCE); | 159 | bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); |
154 | mempool_free(bvec->bv_page, pool); | ||
155 | } | 160 | } |
156 | 161 | ||
157 | bio_orig->bi_status = bio->bi_status; | 162 | bio_orig->bi_status = bio->bi_status; |
diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 15d25ccd51a5..1474153f73e3 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c | |||
@@ -30,7 +30,7 @@ | |||
30 | 30 | ||
31 | /** | 31 | /** |
32 | * bsg_teardown_job - routine to teardown a bsg job | 32 | * bsg_teardown_job - routine to teardown a bsg job |
33 | * @job: bsg_job that is to be torn down | 33 | * @kref: kref inside bsg_job that is to be torn down |
34 | */ | 34 | */ |
35 | static void bsg_teardown_job(struct kref *kref) | 35 | static void bsg_teardown_job(struct kref *kref) |
36 | { | 36 | { |
@@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) | |||
251 | * @name: device to give bsg device | 251 | * @name: device to give bsg device |
252 | * @job_fn: bsg job handler | 252 | * @job_fn: bsg job handler |
253 | * @dd_job_size: size of LLD data needed for each job | 253 | * @dd_job_size: size of LLD data needed for each job |
254 | * @release: @dev release function | ||
254 | */ | 255 | */ |
255 | struct request_queue *bsg_setup_queue(struct device *dev, const char *name, | 256 | struct request_queue *bsg_setup_queue(struct device *dev, const char *name, |
256 | bsg_job_fn *job_fn, int dd_job_size, | 257 | bsg_job_fn *job_fn, int dd_job_size, |
diff --git a/block/bsg.c b/block/bsg.c index 452f94f1c5d4..a1bcbb6ba50b 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
@@ -32,6 +32,9 @@ | |||
32 | #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" | 32 | #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" |
33 | #define BSG_VERSION "0.4" | 33 | #define BSG_VERSION "0.4" |
34 | 34 | ||
35 | #define bsg_dbg(bd, fmt, ...) \ | ||
36 | pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) | ||
37 | |||
35 | struct bsg_device { | 38 | struct bsg_device { |
36 | struct request_queue *queue; | 39 | struct request_queue *queue; |
37 | spinlock_t lock; | 40 | spinlock_t lock; |
@@ -55,14 +58,6 @@ enum { | |||
55 | #define BSG_DEFAULT_CMDS 64 | 58 | #define BSG_DEFAULT_CMDS 64 |
56 | #define BSG_MAX_DEVS 32768 | 59 | #define BSG_MAX_DEVS 32768 |
57 | 60 | ||
58 | #undef BSG_DEBUG | ||
59 | |||
60 | #ifdef BSG_DEBUG | ||
61 | #define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args) | ||
62 | #else | ||
63 | #define dprintk(fmt, args...) | ||
64 | #endif | ||
65 | |||
66 | static DEFINE_MUTEX(bsg_mutex); | 61 | static DEFINE_MUTEX(bsg_mutex); |
67 | static DEFINE_IDR(bsg_minor_idr); | 62 | static DEFINE_IDR(bsg_minor_idr); |
68 | 63 | ||
@@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd) | |||
123 | 118 | ||
124 | bc->bd = bd; | 119 | bc->bd = bd; |
125 | INIT_LIST_HEAD(&bc->list); | 120 | INIT_LIST_HEAD(&bc->list); |
126 | dprintk("%s: returning free cmd %p\n", bd->name, bc); | 121 | bsg_dbg(bd, "returning free cmd %p\n", bc); |
127 | return bc; | 122 | return bc; |
128 | out: | 123 | out: |
129 | spin_unlock_irq(&bd->lock); | 124 | spin_unlock_irq(&bd->lock); |
@@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) | |||
222 | if (!bcd->class_dev) | 217 | if (!bcd->class_dev) |
223 | return ERR_PTR(-ENXIO); | 218 | return ERR_PTR(-ENXIO); |
224 | 219 | ||
225 | dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, | 220 | bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n", |
221 | (unsigned long long) hdr->dout_xferp, | ||
226 | hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, | 222 | hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, |
227 | hdr->din_xfer_len); | 223 | hdr->din_xfer_len); |
228 | 224 | ||
@@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status) | |||
299 | struct bsg_device *bd = bc->bd; | 295 | struct bsg_device *bd = bc->bd; |
300 | unsigned long flags; | 296 | unsigned long flags; |
301 | 297 | ||
302 | dprintk("%s: finished rq %p bc %p, bio %p\n", | 298 | bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", |
303 | bd->name, rq, bc, bc->bio); | 299 | rq, bc, bc->bio); |
304 | 300 | ||
305 | bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); | 301 | bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); |
306 | 302 | ||
@@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, | |||
333 | list_add_tail(&bc->list, &bd->busy_list); | 329 | list_add_tail(&bc->list, &bd->busy_list); |
334 | spin_unlock_irq(&bd->lock); | 330 | spin_unlock_irq(&bd->lock); |
335 | 331 | ||
336 | dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); | 332 | bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc); |
337 | 333 | ||
338 | rq->end_io_data = bc; | 334 | rq->end_io_data = bc; |
339 | blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); | 335 | blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); |
@@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) | |||
379 | } | 375 | } |
380 | } while (1); | 376 | } while (1); |
381 | 377 | ||
382 | dprintk("%s: returning done %p\n", bd->name, bc); | 378 | bsg_dbg(bd, "returning done %p\n", bc); |
383 | 379 | ||
384 | return bc; | 380 | return bc; |
385 | } | 381 | } |
@@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, | |||
390 | struct scsi_request *req = scsi_req(rq); | 386 | struct scsi_request *req = scsi_req(rq); |
391 | int ret = 0; | 387 | int ret = 0; |
392 | 388 | ||
393 | dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); | 389 | pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result); |
394 | /* | 390 | /* |
395 | * fill in all the output members | 391 | * fill in all the output members |
396 | */ | 392 | */ |
@@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) | |||
469 | struct bsg_command *bc; | 465 | struct bsg_command *bc; |
470 | int ret, tret; | 466 | int ret, tret; |
471 | 467 | ||
472 | dprintk("%s: entered\n", bd->name); | 468 | bsg_dbg(bd, "entered\n"); |
473 | 469 | ||
474 | /* | 470 | /* |
475 | * wait for all commands to complete | 471 | * wait for all commands to complete |
@@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
572 | int ret; | 568 | int ret; |
573 | ssize_t bytes_read; | 569 | ssize_t bytes_read; |
574 | 570 | ||
575 | dprintk("%s: read %zd bytes\n", bd->name, count); | 571 | bsg_dbg(bd, "read %zd bytes\n", count); |
576 | 572 | ||
577 | bsg_set_block(bd, file); | 573 | bsg_set_block(bd, file); |
578 | 574 | ||
@@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | |||
646 | ssize_t bytes_written; | 642 | ssize_t bytes_written; |
647 | int ret; | 643 | int ret; |
648 | 644 | ||
649 | dprintk("%s: write %zd bytes\n", bd->name, count); | 645 | bsg_dbg(bd, "write %zd bytes\n", count); |
650 | 646 | ||
651 | if (unlikely(uaccess_kernel())) | 647 | if (unlikely(uaccess_kernel())) |
652 | return -EINVAL; | 648 | return -EINVAL; |
@@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | |||
664 | if (!bytes_written || err_block_err(ret)) | 660 | if (!bytes_written || err_block_err(ret)) |
665 | bytes_written = ret; | 661 | bytes_written = ret; |
666 | 662 | ||
667 | dprintk("%s: returning %zd\n", bd->name, bytes_written); | 663 | bsg_dbg(bd, "returning %zd\n", bytes_written); |
668 | return bytes_written; | 664 | return bytes_written; |
669 | } | 665 | } |
670 | 666 | ||
@@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd) | |||
717 | hlist_del(&bd->dev_list); | 713 | hlist_del(&bd->dev_list); |
718 | mutex_unlock(&bsg_mutex); | 714 | mutex_unlock(&bsg_mutex); |
719 | 715 | ||
720 | dprintk("%s: tearing down\n", bd->name); | 716 | bsg_dbg(bd, "tearing down\n"); |
721 | 717 | ||
722 | /* | 718 | /* |
723 | * close can always block | 719 | * close can always block |
@@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, | |||
744 | struct file *file) | 740 | struct file *file) |
745 | { | 741 | { |
746 | struct bsg_device *bd; | 742 | struct bsg_device *bd; |
747 | #ifdef BSG_DEBUG | ||
748 | unsigned char buf[32]; | 743 | unsigned char buf[32]; |
749 | #endif | ||
750 | 744 | ||
751 | if (!blk_queue_scsi_passthrough(rq)) { | 745 | if (!blk_queue_scsi_passthrough(rq)) { |
752 | WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); | 746 | WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); |
@@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, | |||
771 | hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); | 765 | hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); |
772 | 766 | ||
773 | strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); | 767 | strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); |
774 | dprintk("bound to <%s>, max queue %d\n", | 768 | bsg_dbg(bd, "bound to <%s>, max queue %d\n", |
775 | format_dev_t(buf, inode->i_rdev), bd->max_queue); | 769 | format_dev_t(buf, inode->i_rdev), bd->max_queue); |
776 | 770 | ||
777 | mutex_unlock(&bsg_mutex); | 771 | mutex_unlock(&bsg_mutex); |
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b83f77460d28..9de9f156e203 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c | |||
@@ -50,8 +50,6 @@ struct deadline_data { | |||
50 | int front_merges; | 50 | int front_merges; |
51 | }; | 51 | }; |
52 | 52 | ||
53 | static void deadline_move_request(struct deadline_data *, struct request *); | ||
54 | |||
55 | static inline struct rb_root * | 53 | static inline struct rb_root * |
56 | deadline_rb_root(struct deadline_data *dd, struct request *rq) | 54 | deadline_rb_root(struct deadline_data *dd, struct request *rq) |
57 | { | 55 | { |
@@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq) | |||
100 | struct deadline_data *dd = q->elevator->elevator_data; | 98 | struct deadline_data *dd = q->elevator->elevator_data; |
101 | const int data_dir = rq_data_dir(rq); | 99 | const int data_dir = rq_data_dir(rq); |
102 | 100 | ||
101 | /* | ||
102 | * This may be a requeue of a write request that has locked its | ||
103 | * target zone. If it is the case, this releases the zone lock. | ||
104 | */ | ||
105 | blk_req_zone_write_unlock(rq); | ||
106 | |||
103 | deadline_add_rq_rb(dd, rq); | 107 | deadline_add_rq_rb(dd, rq); |
104 | 108 | ||
105 | /* | 109 | /* |
@@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) | |||
190 | { | 194 | { |
191 | struct request_queue *q = rq->q; | 195 | struct request_queue *q = rq->q; |
192 | 196 | ||
197 | /* | ||
198 | * For a zoned block device, write requests must write lock their | ||
199 | * target zone. | ||
200 | */ | ||
201 | blk_req_zone_write_lock(rq); | ||
202 | |||
193 | deadline_remove_request(q, rq); | 203 | deadline_remove_request(q, rq); |
194 | elv_dispatch_add_tail(q, rq); | 204 | elv_dispatch_add_tail(q, rq); |
195 | } | 205 | } |
@@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) | |||
231 | } | 241 | } |
232 | 242 | ||
233 | /* | 243 | /* |
244 | * For the specified data direction, return the next request to dispatch using | ||
245 | * arrival ordered lists. | ||
246 | */ | ||
247 | static struct request * | ||
248 | deadline_fifo_request(struct deadline_data *dd, int data_dir) | ||
249 | { | ||
250 | struct request *rq; | ||
251 | |||
252 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
253 | return NULL; | ||
254 | |||
255 | if (list_empty(&dd->fifo_list[data_dir])) | ||
256 | return NULL; | ||
257 | |||
258 | rq = rq_entry_fifo(dd->fifo_list[data_dir].next); | ||
259 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
260 | return rq; | ||
261 | |||
262 | /* | ||
263 | * Look for a write request that can be dispatched, that is one with | ||
264 | * an unlocked target zone. | ||
265 | */ | ||
266 | list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { | ||
267 | if (blk_req_can_dispatch_to_zone(rq)) | ||
268 | return rq; | ||
269 | } | ||
270 | |||
271 | return NULL; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * For the specified data direction, return the next request to dispatch using | ||
276 | * sector position sorted lists. | ||
277 | */ | ||
278 | static struct request * | ||
279 | deadline_next_request(struct deadline_data *dd, int data_dir) | ||
280 | { | ||
281 | struct request *rq; | ||
282 | |||
283 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
284 | return NULL; | ||
285 | |||
286 | rq = dd->next_rq[data_dir]; | ||
287 | if (!rq) | ||
288 | return NULL; | ||
289 | |||
290 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
291 | return rq; | ||
292 | |||
293 | /* | ||
294 | * Look for a write request that can be dispatched, that is one with | ||
295 | * an unlocked target zone. | ||
296 | */ | ||
297 | while (rq) { | ||
298 | if (blk_req_can_dispatch_to_zone(rq)) | ||
299 | return rq; | ||
300 | rq = deadline_latter_request(rq); | ||
301 | } | ||
302 | |||
303 | return NULL; | ||
304 | } | ||
305 | |||
306 | /* | ||
234 | * deadline_dispatch_requests selects the best request according to | 307 | * deadline_dispatch_requests selects the best request according to |
235 | * read/write expire, fifo_batch, etc | 308 | * read/write expire, fifo_batch, etc |
236 | */ | 309 | */ |
@@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) | |||
239 | struct deadline_data *dd = q->elevator->elevator_data; | 312 | struct deadline_data *dd = q->elevator->elevator_data; |
240 | const int reads = !list_empty(&dd->fifo_list[READ]); | 313 | const int reads = !list_empty(&dd->fifo_list[READ]); |
241 | const int writes = !list_empty(&dd->fifo_list[WRITE]); | 314 | const int writes = !list_empty(&dd->fifo_list[WRITE]); |
242 | struct request *rq; | 315 | struct request *rq, *next_rq; |
243 | int data_dir; | 316 | int data_dir; |
244 | 317 | ||
245 | /* | 318 | /* |
246 | * batches are currently reads XOR writes | 319 | * batches are currently reads XOR writes |
247 | */ | 320 | */ |
248 | if (dd->next_rq[WRITE]) | 321 | rq = deadline_next_request(dd, WRITE); |
249 | rq = dd->next_rq[WRITE]; | 322 | if (!rq) |
250 | else | 323 | rq = deadline_next_request(dd, READ); |
251 | rq = dd->next_rq[READ]; | ||
252 | 324 | ||
253 | if (rq && dd->batching < dd->fifo_batch) | 325 | if (rq && dd->batching < dd->fifo_batch) |
254 | /* we have a next request are still entitled to batch */ | 326 | /* we have a next request are still entitled to batch */ |
@@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) | |||
262 | if (reads) { | 334 | if (reads) { |
263 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); | 335 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); |
264 | 336 | ||
265 | if (writes && (dd->starved++ >= dd->writes_starved)) | 337 | if (deadline_fifo_request(dd, WRITE) && |
338 | (dd->starved++ >= dd->writes_starved)) | ||
266 | goto dispatch_writes; | 339 | goto dispatch_writes; |
267 | 340 | ||
268 | data_dir = READ; | 341 | data_dir = READ; |
@@ -291,21 +364,29 @@ dispatch_find_request: | |||
291 | /* | 364 | /* |
292 | * we are not running a batch, find best request for selected data_dir | 365 | * we are not running a batch, find best request for selected data_dir |
293 | */ | 366 | */ |
294 | if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { | 367 | next_rq = deadline_next_request(dd, data_dir); |
368 | if (deadline_check_fifo(dd, data_dir) || !next_rq) { | ||
295 | /* | 369 | /* |
296 | * A deadline has expired, the last request was in the other | 370 | * A deadline has expired, the last request was in the other |
297 | * direction, or we have run out of higher-sectored requests. | 371 | * direction, or we have run out of higher-sectored requests. |
298 | * Start again from the request with the earliest expiry time. | 372 | * Start again from the request with the earliest expiry time. |
299 | */ | 373 | */ |
300 | rq = rq_entry_fifo(dd->fifo_list[data_dir].next); | 374 | rq = deadline_fifo_request(dd, data_dir); |
301 | } else { | 375 | } else { |
302 | /* | 376 | /* |
303 | * The last req was the same dir and we have a next request in | 377 | * The last req was the same dir and we have a next request in |
304 | * sort order. No expired requests so continue on from here. | 378 | * sort order. No expired requests so continue on from here. |
305 | */ | 379 | */ |
306 | rq = dd->next_rq[data_dir]; | 380 | rq = next_rq; |
307 | } | 381 | } |
308 | 382 | ||
383 | /* | ||
384 | * For a zoned block device, if we only have writes queued and none of | ||
385 | * them can be dispatched, rq will be NULL. | ||
386 | */ | ||
387 | if (!rq) | ||
388 | return 0; | ||
389 | |||
309 | dd->batching = 0; | 390 | dd->batching = 0; |
310 | 391 | ||
311 | dispatch_request: | 392 | dispatch_request: |
@@ -318,6 +399,16 @@ dispatch_request: | |||
318 | return 1; | 399 | return 1; |
319 | } | 400 | } |
320 | 401 | ||
402 | /* | ||
403 | * For zoned block devices, write unlock the target zone of completed | ||
404 | * write requests. | ||
405 | */ | ||
406 | static void | ||
407 | deadline_completed_request(struct request_queue *q, struct request *rq) | ||
408 | { | ||
409 | blk_req_zone_write_unlock(rq); | ||
410 | } | ||
411 | |||
321 | static void deadline_exit_queue(struct elevator_queue *e) | 412 | static void deadline_exit_queue(struct elevator_queue *e) |
322 | { | 413 | { |
323 | struct deadline_data *dd = e->elevator_data; | 414 | struct deadline_data *dd = e->elevator_data; |
@@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = { | |||
439 | .elevator_merged_fn = deadline_merged_request, | 530 | .elevator_merged_fn = deadline_merged_request, |
440 | .elevator_merge_req_fn = deadline_merged_requests, | 531 | .elevator_merge_req_fn = deadline_merged_requests, |
441 | .elevator_dispatch_fn = deadline_dispatch_requests, | 532 | .elevator_dispatch_fn = deadline_dispatch_requests, |
533 | .elevator_completed_req_fn = deadline_completed_request, | ||
442 | .elevator_add_req_fn = deadline_add_request, | 534 | .elevator_add_req_fn = deadline_add_request, |
443 | .elevator_former_req_fn = elv_rb_former_request, | 535 | .elevator_former_req_fn = elv_rb_former_request, |
444 | .elevator_latter_req_fn = elv_rb_latter_request, | 536 | .elevator_latter_req_fn = elv_rb_latter_request, |
diff --git a/block/elevator.c b/block/elevator.c index 7bda083d5968..e87e9b43aba0 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q) | |||
869 | struct elevator_queue *e = q->elevator; | 869 | struct elevator_queue *e = q->elevator; |
870 | int error; | 870 | int error; |
871 | 871 | ||
872 | lockdep_assert_held(&q->sysfs_lock); | ||
873 | |||
872 | error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); | 874 | error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); |
873 | if (!error) { | 875 | if (!error) { |
874 | struct elv_fs_entry *attr = e->type->elevator_attrs; | 876 | struct elv_fs_entry *attr = e->type->elevator_attrs; |
@@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q) | |||
886 | } | 888 | } |
887 | return error; | 889 | return error; |
888 | } | 890 | } |
889 | EXPORT_SYMBOL(elv_register_queue); | ||
890 | 891 | ||
891 | void elv_unregister_queue(struct request_queue *q) | 892 | void elv_unregister_queue(struct request_queue *q) |
892 | { | 893 | { |
894 | lockdep_assert_held(&q->sysfs_lock); | ||
895 | |||
893 | if (q) { | 896 | if (q) { |
894 | struct elevator_queue *e = q->elevator; | 897 | struct elevator_queue *e = q->elevator; |
895 | 898 | ||
@@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q) | |||
900 | wbt_enable_default(q); | 903 | wbt_enable_default(q); |
901 | } | 904 | } |
902 | } | 905 | } |
903 | EXPORT_SYMBOL(elv_unregister_queue); | ||
904 | 906 | ||
905 | int elv_register(struct elevator_type *e) | 907 | int elv_register(struct elevator_type *e) |
906 | { | 908 | { |
@@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q, | |||
967 | { | 969 | { |
968 | int ret; | 970 | int ret; |
969 | 971 | ||
972 | lockdep_assert_held(&q->sysfs_lock); | ||
973 | |||
970 | blk_mq_freeze_queue(q); | 974 | blk_mq_freeze_queue(q); |
975 | blk_mq_quiesce_queue(q); | ||
971 | 976 | ||
972 | if (q->elevator) { | 977 | if (q->elevator) { |
973 | if (q->elevator->registered) | 978 | if (q->elevator->registered) |
@@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q, | |||
994 | blk_add_trace_msg(q, "elv switch: none"); | 999 | blk_add_trace_msg(q, "elv switch: none"); |
995 | 1000 | ||
996 | out: | 1001 | out: |
1002 | blk_mq_unquiesce_queue(q); | ||
997 | blk_mq_unfreeze_queue(q); | 1003 | blk_mq_unfreeze_queue(q); |
998 | return ret; | 1004 | return ret; |
999 | } | 1005 | } |
@@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) | |||
1010 | bool old_registered = false; | 1016 | bool old_registered = false; |
1011 | int err; | 1017 | int err; |
1012 | 1018 | ||
1019 | lockdep_assert_held(&q->sysfs_lock); | ||
1020 | |||
1013 | if (q->mq_ops) | 1021 | if (q->mq_ops) |
1014 | return elevator_switch_mq(q, new_e); | 1022 | return elevator_switch_mq(q, new_e); |
1015 | 1023 | ||
diff --git a/block/genhd.c b/block/genhd.c index 96a66f671720..88a53c188cb7 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -629,16 +629,18 @@ exit: | |||
629 | } | 629 | } |
630 | 630 | ||
631 | /** | 631 | /** |
632 | * device_add_disk - add partitioning information to kernel list | 632 | * __device_add_disk - add disk information to kernel list |
633 | * @parent: parent device for the disk | 633 | * @parent: parent device for the disk |
634 | * @disk: per-device partitioning information | 634 | * @disk: per-device partitioning information |
635 | * @register_queue: register the queue if set to true | ||
635 | * | 636 | * |
636 | * This function registers the partitioning information in @disk | 637 | * This function registers the partitioning information in @disk |
637 | * with the kernel. | 638 | * with the kernel. |
638 | * | 639 | * |
639 | * FIXME: error handling | 640 | * FIXME: error handling |
640 | */ | 641 | */ |
641 | void device_add_disk(struct device *parent, struct gendisk *disk) | 642 | static void __device_add_disk(struct device *parent, struct gendisk *disk, |
643 | bool register_queue) | ||
642 | { | 644 | { |
643 | dev_t devt; | 645 | dev_t devt; |
644 | int retval; | 646 | int retval; |
@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) | |||
682 | exact_match, exact_lock, disk); | 684 | exact_match, exact_lock, disk); |
683 | } | 685 | } |
684 | register_disk(parent, disk); | 686 | register_disk(parent, disk); |
685 | blk_register_queue(disk); | 687 | if (register_queue) |
688 | blk_register_queue(disk); | ||
686 | 689 | ||
687 | /* | 690 | /* |
688 | * Take an extra ref on queue which will be put on disk_release() | 691 | * Take an extra ref on queue which will be put on disk_release() |
@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk) | |||
693 | disk_add_events(disk); | 696 | disk_add_events(disk); |
694 | blk_integrity_add(disk); | 697 | blk_integrity_add(disk); |
695 | } | 698 | } |
699 | |||
700 | void device_add_disk(struct device *parent, struct gendisk *disk) | ||
701 | { | ||
702 | __device_add_disk(parent, disk, true); | ||
703 | } | ||
696 | EXPORT_SYMBOL(device_add_disk); | 704 | EXPORT_SYMBOL(device_add_disk); |
697 | 705 | ||
706 | void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) | ||
707 | { | ||
708 | __device_add_disk(parent, disk, false); | ||
709 | } | ||
710 | EXPORT_SYMBOL(device_add_disk_no_queue_reg); | ||
711 | |||
698 | void del_gendisk(struct gendisk *disk) | 712 | void del_gendisk(struct gendisk *disk) |
699 | { | 713 | { |
700 | struct disk_part_iter piter; | 714 | struct disk_part_iter piter; |
@@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk) | |||
725 | * Unregister bdi before releasing device numbers (as they can | 739 | * Unregister bdi before releasing device numbers (as they can |
726 | * get reused and we'd get clashes in sysfs). | 740 | * get reused and we'd get clashes in sysfs). |
727 | */ | 741 | */ |
728 | bdi_unregister(disk->queue->backing_dev_info); | 742 | if (!(disk->flags & GENHD_FL_HIDDEN)) |
743 | bdi_unregister(disk->queue->backing_dev_info); | ||
729 | blk_unregister_queue(disk); | 744 | blk_unregister_queue(disk); |
730 | } else { | 745 | } else { |
731 | WARN_ON(1); | 746 | WARN_ON(1); |
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 0179e484ec98..c56f211c8440 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c | |||
@@ -59,6 +59,7 @@ struct deadline_data { | |||
59 | int front_merges; | 59 | int front_merges; |
60 | 60 | ||
61 | spinlock_t lock; | 61 | spinlock_t lock; |
62 | spinlock_t zone_lock; | ||
62 | struct list_head dispatch; | 63 | struct list_head dispatch; |
63 | }; | 64 | }; |
64 | 65 | ||
@@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) | |||
192 | } | 193 | } |
193 | 194 | ||
194 | /* | 195 | /* |
196 | * For the specified data direction, return the next request to | ||
197 | * dispatch using arrival ordered lists. | ||
198 | */ | ||
199 | static struct request * | ||
200 | deadline_fifo_request(struct deadline_data *dd, int data_dir) | ||
201 | { | ||
202 | struct request *rq; | ||
203 | unsigned long flags; | ||
204 | |||
205 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
206 | return NULL; | ||
207 | |||
208 | if (list_empty(&dd->fifo_list[data_dir])) | ||
209 | return NULL; | ||
210 | |||
211 | rq = rq_entry_fifo(dd->fifo_list[data_dir].next); | ||
212 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
213 | return rq; | ||
214 | |||
215 | /* | ||
216 | * Look for a write request that can be dispatched, that is one with | ||
217 | * an unlocked target zone. | ||
218 | */ | ||
219 | spin_lock_irqsave(&dd->zone_lock, flags); | ||
220 | list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { | ||
221 | if (blk_req_can_dispatch_to_zone(rq)) | ||
222 | goto out; | ||
223 | } | ||
224 | rq = NULL; | ||
225 | out: | ||
226 | spin_unlock_irqrestore(&dd->zone_lock, flags); | ||
227 | |||
228 | return rq; | ||
229 | } | ||
230 | |||
231 | /* | ||
232 | * For the specified data direction, return the next request to | ||
233 | * dispatch using sector position sorted lists. | ||
234 | */ | ||
235 | static struct request * | ||
236 | deadline_next_request(struct deadline_data *dd, int data_dir) | ||
237 | { | ||
238 | struct request *rq; | ||
239 | unsigned long flags; | ||
240 | |||
241 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
242 | return NULL; | ||
243 | |||
244 | rq = dd->next_rq[data_dir]; | ||
245 | if (!rq) | ||
246 | return NULL; | ||
247 | |||
248 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
249 | return rq; | ||
250 | |||
251 | /* | ||
252 | * Look for a write request that can be dispatched, that is one with | ||
253 | * an unlocked target zone. | ||
254 | */ | ||
255 | spin_lock_irqsave(&dd->zone_lock, flags); | ||
256 | while (rq) { | ||
257 | if (blk_req_can_dispatch_to_zone(rq)) | ||
258 | break; | ||
259 | rq = deadline_latter_request(rq); | ||
260 | } | ||
261 | spin_unlock_irqrestore(&dd->zone_lock, flags); | ||
262 | |||
263 | return rq; | ||
264 | } | ||
265 | |||
266 | /* | ||
195 | * deadline_dispatch_requests selects the best request according to | 267 | * deadline_dispatch_requests selects the best request according to |
196 | * read/write expire, fifo_batch, etc | 268 | * read/write expire, fifo_batch, etc |
197 | */ | 269 | */ |
198 | static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) | 270 | static struct request *__dd_dispatch_request(struct deadline_data *dd) |
199 | { | 271 | { |
200 | struct deadline_data *dd = hctx->queue->elevator->elevator_data; | 272 | struct request *rq, *next_rq; |
201 | struct request *rq; | ||
202 | bool reads, writes; | 273 | bool reads, writes; |
203 | int data_dir; | 274 | int data_dir; |
204 | 275 | ||
@@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
214 | /* | 285 | /* |
215 | * batches are currently reads XOR writes | 286 | * batches are currently reads XOR writes |
216 | */ | 287 | */ |
217 | if (dd->next_rq[WRITE]) | 288 | rq = deadline_next_request(dd, WRITE); |
218 | rq = dd->next_rq[WRITE]; | 289 | if (!rq) |
219 | else | 290 | rq = deadline_next_request(dd, READ); |
220 | rq = dd->next_rq[READ]; | ||
221 | 291 | ||
222 | if (rq && dd->batching < dd->fifo_batch) | 292 | if (rq && dd->batching < dd->fifo_batch) |
223 | /* we have a next request are still entitled to batch */ | 293 | /* we have a next request are still entitled to batch */ |
@@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
231 | if (reads) { | 301 | if (reads) { |
232 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); | 302 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); |
233 | 303 | ||
234 | if (writes && (dd->starved++ >= dd->writes_starved)) | 304 | if (deadline_fifo_request(dd, WRITE) && |
305 | (dd->starved++ >= dd->writes_starved)) | ||
235 | goto dispatch_writes; | 306 | goto dispatch_writes; |
236 | 307 | ||
237 | data_dir = READ; | 308 | data_dir = READ; |
@@ -260,21 +331,29 @@ dispatch_find_request: | |||
260 | /* | 331 | /* |
261 | * we are not running a batch, find best request for selected data_dir | 332 | * we are not running a batch, find best request for selected data_dir |
262 | */ | 333 | */ |
263 | if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { | 334 | next_rq = deadline_next_request(dd, data_dir); |
335 | if (deadline_check_fifo(dd, data_dir) || !next_rq) { | ||
264 | /* | 336 | /* |
265 | * A deadline has expired, the last request was in the other | 337 | * A deadline has expired, the last request was in the other |
266 | * direction, or we have run out of higher-sectored requests. | 338 | * direction, or we have run out of higher-sectored requests. |
267 | * Start again from the request with the earliest expiry time. | 339 | * Start again from the request with the earliest expiry time. |
268 | */ | 340 | */ |
269 | rq = rq_entry_fifo(dd->fifo_list[data_dir].next); | 341 | rq = deadline_fifo_request(dd, data_dir); |
270 | } else { | 342 | } else { |
271 | /* | 343 | /* |
272 | * The last req was the same dir and we have a next request in | 344 | * The last req was the same dir and we have a next request in |
273 | * sort order. No expired requests so continue on from here. | 345 | * sort order. No expired requests so continue on from here. |
274 | */ | 346 | */ |
275 | rq = dd->next_rq[data_dir]; | 347 | rq = next_rq; |
276 | } | 348 | } |
277 | 349 | ||
350 | /* | ||
351 | * For a zoned block device, if we only have writes queued and none of | ||
352 | * them can be dispatched, rq will be NULL. | ||
353 | */ | ||
354 | if (!rq) | ||
355 | return NULL; | ||
356 | |||
278 | dd->batching = 0; | 357 | dd->batching = 0; |
279 | 358 | ||
280 | dispatch_request: | 359 | dispatch_request: |
@@ -284,17 +363,27 @@ dispatch_request: | |||
284 | dd->batching++; | 363 | dd->batching++; |
285 | deadline_move_request(dd, rq); | 364 | deadline_move_request(dd, rq); |
286 | done: | 365 | done: |
366 | /* | ||
367 | * If the request needs its target zone locked, do it. | ||
368 | */ | ||
369 | blk_req_zone_write_lock(rq); | ||
287 | rq->rq_flags |= RQF_STARTED; | 370 | rq->rq_flags |= RQF_STARTED; |
288 | return rq; | 371 | return rq; |
289 | } | 372 | } |
290 | 373 | ||
374 | /* | ||
375 | * One confusing aspect here is that we get called for a specific | ||
376 | * hardware queue, but we return a request that may not be for a | ||
377 | * different hardware queue. This is because mq-deadline has shared | ||
378 | * state for all hardware queues, in terms of sorting, FIFOs, etc. | ||
379 | */ | ||
291 | static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) | 380 | static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) |
292 | { | 381 | { |
293 | struct deadline_data *dd = hctx->queue->elevator->elevator_data; | 382 | struct deadline_data *dd = hctx->queue->elevator->elevator_data; |
294 | struct request *rq; | 383 | struct request *rq; |
295 | 384 | ||
296 | spin_lock(&dd->lock); | 385 | spin_lock(&dd->lock); |
297 | rq = __dd_dispatch_request(hctx); | 386 | rq = __dd_dispatch_request(dd); |
298 | spin_unlock(&dd->lock); | 387 | spin_unlock(&dd->lock); |
299 | 388 | ||
300 | return rq; | 389 | return rq; |
@@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) | |||
339 | dd->front_merges = 1; | 428 | dd->front_merges = 1; |
340 | dd->fifo_batch = fifo_batch; | 429 | dd->fifo_batch = fifo_batch; |
341 | spin_lock_init(&dd->lock); | 430 | spin_lock_init(&dd->lock); |
431 | spin_lock_init(&dd->zone_lock); | ||
342 | INIT_LIST_HEAD(&dd->dispatch); | 432 | INIT_LIST_HEAD(&dd->dispatch); |
343 | 433 | ||
344 | q->elevator = eq; | 434 | q->elevator = eq; |
@@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
395 | struct deadline_data *dd = q->elevator->elevator_data; | 485 | struct deadline_data *dd = q->elevator->elevator_data; |
396 | const int data_dir = rq_data_dir(rq); | 486 | const int data_dir = rq_data_dir(rq); |
397 | 487 | ||
488 | /* | ||
489 | * This may be a requeue of a write request that has locked its | ||
490 | * target zone. If it is the case, this releases the zone lock. | ||
491 | */ | ||
492 | blk_req_zone_write_unlock(rq); | ||
493 | |||
398 | if (blk_mq_sched_try_insert_merge(q, rq)) | 494 | if (blk_mq_sched_try_insert_merge(q, rq)) |
399 | return; | 495 | return; |
400 | 496 | ||
@@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, | |||
439 | spin_unlock(&dd->lock); | 535 | spin_unlock(&dd->lock); |
440 | } | 536 | } |
441 | 537 | ||
538 | /* | ||
539 | * For zoned block devices, write unlock the target zone of | ||
540 | * completed write requests. Do this while holding the zone lock | ||
541 | * spinlock so that the zone is never unlocked while deadline_fifo_request() | ||
542 | * while deadline_next_request() are executing. | ||
543 | */ | ||
544 | static void dd_completed_request(struct request *rq) | ||
545 | { | ||
546 | struct request_queue *q = rq->q; | ||
547 | |||
548 | if (blk_queue_is_zoned(q)) { | ||
549 | struct deadline_data *dd = q->elevator->elevator_data; | ||
550 | unsigned long flags; | ||
551 | |||
552 | spin_lock_irqsave(&dd->zone_lock, flags); | ||
553 | blk_req_zone_write_unlock(rq); | ||
554 | spin_unlock_irqrestore(&dd->zone_lock, flags); | ||
555 | } | ||
556 | } | ||
557 | |||
442 | static bool dd_has_work(struct blk_mq_hw_ctx *hctx) | 558 | static bool dd_has_work(struct blk_mq_hw_ctx *hctx) |
443 | { | 559 | { |
444 | struct deadline_data *dd = hctx->queue->elevator->elevator_data; | 560 | struct deadline_data *dd = hctx->queue->elevator->elevator_data; |
@@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = { | |||
640 | .ops.mq = { | 756 | .ops.mq = { |
641 | .insert_requests = dd_insert_requests, | 757 | .insert_requests = dd_insert_requests, |
642 | .dispatch_request = dd_dispatch_request, | 758 | .dispatch_request = dd_dispatch_request, |
759 | .completed_request = dd_completed_request, | ||
643 | .next_request = elv_rb_latter_request, | 760 | .next_request = elv_rb_latter_request, |
644 | .former_request = elv_rb_former_request, | 761 | .former_request = elv_rb_former_request, |
645 | .bio_merge = dd_bio_merge, | 762 | .bio_merge = dd_bio_merge, |
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 0af3a3db6fb0..82c44f7df911 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c | |||
@@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state, | |||
301 | continue; | 301 | continue; |
302 | bsd_start = le32_to_cpu(p->p_offset); | 302 | bsd_start = le32_to_cpu(p->p_offset); |
303 | bsd_size = le32_to_cpu(p->p_size); | 303 | bsd_size = le32_to_cpu(p->p_size); |
304 | if (memcmp(flavour, "bsd\0", 4) == 0) | 304 | /* FreeBSD has relative offset if C partition offset is zero */ |
305 | if (memcmp(flavour, "bsd\0", 4) == 0 && | ||
306 | le32_to_cpu(l->d_partitions[2].p_offset) == 0) | ||
305 | bsd_start += offset; | 307 | bsd_start += offset; |
306 | if (offset == bsd_start && size == bsd_size) | 308 | if (offset == bsd_start && size == bsd_size) |
307 | /* full parent partition, we have it already */ | 309 | /* full parent partition, we have it already */ |
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index edcfff974527..60b471f8621b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c | |||
@@ -384,9 +384,10 @@ out_put_request: | |||
384 | 384 | ||
385 | /** | 385 | /** |
386 | * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl | 386 | * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl |
387 | * @file: file this ioctl operates on (optional) | ||
388 | * @q: request queue to send scsi commands down | 387 | * @q: request queue to send scsi commands down |
389 | * @disk: gendisk to operate on (option) | 388 | * @disk: gendisk to operate on (option) |
389 | * @mode: mode used to open the file through which the ioctl has been | ||
390 | * submitted | ||
390 | * @sic: userspace structure describing the command to perform | 391 | * @sic: userspace structure describing the command to perform |
391 | * | 392 | * |
392 | * Send down the scsi command described by @sic to the device below | 393 | * Send down the scsi command described by @sic to the device below |
@@ -415,10 +416,10 @@ out_put_request: | |||
415 | * Positive numbers returned are the compacted SCSI error codes (4 | 416 | * Positive numbers returned are the compacted SCSI error codes (4 |
416 | * bytes in one int) where the lowest byte is the SCSI status. | 417 | * bytes in one int) where the lowest byte is the SCSI status. |
417 | */ | 418 | */ |
418 | #define OMAX_SB_LEN 16 /* For backward compatibility */ | ||
419 | int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, | 419 | int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, |
420 | struct scsi_ioctl_command __user *sic) | 420 | struct scsi_ioctl_command __user *sic) |
421 | { | 421 | { |
422 | enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ | ||
422 | struct request *rq; | 423 | struct request *rq; |
423 | struct scsi_request *req; | 424 | struct scsi_request *req; |
424 | int err; | 425 | int err; |
@@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) | |||
692 | if (bd && bd == bd->bd_contains) | 693 | if (bd && bd == bd->bd_contains) |
693 | return 0; | 694 | return 0; |
694 | 695 | ||
695 | /* Actually none of these is particularly useful on a partition, | ||
696 | * but they are safe. | ||
697 | */ | ||
698 | switch (cmd) { | ||
699 | case SCSI_IOCTL_GET_IDLUN: | ||
700 | case SCSI_IOCTL_GET_BUS_NUMBER: | ||
701 | case SCSI_IOCTL_GET_PCI: | ||
702 | case SCSI_IOCTL_PROBE_HOST: | ||
703 | case SG_GET_VERSION_NUM: | ||
704 | case SG_SET_TIMEOUT: | ||
705 | case SG_GET_TIMEOUT: | ||
706 | case SG_GET_RESERVED_SIZE: | ||
707 | case SG_SET_RESERVED_SIZE: | ||
708 | case SG_EMULATED_HOST: | ||
709 | return 0; | ||
710 | case CDROM_GET_CAPABILITY: | ||
711 | /* Keep this until we remove the printk below. udev sends it | ||
712 | * and we do not want to spam dmesg about it. CD-ROMs do | ||
713 | * not have partitions, so we get here only for disks. | ||
714 | */ | ||
715 | return -ENOIOCTLCMD; | ||
716 | default: | ||
717 | break; | ||
718 | } | ||
719 | |||
720 | if (capable(CAP_SYS_RAWIO)) | 696 | if (capable(CAP_SYS_RAWIO)) |
721 | return 0; | 697 | return 0; |
722 | 698 | ||
723 | /* In particular, rule out all resets and host-specific ioctls. */ | ||
724 | printk_ratelimited(KERN_WARNING | ||
725 | "%s: sending ioctl %x to a partition!\n", current->comm, cmd); | ||
726 | |||
727 | return -ENOIOCTLCMD; | 699 | return -ENOIOCTLCMD; |
728 | } | 700 | } |
729 | EXPORT_SYMBOL(scsi_verify_blk_ioctl); | 701 | EXPORT_SYMBOL(scsi_verify_blk_ioctl); |
diff --git a/crypto/Kconfig b/crypto/Kconfig index f7911963bb79..20360e040425 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig | |||
@@ -106,6 +106,7 @@ config CRYPTO_KPP | |||
106 | config CRYPTO_ACOMP2 | 106 | config CRYPTO_ACOMP2 |
107 | tristate | 107 | tristate |
108 | select CRYPTO_ALGAPI2 | 108 | select CRYPTO_ALGAPI2 |
109 | select SGL_ALLOC | ||
109 | 110 | ||
110 | config CRYPTO_ACOMP | 111 | config CRYPTO_ACOMP |
111 | tristate | 112 | tristate |
diff --git a/crypto/scompress.c b/crypto/scompress.c index 2075e2c4e7df..968bbcf65c94 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c | |||
@@ -140,53 +140,6 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm) | |||
140 | return ret; | 140 | return ret; |
141 | } | 141 | } |
142 | 142 | ||
143 | static void crypto_scomp_sg_free(struct scatterlist *sgl) | ||
144 | { | ||
145 | int i, n; | ||
146 | struct page *page; | ||
147 | |||
148 | if (!sgl) | ||
149 | return; | ||
150 | |||
151 | n = sg_nents(sgl); | ||
152 | for_each_sg(sgl, sgl, n, i) { | ||
153 | page = sg_page(sgl); | ||
154 | if (page) | ||
155 | __free_page(page); | ||
156 | } | ||
157 | |||
158 | kfree(sgl); | ||
159 | } | ||
160 | |||
161 | static struct scatterlist *crypto_scomp_sg_alloc(size_t size, gfp_t gfp) | ||
162 | { | ||
163 | struct scatterlist *sgl; | ||
164 | struct page *page; | ||
165 | int i, n; | ||
166 | |||
167 | n = ((size - 1) >> PAGE_SHIFT) + 1; | ||
168 | |||
169 | sgl = kmalloc_array(n, sizeof(struct scatterlist), gfp); | ||
170 | if (!sgl) | ||
171 | return NULL; | ||
172 | |||
173 | sg_init_table(sgl, n); | ||
174 | |||
175 | for (i = 0; i < n; i++) { | ||
176 | page = alloc_page(gfp); | ||
177 | if (!page) | ||
178 | goto err; | ||
179 | sg_set_page(sgl + i, page, PAGE_SIZE, 0); | ||
180 | } | ||
181 | |||
182 | return sgl; | ||
183 | |||
184 | err: | ||
185 | sg_mark_end(sgl + i); | ||
186 | crypto_scomp_sg_free(sgl); | ||
187 | return NULL; | ||
188 | } | ||
189 | |||
190 | static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) | 143 | static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) |
191 | { | 144 | { |
192 | struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); | 145 | struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); |
@@ -220,7 +173,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) | |||
220 | scratch_dst, &req->dlen, *ctx); | 173 | scratch_dst, &req->dlen, *ctx); |
221 | if (!ret) { | 174 | if (!ret) { |
222 | if (!req->dst) { | 175 | if (!req->dst) { |
223 | req->dst = crypto_scomp_sg_alloc(req->dlen, GFP_ATOMIC); | 176 | req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL); |
224 | if (!req->dst) | 177 | if (!req->dst) |
225 | goto out; | 178 | goto out; |
226 | } | 179 | } |
@@ -274,7 +227,7 @@ int crypto_init_scomp_ops_async(struct crypto_tfm *tfm) | |||
274 | 227 | ||
275 | crt->compress = scomp_acomp_compress; | 228 | crt->compress = scomp_acomp_compress; |
276 | crt->decompress = scomp_acomp_decompress; | 229 | crt->decompress = scomp_acomp_decompress; |
277 | crt->dst_free = crypto_scomp_sg_free; | 230 | crt->dst_free = sgl_free; |
278 | crt->reqsize = sizeof(void *); | 231 | crt->reqsize = sizeof(void *); |
279 | 232 | ||
280 | return 0; | 233 | return 0; |
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 442e777bdfb2..728075214959 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c | |||
@@ -6619,43 +6619,27 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller) | |||
6619 | 6619 | ||
6620 | #ifdef DAC960_GAM_MINOR | 6620 | #ifdef DAC960_GAM_MINOR |
6621 | 6621 | ||
6622 | /* | 6622 | static long DAC960_gam_get_controller_info(DAC960_ControllerInfo_T __user *UserSpaceControllerInfo) |
6623 | * DAC960_gam_ioctl is the ioctl function for performing RAID operations. | ||
6624 | */ | ||
6625 | |||
6626 | static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | ||
6627 | unsigned long Argument) | ||
6628 | { | 6623 | { |
6629 | long ErrorCode = 0; | ||
6630 | if (!capable(CAP_SYS_ADMIN)) return -EACCES; | ||
6631 | |||
6632 | mutex_lock(&DAC960_mutex); | ||
6633 | switch (Request) | ||
6634 | { | ||
6635 | case DAC960_IOCTL_GET_CONTROLLER_COUNT: | ||
6636 | ErrorCode = DAC960_ControllerCount; | ||
6637 | break; | ||
6638 | case DAC960_IOCTL_GET_CONTROLLER_INFO: | ||
6639 | { | ||
6640 | DAC960_ControllerInfo_T __user *UserSpaceControllerInfo = | ||
6641 | (DAC960_ControllerInfo_T __user *) Argument; | ||
6642 | DAC960_ControllerInfo_T ControllerInfo; | 6624 | DAC960_ControllerInfo_T ControllerInfo; |
6643 | DAC960_Controller_T *Controller; | 6625 | DAC960_Controller_T *Controller; |
6644 | int ControllerNumber; | 6626 | int ControllerNumber; |
6627 | long ErrorCode; | ||
6628 | |||
6645 | if (UserSpaceControllerInfo == NULL) | 6629 | if (UserSpaceControllerInfo == NULL) |
6646 | ErrorCode = -EINVAL; | 6630 | ErrorCode = -EINVAL; |
6647 | else ErrorCode = get_user(ControllerNumber, | 6631 | else ErrorCode = get_user(ControllerNumber, |
6648 | &UserSpaceControllerInfo->ControllerNumber); | 6632 | &UserSpaceControllerInfo->ControllerNumber); |
6649 | if (ErrorCode != 0) | 6633 | if (ErrorCode != 0) |
6650 | break; | 6634 | goto out; |
6651 | ErrorCode = -ENXIO; | 6635 | ErrorCode = -ENXIO; |
6652 | if (ControllerNumber < 0 || | 6636 | if (ControllerNumber < 0 || |
6653 | ControllerNumber > DAC960_ControllerCount - 1) { | 6637 | ControllerNumber > DAC960_ControllerCount - 1) { |
6654 | break; | 6638 | goto out; |
6655 | } | 6639 | } |
6656 | Controller = DAC960_Controllers[ControllerNumber]; | 6640 | Controller = DAC960_Controllers[ControllerNumber]; |
6657 | if (Controller == NULL) | 6641 | if (Controller == NULL) |
6658 | break; | 6642 | goto out; |
6659 | memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); | 6643 | memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); |
6660 | ControllerInfo.ControllerNumber = ControllerNumber; | 6644 | ControllerInfo.ControllerNumber = ControllerNumber; |
6661 | ControllerInfo.FirmwareType = Controller->FirmwareType; | 6645 | ControllerInfo.FirmwareType = Controller->FirmwareType; |
@@ -6670,12 +6654,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
6670 | strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion); | 6654 | strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion); |
6671 | ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, | 6655 | ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, |
6672 | sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0); | 6656 | sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0); |
6673 | break; | 6657 | out: |
6674 | } | 6658 | return ErrorCode; |
6675 | case DAC960_IOCTL_V1_EXECUTE_COMMAND: | 6659 | } |
6676 | { | 6660 | |
6677 | DAC960_V1_UserCommand_T __user *UserSpaceUserCommand = | 6661 | static long DAC960_gam_v1_execute_command(DAC960_V1_UserCommand_T __user *UserSpaceUserCommand) |
6678 | (DAC960_V1_UserCommand_T __user *) Argument; | 6662 | { |
6679 | DAC960_V1_UserCommand_T UserCommand; | 6663 | DAC960_V1_UserCommand_T UserCommand; |
6680 | DAC960_Controller_T *Controller; | 6664 | DAC960_Controller_T *Controller; |
6681 | DAC960_Command_T *Command = NULL; | 6665 | DAC960_Command_T *Command = NULL; |
@@ -6688,39 +6672,41 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
6688 | int ControllerNumber, DataTransferLength; | 6672 | int ControllerNumber, DataTransferLength; |
6689 | unsigned char *DataTransferBuffer = NULL; | 6673 | unsigned char *DataTransferBuffer = NULL; |
6690 | dma_addr_t DataTransferBufferDMA; | 6674 | dma_addr_t DataTransferBufferDMA; |
6675 | long ErrorCode; | ||
6676 | |||
6691 | if (UserSpaceUserCommand == NULL) { | 6677 | if (UserSpaceUserCommand == NULL) { |
6692 | ErrorCode = -EINVAL; | 6678 | ErrorCode = -EINVAL; |
6693 | break; | 6679 | goto out; |
6694 | } | 6680 | } |
6695 | if (copy_from_user(&UserCommand, UserSpaceUserCommand, | 6681 | if (copy_from_user(&UserCommand, UserSpaceUserCommand, |
6696 | sizeof(DAC960_V1_UserCommand_T))) { | 6682 | sizeof(DAC960_V1_UserCommand_T))) { |
6697 | ErrorCode = -EFAULT; | 6683 | ErrorCode = -EFAULT; |
6698 | break; | 6684 | goto out; |
6699 | } | 6685 | } |
6700 | ControllerNumber = UserCommand.ControllerNumber; | 6686 | ControllerNumber = UserCommand.ControllerNumber; |
6701 | ErrorCode = -ENXIO; | 6687 | ErrorCode = -ENXIO; |
6702 | if (ControllerNumber < 0 || | 6688 | if (ControllerNumber < 0 || |
6703 | ControllerNumber > DAC960_ControllerCount - 1) | 6689 | ControllerNumber > DAC960_ControllerCount - 1) |
6704 | break; | 6690 | goto out; |
6705 | Controller = DAC960_Controllers[ControllerNumber]; | 6691 | Controller = DAC960_Controllers[ControllerNumber]; |
6706 | if (Controller == NULL) | 6692 | if (Controller == NULL) |
6707 | break; | 6693 | goto out; |
6708 | ErrorCode = -EINVAL; | 6694 | ErrorCode = -EINVAL; |
6709 | if (Controller->FirmwareType != DAC960_V1_Controller) | 6695 | if (Controller->FirmwareType != DAC960_V1_Controller) |
6710 | break; | 6696 | goto out; |
6711 | CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode; | 6697 | CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode; |
6712 | DataTransferLength = UserCommand.DataTransferLength; | 6698 | DataTransferLength = UserCommand.DataTransferLength; |
6713 | if (CommandOpcode & 0x80) | 6699 | if (CommandOpcode & 0x80) |
6714 | break; | 6700 | goto out; |
6715 | if (CommandOpcode == DAC960_V1_DCDB) | 6701 | if (CommandOpcode == DAC960_V1_DCDB) |
6716 | { | 6702 | { |
6717 | if (copy_from_user(&DCDB, UserCommand.DCDB, | 6703 | if (copy_from_user(&DCDB, UserCommand.DCDB, |
6718 | sizeof(DAC960_V1_DCDB_T))) { | 6704 | sizeof(DAC960_V1_DCDB_T))) { |
6719 | ErrorCode = -EFAULT; | 6705 | ErrorCode = -EFAULT; |
6720 | break; | 6706 | goto out; |
6721 | } | 6707 | } |
6722 | if (DCDB.Channel >= DAC960_V1_MaxChannels) | 6708 | if (DCDB.Channel >= DAC960_V1_MaxChannels) |
6723 | break; | 6709 | goto out; |
6724 | if (!((DataTransferLength == 0 && | 6710 | if (!((DataTransferLength == 0 && |
6725 | DCDB.Direction | 6711 | DCDB.Direction |
6726 | == DAC960_V1_DCDB_NoDataTransfer) || | 6712 | == DAC960_V1_DCDB_NoDataTransfer) || |
@@ -6730,15 +6716,15 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
6730 | (DataTransferLength < 0 && | 6716 | (DataTransferLength < 0 && |
6731 | DCDB.Direction | 6717 | DCDB.Direction |
6732 | == DAC960_V1_DCDB_DataTransferSystemToDevice))) | 6718 | == DAC960_V1_DCDB_DataTransferSystemToDevice))) |
6733 | break; | 6719 | goto out; |
6734 | if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength) | 6720 | if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength) |
6735 | != abs(DataTransferLength)) | 6721 | != abs(DataTransferLength)) |
6736 | break; | 6722 | goto out; |
6737 | DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice, | 6723 | DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice, |
6738 | sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA); | 6724 | sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA); |
6739 | if (DCDB_IOBUF == NULL) { | 6725 | if (DCDB_IOBUF == NULL) { |
6740 | ErrorCode = -ENOMEM; | 6726 | ErrorCode = -ENOMEM; |
6741 | break; | 6727 | goto out; |
6742 | } | 6728 | } |
6743 | } | 6729 | } |
6744 | ErrorCode = -ENOMEM; | 6730 | ErrorCode = -ENOMEM; |
@@ -6748,19 +6734,19 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
6748 | DataTransferLength, | 6734 | DataTransferLength, |
6749 | &DataTransferBufferDMA); | 6735 | &DataTransferBufferDMA); |
6750 | if (DataTransferBuffer == NULL) | 6736 | if (DataTransferBuffer == NULL) |
6751 | break; | 6737 | goto out; |
6752 | } | 6738 | } |
6753 | else if (DataTransferLength < 0) | 6739 | else if (DataTransferLength < 0) |
6754 | { | 6740 | { |
6755 | DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, | 6741 | DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, |
6756 | -DataTransferLength, &DataTransferBufferDMA); | 6742 | -DataTransferLength, &DataTransferBufferDMA); |
6757 | if (DataTransferBuffer == NULL) | 6743 | if (DataTransferBuffer == NULL) |
6758 | break; | 6744 | goto out; |
6759 | if (copy_from_user(DataTransferBuffer, | 6745 | if (copy_from_user(DataTransferBuffer, |
6760 | UserCommand.DataTransferBuffer, | 6746 | UserCommand.DataTransferBuffer, |
6761 | -DataTransferLength)) { | 6747 | -DataTransferLength)) { |
6762 | ErrorCode = -EFAULT; | 6748 | ErrorCode = -EFAULT; |
6763 | break; | 6749 | goto out; |
6764 | } | 6750 | } |
6765 | } | 6751 | } |
6766 | if (CommandOpcode == DAC960_V1_DCDB) | 6752 | if (CommandOpcode == DAC960_V1_DCDB) |
@@ -6837,12 +6823,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
6837 | if (DCDB_IOBUF != NULL) | 6823 | if (DCDB_IOBUF != NULL) |
6838 | pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), | 6824 | pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), |
6839 | DCDB_IOBUF, DCDB_IOBUFDMA); | 6825 | DCDB_IOBUF, DCDB_IOBUFDMA); |
6840 | break; | 6826 | out: |
6841 | } | 6827 | return ErrorCode; |
6842 | case DAC960_IOCTL_V2_EXECUTE_COMMAND: | 6828 | } |
6843 | { | 6829 | |
6844 | DAC960_V2_UserCommand_T __user *UserSpaceUserCommand = | 6830 | static long DAC960_gam_v2_execute_command(DAC960_V2_UserCommand_T __user *UserSpaceUserCommand) |
6845 | (DAC960_V2_UserCommand_T __user *) Argument; | 6831 | { |
6846 | DAC960_V2_UserCommand_T UserCommand; | 6832 | DAC960_V2_UserCommand_T UserCommand; |
6847 | DAC960_Controller_T *Controller; | 6833 | DAC960_Controller_T *Controller; |
6848 | DAC960_Command_T *Command = NULL; | 6834 | DAC960_Command_T *Command = NULL; |
@@ -6855,26 +6841,26 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
6855 | dma_addr_t DataTransferBufferDMA; | 6841 | dma_addr_t DataTransferBufferDMA; |
6856 | unsigned char *RequestSenseBuffer = NULL; | 6842 | unsigned char *RequestSenseBuffer = NULL; |
6857 | dma_addr_t RequestSenseBufferDMA; | 6843 | dma_addr_t RequestSenseBufferDMA; |
6844 | long ErrorCode = -EINVAL; | ||
6858 | 6845 | ||
6859 | ErrorCode = -EINVAL; | ||
6860 | if (UserSpaceUserCommand == NULL) | 6846 | if (UserSpaceUserCommand == NULL) |
6861 | break; | 6847 | goto out; |
6862 | if (copy_from_user(&UserCommand, UserSpaceUserCommand, | 6848 | if (copy_from_user(&UserCommand, UserSpaceUserCommand, |
6863 | sizeof(DAC960_V2_UserCommand_T))) { | 6849 | sizeof(DAC960_V2_UserCommand_T))) { |
6864 | ErrorCode = -EFAULT; | 6850 | ErrorCode = -EFAULT; |
6865 | break; | 6851 | goto out; |
6866 | } | 6852 | } |
6867 | ErrorCode = -ENXIO; | 6853 | ErrorCode = -ENXIO; |
6868 | ControllerNumber = UserCommand.ControllerNumber; | 6854 | ControllerNumber = UserCommand.ControllerNumber; |
6869 | if (ControllerNumber < 0 || | 6855 | if (ControllerNumber < 0 || |
6870 | ControllerNumber > DAC960_ControllerCount - 1) | 6856 | ControllerNumber > DAC960_ControllerCount - 1) |
6871 | break; | 6857 | goto out; |
6872 | Controller = DAC960_Controllers[ControllerNumber]; | 6858 | Controller = DAC960_Controllers[ControllerNumber]; |
6873 | if (Controller == NULL) | 6859 | if (Controller == NULL) |
6874 | break; | 6860 | goto out; |
6875 | if (Controller->FirmwareType != DAC960_V2_Controller){ | 6861 | if (Controller->FirmwareType != DAC960_V2_Controller){ |
6876 | ErrorCode = -EINVAL; | 6862 | ErrorCode = -EINVAL; |
6877 | break; | 6863 | goto out; |
6878 | } | 6864 | } |
6879 | DataTransferLength = UserCommand.DataTransferLength; | 6865 | DataTransferLength = UserCommand.DataTransferLength; |
6880 | ErrorCode = -ENOMEM; | 6866 | ErrorCode = -ENOMEM; |
@@ -6884,14 +6870,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
6884 | DataTransferLength, | 6870 | DataTransferLength, |
6885 | &DataTransferBufferDMA); | 6871 | &DataTransferBufferDMA); |
6886 | if (DataTransferBuffer == NULL) | 6872 | if (DataTransferBuffer == NULL) |
6887 | break; | 6873 | goto out; |
6888 | } | 6874 | } |
6889 | else if (DataTransferLength < 0) | 6875 | else if (DataTransferLength < 0) |
6890 | { | 6876 | { |
6891 | DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, | 6877 | DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, |
6892 | -DataTransferLength, &DataTransferBufferDMA); | 6878 | -DataTransferLength, &DataTransferBufferDMA); |
6893 | if (DataTransferBuffer == NULL) | 6879 | if (DataTransferBuffer == NULL) |
6894 | break; | 6880 | goto out; |
6895 | if (copy_from_user(DataTransferBuffer, | 6881 | if (copy_from_user(DataTransferBuffer, |
6896 | UserCommand.DataTransferBuffer, | 6882 | UserCommand.DataTransferBuffer, |
6897 | -DataTransferLength)) { | 6883 | -DataTransferLength)) { |
@@ -7001,42 +6987,44 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
7001 | if (RequestSenseBuffer != NULL) | 6987 | if (RequestSenseBuffer != NULL) |
7002 | pci_free_consistent(Controller->PCIDevice, RequestSenseLength, | 6988 | pci_free_consistent(Controller->PCIDevice, RequestSenseLength, |
7003 | RequestSenseBuffer, RequestSenseBufferDMA); | 6989 | RequestSenseBuffer, RequestSenseBufferDMA); |
7004 | break; | 6990 | out: |
7005 | } | 6991 | return ErrorCode; |
7006 | case DAC960_IOCTL_V2_GET_HEALTH_STATUS: | 6992 | } |
7007 | { | 6993 | |
7008 | DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus = | 6994 | static long DAC960_gam_v2_get_health_status(DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus) |
7009 | (DAC960_V2_GetHealthStatus_T __user *) Argument; | 6995 | { |
7010 | DAC960_V2_GetHealthStatus_T GetHealthStatus; | 6996 | DAC960_V2_GetHealthStatus_T GetHealthStatus; |
7011 | DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer; | 6997 | DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer; |
7012 | DAC960_Controller_T *Controller; | 6998 | DAC960_Controller_T *Controller; |
7013 | int ControllerNumber; | 6999 | int ControllerNumber; |
7000 | long ErrorCode; | ||
7001 | |||
7014 | if (UserSpaceGetHealthStatus == NULL) { | 7002 | if (UserSpaceGetHealthStatus == NULL) { |
7015 | ErrorCode = -EINVAL; | 7003 | ErrorCode = -EINVAL; |
7016 | break; | 7004 | goto out; |
7017 | } | 7005 | } |
7018 | if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus, | 7006 | if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus, |
7019 | sizeof(DAC960_V2_GetHealthStatus_T))) { | 7007 | sizeof(DAC960_V2_GetHealthStatus_T))) { |
7020 | ErrorCode = -EFAULT; | 7008 | ErrorCode = -EFAULT; |
7021 | break; | 7009 | goto out; |
7022 | } | 7010 | } |
7023 | ErrorCode = -ENXIO; | 7011 | ErrorCode = -ENXIO; |
7024 | ControllerNumber = GetHealthStatus.ControllerNumber; | 7012 | ControllerNumber = GetHealthStatus.ControllerNumber; |
7025 | if (ControllerNumber < 0 || | 7013 | if (ControllerNumber < 0 || |
7026 | ControllerNumber > DAC960_ControllerCount - 1) | 7014 | ControllerNumber > DAC960_ControllerCount - 1) |
7027 | break; | 7015 | goto out; |
7028 | Controller = DAC960_Controllers[ControllerNumber]; | 7016 | Controller = DAC960_Controllers[ControllerNumber]; |
7029 | if (Controller == NULL) | 7017 | if (Controller == NULL) |
7030 | break; | 7018 | goto out; |
7031 | if (Controller->FirmwareType != DAC960_V2_Controller) { | 7019 | if (Controller->FirmwareType != DAC960_V2_Controller) { |
7032 | ErrorCode = -EINVAL; | 7020 | ErrorCode = -EINVAL; |
7033 | break; | 7021 | goto out; |
7034 | } | 7022 | } |
7035 | if (copy_from_user(&HealthStatusBuffer, | 7023 | if (copy_from_user(&HealthStatusBuffer, |
7036 | GetHealthStatus.HealthStatusBuffer, | 7024 | GetHealthStatus.HealthStatusBuffer, |
7037 | sizeof(DAC960_V2_HealthStatusBuffer_T))) { | 7025 | sizeof(DAC960_V2_HealthStatusBuffer_T))) { |
7038 | ErrorCode = -EFAULT; | 7026 | ErrorCode = -EFAULT; |
7039 | break; | 7027 | goto out; |
7040 | } | 7028 | } |
7041 | ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue, | 7029 | ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue, |
7042 | !(Controller->V2.HealthStatusBuffer->StatusChangeCounter | 7030 | !(Controller->V2.HealthStatusBuffer->StatusChangeCounter |
@@ -7046,7 +7034,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
7046 | DAC960_MonitoringTimerInterval); | 7034 | DAC960_MonitoringTimerInterval); |
7047 | if (ErrorCode == -ERESTARTSYS) { | 7035 | if (ErrorCode == -ERESTARTSYS) { |
7048 | ErrorCode = -EINTR; | 7036 | ErrorCode = -EINTR; |
7049 | break; | 7037 | goto out; |
7050 | } | 7038 | } |
7051 | if (copy_to_user(GetHealthStatus.HealthStatusBuffer, | 7039 | if (copy_to_user(GetHealthStatus.HealthStatusBuffer, |
7052 | Controller->V2.HealthStatusBuffer, | 7040 | Controller->V2.HealthStatusBuffer, |
@@ -7054,7 +7042,39 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | |||
7054 | ErrorCode = -EFAULT; | 7042 | ErrorCode = -EFAULT; |
7055 | else | 7043 | else |
7056 | ErrorCode = 0; | 7044 | ErrorCode = 0; |
7057 | } | 7045 | |
7046 | out: | ||
7047 | return ErrorCode; | ||
7048 | } | ||
7049 | |||
7050 | /* | ||
7051 | * DAC960_gam_ioctl is the ioctl function for performing RAID operations. | ||
7052 | */ | ||
7053 | |||
7054 | static long DAC960_gam_ioctl(struct file *file, unsigned int Request, | ||
7055 | unsigned long Argument) | ||
7056 | { | ||
7057 | long ErrorCode = 0; | ||
7058 | void __user *argp = (void __user *)Argument; | ||
7059 | if (!capable(CAP_SYS_ADMIN)) return -EACCES; | ||
7060 | |||
7061 | mutex_lock(&DAC960_mutex); | ||
7062 | switch (Request) | ||
7063 | { | ||
7064 | case DAC960_IOCTL_GET_CONTROLLER_COUNT: | ||
7065 | ErrorCode = DAC960_ControllerCount; | ||
7066 | break; | ||
7067 | case DAC960_IOCTL_GET_CONTROLLER_INFO: | ||
7068 | ErrorCode = DAC960_gam_get_controller_info(argp); | ||
7069 | break; | ||
7070 | case DAC960_IOCTL_V1_EXECUTE_COMMAND: | ||
7071 | ErrorCode = DAC960_gam_v1_execute_command(argp); | ||
7072 | break; | ||
7073 | case DAC960_IOCTL_V2_EXECUTE_COMMAND: | ||
7074 | ErrorCode = DAC960_gam_v2_execute_command(argp); | ||
7075 | break; | ||
7076 | case DAC960_IOCTL_V2_GET_HEALTH_STATUS: | ||
7077 | ErrorCode = DAC960_gam_v2_get_health_status(argp); | ||
7058 | break; | 7078 | break; |
7059 | default: | 7079 | default: |
7060 | ErrorCode = -ENOTTY; | 7080 | ErrorCode = -ENOTTY; |
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 40579d0cb3d1..ad9b687a236a 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK | |||
20 | tristate "Null test block driver" | 20 | tristate "Null test block driver" |
21 | select CONFIGFS_FS | 21 | select CONFIGFS_FS |
22 | 22 | ||
23 | config BLK_DEV_NULL_BLK_FAULT_INJECTION | ||
24 | bool "Support fault injection for Null test block driver" | ||
25 | depends on BLK_DEV_NULL_BLK && FAULT_INJECTION | ||
26 | |||
23 | config BLK_DEV_FD | 27 | config BLK_DEV_FD |
24 | tristate "Normal floppy disk support" | 28 | tristate "Normal floppy disk support" |
25 | depends on ARCH_MAY_HAVE_PC_FDC | 29 | depends on ARCH_MAY_HAVE_PC_FDC |
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 9220f8e833d0..c0ebda1283cc 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h | |||
@@ -112,8 +112,7 @@ enum frame_flags { | |||
112 | struct frame { | 112 | struct frame { |
113 | struct list_head head; | 113 | struct list_head head; |
114 | u32 tag; | 114 | u32 tag; |
115 | struct timeval sent; /* high-res time packet was sent */ | 115 | ktime_t sent; /* high-res time packet was sent */ |
116 | u32 sent_jiffs; /* low-res jiffies-based sent time */ | ||
117 | ulong waited; | 116 | ulong waited; |
118 | ulong waited_total; | 117 | ulong waited_total; |
119 | struct aoetgt *t; /* parent target I belong to */ | 118 | struct aoetgt *t; /* parent target I belong to */ |
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 812fed069708..540bb60cd071 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c | |||
@@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d) | |||
398 | 398 | ||
399 | skb = skb_clone(f->skb, GFP_ATOMIC); | 399 | skb = skb_clone(f->skb, GFP_ATOMIC); |
400 | if (skb) { | 400 | if (skb) { |
401 | do_gettimeofday(&f->sent); | 401 | f->sent = ktime_get(); |
402 | f->sent_jiffs = (u32) jiffies; | ||
403 | __skb_queue_head_init(&queue); | 402 | __skb_queue_head_init(&queue); |
404 | __skb_queue_tail(&queue, skb); | 403 | __skb_queue_tail(&queue, skb); |
405 | aoenet_xmit(&queue); | 404 | aoenet_xmit(&queue); |
@@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f) | |||
489 | skb = skb_clone(skb, GFP_ATOMIC); | 488 | skb = skb_clone(skb, GFP_ATOMIC); |
490 | if (skb == NULL) | 489 | if (skb == NULL) |
491 | return; | 490 | return; |
492 | do_gettimeofday(&f->sent); | 491 | f->sent = ktime_get(); |
493 | f->sent_jiffs = (u32) jiffies; | ||
494 | __skb_queue_head_init(&queue); | 492 | __skb_queue_head_init(&queue); |
495 | __skb_queue_tail(&queue, skb); | 493 | __skb_queue_tail(&queue, skb); |
496 | aoenet_xmit(&queue); | 494 | aoenet_xmit(&queue); |
@@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f) | |||
499 | static int | 497 | static int |
500 | tsince_hr(struct frame *f) | 498 | tsince_hr(struct frame *f) |
501 | { | 499 | { |
502 | struct timeval now; | 500 | u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent)); |
503 | int n; | ||
504 | 501 | ||
505 | do_gettimeofday(&now); | 502 | /* delta is normally under 4.2 seconds, avoid 64-bit division */ |
506 | n = now.tv_usec - f->sent.tv_usec; | 503 | if (likely(delta <= UINT_MAX)) |
507 | n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC; | 504 | return (u32)delta / NSEC_PER_USEC; |
508 | 505 | ||
509 | if (n < 0) | 506 | /* avoid overflow after 71 minutes */ |
510 | n = -n; | 507 | if (delta > ((u64)INT_MAX * NSEC_PER_USEC)) |
508 | return INT_MAX; | ||
511 | 509 | ||
512 | /* For relatively long periods, use jiffies to avoid | 510 | return div_u64(delta, NSEC_PER_USEC); |
513 | * discrepancies caused by updates to the system time. | ||
514 | * | ||
515 | * On system with HZ of 1000, 32-bits is over 49 days | ||
516 | * worth of jiffies, or over 71 minutes worth of usecs. | ||
517 | * | ||
518 | * Jiffies overflow is handled by subtraction of unsigned ints: | ||
519 | * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe | ||
520 | * $3 = 4 | ||
521 | * (gdb) | ||
522 | */ | ||
523 | if (n > USEC_PER_SEC / 4) { | ||
524 | n = ((u32) jiffies) - f->sent_jiffs; | ||
525 | n *= USEC_PER_SEC / HZ; | ||
526 | } | ||
527 | |||
528 | return n; | ||
529 | } | 511 | } |
530 | 512 | ||
531 | static int | 513 | static int |
@@ -589,7 +571,6 @@ reassign_frame(struct frame *f) | |||
589 | nf->waited = 0; | 571 | nf->waited = 0; |
590 | nf->waited_total = f->waited_total; | 572 | nf->waited_total = f->waited_total; |
591 | nf->sent = f->sent; | 573 | nf->sent = f->sent; |
592 | nf->sent_jiffs = f->sent_jiffs; | ||
593 | f->skb = skb; | 574 | f->skb = skb; |
594 | 575 | ||
595 | return nf; | 576 | return nf; |
@@ -633,8 +614,7 @@ probe(struct aoetgt *t) | |||
633 | 614 | ||
634 | skb = skb_clone(f->skb, GFP_ATOMIC); | 615 | skb = skb_clone(f->skb, GFP_ATOMIC); |
635 | if (skb) { | 616 | if (skb) { |
636 | do_gettimeofday(&f->sent); | 617 | f->sent = ktime_get(); |
637 | f->sent_jiffs = (u32) jiffies; | ||
638 | __skb_queue_head_init(&queue); | 618 | __skb_queue_head_init(&queue); |
639 | __skb_queue_tail(&queue, skb); | 619 | __skb_queue_tail(&queue, skb); |
640 | aoenet_xmit(&queue); | 620 | aoenet_xmit(&queue); |
@@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d) | |||
1432 | d->timer.function = rexmit_timer; | 1412 | d->timer.function = rexmit_timer; |
1433 | 1413 | ||
1434 | skb = skb_clone(skb, GFP_ATOMIC); | 1414 | skb = skb_clone(skb, GFP_ATOMIC); |
1435 | if (skb) { | 1415 | if (skb) |
1436 | do_gettimeofday(&f->sent); | 1416 | f->sent = ktime_get(); |
1437 | f->sent_jiffs = (u32) jiffies; | ||
1438 | } | ||
1439 | 1417 | ||
1440 | return skb; | 1418 | return skb; |
1441 | } | 1419 | } |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index bd97908c766f..9f4e6f502b84 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio) | |||
953 | struct drbd_bm_aio_ctx *ctx = bio->bi_private; | 953 | struct drbd_bm_aio_ctx *ctx = bio->bi_private; |
954 | struct drbd_device *device = ctx->device; | 954 | struct drbd_device *device = ctx->device; |
955 | struct drbd_bitmap *b = device->bitmap; | 955 | struct drbd_bitmap *b = device->bitmap; |
956 | unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); | 956 | unsigned int idx = bm_page_to_idx(bio_first_page_all(bio)); |
957 | 957 | ||
958 | if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && | 958 | if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && |
959 | !bm_test_page_unchanged(b->bm_pages[idx])) | 959 | !bm_test_page_unchanged(b->bm_pages[idx])) |
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index ad0477ae820f..6655893a3a7a 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c | |||
@@ -12,9 +12,9 @@ | |||
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/blk-mq.h> | 13 | #include <linux/blk-mq.h> |
14 | #include <linux/hrtimer.h> | 14 | #include <linux/hrtimer.h> |
15 | #include <linux/lightnvm.h> | ||
16 | #include <linux/configfs.h> | 15 | #include <linux/configfs.h> |
17 | #include <linux/badblocks.h> | 16 | #include <linux/badblocks.h> |
17 | #include <linux/fault-inject.h> | ||
18 | 18 | ||
19 | #define SECTOR_SHIFT 9 | 19 | #define SECTOR_SHIFT 9 |
20 | #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) | 20 | #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) |
@@ -27,6 +27,10 @@ | |||
27 | #define TICKS_PER_SEC 50ULL | 27 | #define TICKS_PER_SEC 50ULL |
28 | #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) | 28 | #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) |
29 | 29 | ||
30 | #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION | ||
31 | static DECLARE_FAULT_ATTR(null_timeout_attr); | ||
32 | #endif | ||
33 | |||
30 | static inline u64 mb_per_tick(int mbps) | 34 | static inline u64 mb_per_tick(int mbps) |
31 | { | 35 | { |
32 | return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); | 36 | return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); |
@@ -107,7 +111,6 @@ struct nullb_device { | |||
107 | unsigned int hw_queue_depth; /* queue depth */ | 111 | unsigned int hw_queue_depth; /* queue depth */ |
108 | unsigned int index; /* index of the disk, only valid with a disk */ | 112 | unsigned int index; /* index of the disk, only valid with a disk */ |
109 | unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ | 113 | unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ |
110 | bool use_lightnvm; /* register as a LightNVM device */ | ||
111 | bool blocking; /* blocking blk-mq device */ | 114 | bool blocking; /* blocking blk-mq device */ |
112 | bool use_per_node_hctx; /* use per-node allocation for hardware context */ | 115 | bool use_per_node_hctx; /* use per-node allocation for hardware context */ |
113 | bool power; /* power on/off the device */ | 116 | bool power; /* power on/off the device */ |
@@ -121,7 +124,6 @@ struct nullb { | |||
121 | unsigned int index; | 124 | unsigned int index; |
122 | struct request_queue *q; | 125 | struct request_queue *q; |
123 | struct gendisk *disk; | 126 | struct gendisk *disk; |
124 | struct nvm_dev *ndev; | ||
125 | struct blk_mq_tag_set *tag_set; | 127 | struct blk_mq_tag_set *tag_set; |
126 | struct blk_mq_tag_set __tag_set; | 128 | struct blk_mq_tag_set __tag_set; |
127 | unsigned int queue_depth; | 129 | unsigned int queue_depth; |
@@ -139,7 +141,6 @@ static LIST_HEAD(nullb_list); | |||
139 | static struct mutex lock; | 141 | static struct mutex lock; |
140 | static int null_major; | 142 | static int null_major; |
141 | static DEFINE_IDA(nullb_indexes); | 143 | static DEFINE_IDA(nullb_indexes); |
142 | static struct kmem_cache *ppa_cache; | ||
143 | static struct blk_mq_tag_set tag_set; | 144 | static struct blk_mq_tag_set tag_set; |
144 | 145 | ||
145 | enum { | 146 | enum { |
@@ -166,6 +167,11 @@ static int g_home_node = NUMA_NO_NODE; | |||
166 | module_param_named(home_node, g_home_node, int, S_IRUGO); | 167 | module_param_named(home_node, g_home_node, int, S_IRUGO); |
167 | MODULE_PARM_DESC(home_node, "Home node for the device"); | 168 | MODULE_PARM_DESC(home_node, "Home node for the device"); |
168 | 169 | ||
170 | #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION | ||
171 | static char g_timeout_str[80]; | ||
172 | module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO); | ||
173 | #endif | ||
174 | |||
169 | static int g_queue_mode = NULL_Q_MQ; | 175 | static int g_queue_mode = NULL_Q_MQ; |
170 | 176 | ||
171 | static int null_param_store_val(const char *str, int *val, int min, int max) | 177 | static int null_param_store_val(const char *str, int *val, int min, int max) |
@@ -208,10 +214,6 @@ static int nr_devices = 1; | |||
208 | module_param(nr_devices, int, S_IRUGO); | 214 | module_param(nr_devices, int, S_IRUGO); |
209 | MODULE_PARM_DESC(nr_devices, "Number of devices to register"); | 215 | MODULE_PARM_DESC(nr_devices, "Number of devices to register"); |
210 | 216 | ||
211 | static bool g_use_lightnvm; | ||
212 | module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO); | ||
213 | MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device"); | ||
214 | |||
215 | static bool g_blocking; | 217 | static bool g_blocking; |
216 | module_param_named(blocking, g_blocking, bool, S_IRUGO); | 218 | module_param_named(blocking, g_blocking, bool, S_IRUGO); |
217 | MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); | 219 | MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); |
@@ -345,7 +347,6 @@ NULLB_DEVICE_ATTR(blocksize, uint); | |||
345 | NULLB_DEVICE_ATTR(irqmode, uint); | 347 | NULLB_DEVICE_ATTR(irqmode, uint); |
346 | NULLB_DEVICE_ATTR(hw_queue_depth, uint); | 348 | NULLB_DEVICE_ATTR(hw_queue_depth, uint); |
347 | NULLB_DEVICE_ATTR(index, uint); | 349 | NULLB_DEVICE_ATTR(index, uint); |
348 | NULLB_DEVICE_ATTR(use_lightnvm, bool); | ||
349 | NULLB_DEVICE_ATTR(blocking, bool); | 350 | NULLB_DEVICE_ATTR(blocking, bool); |
350 | NULLB_DEVICE_ATTR(use_per_node_hctx, bool); | 351 | NULLB_DEVICE_ATTR(use_per_node_hctx, bool); |
351 | NULLB_DEVICE_ATTR(memory_backed, bool); | 352 | NULLB_DEVICE_ATTR(memory_backed, bool); |
@@ -455,7 +456,6 @@ static struct configfs_attribute *nullb_device_attrs[] = { | |||
455 | &nullb_device_attr_irqmode, | 456 | &nullb_device_attr_irqmode, |
456 | &nullb_device_attr_hw_queue_depth, | 457 | &nullb_device_attr_hw_queue_depth, |
457 | &nullb_device_attr_index, | 458 | &nullb_device_attr_index, |
458 | &nullb_device_attr_use_lightnvm, | ||
459 | &nullb_device_attr_blocking, | 459 | &nullb_device_attr_blocking, |
460 | &nullb_device_attr_use_per_node_hctx, | 460 | &nullb_device_attr_use_per_node_hctx, |
461 | &nullb_device_attr_power, | 461 | &nullb_device_attr_power, |
@@ -573,7 +573,6 @@ static struct nullb_device *null_alloc_dev(void) | |||
573 | dev->blocksize = g_bs; | 573 | dev->blocksize = g_bs; |
574 | dev->irqmode = g_irqmode; | 574 | dev->irqmode = g_irqmode; |
575 | dev->hw_queue_depth = g_hw_queue_depth; | 575 | dev->hw_queue_depth = g_hw_queue_depth; |
576 | dev->use_lightnvm = g_use_lightnvm; | ||
577 | dev->blocking = g_blocking; | 576 | dev->blocking = g_blocking; |
578 | dev->use_per_node_hctx = g_use_per_node_hctx; | 577 | dev->use_per_node_hctx = g_use_per_node_hctx; |
579 | return dev; | 578 | return dev; |
@@ -1352,6 +1351,12 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) | |||
1352 | return BLK_QC_T_NONE; | 1351 | return BLK_QC_T_NONE; |
1353 | } | 1352 | } |
1354 | 1353 | ||
1354 | static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq) | ||
1355 | { | ||
1356 | pr_info("null: rq %p timed out\n", rq); | ||
1357 | return BLK_EH_HANDLED; | ||
1358 | } | ||
1359 | |||
1355 | static int null_rq_prep_fn(struct request_queue *q, struct request *req) | 1360 | static int null_rq_prep_fn(struct request_queue *q, struct request *req) |
1356 | { | 1361 | { |
1357 | struct nullb *nullb = q->queuedata; | 1362 | struct nullb *nullb = q->queuedata; |
@@ -1369,6 +1374,16 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req) | |||
1369 | return BLKPREP_DEFER; | 1374 | return BLKPREP_DEFER; |
1370 | } | 1375 | } |
1371 | 1376 | ||
1377 | static bool should_timeout_request(struct request *rq) | ||
1378 | { | ||
1379 | #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION | ||
1380 | if (g_timeout_str[0]) | ||
1381 | return should_fail(&null_timeout_attr, 1); | ||
1382 | #endif | ||
1383 | |||
1384 | return false; | ||
1385 | } | ||
1386 | |||
1372 | static void null_request_fn(struct request_queue *q) | 1387 | static void null_request_fn(struct request_queue *q) |
1373 | { | 1388 | { |
1374 | struct request *rq; | 1389 | struct request *rq; |
@@ -1376,12 +1391,20 @@ static void null_request_fn(struct request_queue *q) | |||
1376 | while ((rq = blk_fetch_request(q)) != NULL) { | 1391 | while ((rq = blk_fetch_request(q)) != NULL) { |
1377 | struct nullb_cmd *cmd = rq->special; | 1392 | struct nullb_cmd *cmd = rq->special; |
1378 | 1393 | ||
1379 | spin_unlock_irq(q->queue_lock); | 1394 | if (!should_timeout_request(rq)) { |
1380 | null_handle_cmd(cmd); | 1395 | spin_unlock_irq(q->queue_lock); |
1381 | spin_lock_irq(q->queue_lock); | 1396 | null_handle_cmd(cmd); |
1397 | spin_lock_irq(q->queue_lock); | ||
1398 | } | ||
1382 | } | 1399 | } |
1383 | } | 1400 | } |
1384 | 1401 | ||
1402 | static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) | ||
1403 | { | ||
1404 | pr_info("null: rq %p timed out\n", rq); | ||
1405 | return BLK_EH_HANDLED; | ||
1406 | } | ||
1407 | |||
1385 | static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, | 1408 | static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, |
1386 | const struct blk_mq_queue_data *bd) | 1409 | const struct blk_mq_queue_data *bd) |
1387 | { | 1410 | { |
@@ -1399,12 +1422,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1399 | 1422 | ||
1400 | blk_mq_start_request(bd->rq); | 1423 | blk_mq_start_request(bd->rq); |
1401 | 1424 | ||
1402 | return null_handle_cmd(cmd); | 1425 | if (!should_timeout_request(bd->rq)) |
1426 | return null_handle_cmd(cmd); | ||
1427 | |||
1428 | return BLK_STS_OK; | ||
1403 | } | 1429 | } |
1404 | 1430 | ||
1405 | static const struct blk_mq_ops null_mq_ops = { | 1431 | static const struct blk_mq_ops null_mq_ops = { |
1406 | .queue_rq = null_queue_rq, | 1432 | .queue_rq = null_queue_rq, |
1407 | .complete = null_softirq_done_fn, | 1433 | .complete = null_softirq_done_fn, |
1434 | .timeout = null_timeout_rq, | ||
1408 | }; | 1435 | }; |
1409 | 1436 | ||
1410 | static void cleanup_queue(struct nullb_queue *nq) | 1437 | static void cleanup_queue(struct nullb_queue *nq) |
@@ -1423,170 +1450,6 @@ static void cleanup_queues(struct nullb *nullb) | |||
1423 | kfree(nullb->queues); | 1450 | kfree(nullb->queues); |
1424 | } | 1451 | } |
1425 | 1452 | ||
1426 | #ifdef CONFIG_NVM | ||
1427 | |||
1428 | static void null_lnvm_end_io(struct request *rq, blk_status_t status) | ||
1429 | { | ||
1430 | struct nvm_rq *rqd = rq->end_io_data; | ||
1431 | |||
1432 | /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */ | ||
1433 | rqd->error = status ? -EIO : 0; | ||
1434 | nvm_end_io(rqd); | ||
1435 | |||
1436 | blk_put_request(rq); | ||
1437 | } | ||
1438 | |||
1439 | static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) | ||
1440 | { | ||
1441 | struct request_queue *q = dev->q; | ||
1442 | struct request *rq; | ||
1443 | struct bio *bio = rqd->bio; | ||
1444 | |||
1445 | rq = blk_mq_alloc_request(q, | ||
1446 | op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); | ||
1447 | if (IS_ERR(rq)) | ||
1448 | return -ENOMEM; | ||
1449 | |||
1450 | blk_init_request_from_bio(rq, bio); | ||
1451 | |||
1452 | rq->end_io_data = rqd; | ||
1453 | |||
1454 | blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io); | ||
1455 | |||
1456 | return 0; | ||
1457 | } | ||
1458 | |||
1459 | static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) | ||
1460 | { | ||
1461 | struct nullb *nullb = dev->q->queuedata; | ||
1462 | sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL; | ||
1463 | sector_t blksize; | ||
1464 | struct nvm_id_group *grp; | ||
1465 | |||
1466 | id->ver_id = 0x1; | ||
1467 | id->vmnt = 0; | ||
1468 | id->cap = 0x2; | ||
1469 | id->dom = 0x1; | ||
1470 | |||
1471 | id->ppaf.blk_offset = 0; | ||
1472 | id->ppaf.blk_len = 16; | ||
1473 | id->ppaf.pg_offset = 16; | ||
1474 | id->ppaf.pg_len = 16; | ||
1475 | id->ppaf.sect_offset = 32; | ||
1476 | id->ppaf.sect_len = 8; | ||
1477 | id->ppaf.pln_offset = 40; | ||
1478 | id->ppaf.pln_len = 8; | ||
1479 | id->ppaf.lun_offset = 48; | ||
1480 | id->ppaf.lun_len = 8; | ||
1481 | id->ppaf.ch_offset = 56; | ||
1482 | id->ppaf.ch_len = 8; | ||
1483 | |||
1484 | sector_div(size, nullb->dev->blocksize); /* convert size to pages */ | ||
1485 | size >>= 8; /* concert size to pgs pr blk */ | ||
1486 | grp = &id->grp; | ||
1487 | grp->mtype = 0; | ||
1488 | grp->fmtype = 0; | ||
1489 | grp->num_ch = 1; | ||
1490 | grp->num_pg = 256; | ||
1491 | blksize = size; | ||
1492 | size >>= 16; | ||
1493 | grp->num_lun = size + 1; | ||
1494 | sector_div(blksize, grp->num_lun); | ||
1495 | grp->num_blk = blksize; | ||
1496 | grp->num_pln = 1; | ||
1497 | |||
1498 | grp->fpg_sz = nullb->dev->blocksize; | ||
1499 | grp->csecs = nullb->dev->blocksize; | ||
1500 | grp->trdt = 25000; | ||
1501 | grp->trdm = 25000; | ||
1502 | grp->tprt = 500000; | ||
1503 | grp->tprm = 500000; | ||
1504 | grp->tbet = 1500000; | ||
1505 | grp->tbem = 1500000; | ||
1506 | grp->mpos = 0x010101; /* single plane rwe */ | ||
1507 | grp->cpar = nullb->dev->hw_queue_depth; | ||
1508 | |||
1509 | return 0; | ||
1510 | } | ||
1511 | |||
1512 | static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) | ||
1513 | { | ||
1514 | mempool_t *virtmem_pool; | ||
1515 | |||
1516 | virtmem_pool = mempool_create_slab_pool(64, ppa_cache); | ||
1517 | if (!virtmem_pool) { | ||
1518 | pr_err("null_blk: Unable to create virtual memory pool\n"); | ||
1519 | return NULL; | ||
1520 | } | ||
1521 | |||
1522 | return virtmem_pool; | ||
1523 | } | ||
1524 | |||
1525 | static void null_lnvm_destroy_dma_pool(void *pool) | ||
1526 | { | ||
1527 | mempool_destroy(pool); | ||
1528 | } | ||
1529 | |||
1530 | static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, | ||
1531 | gfp_t mem_flags, dma_addr_t *dma_handler) | ||
1532 | { | ||
1533 | return mempool_alloc(pool, mem_flags); | ||
1534 | } | ||
1535 | |||
1536 | static void null_lnvm_dev_dma_free(void *pool, void *entry, | ||
1537 | dma_addr_t dma_handler) | ||
1538 | { | ||
1539 | mempool_free(entry, pool); | ||
1540 | } | ||
1541 | |||
1542 | static struct nvm_dev_ops null_lnvm_dev_ops = { | ||
1543 | .identity = null_lnvm_id, | ||
1544 | .submit_io = null_lnvm_submit_io, | ||
1545 | |||
1546 | .create_dma_pool = null_lnvm_create_dma_pool, | ||
1547 | .destroy_dma_pool = null_lnvm_destroy_dma_pool, | ||
1548 | .dev_dma_alloc = null_lnvm_dev_dma_alloc, | ||
1549 | .dev_dma_free = null_lnvm_dev_dma_free, | ||
1550 | |||
1551 | /* Simulate nvme protocol restriction */ | ||
1552 | .max_phys_sect = 64, | ||
1553 | }; | ||
1554 | |||
1555 | static int null_nvm_register(struct nullb *nullb) | ||
1556 | { | ||
1557 | struct nvm_dev *dev; | ||
1558 | int rv; | ||
1559 | |||
1560 | dev = nvm_alloc_dev(0); | ||
1561 | if (!dev) | ||
1562 | return -ENOMEM; | ||
1563 | |||
1564 | dev->q = nullb->q; | ||
1565 | memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN); | ||
1566 | dev->ops = &null_lnvm_dev_ops; | ||
1567 | |||
1568 | rv = nvm_register(dev); | ||
1569 | if (rv) { | ||
1570 | kfree(dev); | ||
1571 | return rv; | ||
1572 | } | ||
1573 | nullb->ndev = dev; | ||
1574 | return 0; | ||
1575 | } | ||
1576 | |||
1577 | static void null_nvm_unregister(struct nullb *nullb) | ||
1578 | { | ||
1579 | nvm_unregister(nullb->ndev); | ||
1580 | } | ||
1581 | #else | ||
1582 | static int null_nvm_register(struct nullb *nullb) | ||
1583 | { | ||
1584 | pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n"); | ||
1585 | return -EINVAL; | ||
1586 | } | ||
1587 | static void null_nvm_unregister(struct nullb *nullb) {} | ||
1588 | #endif /* CONFIG_NVM */ | ||
1589 | |||
1590 | static void null_del_dev(struct nullb *nullb) | 1453 | static void null_del_dev(struct nullb *nullb) |
1591 | { | 1454 | { |
1592 | struct nullb_device *dev = nullb->dev; | 1455 | struct nullb_device *dev = nullb->dev; |
@@ -1595,10 +1458,7 @@ static void null_del_dev(struct nullb *nullb) | |||
1595 | 1458 | ||
1596 | list_del_init(&nullb->list); | 1459 | list_del_init(&nullb->list); |
1597 | 1460 | ||
1598 | if (dev->use_lightnvm) | 1461 | del_gendisk(nullb->disk); |
1599 | null_nvm_unregister(nullb); | ||
1600 | else | ||
1601 | del_gendisk(nullb->disk); | ||
1602 | 1462 | ||
1603 | if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { | 1463 | if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { |
1604 | hrtimer_cancel(&nullb->bw_timer); | 1464 | hrtimer_cancel(&nullb->bw_timer); |
@@ -1610,8 +1470,7 @@ static void null_del_dev(struct nullb *nullb) | |||
1610 | if (dev->queue_mode == NULL_Q_MQ && | 1470 | if (dev->queue_mode == NULL_Q_MQ && |
1611 | nullb->tag_set == &nullb->__tag_set) | 1471 | nullb->tag_set == &nullb->__tag_set) |
1612 | blk_mq_free_tag_set(nullb->tag_set); | 1472 | blk_mq_free_tag_set(nullb->tag_set); |
1613 | if (!dev->use_lightnvm) | 1473 | put_disk(nullb->disk); |
1614 | put_disk(nullb->disk); | ||
1615 | cleanup_queues(nullb); | 1474 | cleanup_queues(nullb); |
1616 | if (null_cache_active(nullb)) | 1475 | if (null_cache_active(nullb)) |
1617 | null_free_device_storage(nullb->dev, true); | 1476 | null_free_device_storage(nullb->dev, true); |
@@ -1775,11 +1634,6 @@ static void null_validate_conf(struct nullb_device *dev) | |||
1775 | { | 1634 | { |
1776 | dev->blocksize = round_down(dev->blocksize, 512); | 1635 | dev->blocksize = round_down(dev->blocksize, 512); |
1777 | dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); | 1636 | dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); |
1778 | if (dev->use_lightnvm && dev->blocksize != 4096) | ||
1779 | dev->blocksize = 4096; | ||
1780 | |||
1781 | if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ) | ||
1782 | dev->queue_mode = NULL_Q_MQ; | ||
1783 | 1637 | ||
1784 | if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { | 1638 | if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { |
1785 | if (dev->submit_queues != nr_online_nodes) | 1639 | if (dev->submit_queues != nr_online_nodes) |
@@ -1805,6 +1659,20 @@ static void null_validate_conf(struct nullb_device *dev) | |||
1805 | dev->mbps = 0; | 1659 | dev->mbps = 0; |
1806 | } | 1660 | } |
1807 | 1661 | ||
1662 | static bool null_setup_fault(void) | ||
1663 | { | ||
1664 | #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION | ||
1665 | if (!g_timeout_str[0]) | ||
1666 | return true; | ||
1667 | |||
1668 | if (!setup_fault_attr(&null_timeout_attr, g_timeout_str)) | ||
1669 | return false; | ||
1670 | |||
1671 | null_timeout_attr.verbose = 0; | ||
1672 | #endif | ||
1673 | return true; | ||
1674 | } | ||
1675 | |||
1808 | static int null_add_dev(struct nullb_device *dev) | 1676 | static int null_add_dev(struct nullb_device *dev) |
1809 | { | 1677 | { |
1810 | struct nullb *nullb; | 1678 | struct nullb *nullb; |
@@ -1838,6 +1706,10 @@ static int null_add_dev(struct nullb_device *dev) | |||
1838 | if (rv) | 1706 | if (rv) |
1839 | goto out_cleanup_queues; | 1707 | goto out_cleanup_queues; |
1840 | 1708 | ||
1709 | if (!null_setup_fault()) | ||
1710 | goto out_cleanup_queues; | ||
1711 | |||
1712 | nullb->tag_set->timeout = 5 * HZ; | ||
1841 | nullb->q = blk_mq_init_queue(nullb->tag_set); | 1713 | nullb->q = blk_mq_init_queue(nullb->tag_set); |
1842 | if (IS_ERR(nullb->q)) { | 1714 | if (IS_ERR(nullb->q)) { |
1843 | rv = -ENOMEM; | 1715 | rv = -ENOMEM; |
@@ -1861,8 +1733,14 @@ static int null_add_dev(struct nullb_device *dev) | |||
1861 | rv = -ENOMEM; | 1733 | rv = -ENOMEM; |
1862 | goto out_cleanup_queues; | 1734 | goto out_cleanup_queues; |
1863 | } | 1735 | } |
1736 | |||
1737 | if (!null_setup_fault()) | ||
1738 | goto out_cleanup_blk_queue; | ||
1739 | |||
1864 | blk_queue_prep_rq(nullb->q, null_rq_prep_fn); | 1740 | blk_queue_prep_rq(nullb->q, null_rq_prep_fn); |
1865 | blk_queue_softirq_done(nullb->q, null_softirq_done_fn); | 1741 | blk_queue_softirq_done(nullb->q, null_softirq_done_fn); |
1742 | blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn); | ||
1743 | nullb->q->rq_timeout = 5 * HZ; | ||
1866 | rv = init_driver_queues(nullb); | 1744 | rv = init_driver_queues(nullb); |
1867 | if (rv) | 1745 | if (rv) |
1868 | goto out_cleanup_blk_queue; | 1746 | goto out_cleanup_blk_queue; |
@@ -1895,11 +1773,7 @@ static int null_add_dev(struct nullb_device *dev) | |||
1895 | 1773 | ||
1896 | sprintf(nullb->disk_name, "nullb%d", nullb->index); | 1774 | sprintf(nullb->disk_name, "nullb%d", nullb->index); |
1897 | 1775 | ||
1898 | if (dev->use_lightnvm) | 1776 | rv = null_gendisk_register(nullb); |
1899 | rv = null_nvm_register(nullb); | ||
1900 | else | ||
1901 | rv = null_gendisk_register(nullb); | ||
1902 | |||
1903 | if (rv) | 1777 | if (rv) |
1904 | goto out_cleanup_blk_queue; | 1778 | goto out_cleanup_blk_queue; |
1905 | 1779 | ||
@@ -1938,18 +1812,6 @@ static int __init null_init(void) | |||
1938 | g_bs = PAGE_SIZE; | 1812 | g_bs = PAGE_SIZE; |
1939 | } | 1813 | } |
1940 | 1814 | ||
1941 | if (g_use_lightnvm && g_bs != 4096) { | ||
1942 | pr_warn("null_blk: LightNVM only supports 4k block size\n"); | ||
1943 | pr_warn("null_blk: defaults block size to 4k\n"); | ||
1944 | g_bs = 4096; | ||
1945 | } | ||
1946 | |||
1947 | if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) { | ||
1948 | pr_warn("null_blk: LightNVM only supported for blk-mq\n"); | ||
1949 | pr_warn("null_blk: defaults queue mode to blk-mq\n"); | ||
1950 | g_queue_mode = NULL_Q_MQ; | ||
1951 | } | ||
1952 | |||
1953 | if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { | 1815 | if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { |
1954 | if (g_submit_queues != nr_online_nodes) { | 1816 | if (g_submit_queues != nr_online_nodes) { |
1955 | pr_warn("null_blk: submit_queues param is set to %u.\n", | 1817 | pr_warn("null_blk: submit_queues param is set to %u.\n", |
@@ -1982,16 +1844,6 @@ static int __init null_init(void) | |||
1982 | goto err_conf; | 1844 | goto err_conf; |
1983 | } | 1845 | } |
1984 | 1846 | ||
1985 | if (g_use_lightnvm) { | ||
1986 | ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64), | ||
1987 | 0, 0, NULL); | ||
1988 | if (!ppa_cache) { | ||
1989 | pr_err("null_blk: unable to create ppa cache\n"); | ||
1990 | ret = -ENOMEM; | ||
1991 | goto err_ppa; | ||
1992 | } | ||
1993 | } | ||
1994 | |||
1995 | for (i = 0; i < nr_devices; i++) { | 1847 | for (i = 0; i < nr_devices; i++) { |
1996 | dev = null_alloc_dev(); | 1848 | dev = null_alloc_dev(); |
1997 | if (!dev) { | 1849 | if (!dev) { |
@@ -2015,8 +1867,6 @@ err_dev: | |||
2015 | null_del_dev(nullb); | 1867 | null_del_dev(nullb); |
2016 | null_free_dev(dev); | 1868 | null_free_dev(dev); |
2017 | } | 1869 | } |
2018 | kmem_cache_destroy(ppa_cache); | ||
2019 | err_ppa: | ||
2020 | unregister_blkdev(null_major, "nullb"); | 1870 | unregister_blkdev(null_major, "nullb"); |
2021 | err_conf: | 1871 | err_conf: |
2022 | configfs_unregister_subsystem(&nullb_subsys); | 1872 | configfs_unregister_subsystem(&nullb_subsys); |
@@ -2047,8 +1897,6 @@ static void __exit null_exit(void) | |||
2047 | 1897 | ||
2048 | if (g_queue_mode == NULL_Q_MQ && shared_tags) | 1898 | if (g_queue_mode == NULL_Q_MQ && shared_tags) |
2049 | blk_mq_free_tag_set(&tag_set); | 1899 | blk_mq_free_tag_set(&tag_set); |
2050 | |||
2051 | kmem_cache_destroy(ppa_cache); | ||
2052 | } | 1900 | } |
2053 | 1901 | ||
2054 | module_init(null_init); | 1902 | module_init(null_init); |
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 67974796c350..531a0915066b 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c | |||
@@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) | |||
2579 | bdev = bdget(dev); | 2579 | bdev = bdget(dev); |
2580 | if (!bdev) | 2580 | if (!bdev) |
2581 | return -ENOMEM; | 2581 | return -ENOMEM; |
2582 | ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); | ||
2583 | if (ret) | ||
2584 | return ret; | ||
2582 | if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { | 2585 | if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { |
2583 | WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); | 2586 | WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); |
2584 | bdput(bdev); | 2587 | blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); |
2585 | return -EINVAL; | 2588 | return -EINVAL; |
2586 | } | 2589 | } |
2587 | ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); | ||
2588 | if (ret) | ||
2589 | return ret; | ||
2590 | 2590 | ||
2591 | /* This is safe, since we have a reference from open(). */ | 2591 | /* This is safe, since we have a reference from open(). */ |
2592 | __module_get(THIS_MODULE); | 2592 | __module_get(THIS_MODULE); |
@@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) | |||
2745 | pd->pkt_dev = MKDEV(pktdev_major, idx); | 2745 | pd->pkt_dev = MKDEV(pktdev_major, idx); |
2746 | ret = pkt_new_dev(pd, dev); | 2746 | ret = pkt_new_dev(pd, dev); |
2747 | if (ret) | 2747 | if (ret) |
2748 | goto out_new_dev; | 2748 | goto out_mem2; |
2749 | 2749 | ||
2750 | /* inherit events of the host device */ | 2750 | /* inherit events of the host device */ |
2751 | disk->events = pd->bdev->bd_disk->events; | 2751 | disk->events = pd->bdev->bd_disk->events; |
@@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) | |||
2763 | mutex_unlock(&ctl_mutex); | 2763 | mutex_unlock(&ctl_mutex); |
2764 | return 0; | 2764 | return 0; |
2765 | 2765 | ||
2766 | out_new_dev: | ||
2767 | blk_cleanup_queue(disk->queue); | ||
2768 | out_mem2: | 2766 | out_mem2: |
2769 | put_disk(disk); | 2767 | put_disk(disk); |
2770 | out_mem: | 2768 | out_mem: |
diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h deleted file mode 100644 index e5565fbaeb30..000000000000 --- a/drivers/block/smart1,2.h +++ /dev/null | |||
@@ -1,278 +0,0 @@ | |||
1 | /* | ||
2 | * Disk Array driver for Compaq SMART2 Controllers | ||
3 | * Copyright 1998 Compaq Computer Corporation | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
13 | * NON INFRINGEMENT. See the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
18 | * | ||
19 | * Questions/Comments/Bugfixes to iss_storagedev@hp.com | ||
20 | * | ||
21 | * If you want to make changes, improve or add functionality to this | ||
22 | * driver, you'll probably need the Compaq Array Controller Interface | ||
23 | * Specificiation (Document number ECG086/1198) | ||
24 | */ | ||
25 | |||
26 | /* | ||
27 | * This file contains the controller communication implementation for | ||
28 | * Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge, | ||
29 | * this should support: | ||
30 | * | ||
31 | * PCI: | ||
32 | * SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200 | ||
33 | * Integerated SMART Array Controller, SMART-4200, SMART-4250ES | ||
34 | * | ||
35 | * EISA: | ||
36 | * SMART-2/E, SMART, IAES, IDA-2, IDA | ||
37 | */ | ||
38 | |||
39 | /* | ||
40 | * Memory mapped FIFO interface (SMART 42xx cards) | ||
41 | */ | ||
42 | static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c) | ||
43 | { | ||
44 | writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET); | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * This card is the opposite of the other cards. | ||
49 | * 0 turns interrupts on... | ||
50 | * 0x08 turns them off... | ||
51 | */ | ||
52 | static void smart4_intr_mask(ctlr_info_t *h, unsigned long val) | ||
53 | { | ||
54 | if (val) | ||
55 | { /* Turn interrupts on */ | ||
56 | writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET); | ||
57 | } else /* Turn them off */ | ||
58 | { | ||
59 | writel( S42XX_INTR_OFF, | ||
60 | h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET); | ||
61 | } | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * For older cards FIFO Full = 0. | ||
66 | * On this card 0 means there is room, anything else FIFO Full. | ||
67 | * | ||
68 | */ | ||
69 | static unsigned long smart4_fifo_full(ctlr_info_t *h) | ||
70 | { | ||
71 | |||
72 | return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET)); | ||
73 | } | ||
74 | |||
75 | /* This type of controller returns -1 if the fifo is empty, | ||
76 | * Not 0 like the others. | ||
77 | * And we need to let it know we read a value out | ||
78 | */ | ||
79 | static unsigned long smart4_completed(ctlr_info_t *h) | ||
80 | { | ||
81 | long register_value | ||
82 | = readl(h->vaddr + S42XX_REPLY_PORT_OFFSET); | ||
83 | |||
84 | /* Fifo is empty */ | ||
85 | if( register_value == 0xffffffff) | ||
86 | return 0; | ||
87 | |||
88 | /* Need to let it know we got the reply */ | ||
89 | /* We do this by writing a 0 to the port we just read from */ | ||
90 | writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET); | ||
91 | |||
92 | return ((unsigned long) register_value); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * This hardware returns interrupt pending at a different place and | ||
97 | * it does not tell us if the fifo is empty, we will have check | ||
98 | * that by getting a 0 back from the command_completed call. | ||
99 | */ | ||
100 | static unsigned long smart4_intr_pending(ctlr_info_t *h) | ||
101 | { | ||
102 | unsigned long register_value = | ||
103 | readl(h->vaddr + S42XX_INTR_STATUS); | ||
104 | |||
105 | if( register_value & S42XX_INTR_PENDING) | ||
106 | return FIFO_NOT_EMPTY; | ||
107 | return 0 ; | ||
108 | } | ||
109 | |||
110 | static struct access_method smart4_access = { | ||
111 | smart4_submit_command, | ||
112 | smart4_intr_mask, | ||
113 | smart4_fifo_full, | ||
114 | smart4_intr_pending, | ||
115 | smart4_completed, | ||
116 | }; | ||
117 | |||
118 | /* | ||
119 | * Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards) | ||
120 | */ | ||
121 | static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c) | ||
122 | { | ||
123 | writel(c->busaddr, h->vaddr + COMMAND_FIFO); | ||
124 | } | ||
125 | |||
126 | static void smart2_intr_mask(ctlr_info_t *h, unsigned long val) | ||
127 | { | ||
128 | writel(val, h->vaddr + INTR_MASK); | ||
129 | } | ||
130 | |||
131 | static unsigned long smart2_fifo_full(ctlr_info_t *h) | ||
132 | { | ||
133 | return readl(h->vaddr + COMMAND_FIFO); | ||
134 | } | ||
135 | |||
136 | static unsigned long smart2_completed(ctlr_info_t *h) | ||
137 | { | ||
138 | return readl(h->vaddr + COMMAND_COMPLETE_FIFO); | ||
139 | } | ||
140 | |||
141 | static unsigned long smart2_intr_pending(ctlr_info_t *h) | ||
142 | { | ||
143 | return readl(h->vaddr + INTR_PENDING); | ||
144 | } | ||
145 | |||
146 | static struct access_method smart2_access = { | ||
147 | smart2_submit_command, | ||
148 | smart2_intr_mask, | ||
149 | smart2_fifo_full, | ||
150 | smart2_intr_pending, | ||
151 | smart2_completed, | ||
152 | }; | ||
153 | |||
154 | /* | ||
155 | * IO access for SMART-2/E cards | ||
156 | */ | ||
157 | static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c) | ||
158 | { | ||
159 | outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO); | ||
160 | } | ||
161 | |||
162 | static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val) | ||
163 | { | ||
164 | outl(val, h->io_mem_addr + INTR_MASK); | ||
165 | } | ||
166 | |||
167 | static unsigned long smart2e_fifo_full(ctlr_info_t *h) | ||
168 | { | ||
169 | return inl(h->io_mem_addr + COMMAND_FIFO); | ||
170 | } | ||
171 | |||
172 | static unsigned long smart2e_completed(ctlr_info_t *h) | ||
173 | { | ||
174 | return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO); | ||
175 | } | ||
176 | |||
177 | static unsigned long smart2e_intr_pending(ctlr_info_t *h) | ||
178 | { | ||
179 | return inl(h->io_mem_addr + INTR_PENDING); | ||
180 | } | ||
181 | |||
182 | static struct access_method smart2e_access = { | ||
183 | smart2e_submit_command, | ||
184 | smart2e_intr_mask, | ||
185 | smart2e_fifo_full, | ||
186 | smart2e_intr_pending, | ||
187 | smart2e_completed, | ||
188 | }; | ||
189 | |||
190 | /* | ||
191 | * IO access for older SMART-1 type cards | ||
192 | */ | ||
193 | #define SMART1_SYSTEM_MASK 0xC8E | ||
194 | #define SMART1_SYSTEM_DOORBELL 0xC8F | ||
195 | #define SMART1_LOCAL_MASK 0xC8C | ||
196 | #define SMART1_LOCAL_DOORBELL 0xC8D | ||
197 | #define SMART1_INTR_MASK 0xC89 | ||
198 | #define SMART1_LISTADDR 0xC90 | ||
199 | #define SMART1_LISTLEN 0xC94 | ||
200 | #define SMART1_TAG 0xC97 | ||
201 | #define SMART1_COMPLETE_ADDR 0xC98 | ||
202 | #define SMART1_LISTSTATUS 0xC9E | ||
203 | |||
204 | #define CHANNEL_BUSY 0x01 | ||
205 | #define CHANNEL_CLEAR 0x02 | ||
206 | |||
207 | static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c) | ||
208 | { | ||
209 | /* | ||
210 | * This __u16 is actually a bunch of control flags on SMART | ||
211 | * and below. We want them all to be zero. | ||
212 | */ | ||
213 | c->hdr.size = 0; | ||
214 | |||
215 | outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); | ||
216 | |||
217 | outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR); | ||
218 | outw(c->size, h->io_mem_addr + SMART1_LISTLEN); | ||
219 | |||
220 | outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL); | ||
221 | } | ||
222 | |||
223 | static void smart1_intr_mask(ctlr_info_t *h, unsigned long val) | ||
224 | { | ||
225 | if (val == 1) { | ||
226 | outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); | ||
227 | outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL); | ||
228 | outb(0x01, h->io_mem_addr + SMART1_INTR_MASK); | ||
229 | outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK); | ||
230 | } else { | ||
231 | outb(0, h->io_mem_addr + 0xC8E); | ||
232 | } | ||
233 | } | ||
234 | |||
235 | static unsigned long smart1_fifo_full(ctlr_info_t *h) | ||
236 | { | ||
237 | unsigned char chan; | ||
238 | chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR; | ||
239 | return chan; | ||
240 | } | ||
241 | |||
242 | static unsigned long smart1_completed(ctlr_info_t *h) | ||
243 | { | ||
244 | unsigned char status; | ||
245 | unsigned long cmd; | ||
246 | |||
247 | if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) { | ||
248 | outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); | ||
249 | |||
250 | cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR); | ||
251 | status = inb(h->io_mem_addr + SMART1_LISTSTATUS); | ||
252 | |||
253 | outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL); | ||
254 | |||
255 | /* | ||
256 | * this is x86 (actually compaq x86) only, so it's ok | ||
257 | */ | ||
258 | if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status; | ||
259 | } else { | ||
260 | cmd = 0; | ||
261 | } | ||
262 | return cmd; | ||
263 | } | ||
264 | |||
265 | static unsigned long smart1_intr_pending(ctlr_info_t *h) | ||
266 | { | ||
267 | unsigned char chan; | ||
268 | chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY; | ||
269 | return chan; | ||
270 | } | ||
271 | |||
272 | static struct access_method smart1_access = { | ||
273 | smart1_submit_command, | ||
274 | smart1_intr_mask, | ||
275 | smart1_fifo_full, | ||
276 | smart1_intr_pending, | ||
277 | smart1_completed, | ||
278 | }; | ||
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d70eba30003a..0afa6c8c3857 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) | |||
430 | 430 | ||
431 | static void zram_page_end_io(struct bio *bio) | 431 | static void zram_page_end_io(struct bio *bio) |
432 | { | 432 | { |
433 | struct page *page = bio->bi_io_vec[0].bv_page; | 433 | struct page *page = bio_first_page_all(bio); |
434 | 434 | ||
435 | page_endio(page, op_is_write(bio_op(bio)), | 435 | page_endio(page, op_is_write(bio_op(bio)), |
436 | blk_status_to_errno(bio->bi_status)); | 436 | blk_status_to_errno(bio->bi_status)); |
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 2a953efec4e1..10c08982185a 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig | |||
@@ -27,13 +27,6 @@ config NVM_DEBUG | |||
27 | 27 | ||
28 | It is required to create/remove targets without IOCTLs. | 28 | It is required to create/remove targets without IOCTLs. |
29 | 29 | ||
30 | config NVM_RRPC | ||
31 | tristate "Round-robin Hybrid Open-Channel SSD target" | ||
32 | ---help--- | ||
33 | Allows an open-channel SSD to be exposed as a block device to the | ||
34 | host. The target is implemented using a linear mapping table and | ||
35 | cost-based garbage collection. It is optimized for 4K IO sizes. | ||
36 | |||
37 | config NVM_PBLK | 30 | config NVM_PBLK |
38 | tristate "Physical Block Device Open-Channel SSD target" | 31 | tristate "Physical Block Device Open-Channel SSD target" |
39 | ---help--- | 32 | ---help--- |
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index 2c3fd9d2c08c..97d9d7c71550 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile | |||
@@ -4,7 +4,6 @@ | |||
4 | # | 4 | # |
5 | 5 | ||
6 | obj-$(CONFIG_NVM) := core.o | 6 | obj-$(CONFIG_NVM) := core.o |
7 | obj-$(CONFIG_NVM_RRPC) += rrpc.o | ||
8 | obj-$(CONFIG_NVM_PBLK) += pblk.o | 7 | obj-$(CONFIG_NVM_PBLK) += pblk.o |
9 | pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ | 8 | pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ |
10 | pblk-write.o pblk-cache.o pblk-read.o \ | 9 | pblk-write.o pblk-cache.o pblk-read.o \ |
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 83249b43dd06..dcc9e621e651 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c | |||
@@ -45,12 +45,6 @@ struct nvm_dev_map { | |||
45 | int nr_chnls; | 45 | int nr_chnls; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | struct nvm_area { | ||
49 | struct list_head list; | ||
50 | sector_t begin; | ||
51 | sector_t end; /* end is excluded */ | ||
52 | }; | ||
53 | |||
54 | static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) | 48 | static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) |
55 | { | 49 | { |
56 | struct nvm_target *tgt; | 50 | struct nvm_target *tgt; |
@@ -62,6 +56,30 @@ static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) | |||
62 | return NULL; | 56 | return NULL; |
63 | } | 57 | } |
64 | 58 | ||
59 | static bool nvm_target_exists(const char *name) | ||
60 | { | ||
61 | struct nvm_dev *dev; | ||
62 | struct nvm_target *tgt; | ||
63 | bool ret = false; | ||
64 | |||
65 | down_write(&nvm_lock); | ||
66 | list_for_each_entry(dev, &nvm_devices, devices) { | ||
67 | mutex_lock(&dev->mlock); | ||
68 | list_for_each_entry(tgt, &dev->targets, list) { | ||
69 | if (!strcmp(name, tgt->disk->disk_name)) { | ||
70 | ret = true; | ||
71 | mutex_unlock(&dev->mlock); | ||
72 | goto out; | ||
73 | } | ||
74 | } | ||
75 | mutex_unlock(&dev->mlock); | ||
76 | } | ||
77 | |||
78 | out: | ||
79 | up_write(&nvm_lock); | ||
80 | return ret; | ||
81 | } | ||
82 | |||
65 | static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) | 83 | static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) |
66 | { | 84 | { |
67 | int i; | 85 | int i; |
@@ -104,7 +122,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) | |||
104 | if (clear) { | 122 | if (clear) { |
105 | for (j = 0; j < ch_map->nr_luns; j++) { | 123 | for (j = 0; j < ch_map->nr_luns; j++) { |
106 | int lun = j + lun_offs[j]; | 124 | int lun = j + lun_offs[j]; |
107 | int lunid = (ch * dev->geo.luns_per_chnl) + lun; | 125 | int lunid = (ch * dev->geo.nr_luns) + lun; |
108 | 126 | ||
109 | WARN_ON(!test_and_clear_bit(lunid, | 127 | WARN_ON(!test_and_clear_bit(lunid, |
110 | dev->lun_map)); | 128 | dev->lun_map)); |
@@ -122,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) | |||
122 | } | 140 | } |
123 | 141 | ||
124 | static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, | 142 | static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, |
125 | int lun_begin, int lun_end) | 143 | u16 lun_begin, u16 lun_end, |
144 | u16 op) | ||
126 | { | 145 | { |
127 | struct nvm_tgt_dev *tgt_dev = NULL; | 146 | struct nvm_tgt_dev *tgt_dev = NULL; |
128 | struct nvm_dev_map *dev_rmap = dev->rmap; | 147 | struct nvm_dev_map *dev_rmap = dev->rmap; |
@@ -130,10 +149,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, | |||
130 | struct ppa_addr *luns; | 149 | struct ppa_addr *luns; |
131 | int nr_luns = lun_end - lun_begin + 1; | 150 | int nr_luns = lun_end - lun_begin + 1; |
132 | int luns_left = nr_luns; | 151 | int luns_left = nr_luns; |
133 | int nr_chnls = nr_luns / dev->geo.luns_per_chnl; | 152 | int nr_chnls = nr_luns / dev->geo.nr_luns; |
134 | int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; | 153 | int nr_chnls_mod = nr_luns % dev->geo.nr_luns; |
135 | int bch = lun_begin / dev->geo.luns_per_chnl; | 154 | int bch = lun_begin / dev->geo.nr_luns; |
136 | int blun = lun_begin % dev->geo.luns_per_chnl; | 155 | int blun = lun_begin % dev->geo.nr_luns; |
137 | int lunid = 0; | 156 | int lunid = 0; |
138 | int lun_balanced = 1; | 157 | int lun_balanced = 1; |
139 | int prev_nr_luns; | 158 | int prev_nr_luns; |
@@ -154,15 +173,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, | |||
154 | if (!luns) | 173 | if (!luns) |
155 | goto err_luns; | 174 | goto err_luns; |
156 | 175 | ||
157 | prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? | 176 | prev_nr_luns = (luns_left > dev->geo.nr_luns) ? |
158 | dev->geo.luns_per_chnl : luns_left; | 177 | dev->geo.nr_luns : luns_left; |
159 | for (i = 0; i < nr_chnls; i++) { | 178 | for (i = 0; i < nr_chnls; i++) { |
160 | struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; | 179 | struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; |
161 | int *lun_roffs = ch_rmap->lun_offs; | 180 | int *lun_roffs = ch_rmap->lun_offs; |
162 | struct nvm_ch_map *ch_map = &dev_map->chnls[i]; | 181 | struct nvm_ch_map *ch_map = &dev_map->chnls[i]; |
163 | int *lun_offs; | 182 | int *lun_offs; |
164 | int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? | 183 | int luns_in_chnl = (luns_left > dev->geo.nr_luns) ? |
165 | dev->geo.luns_per_chnl : luns_left; | 184 | dev->geo.nr_luns : luns_left; |
166 | 185 | ||
167 | if (lun_balanced && prev_nr_luns != luns_in_chnl) | 186 | if (lun_balanced && prev_nr_luns != luns_in_chnl) |
168 | lun_balanced = 0; | 187 | lun_balanced = 0; |
@@ -199,8 +218,9 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, | |||
199 | memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); | 218 | memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); |
200 | /* Target device only owns a portion of the physical device */ | 219 | /* Target device only owns a portion of the physical device */ |
201 | tgt_dev->geo.nr_chnls = nr_chnls; | 220 | tgt_dev->geo.nr_chnls = nr_chnls; |
202 | tgt_dev->geo.nr_luns = nr_luns; | 221 | tgt_dev->geo.all_luns = nr_luns; |
203 | tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; | 222 | tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1; |
223 | tgt_dev->geo.op = op; | ||
204 | tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; | 224 | tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; |
205 | tgt_dev->q = dev->q; | 225 | tgt_dev->q = dev->q; |
206 | tgt_dev->map = dev_map; | 226 | tgt_dev->map = dev_map; |
@@ -226,27 +246,79 @@ static const struct block_device_operations nvm_fops = { | |||
226 | .owner = THIS_MODULE, | 246 | .owner = THIS_MODULE, |
227 | }; | 247 | }; |
228 | 248 | ||
229 | static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) | 249 | static struct nvm_tgt_type *__nvm_find_target_type(const char *name) |
230 | { | 250 | { |
231 | struct nvm_tgt_type *tmp, *tt = NULL; | 251 | struct nvm_tgt_type *tt; |
232 | 252 | ||
233 | if (lock) | 253 | list_for_each_entry(tt, &nvm_tgt_types, list) |
234 | down_write(&nvm_tgtt_lock); | 254 | if (!strcmp(name, tt->name)) |
255 | return tt; | ||
235 | 256 | ||
236 | list_for_each_entry(tmp, &nvm_tgt_types, list) | 257 | return NULL; |
237 | if (!strcmp(name, tmp->name)) { | 258 | } |
238 | tt = tmp; | 259 | |
239 | break; | 260 | static struct nvm_tgt_type *nvm_find_target_type(const char *name) |
240 | } | 261 | { |
262 | struct nvm_tgt_type *tt; | ||
263 | |||
264 | down_write(&nvm_tgtt_lock); | ||
265 | tt = __nvm_find_target_type(name); | ||
266 | up_write(&nvm_tgtt_lock); | ||
241 | 267 | ||
242 | if (lock) | ||
243 | up_write(&nvm_tgtt_lock); | ||
244 | return tt; | 268 | return tt; |
245 | } | 269 | } |
246 | 270 | ||
271 | static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin, | ||
272 | int lun_end) | ||
273 | { | ||
274 | if (lun_begin > lun_end || lun_end >= geo->all_luns) { | ||
275 | pr_err("nvm: lun out of bound (%u:%u > %u)\n", | ||
276 | lun_begin, lun_end, geo->all_luns - 1); | ||
277 | return -EINVAL; | ||
278 | } | ||
279 | |||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | static int __nvm_config_simple(struct nvm_dev *dev, | ||
284 | struct nvm_ioctl_create_simple *s) | ||
285 | { | ||
286 | struct nvm_geo *geo = &dev->geo; | ||
287 | |||
288 | if (s->lun_begin == -1 && s->lun_end == -1) { | ||
289 | s->lun_begin = 0; | ||
290 | s->lun_end = geo->all_luns - 1; | ||
291 | } | ||
292 | |||
293 | return nvm_config_check_luns(geo, s->lun_begin, s->lun_end); | ||
294 | } | ||
295 | |||
296 | static int __nvm_config_extended(struct nvm_dev *dev, | ||
297 | struct nvm_ioctl_create_extended *e) | ||
298 | { | ||
299 | struct nvm_geo *geo = &dev->geo; | ||
300 | |||
301 | if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) { | ||
302 | e->lun_begin = 0; | ||
303 | e->lun_end = dev->geo.all_luns - 1; | ||
304 | } | ||
305 | |||
306 | /* op not set falls into target's default */ | ||
307 | if (e->op == 0xFFFF) | ||
308 | e->op = NVM_TARGET_DEFAULT_OP; | ||
309 | |||
310 | if (e->op < NVM_TARGET_MIN_OP || | ||
311 | e->op > NVM_TARGET_MAX_OP) { | ||
312 | pr_err("nvm: invalid over provisioning value\n"); | ||
313 | return -EINVAL; | ||
314 | } | ||
315 | |||
316 | return nvm_config_check_luns(geo, e->lun_begin, e->lun_end); | ||
317 | } | ||
318 | |||
247 | static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) | 319 | static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) |
248 | { | 320 | { |
249 | struct nvm_ioctl_create_simple *s = &create->conf.s; | 321 | struct nvm_ioctl_create_extended e; |
250 | struct request_queue *tqueue; | 322 | struct request_queue *tqueue; |
251 | struct gendisk *tdisk; | 323 | struct gendisk *tdisk; |
252 | struct nvm_tgt_type *tt; | 324 | struct nvm_tgt_type *tt; |
@@ -255,22 +327,41 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) | |||
255 | void *targetdata; | 327 | void *targetdata; |
256 | int ret; | 328 | int ret; |
257 | 329 | ||
258 | tt = nvm_find_target_type(create->tgttype, 1); | 330 | switch (create->conf.type) { |
331 | case NVM_CONFIG_TYPE_SIMPLE: | ||
332 | ret = __nvm_config_simple(dev, &create->conf.s); | ||
333 | if (ret) | ||
334 | return ret; | ||
335 | |||
336 | e.lun_begin = create->conf.s.lun_begin; | ||
337 | e.lun_end = create->conf.s.lun_end; | ||
338 | e.op = NVM_TARGET_DEFAULT_OP; | ||
339 | break; | ||
340 | case NVM_CONFIG_TYPE_EXTENDED: | ||
341 | ret = __nvm_config_extended(dev, &create->conf.e); | ||
342 | if (ret) | ||
343 | return ret; | ||
344 | |||
345 | e = create->conf.e; | ||
346 | break; | ||
347 | default: | ||
348 | pr_err("nvm: config type not valid\n"); | ||
349 | return -EINVAL; | ||
350 | } | ||
351 | |||
352 | tt = nvm_find_target_type(create->tgttype); | ||
259 | if (!tt) { | 353 | if (!tt) { |
260 | pr_err("nvm: target type %s not found\n", create->tgttype); | 354 | pr_err("nvm: target type %s not found\n", create->tgttype); |
261 | return -EINVAL; | 355 | return -EINVAL; |
262 | } | 356 | } |
263 | 357 | ||
264 | mutex_lock(&dev->mlock); | 358 | if (nvm_target_exists(create->tgtname)) { |
265 | t = nvm_find_target(dev, create->tgtname); | 359 | pr_err("nvm: target name already exists (%s)\n", |
266 | if (t) { | 360 | create->tgtname); |
267 | pr_err("nvm: target name already exists.\n"); | ||
268 | mutex_unlock(&dev->mlock); | ||
269 | return -EINVAL; | 361 | return -EINVAL; |
270 | } | 362 | } |
271 | mutex_unlock(&dev->mlock); | ||
272 | 363 | ||
273 | ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end); | 364 | ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end); |
274 | if (ret) | 365 | if (ret) |
275 | return ret; | 366 | return ret; |
276 | 367 | ||
@@ -280,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) | |||
280 | goto err_reserve; | 371 | goto err_reserve; |
281 | } | 372 | } |
282 | 373 | ||
283 | tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end); | 374 | tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op); |
284 | if (!tgt_dev) { | 375 | if (!tgt_dev) { |
285 | pr_err("nvm: could not create target device\n"); | 376 | pr_err("nvm: could not create target device\n"); |
286 | ret = -ENOMEM; | 377 | ret = -ENOMEM; |
@@ -350,7 +441,7 @@ err_dev: | |||
350 | err_t: | 441 | err_t: |
351 | kfree(t); | 442 | kfree(t); |
352 | err_reserve: | 443 | err_reserve: |
353 | nvm_release_luns_err(dev, s->lun_begin, s->lun_end); | 444 | nvm_release_luns_err(dev, e.lun_begin, e.lun_end); |
354 | return ret; | 445 | return ret; |
355 | } | 446 | } |
356 | 447 | ||
@@ -420,7 +511,7 @@ static int nvm_register_map(struct nvm_dev *dev) | |||
420 | for (i = 0; i < dev->geo.nr_chnls; i++) { | 511 | for (i = 0; i < dev->geo.nr_chnls; i++) { |
421 | struct nvm_ch_map *ch_rmap; | 512 | struct nvm_ch_map *ch_rmap; |
422 | int *lun_roffs; | 513 | int *lun_roffs; |
423 | int luns_in_chnl = dev->geo.luns_per_chnl; | 514 | int luns_in_chnl = dev->geo.nr_luns; |
424 | 515 | ||
425 | ch_rmap = &rmap->chnls[i]; | 516 | ch_rmap = &rmap->chnls[i]; |
426 | 517 | ||
@@ -524,41 +615,12 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) | |||
524 | nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); | 615 | nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); |
525 | } | 616 | } |
526 | 617 | ||
527 | void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, | ||
528 | int len) | ||
529 | { | ||
530 | struct nvm_geo *geo = &dev->geo; | ||
531 | struct nvm_dev_map *dev_rmap = dev->rmap; | ||
532 | u64 i; | ||
533 | |||
534 | for (i = 0; i < len; i++) { | ||
535 | struct nvm_ch_map *ch_rmap; | ||
536 | int *lun_roffs; | ||
537 | struct ppa_addr gaddr; | ||
538 | u64 pba = le64_to_cpu(entries[i]); | ||
539 | u64 diff; | ||
540 | |||
541 | if (!pba) | ||
542 | continue; | ||
543 | |||
544 | gaddr = linear_to_generic_addr(geo, pba); | ||
545 | ch_rmap = &dev_rmap->chnls[gaddr.g.ch]; | ||
546 | lun_roffs = ch_rmap->lun_offs; | ||
547 | |||
548 | diff = ((ch_rmap->ch_off * geo->luns_per_chnl) + | ||
549 | (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun; | ||
550 | |||
551 | entries[i] -= cpu_to_le64(diff); | ||
552 | } | ||
553 | } | ||
554 | EXPORT_SYMBOL(nvm_part_to_tgt); | ||
555 | |||
556 | int nvm_register_tgt_type(struct nvm_tgt_type *tt) | 618 | int nvm_register_tgt_type(struct nvm_tgt_type *tt) |
557 | { | 619 | { |
558 | int ret = 0; | 620 | int ret = 0; |
559 | 621 | ||
560 | down_write(&nvm_tgtt_lock); | 622 | down_write(&nvm_tgtt_lock); |
561 | if (nvm_find_target_type(tt->name, 0)) | 623 | if (__nvm_find_target_type(tt->name)) |
562 | ret = -EEXIST; | 624 | ret = -EEXIST; |
563 | else | 625 | else |
564 | list_add(&tt->list, &nvm_tgt_types); | 626 | list_add(&tt->list, &nvm_tgt_types); |
@@ -726,112 +788,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) | |||
726 | } | 788 | } |
727 | EXPORT_SYMBOL(nvm_submit_io_sync); | 789 | EXPORT_SYMBOL(nvm_submit_io_sync); |
728 | 790 | ||
729 | int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, | ||
730 | int nr_ppas) | ||
731 | { | ||
732 | struct nvm_geo *geo = &tgt_dev->geo; | ||
733 | struct nvm_rq rqd; | ||
734 | int ret; | ||
735 | |||
736 | memset(&rqd, 0, sizeof(struct nvm_rq)); | ||
737 | |||
738 | rqd.opcode = NVM_OP_ERASE; | ||
739 | rqd.flags = geo->plane_mode >> 1; | ||
740 | |||
741 | ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); | ||
742 | if (ret) | ||
743 | return ret; | ||
744 | |||
745 | ret = nvm_submit_io_sync(tgt_dev, &rqd); | ||
746 | if (ret) { | ||
747 | pr_err("rrpr: erase I/O submission failed: %d\n", ret); | ||
748 | goto free_ppa_list; | ||
749 | } | ||
750 | |||
751 | free_ppa_list: | ||
752 | nvm_free_rqd_ppalist(tgt_dev, &rqd); | ||
753 | |||
754 | return ret; | ||
755 | } | ||
756 | EXPORT_SYMBOL(nvm_erase_sync); | ||
757 | |||
758 | int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb, | ||
759 | nvm_l2p_update_fn *update_l2p, void *priv) | ||
760 | { | ||
761 | struct nvm_dev *dev = tgt_dev->parent; | ||
762 | |||
763 | if (!dev->ops->get_l2p_tbl) | ||
764 | return 0; | ||
765 | |||
766 | return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv); | ||
767 | } | ||
768 | EXPORT_SYMBOL(nvm_get_l2p_tbl); | ||
769 | |||
770 | int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len) | ||
771 | { | ||
772 | struct nvm_dev *dev = tgt_dev->parent; | ||
773 | struct nvm_geo *geo = &dev->geo; | ||
774 | struct nvm_area *area, *prev, *next; | ||
775 | sector_t begin = 0; | ||
776 | sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9; | ||
777 | |||
778 | if (len > max_sectors) | ||
779 | return -EINVAL; | ||
780 | |||
781 | area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL); | ||
782 | if (!area) | ||
783 | return -ENOMEM; | ||
784 | |||
785 | prev = NULL; | ||
786 | |||
787 | spin_lock(&dev->lock); | ||
788 | list_for_each_entry(next, &dev->area_list, list) { | ||
789 | if (begin + len > next->begin) { | ||
790 | begin = next->end; | ||
791 | prev = next; | ||
792 | continue; | ||
793 | } | ||
794 | break; | ||
795 | } | ||
796 | |||
797 | if ((begin + len) > max_sectors) { | ||
798 | spin_unlock(&dev->lock); | ||
799 | kfree(area); | ||
800 | return -EINVAL; | ||
801 | } | ||
802 | |||
803 | area->begin = *lba = begin; | ||
804 | area->end = begin + len; | ||
805 | |||
806 | if (prev) /* insert into sorted order */ | ||
807 | list_add(&area->list, &prev->list); | ||
808 | else | ||
809 | list_add(&area->list, &dev->area_list); | ||
810 | spin_unlock(&dev->lock); | ||
811 | |||
812 | return 0; | ||
813 | } | ||
814 | EXPORT_SYMBOL(nvm_get_area); | ||
815 | |||
816 | void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) | ||
817 | { | ||
818 | struct nvm_dev *dev = tgt_dev->parent; | ||
819 | struct nvm_area *area; | ||
820 | |||
821 | spin_lock(&dev->lock); | ||
822 | list_for_each_entry(area, &dev->area_list, list) { | ||
823 | if (area->begin != begin) | ||
824 | continue; | ||
825 | |||
826 | list_del(&area->list); | ||
827 | spin_unlock(&dev->lock); | ||
828 | kfree(area); | ||
829 | return; | ||
830 | } | ||
831 | spin_unlock(&dev->lock); | ||
832 | } | ||
833 | EXPORT_SYMBOL(nvm_put_area); | ||
834 | |||
835 | void nvm_end_io(struct nvm_rq *rqd) | 791 | void nvm_end_io(struct nvm_rq *rqd) |
836 | { | 792 | { |
837 | struct nvm_tgt_dev *tgt_dev = rqd->dev; | 793 | struct nvm_tgt_dev *tgt_dev = rqd->dev; |
@@ -858,10 +814,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) | |||
858 | struct nvm_geo *geo = &dev->geo; | 814 | struct nvm_geo *geo = &dev->geo; |
859 | int blk, offset, pl, blktype; | 815 | int blk, offset, pl, blktype; |
860 | 816 | ||
861 | if (nr_blks != geo->blks_per_lun * geo->plane_mode) | 817 | if (nr_blks != geo->nr_chks * geo->plane_mode) |
862 | return -EINVAL; | 818 | return -EINVAL; |
863 | 819 | ||
864 | for (blk = 0; blk < geo->blks_per_lun; blk++) { | 820 | for (blk = 0; blk < geo->nr_chks; blk++) { |
865 | offset = blk * geo->plane_mode; | 821 | offset = blk * geo->plane_mode; |
866 | blktype = blks[offset]; | 822 | blktype = blks[offset]; |
867 | 823 | ||
@@ -877,7 +833,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) | |||
877 | blks[blk] = blktype; | 833 | blks[blk] = blktype; |
878 | } | 834 | } |
879 | 835 | ||
880 | return geo->blks_per_lun; | 836 | return geo->nr_chks; |
881 | } | 837 | } |
882 | EXPORT_SYMBOL(nvm_bb_tbl_fold); | 838 | EXPORT_SYMBOL(nvm_bb_tbl_fold); |
883 | 839 | ||
@@ -892,53 +848,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, | |||
892 | } | 848 | } |
893 | EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); | 849 | EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); |
894 | 850 | ||
895 | static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) | ||
896 | { | ||
897 | struct nvm_geo *geo = &dev->geo; | ||
898 | int i; | ||
899 | |||
900 | dev->lps_per_blk = geo->pgs_per_blk; | ||
901 | dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); | ||
902 | if (!dev->lptbl) | ||
903 | return -ENOMEM; | ||
904 | |||
905 | /* Just a linear array */ | ||
906 | for (i = 0; i < dev->lps_per_blk; i++) | ||
907 | dev->lptbl[i] = i; | ||
908 | |||
909 | return 0; | ||
910 | } | ||
911 | |||
912 | static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) | ||
913 | { | ||
914 | int i, p; | ||
915 | struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc; | ||
916 | |||
917 | if (!mlc->num_pairs) | ||
918 | return 0; | ||
919 | |||
920 | dev->lps_per_blk = mlc->num_pairs; | ||
921 | dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); | ||
922 | if (!dev->lptbl) | ||
923 | return -ENOMEM; | ||
924 | |||
925 | /* The lower page table encoding consists of a list of bytes, where each | ||
926 | * has a lower and an upper half. The first half byte maintains the | ||
927 | * increment value and every value after is an offset added to the | ||
928 | * previous incrementation value | ||
929 | */ | ||
930 | dev->lptbl[0] = mlc->pairs[0] & 0xF; | ||
931 | for (i = 1; i < dev->lps_per_blk; i++) { | ||
932 | p = mlc->pairs[i >> 1]; | ||
933 | if (i & 0x1) /* upper */ | ||
934 | dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4); | ||
935 | else /* lower */ | ||
936 | dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF); | ||
937 | } | ||
938 | |||
939 | return 0; | ||
940 | } | ||
941 | |||
942 | static int nvm_core_init(struct nvm_dev *dev) | 851 | static int nvm_core_init(struct nvm_dev *dev) |
943 | { | 852 | { |
944 | struct nvm_id *id = &dev->identity; | 853 | struct nvm_id *id = &dev->identity; |
@@ -946,66 +855,44 @@ static int nvm_core_init(struct nvm_dev *dev) | |||
946 | struct nvm_geo *geo = &dev->geo; | 855 | struct nvm_geo *geo = &dev->geo; |
947 | int ret; | 856 | int ret; |
948 | 857 | ||
858 | memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); | ||
859 | |||
860 | if (grp->mtype != 0) { | ||
861 | pr_err("nvm: memory type not supported\n"); | ||
862 | return -EINVAL; | ||
863 | } | ||
864 | |||
949 | /* Whole device values */ | 865 | /* Whole device values */ |
950 | geo->nr_chnls = grp->num_ch; | 866 | geo->nr_chnls = grp->num_ch; |
951 | geo->luns_per_chnl = grp->num_lun; | 867 | geo->nr_luns = grp->num_lun; |
952 | 868 | ||
953 | /* Generic device values */ | 869 | /* Generic device geometry values */ |
954 | geo->pgs_per_blk = grp->num_pg; | 870 | geo->ws_min = grp->ws_min; |
955 | geo->blks_per_lun = grp->num_blk; | 871 | geo->ws_opt = grp->ws_opt; |
956 | geo->nr_planes = grp->num_pln; | 872 | geo->ws_seq = grp->ws_seq; |
957 | geo->fpg_size = grp->fpg_sz; | 873 | geo->ws_per_chk = grp->ws_per_chk; |
958 | geo->pfpg_size = grp->fpg_sz * grp->num_pln; | 874 | geo->nr_chks = grp->num_chk; |
959 | geo->sec_size = grp->csecs; | 875 | geo->sec_size = grp->csecs; |
960 | geo->oob_size = grp->sos; | 876 | geo->oob_size = grp->sos; |
961 | geo->sec_per_pg = grp->fpg_sz / grp->csecs; | ||
962 | geo->mccap = grp->mccap; | 877 | geo->mccap = grp->mccap; |
963 | memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); | ||
964 | |||
965 | geo->plane_mode = NVM_PLANE_SINGLE; | ||
966 | geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size; | 878 | geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size; |
967 | 879 | ||
968 | if (grp->mpos & 0x020202) | 880 | geo->sec_per_chk = grp->clba; |
969 | geo->plane_mode = NVM_PLANE_DOUBLE; | 881 | geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks; |
970 | if (grp->mpos & 0x040404) | 882 | geo->all_luns = geo->nr_luns * geo->nr_chnls; |
971 | geo->plane_mode = NVM_PLANE_QUAD; | ||
972 | 883 | ||
973 | if (grp->mtype != 0) { | 884 | /* 1.2 spec device geometry values */ |
974 | pr_err("nvm: memory type not supported\n"); | 885 | geo->plane_mode = 1 << geo->ws_seq; |
975 | return -EINVAL; | 886 | geo->nr_planes = geo->ws_opt / geo->ws_min; |
976 | } | 887 | geo->sec_per_pg = geo->ws_min; |
977 | |||
978 | /* calculated values */ | ||
979 | geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes; | 888 | geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes; |
980 | geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk; | ||
981 | geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun; | ||
982 | geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls; | ||
983 | 889 | ||
984 | dev->total_secs = geo->nr_luns * geo->sec_per_lun; | 890 | dev->total_secs = geo->all_luns * geo->sec_per_lun; |
985 | dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns), | 891 | dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns), |
986 | sizeof(unsigned long), GFP_KERNEL); | 892 | sizeof(unsigned long), GFP_KERNEL); |
987 | if (!dev->lun_map) | 893 | if (!dev->lun_map) |
988 | return -ENOMEM; | 894 | return -ENOMEM; |
989 | 895 | ||
990 | switch (grp->fmtype) { | ||
991 | case NVM_ID_FMTYPE_SLC: | ||
992 | if (nvm_init_slc_tbl(dev, grp)) { | ||
993 | ret = -ENOMEM; | ||
994 | goto err_fmtype; | ||
995 | } | ||
996 | break; | ||
997 | case NVM_ID_FMTYPE_MLC: | ||
998 | if (nvm_init_mlc_tbl(dev, grp)) { | ||
999 | ret = -ENOMEM; | ||
1000 | goto err_fmtype; | ||
1001 | } | ||
1002 | break; | ||
1003 | default: | ||
1004 | pr_err("nvm: flash type not supported\n"); | ||
1005 | ret = -EINVAL; | ||
1006 | goto err_fmtype; | ||
1007 | } | ||
1008 | |||
1009 | INIT_LIST_HEAD(&dev->area_list); | 896 | INIT_LIST_HEAD(&dev->area_list); |
1010 | INIT_LIST_HEAD(&dev->targets); | 897 | INIT_LIST_HEAD(&dev->targets); |
1011 | mutex_init(&dev->mlock); | 898 | mutex_init(&dev->mlock); |
@@ -1031,7 +918,6 @@ static void nvm_free(struct nvm_dev *dev) | |||
1031 | dev->ops->destroy_dma_pool(dev->dma_pool); | 918 | dev->ops->destroy_dma_pool(dev->dma_pool); |
1032 | 919 | ||
1033 | nvm_unregister_map(dev); | 920 | nvm_unregister_map(dev); |
1034 | kfree(dev->lptbl); | ||
1035 | kfree(dev->lun_map); | 921 | kfree(dev->lun_map); |
1036 | kfree(dev); | 922 | kfree(dev); |
1037 | } | 923 | } |
@@ -1062,8 +948,8 @@ static int nvm_init(struct nvm_dev *dev) | |||
1062 | 948 | ||
1063 | pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n", | 949 | pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n", |
1064 | dev->name, geo->sec_per_pg, geo->nr_planes, | 950 | dev->name, geo->sec_per_pg, geo->nr_planes, |
1065 | geo->pgs_per_blk, geo->blks_per_lun, | 951 | geo->ws_per_chk, geo->nr_chks, |
1066 | geo->nr_luns, geo->nr_chnls); | 952 | geo->all_luns, geo->nr_chnls); |
1067 | return 0; | 953 | return 0; |
1068 | err: | 954 | err: |
1069 | pr_err("nvm: failed to initialize nvm\n"); | 955 | pr_err("nvm: failed to initialize nvm\n"); |
@@ -1135,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister); | |||
1135 | static int __nvm_configure_create(struct nvm_ioctl_create *create) | 1021 | static int __nvm_configure_create(struct nvm_ioctl_create *create) |
1136 | { | 1022 | { |
1137 | struct nvm_dev *dev; | 1023 | struct nvm_dev *dev; |
1138 | struct nvm_ioctl_create_simple *s; | ||
1139 | 1024 | ||
1140 | down_write(&nvm_lock); | 1025 | down_write(&nvm_lock); |
1141 | dev = nvm_find_nvm_dev(create->dev); | 1026 | dev = nvm_find_nvm_dev(create->dev); |
@@ -1146,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) | |||
1146 | return -EINVAL; | 1031 | return -EINVAL; |
1147 | } | 1032 | } |
1148 | 1033 | ||
1149 | if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) { | ||
1150 | pr_err("nvm: config type not valid\n"); | ||
1151 | return -EINVAL; | ||
1152 | } | ||
1153 | s = &create->conf.s; | ||
1154 | |||
1155 | if (s->lun_begin == -1 && s->lun_end == -1) { | ||
1156 | s->lun_begin = 0; | ||
1157 | s->lun_end = dev->geo.nr_luns - 1; | ||
1158 | } | ||
1159 | |||
1160 | if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) { | ||
1161 | pr_err("nvm: lun out of bound (%u:%u > %u)\n", | ||
1162 | s->lun_begin, s->lun_end, dev->geo.nr_luns - 1); | ||
1163 | return -EINVAL; | ||
1164 | } | ||
1165 | |||
1166 | return nvm_create_tgt(dev, create); | 1034 | return nvm_create_tgt(dev, create); |
1167 | } | 1035 | } |
1168 | 1036 | ||
@@ -1262,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg) | |||
1262 | if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) | 1130 | if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) |
1263 | return -EFAULT; | 1131 | return -EFAULT; |
1264 | 1132 | ||
1133 | if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED && | ||
1134 | create.conf.e.rsv != 0) { | ||
1135 | pr_err("nvm: reserved config field in use\n"); | ||
1136 | return -EINVAL; | ||
1137 | } | ||
1138 | |||
1265 | create.dev[DISK_NAME_LEN - 1] = '\0'; | 1139 | create.dev[DISK_NAME_LEN - 1] = '\0'; |
1266 | create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; | 1140 | create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; |
1267 | create.tgtname[DISK_NAME_LEN - 1] = '\0'; | 1141 | create.tgtname[DISK_NAME_LEN - 1] = '\0'; |
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 0d227ef7d1b9..000fcad38136 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c | |||
@@ -19,12 +19,16 @@ | |||
19 | 19 | ||
20 | int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) | 20 | int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) |
21 | { | 21 | { |
22 | struct request_queue *q = pblk->dev->q; | ||
22 | struct pblk_w_ctx w_ctx; | 23 | struct pblk_w_ctx w_ctx; |
23 | sector_t lba = pblk_get_lba(bio); | 24 | sector_t lba = pblk_get_lba(bio); |
25 | unsigned long start_time = jiffies; | ||
24 | unsigned int bpos, pos; | 26 | unsigned int bpos, pos; |
25 | int nr_entries = pblk_get_secs(bio); | 27 | int nr_entries = pblk_get_secs(bio); |
26 | int i, ret; | 28 | int i, ret; |
27 | 29 | ||
30 | generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0); | ||
31 | |||
28 | /* Update the write buffer head (mem) with the entries that we can | 32 | /* Update the write buffer head (mem) with the entries that we can |
29 | * write. The write in itself cannot fail, so there is no need to | 33 | * write. The write in itself cannot fail, so there is no need to |
30 | * rollback from here on. | 34 | * rollback from here on. |
@@ -67,6 +71,7 @@ retry: | |||
67 | pblk_rl_inserted(&pblk->rl, nr_entries); | 71 | pblk_rl_inserted(&pblk->rl, nr_entries); |
68 | 72 | ||
69 | out: | 73 | out: |
74 | generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time); | ||
70 | pblk_write_should_kick(pblk); | 75 | pblk_write_should_kick(pblk); |
71 | return ret; | 76 | return ret; |
72 | } | 77 | } |
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 76516ee84e9a..0487b9340c1d 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c | |||
@@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work) | |||
32 | struct pblk_line *line; | 32 | struct pblk_line *line; |
33 | int pos; | 33 | int pos; |
34 | 34 | ||
35 | line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; | 35 | line = &pblk->lines[pblk_ppa_to_line(*ppa)]; |
36 | pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); | 36 | pos = pblk_ppa_to_pos(&dev->geo, *ppa); |
37 | 37 | ||
38 | pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", | 38 | pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", |
39 | line->id, pos); | 39 | line->id, pos); |
@@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, | |||
48 | { | 48 | { |
49 | struct nvm_tgt_dev *dev = pblk->dev; | 49 | struct nvm_tgt_dev *dev = pblk->dev; |
50 | struct nvm_geo *geo = &dev->geo; | 50 | struct nvm_geo *geo = &dev->geo; |
51 | int pos = pblk_dev_ppa_to_pos(geo, *ppa); | 51 | int pos = pblk_ppa_to_pos(geo, *ppa); |
52 | 52 | ||
53 | pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); | 53 | pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); |
54 | atomic_long_inc(&pblk->erase_failed); | 54 | atomic_long_inc(&pblk->erase_failed); |
@@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) | |||
66 | { | 66 | { |
67 | struct pblk_line *line; | 67 | struct pblk_line *line; |
68 | 68 | ||
69 | line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)]; | 69 | line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)]; |
70 | atomic_dec(&line->left_seblks); | 70 | atomic_dec(&line->left_seblks); |
71 | 71 | ||
72 | if (rqd->error) { | 72 | if (rqd->error) { |
@@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) | |||
144 | BUG_ON(pblk_ppa_empty(ppa)); | 144 | BUG_ON(pblk_ppa_empty(ppa)); |
145 | #endif | 145 | #endif |
146 | 146 | ||
147 | line_id = pblk_tgt_ppa_to_line(ppa); | 147 | line_id = pblk_ppa_to_line(ppa); |
148 | line = &pblk->lines[line_id]; | 148 | line = &pblk->lines[line_id]; |
149 | paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); | 149 | paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); |
150 | 150 | ||
@@ -650,7 +650,7 @@ next_rq: | |||
650 | } else { | 650 | } else { |
651 | for (i = 0; i < rqd.nr_ppas; ) { | 651 | for (i = 0; i < rqd.nr_ppas; ) { |
652 | struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); | 652 | struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); |
653 | int pos = pblk_dev_ppa_to_pos(geo, ppa); | 653 | int pos = pblk_ppa_to_pos(geo, ppa); |
654 | int read_type = PBLK_READ_RANDOM; | 654 | int read_type = PBLK_READ_RANDOM; |
655 | 655 | ||
656 | if (pblk_io_aligned(pblk, rq_ppas)) | 656 | if (pblk_io_aligned(pblk, rq_ppas)) |
@@ -668,7 +668,7 @@ next_rq: | |||
668 | } | 668 | } |
669 | 669 | ||
670 | ppa = addr_to_gen_ppa(pblk, paddr, id); | 670 | ppa = addr_to_gen_ppa(pblk, paddr, id); |
671 | pos = pblk_dev_ppa_to_pos(geo, ppa); | 671 | pos = pblk_ppa_to_pos(geo, ppa); |
672 | } | 672 | } |
673 | 673 | ||
674 | if (pblk_boundary_paddr_checks(pblk, paddr + min)) { | 674 | if (pblk_boundary_paddr_checks(pblk, paddr + min)) { |
@@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, | |||
742 | cmd_op = NVM_OP_PWRITE; | 742 | cmd_op = NVM_OP_PWRITE; |
743 | flags = pblk_set_progr_mode(pblk, PBLK_WRITE); | 743 | flags = pblk_set_progr_mode(pblk, PBLK_WRITE); |
744 | lba_list = emeta_to_lbas(pblk, line->emeta->buf); | 744 | lba_list = emeta_to_lbas(pblk, line->emeta->buf); |
745 | } else if (dir == PBLK_READ) { | 745 | } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) { |
746 | bio_op = REQ_OP_READ; | 746 | bio_op = REQ_OP_READ; |
747 | cmd_op = NVM_OP_PREAD; | 747 | cmd_op = NVM_OP_PREAD; |
748 | flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); | 748 | flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); |
@@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, | |||
802 | if (rqd.error) { | 802 | if (rqd.error) { |
803 | if (dir == PBLK_WRITE) | 803 | if (dir == PBLK_WRITE) |
804 | pblk_log_write_err(pblk, &rqd); | 804 | pblk_log_write_err(pblk, &rqd); |
805 | else | 805 | else if (dir == PBLK_READ) |
806 | pblk_log_read_err(pblk, &rqd); | 806 | pblk_log_read_err(pblk, &rqd); |
807 | } | 807 | } |
808 | 808 | ||
@@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) | |||
816 | { | 816 | { |
817 | u64 bpaddr = pblk_line_smeta_start(pblk, line); | 817 | u64 bpaddr = pblk_line_smeta_start(pblk, line); |
818 | 818 | ||
819 | return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ); | 819 | return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV); |
820 | } | 820 | } |
821 | 821 | ||
822 | int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, | 822 | int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, |
@@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) | |||
854 | struct nvm_geo *geo = &dev->geo; | 854 | struct nvm_geo *geo = &dev->geo; |
855 | 855 | ||
856 | pr_err("pblk: could not sync erase line:%d,blk:%d\n", | 856 | pr_err("pblk: could not sync erase line:%d,blk:%d\n", |
857 | pblk_dev_ppa_to_line(ppa), | 857 | pblk_ppa_to_line(ppa), |
858 | pblk_dev_ppa_to_pos(geo, ppa)); | 858 | pblk_ppa_to_pos(geo, ppa)); |
859 | 859 | ||
860 | rqd.error = ret; | 860 | rqd.error = ret; |
861 | goto out; | 861 | goto out; |
@@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, | |||
979 | 979 | ||
980 | /* Start metadata */ | 980 | /* Start metadata */ |
981 | smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); | 981 | smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); |
982 | smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns); | 982 | smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns); |
983 | 983 | ||
984 | /* Fill metadata among lines */ | 984 | /* Fill metadata among lines */ |
985 | if (cur) { | 985 | if (cur) { |
@@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, | |||
1032 | lm->sec_per_line); | 1032 | lm->sec_per_line); |
1033 | bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, | 1033 | bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, |
1034 | lm->sec_per_line); | 1034 | lm->sec_per_line); |
1035 | line->sec_in_line -= geo->sec_per_blk; | 1035 | line->sec_in_line -= geo->sec_per_chk; |
1036 | if (bit >= lm->emeta_bb) | 1036 | if (bit >= lm->emeta_bb) |
1037 | nr_bb++; | 1037 | nr_bb++; |
1038 | } | 1038 | } |
@@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) | |||
1145 | } | 1145 | } |
1146 | spin_unlock(&l_mg->free_lock); | 1146 | spin_unlock(&l_mg->free_lock); |
1147 | 1147 | ||
1148 | pblk_rl_free_lines_dec(&pblk->rl, line); | 1148 | pblk_rl_free_lines_dec(&pblk->rl, line, true); |
1149 | 1149 | ||
1150 | if (!pblk_line_init_bb(pblk, line, 0)) { | 1150 | if (!pblk_line_init_bb(pblk, line, 0)) { |
1151 | list_add(&line->list, &l_mg->free_list); | 1151 | list_add(&line->list, &l_mg->free_list); |
@@ -1233,7 +1233,7 @@ retry: | |||
1233 | l_mg->data_line = retry_line; | 1233 | l_mg->data_line = retry_line; |
1234 | spin_unlock(&l_mg->free_lock); | 1234 | spin_unlock(&l_mg->free_lock); |
1235 | 1235 | ||
1236 | pblk_rl_free_lines_dec(&pblk->rl, retry_line); | 1236 | pblk_rl_free_lines_dec(&pblk->rl, line, false); |
1237 | 1237 | ||
1238 | if (pblk_line_erase(pblk, retry_line)) | 1238 | if (pblk_line_erase(pblk, retry_line)) |
1239 | goto retry; | 1239 | goto retry; |
@@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) | |||
1252 | { | 1252 | { |
1253 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 1253 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
1254 | struct pblk_line *line; | 1254 | struct pblk_line *line; |
1255 | int is_next = 0; | ||
1256 | 1255 | ||
1257 | spin_lock(&l_mg->free_lock); | 1256 | spin_lock(&l_mg->free_lock); |
1258 | line = pblk_line_get(pblk); | 1257 | line = pblk_line_get(pblk); |
@@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) | |||
1280 | } else { | 1279 | } else { |
1281 | l_mg->data_next->seq_nr = l_mg->d_seq_nr++; | 1280 | l_mg->data_next->seq_nr = l_mg->d_seq_nr++; |
1282 | l_mg->data_next->type = PBLK_LINETYPE_DATA; | 1281 | l_mg->data_next->type = PBLK_LINETYPE_DATA; |
1283 | is_next = 1; | ||
1284 | } | 1282 | } |
1285 | spin_unlock(&l_mg->free_lock); | 1283 | spin_unlock(&l_mg->free_lock); |
1286 | 1284 | ||
@@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) | |||
1290 | return NULL; | 1288 | return NULL; |
1291 | } | 1289 | } |
1292 | 1290 | ||
1293 | pblk_rl_free_lines_dec(&pblk->rl, line); | ||
1294 | if (is_next) | ||
1295 | pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); | ||
1296 | |||
1297 | retry_setup: | 1291 | retry_setup: |
1298 | if (!pblk_line_init_metadata(pblk, line, NULL)) { | 1292 | if (!pblk_line_init_metadata(pblk, line, NULL)) { |
1299 | line = pblk_line_retry(pblk, line); | 1293 | line = pblk_line_retry(pblk, line); |
@@ -1311,6 +1305,8 @@ retry_setup: | |||
1311 | goto retry_setup; | 1305 | goto retry_setup; |
1312 | } | 1306 | } |
1313 | 1307 | ||
1308 | pblk_rl_free_lines_dec(&pblk->rl, line, true); | ||
1309 | |||
1314 | return line; | 1310 | return line; |
1315 | } | 1311 | } |
1316 | 1312 | ||
@@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk) | |||
1395 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | 1391 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; |
1396 | struct pblk_line *cur, *new = NULL; | 1392 | struct pblk_line *cur, *new = NULL; |
1397 | unsigned int left_seblks; | 1393 | unsigned int left_seblks; |
1398 | int is_next = 0; | ||
1399 | 1394 | ||
1400 | cur = l_mg->data_line; | 1395 | cur = l_mg->data_line; |
1401 | new = l_mg->data_next; | 1396 | new = l_mg->data_next; |
@@ -1444,6 +1439,8 @@ retry_setup: | |||
1444 | goto retry_setup; | 1439 | goto retry_setup; |
1445 | } | 1440 | } |
1446 | 1441 | ||
1442 | pblk_rl_free_lines_dec(&pblk->rl, new, true); | ||
1443 | |||
1447 | /* Allocate next line for preparation */ | 1444 | /* Allocate next line for preparation */ |
1448 | spin_lock(&l_mg->free_lock); | 1445 | spin_lock(&l_mg->free_lock); |
1449 | l_mg->data_next = pblk_line_get(pblk); | 1446 | l_mg->data_next = pblk_line_get(pblk); |
@@ -1457,13 +1454,9 @@ retry_setup: | |||
1457 | } else { | 1454 | } else { |
1458 | l_mg->data_next->seq_nr = l_mg->d_seq_nr++; | 1455 | l_mg->data_next->seq_nr = l_mg->d_seq_nr++; |
1459 | l_mg->data_next->type = PBLK_LINETYPE_DATA; | 1456 | l_mg->data_next->type = PBLK_LINETYPE_DATA; |
1460 | is_next = 1; | ||
1461 | } | 1457 | } |
1462 | spin_unlock(&l_mg->free_lock); | 1458 | spin_unlock(&l_mg->free_lock); |
1463 | 1459 | ||
1464 | if (is_next) | ||
1465 | pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); | ||
1466 | |||
1467 | out: | 1460 | out: |
1468 | return new; | 1461 | return new; |
1469 | } | 1462 | } |
@@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) | |||
1561 | struct nvm_geo *geo = &dev->geo; | 1554 | struct nvm_geo *geo = &dev->geo; |
1562 | 1555 | ||
1563 | pr_err("pblk: could not async erase line:%d,blk:%d\n", | 1556 | pr_err("pblk: could not async erase line:%d,blk:%d\n", |
1564 | pblk_dev_ppa_to_line(ppa), | 1557 | pblk_ppa_to_line(ppa), |
1565 | pblk_dev_ppa_to_pos(geo, ppa)); | 1558 | pblk_ppa_to_pos(geo, ppa)); |
1566 | } | 1559 | } |
1567 | 1560 | ||
1568 | return err; | 1561 | return err; |
@@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, | |||
1746 | struct nvm_tgt_dev *dev = pblk->dev; | 1739 | struct nvm_tgt_dev *dev = pblk->dev; |
1747 | struct nvm_geo *geo = &dev->geo; | 1740 | struct nvm_geo *geo = &dev->geo; |
1748 | struct pblk_lun *rlun; | 1741 | struct pblk_lun *rlun; |
1749 | int nr_luns = geo->nr_luns; | 1742 | int nr_luns = geo->all_luns; |
1750 | int bit = -1; | 1743 | int bit = -1; |
1751 | 1744 | ||
1752 | while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) { | 1745 | while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) { |
@@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, | |||
1884 | 1877 | ||
1885 | /* If the L2P entry maps to a line, the reference is valid */ | 1878 | /* If the L2P entry maps to a line, the reference is valid */ |
1886 | if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { | 1879 | if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { |
1887 | int line_id = pblk_dev_ppa_to_line(ppa); | 1880 | int line_id = pblk_ppa_to_line(ppa); |
1888 | struct pblk_line *line = &pblk->lines[line_id]; | 1881 | struct pblk_line *line = &pblk->lines[line_id]; |
1889 | 1882 | ||
1890 | kref_get(&line->ref); | 1883 | kref_get(&line->ref); |
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 9c8e114c8a54..3d899383666e 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c | |||
@@ -169,7 +169,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) | |||
169 | * the line untouched. TODO: Implement a recovery routine that scans and | 169 | * the line untouched. TODO: Implement a recovery routine that scans and |
170 | * moves all sectors on the line. | 170 | * moves all sectors on the line. |
171 | */ | 171 | */ |
172 | lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); | 172 | |
173 | ret = pblk_recov_check_emeta(pblk, emeta_buf); | ||
174 | if (ret) { | ||
175 | pr_err("pblk: inconsistent emeta (line %d)\n", line->id); | ||
176 | goto fail_free_emeta; | ||
177 | } | ||
178 | |||
179 | lba_list = emeta_to_lbas(pblk, emeta_buf); | ||
173 | if (!lba_list) { | 180 | if (!lba_list) { |
174 | pr_err("pblk: could not interpret emeta (line %d)\n", line->id); | 181 | pr_err("pblk: could not interpret emeta (line %d)\n", line->id); |
175 | goto fail_free_emeta; | 182 | goto fail_free_emeta; |
@@ -519,22 +526,12 @@ void pblk_gc_should_start(struct pblk *pblk) | |||
519 | } | 526 | } |
520 | } | 527 | } |
521 | 528 | ||
522 | /* | ||
523 | * If flush_wq == 1 then no lock should be held by the caller since | ||
524 | * flush_workqueue can sleep | ||
525 | */ | ||
526 | static void pblk_gc_stop(struct pblk *pblk, int flush_wq) | ||
527 | { | ||
528 | pblk->gc.gc_active = 0; | ||
529 | pr_debug("pblk: gc stop\n"); | ||
530 | } | ||
531 | |||
532 | void pblk_gc_should_stop(struct pblk *pblk) | 529 | void pblk_gc_should_stop(struct pblk *pblk) |
533 | { | 530 | { |
534 | struct pblk_gc *gc = &pblk->gc; | 531 | struct pblk_gc *gc = &pblk->gc; |
535 | 532 | ||
536 | if (gc->gc_active && !gc->gc_forced) | 533 | if (gc->gc_active && !gc->gc_forced) |
537 | pblk_gc_stop(pblk, 0); | 534 | gc->gc_active = 0; |
538 | } | 535 | } |
539 | 536 | ||
540 | void pblk_gc_should_kick(struct pblk *pblk) | 537 | void pblk_gc_should_kick(struct pblk *pblk) |
@@ -660,7 +657,7 @@ void pblk_gc_exit(struct pblk *pblk) | |||
660 | 657 | ||
661 | gc->gc_enabled = 0; | 658 | gc->gc_enabled = 0; |
662 | del_timer_sync(&gc->gc_timer); | 659 | del_timer_sync(&gc->gc_timer); |
663 | pblk_gc_stop(pblk, 1); | 660 | gc->gc_active = 0; |
664 | 661 | ||
665 | if (gc->gc_ts) | 662 | if (gc->gc_ts) |
666 | kthread_stop(gc->gc_ts); | 663 | kthread_stop(gc->gc_ts); |
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 695826a06b5d..93d671ca518e 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c | |||
@@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk) | |||
169 | } | 169 | } |
170 | ppaf.ch_len = power_len; | 170 | ppaf.ch_len = power_len; |
171 | 171 | ||
172 | power_len = get_count_order(geo->luns_per_chnl); | 172 | power_len = get_count_order(geo->nr_luns); |
173 | if (1 << power_len != geo->luns_per_chnl) { | 173 | if (1 << power_len != geo->nr_luns) { |
174 | pr_err("pblk: supports only power-of-two LUN config.\n"); | 174 | pr_err("pblk: supports only power-of-two LUN config.\n"); |
175 | return -EINVAL; | 175 | return -EINVAL; |
176 | } | 176 | } |
@@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk) | |||
254 | struct nvm_geo *geo = &dev->geo; | 254 | struct nvm_geo *geo = &dev->geo; |
255 | 255 | ||
256 | pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * | 256 | pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * |
257 | geo->nr_planes * geo->nr_luns; | 257 | geo->nr_planes * geo->all_luns; |
258 | 258 | ||
259 | if (pblk_init_global_caches(pblk)) | 259 | if (pblk_init_global_caches(pblk)) |
260 | return -ENOMEM; | 260 | return -ENOMEM; |
@@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk) | |||
270 | if (!pblk->gen_ws_pool) | 270 | if (!pblk->gen_ws_pool) |
271 | goto free_page_bio_pool; | 271 | goto free_page_bio_pool; |
272 | 272 | ||
273 | pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); | 273 | pblk->rec_pool = mempool_create_slab_pool(geo->all_luns, |
274 | pblk_rec_cache); | ||
274 | if (!pblk->rec_pool) | 275 | if (!pblk->rec_pool) |
275 | goto free_gen_ws_pool; | 276 | goto free_gen_ws_pool; |
276 | 277 | ||
277 | pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns, | 278 | pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns, |
278 | pblk_g_rq_cache); | 279 | pblk_g_rq_cache); |
279 | if (!pblk->r_rq_pool) | 280 | if (!pblk->r_rq_pool) |
280 | goto free_rec_pool; | 281 | goto free_rec_pool; |
281 | 282 | ||
282 | pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns, | 283 | pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns, |
283 | pblk_g_rq_cache); | 284 | pblk_g_rq_cache); |
284 | if (!pblk->e_rq_pool) | 285 | if (!pblk->e_rq_pool) |
285 | goto free_r_rq_pool; | 286 | goto free_r_rq_pool; |
286 | 287 | ||
287 | pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns, | 288 | pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns, |
288 | pblk_w_rq_cache); | 289 | pblk_w_rq_cache); |
289 | if (!pblk->w_rq_pool) | 290 | if (!pblk->w_rq_pool) |
290 | goto free_e_rq_pool; | 291 | goto free_e_rq_pool; |
@@ -354,6 +355,8 @@ static void pblk_core_free(struct pblk *pblk) | |||
354 | mempool_destroy(pblk->e_rq_pool); | 355 | mempool_destroy(pblk->e_rq_pool); |
355 | mempool_destroy(pblk->w_rq_pool); | 356 | mempool_destroy(pblk->w_rq_pool); |
356 | 357 | ||
358 | pblk_rwb_free(pblk); | ||
359 | |||
357 | pblk_free_global_caches(pblk); | 360 | pblk_free_global_caches(pblk); |
358 | } | 361 | } |
359 | 362 | ||
@@ -409,7 +412,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun) | |||
409 | u8 *blks; | 412 | u8 *blks; |
410 | int nr_blks, ret; | 413 | int nr_blks, ret; |
411 | 414 | ||
412 | nr_blks = geo->blks_per_lun * geo->plane_mode; | 415 | nr_blks = geo->nr_chks * geo->plane_mode; |
413 | blks = kmalloc(nr_blks, GFP_KERNEL); | 416 | blks = kmalloc(nr_blks, GFP_KERNEL); |
414 | if (!blks) | 417 | if (!blks) |
415 | return -ENOMEM; | 418 | return -ENOMEM; |
@@ -482,20 +485,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns) | |||
482 | int i, ret; | 485 | int i, ret; |
483 | 486 | ||
484 | /* TODO: Implement unbalanced LUN support */ | 487 | /* TODO: Implement unbalanced LUN support */ |
485 | if (geo->luns_per_chnl < 0) { | 488 | if (geo->nr_luns < 0) { |
486 | pr_err("pblk: unbalanced LUN config.\n"); | 489 | pr_err("pblk: unbalanced LUN config.\n"); |
487 | return -EINVAL; | 490 | return -EINVAL; |
488 | } | 491 | } |
489 | 492 | ||
490 | pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL); | 493 | pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun), |
494 | GFP_KERNEL); | ||
491 | if (!pblk->luns) | 495 | if (!pblk->luns) |
492 | return -ENOMEM; | 496 | return -ENOMEM; |
493 | 497 | ||
494 | for (i = 0; i < geo->nr_luns; i++) { | 498 | for (i = 0; i < geo->all_luns; i++) { |
495 | /* Stripe across channels */ | 499 | /* Stripe across channels */ |
496 | int ch = i % geo->nr_chnls; | 500 | int ch = i % geo->nr_chnls; |
497 | int lun_raw = i / geo->nr_chnls; | 501 | int lun_raw = i / geo->nr_chnls; |
498 | int lunid = lun_raw + ch * geo->luns_per_chnl; | 502 | int lunid = lun_raw + ch * geo->nr_luns; |
499 | 503 | ||
500 | rlun = &pblk->luns[i]; | 504 | rlun = &pblk->luns[i]; |
501 | rlun->bppa = luns[lunid]; | 505 | rlun->bppa = luns[lunid]; |
@@ -577,22 +581,37 @@ static unsigned int calc_emeta_len(struct pblk *pblk) | |||
577 | static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) | 581 | static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) |
578 | { | 582 | { |
579 | struct nvm_tgt_dev *dev = pblk->dev; | 583 | struct nvm_tgt_dev *dev = pblk->dev; |
584 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | ||
585 | struct pblk_line_meta *lm = &pblk->lm; | ||
580 | struct nvm_geo *geo = &dev->geo; | 586 | struct nvm_geo *geo = &dev->geo; |
581 | sector_t provisioned; | 587 | sector_t provisioned; |
588 | int sec_meta, blk_meta; | ||
582 | 589 | ||
583 | pblk->over_pct = 20; | 590 | if (geo->op == NVM_TARGET_DEFAULT_OP) |
591 | pblk->op = PBLK_DEFAULT_OP; | ||
592 | else | ||
593 | pblk->op = geo->op; | ||
584 | 594 | ||
585 | provisioned = nr_free_blks; | 595 | provisioned = nr_free_blks; |
586 | provisioned *= (100 - pblk->over_pct); | 596 | provisioned *= (100 - pblk->op); |
587 | sector_div(provisioned, 100); | 597 | sector_div(provisioned, 100); |
588 | 598 | ||
599 | pblk->op_blks = nr_free_blks - provisioned; | ||
600 | |||
589 | /* Internally pblk manages all free blocks, but all calculations based | 601 | /* Internally pblk manages all free blocks, but all calculations based |
590 | * on user capacity consider only provisioned blocks | 602 | * on user capacity consider only provisioned blocks |
591 | */ | 603 | */ |
592 | pblk->rl.total_blocks = nr_free_blks; | 604 | pblk->rl.total_blocks = nr_free_blks; |
593 | pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk; | 605 | pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk; |
594 | pblk->capacity = provisioned * geo->sec_per_blk; | 606 | |
607 | /* Consider sectors used for metadata */ | ||
608 | sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; | ||
609 | blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk); | ||
610 | |||
611 | pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk; | ||
612 | |||
595 | atomic_set(&pblk->rl.free_blocks, nr_free_blks); | 613 | atomic_set(&pblk->rl.free_blocks, nr_free_blks); |
614 | atomic_set(&pblk->rl.free_user_blocks, nr_free_blks); | ||
596 | } | 615 | } |
597 | 616 | ||
598 | static int pblk_lines_alloc_metadata(struct pblk *pblk) | 617 | static int pblk_lines_alloc_metadata(struct pblk *pblk) |
@@ -683,7 +702,7 @@ static int pblk_lines_init(struct pblk *pblk) | |||
683 | int i, ret; | 702 | int i, ret; |
684 | 703 | ||
685 | pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); | 704 | pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); |
686 | max_write_ppas = pblk->min_write_pgs * geo->nr_luns; | 705 | max_write_ppas = pblk->min_write_pgs * geo->all_luns; |
687 | pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? | 706 | pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? |
688 | max_write_ppas : nvm_max_phys_sects(dev); | 707 | max_write_ppas : nvm_max_phys_sects(dev); |
689 | pblk_set_sec_per_write(pblk, pblk->min_write_pgs); | 708 | pblk_set_sec_per_write(pblk, pblk->min_write_pgs); |
@@ -693,26 +712,26 @@ static int pblk_lines_init(struct pblk *pblk) | |||
693 | return -EINVAL; | 712 | return -EINVAL; |
694 | } | 713 | } |
695 | 714 | ||
696 | div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); | 715 | div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod); |
697 | if (mod) { | 716 | if (mod) { |
698 | pr_err("pblk: bad configuration of sectors/pages\n"); | 717 | pr_err("pblk: bad configuration of sectors/pages\n"); |
699 | return -EINVAL; | 718 | return -EINVAL; |
700 | } | 719 | } |
701 | 720 | ||
702 | l_mg->nr_lines = geo->blks_per_lun; | 721 | l_mg->nr_lines = geo->nr_chks; |
703 | l_mg->log_line = l_mg->data_line = NULL; | 722 | l_mg->log_line = l_mg->data_line = NULL; |
704 | l_mg->l_seq_nr = l_mg->d_seq_nr = 0; | 723 | l_mg->l_seq_nr = l_mg->d_seq_nr = 0; |
705 | l_mg->nr_free_lines = 0; | 724 | l_mg->nr_free_lines = 0; |
706 | bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); | 725 | bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); |
707 | 726 | ||
708 | lm->sec_per_line = geo->sec_per_blk * geo->nr_luns; | 727 | lm->sec_per_line = geo->sec_per_chk * geo->all_luns; |
709 | lm->blk_per_line = geo->nr_luns; | 728 | lm->blk_per_line = geo->all_luns; |
710 | lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); | 729 | lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); |
711 | lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); | 730 | lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); |
712 | lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); | 731 | lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); |
713 | lm->mid_thrs = lm->sec_per_line / 2; | 732 | lm->mid_thrs = lm->sec_per_line / 2; |
714 | lm->high_thrs = lm->sec_per_line / 4; | 733 | lm->high_thrs = lm->sec_per_line / 4; |
715 | lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; | 734 | lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs; |
716 | 735 | ||
717 | /* Calculate necessary pages for smeta. See comment over struct | 736 | /* Calculate necessary pages for smeta. See comment over struct |
718 | * line_smeta definition | 737 | * line_smeta definition |
@@ -742,12 +761,12 @@ add_emeta_page: | |||
742 | goto add_emeta_page; | 761 | goto add_emeta_page; |
743 | } | 762 | } |
744 | 763 | ||
745 | lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0; | 764 | lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0; |
746 | 765 | ||
747 | lm->min_blk_line = 1; | 766 | lm->min_blk_line = 1; |
748 | if (geo->nr_luns > 1) | 767 | if (geo->all_luns > 1) |
749 | lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + | 768 | lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + |
750 | lm->emeta_sec[0], geo->sec_per_blk); | 769 | lm->emeta_sec[0], geo->sec_per_chk); |
751 | 770 | ||
752 | if (lm->min_blk_line > lm->blk_per_line) { | 771 | if (lm->min_blk_line > lm->blk_per_line) { |
753 | pr_err("pblk: config. not supported. Min. LUN in line:%d\n", | 772 | pr_err("pblk: config. not supported. Min. LUN in line:%d\n", |
@@ -772,7 +791,7 @@ add_emeta_page: | |||
772 | goto fail_free_bb_template; | 791 | goto fail_free_bb_template; |
773 | } | 792 | } |
774 | 793 | ||
775 | bb_distance = (geo->nr_luns) * geo->sec_per_pl; | 794 | bb_distance = (geo->all_luns) * geo->sec_per_pl; |
776 | for (i = 0; i < lm->sec_per_line; i += bb_distance) | 795 | for (i = 0; i < lm->sec_per_line; i += bb_distance) |
777 | bitmap_set(l_mg->bb_template, i, geo->sec_per_pl); | 796 | bitmap_set(l_mg->bb_template, i, geo->sec_per_pl); |
778 | 797 | ||
@@ -844,7 +863,7 @@ add_emeta_page: | |||
844 | pblk_set_provision(pblk, nr_free_blks); | 863 | pblk_set_provision(pblk, nr_free_blks); |
845 | 864 | ||
846 | /* Cleanup per-LUN bad block lists - managed within lines on run-time */ | 865 | /* Cleanup per-LUN bad block lists - managed within lines on run-time */ |
847 | for (i = 0; i < geo->nr_luns; i++) | 866 | for (i = 0; i < geo->all_luns; i++) |
848 | kfree(pblk->luns[i].bb_list); | 867 | kfree(pblk->luns[i].bb_list); |
849 | 868 | ||
850 | return 0; | 869 | return 0; |
@@ -858,7 +877,7 @@ fail_free_bb_template: | |||
858 | fail_free_meta: | 877 | fail_free_meta: |
859 | pblk_line_meta_free(pblk); | 878 | pblk_line_meta_free(pblk); |
860 | fail: | 879 | fail: |
861 | for (i = 0; i < geo->nr_luns; i++) | 880 | for (i = 0; i < geo->all_luns; i++) |
862 | kfree(pblk->luns[i].bb_list); | 881 | kfree(pblk->luns[i].bb_list); |
863 | 882 | ||
864 | return ret; | 883 | return ret; |
@@ -866,15 +885,19 @@ fail: | |||
866 | 885 | ||
867 | static int pblk_writer_init(struct pblk *pblk) | 886 | static int pblk_writer_init(struct pblk *pblk) |
868 | { | 887 | { |
869 | timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); | ||
870 | mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); | ||
871 | |||
872 | pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); | 888 | pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); |
873 | if (IS_ERR(pblk->writer_ts)) { | 889 | if (IS_ERR(pblk->writer_ts)) { |
874 | pr_err("pblk: could not allocate writer kthread\n"); | 890 | int err = PTR_ERR(pblk->writer_ts); |
875 | return PTR_ERR(pblk->writer_ts); | 891 | |
892 | if (err != -EINTR) | ||
893 | pr_err("pblk: could not allocate writer kthread (%d)\n", | ||
894 | err); | ||
895 | return err; | ||
876 | } | 896 | } |
877 | 897 | ||
898 | timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); | ||
899 | mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); | ||
900 | |||
878 | return 0; | 901 | return 0; |
879 | } | 902 | } |
880 | 903 | ||
@@ -910,7 +933,6 @@ static void pblk_tear_down(struct pblk *pblk) | |||
910 | pblk_pipeline_stop(pblk); | 933 | pblk_pipeline_stop(pblk); |
911 | pblk_writer_stop(pblk); | 934 | pblk_writer_stop(pblk); |
912 | pblk_rb_sync_l2p(&pblk->rwb); | 935 | pblk_rb_sync_l2p(&pblk->rwb); |
913 | pblk_rwb_free(pblk); | ||
914 | pblk_rl_free(&pblk->rl); | 936 | pblk_rl_free(&pblk->rl); |
915 | 937 | ||
916 | pr_debug("pblk: consistent tear down\n"); | 938 | pr_debug("pblk: consistent tear down\n"); |
@@ -1025,7 +1047,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
1025 | 1047 | ||
1026 | ret = pblk_writer_init(pblk); | 1048 | ret = pblk_writer_init(pblk); |
1027 | if (ret) { | 1049 | if (ret) { |
1028 | pr_err("pblk: could not initialize write thread\n"); | 1050 | if (ret != -EINTR) |
1051 | pr_err("pblk: could not initialize write thread\n"); | ||
1029 | goto fail_free_lines; | 1052 | goto fail_free_lines; |
1030 | } | 1053 | } |
1031 | 1054 | ||
@@ -1041,13 +1064,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
1041 | 1064 | ||
1042 | blk_queue_write_cache(tqueue, true, false); | 1065 | blk_queue_write_cache(tqueue, true, false); |
1043 | 1066 | ||
1044 | tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size; | 1067 | tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size; |
1045 | tqueue->limits.discard_alignment = 0; | 1068 | tqueue->limits.discard_alignment = 0; |
1046 | blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); | 1069 | blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); |
1047 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue); | 1070 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue); |
1048 | 1071 | ||
1049 | pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n", | 1072 | pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n", |
1050 | geo->nr_luns, pblk->l_mg.nr_lines, | 1073 | tdisk->disk_name, |
1074 | geo->all_luns, pblk->l_mg.nr_lines, | ||
1051 | (unsigned long long)pblk->rl.nr_secs, | 1075 | (unsigned long long)pblk->rl.nr_secs, |
1052 | pblk->rwb.nr_entries); | 1076 | pblk->rwb.nr_entries); |
1053 | 1077 | ||
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 6f3ecde2140f..7445e6430c52 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c | |||
@@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, | |||
146 | return; | 146 | return; |
147 | 147 | ||
148 | /* Erase blocks that are bad in this line but might not be in next */ | 148 | /* Erase blocks that are bad in this line but might not be in next */ |
149 | if (unlikely(ppa_empty(*erase_ppa)) && | 149 | if (unlikely(pblk_ppa_empty(*erase_ppa)) && |
150 | bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { | 150 | bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { |
151 | int bit = -1; | 151 | int bit = -1; |
152 | 152 | ||
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index b8f78e401482..ec8fc314646b 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c | |||
@@ -54,7 +54,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, | |||
54 | rb->seg_size = (1 << power_seg_sz); | 54 | rb->seg_size = (1 << power_seg_sz); |
55 | rb->nr_entries = (1 << power_size); | 55 | rb->nr_entries = (1 << power_size); |
56 | rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; | 56 | rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; |
57 | rb->sync_point = EMPTY_ENTRY; | 57 | rb->flush_point = EMPTY_ENTRY; |
58 | 58 | ||
59 | spin_lock_init(&rb->w_lock); | 59 | spin_lock_init(&rb->w_lock); |
60 | spin_lock_init(&rb->s_lock); | 60 | spin_lock_init(&rb->s_lock); |
@@ -112,7 +112,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, | |||
112 | up_write(&pblk_rb_lock); | 112 | up_write(&pblk_rb_lock); |
113 | 113 | ||
114 | #ifdef CONFIG_NVM_DEBUG | 114 | #ifdef CONFIG_NVM_DEBUG |
115 | atomic_set(&rb->inflight_sync_point, 0); | 115 | atomic_set(&rb->inflight_flush_point, 0); |
116 | #endif | 116 | #endif |
117 | 117 | ||
118 | /* | 118 | /* |
@@ -226,7 +226,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) | |||
226 | pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, | 226 | pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, |
227 | entry->cacheline); | 227 | entry->cacheline); |
228 | 228 | ||
229 | line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; | 229 | line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)]; |
230 | kref_put(&line->ref, pblk_line_put); | 230 | kref_put(&line->ref, pblk_line_put); |
231 | clean_wctx(w_ctx); | 231 | clean_wctx(w_ctx); |
232 | rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); | 232 | rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); |
@@ -349,35 +349,35 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, | |||
349 | smp_store_release(&entry->w_ctx.flags, flags); | 349 | smp_store_release(&entry->w_ctx.flags, flags); |
350 | } | 350 | } |
351 | 351 | ||
352 | static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, | 352 | static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, |
353 | unsigned int pos) | 353 | unsigned int pos) |
354 | { | 354 | { |
355 | struct pblk_rb_entry *entry; | 355 | struct pblk_rb_entry *entry; |
356 | unsigned int subm, sync_point; | 356 | unsigned int sync, flush_point; |
357 | 357 | ||
358 | subm = READ_ONCE(rb->subm); | 358 | sync = READ_ONCE(rb->sync); |
359 | |||
360 | if (pos == sync) | ||
361 | return 0; | ||
359 | 362 | ||
360 | #ifdef CONFIG_NVM_DEBUG | 363 | #ifdef CONFIG_NVM_DEBUG |
361 | atomic_inc(&rb->inflight_sync_point); | 364 | atomic_inc(&rb->inflight_flush_point); |
362 | #endif | 365 | #endif |
363 | 366 | ||
364 | if (pos == subm) | 367 | flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); |
365 | return 0; | 368 | entry = &rb->entries[flush_point]; |
366 | 369 | ||
367 | sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); | 370 | pblk_rb_sync_init(rb, NULL); |
368 | entry = &rb->entries[sync_point]; | ||
369 | 371 | ||
370 | /* Protect syncs */ | 372 | /* Protect flush points */ |
371 | smp_store_release(&rb->sync_point, sync_point); | 373 | smp_store_release(&rb->flush_point, flush_point); |
372 | 374 | ||
373 | if (!bio) | 375 | if (bio) |
374 | return 0; | 376 | bio_list_add(&entry->w_ctx.bios, bio); |
375 | 377 | ||
376 | spin_lock_irq(&rb->s_lock); | 378 | pblk_rb_sync_end(rb, NULL); |
377 | bio_list_add(&entry->w_ctx.bios, bio); | ||
378 | spin_unlock_irq(&rb->s_lock); | ||
379 | 379 | ||
380 | return 1; | 380 | return bio ? 1 : 0; |
381 | } | 381 | } |
382 | 382 | ||
383 | static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, | 383 | static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, |
@@ -416,7 +416,7 @@ void pblk_rb_flush(struct pblk_rb *rb) | |||
416 | struct pblk *pblk = container_of(rb, struct pblk, rwb); | 416 | struct pblk *pblk = container_of(rb, struct pblk, rwb); |
417 | unsigned int mem = READ_ONCE(rb->mem); | 417 | unsigned int mem = READ_ONCE(rb->mem); |
418 | 418 | ||
419 | if (pblk_rb_sync_point_set(rb, NULL, mem)) | 419 | if (pblk_rb_flush_point_set(rb, NULL, mem)) |
420 | return; | 420 | return; |
421 | 421 | ||
422 | pblk_write_should_kick(pblk); | 422 | pblk_write_should_kick(pblk); |
@@ -440,7 +440,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, | |||
440 | #ifdef CONFIG_NVM_DEBUG | 440 | #ifdef CONFIG_NVM_DEBUG |
441 | atomic_long_inc(&pblk->nr_flush); | 441 | atomic_long_inc(&pblk->nr_flush); |
442 | #endif | 442 | #endif |
443 | if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem)) | 443 | if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem)) |
444 | *io_ret = NVM_IO_OK; | 444 | *io_ret = NVM_IO_OK; |
445 | } | 445 | } |
446 | 446 | ||
@@ -606,21 +606,6 @@ try: | |||
606 | return NVM_IO_ERR; | 606 | return NVM_IO_ERR; |
607 | } | 607 | } |
608 | 608 | ||
609 | if (flags & PBLK_FLUSH_ENTRY) { | ||
610 | unsigned int sync_point; | ||
611 | |||
612 | sync_point = READ_ONCE(rb->sync_point); | ||
613 | if (sync_point == pos) { | ||
614 | /* Protect syncs */ | ||
615 | smp_store_release(&rb->sync_point, EMPTY_ENTRY); | ||
616 | } | ||
617 | |||
618 | flags &= ~PBLK_FLUSH_ENTRY; | ||
619 | #ifdef CONFIG_NVM_DEBUG | ||
620 | atomic_dec(&rb->inflight_sync_point); | ||
621 | #endif | ||
622 | } | ||
623 | |||
624 | flags &= ~PBLK_WRITTEN_DATA; | 609 | flags &= ~PBLK_WRITTEN_DATA; |
625 | flags |= PBLK_SUBMITTED_ENTRY; | 610 | flags |= PBLK_SUBMITTED_ENTRY; |
626 | 611 | ||
@@ -730,15 +715,24 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags) | |||
730 | 715 | ||
731 | unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) | 716 | unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) |
732 | { | 717 | { |
733 | unsigned int sync; | 718 | unsigned int sync, flush_point; |
734 | unsigned int i; | ||
735 | |||
736 | lockdep_assert_held(&rb->s_lock); | 719 | lockdep_assert_held(&rb->s_lock); |
737 | 720 | ||
738 | sync = READ_ONCE(rb->sync); | 721 | sync = READ_ONCE(rb->sync); |
722 | flush_point = READ_ONCE(rb->flush_point); | ||
739 | 723 | ||
740 | for (i = 0; i < nr_entries; i++) | 724 | if (flush_point != EMPTY_ENTRY) { |
741 | sync = (sync + 1) & (rb->nr_entries - 1); | 725 | unsigned int secs_to_flush; |
726 | |||
727 | secs_to_flush = pblk_rb_ring_count(flush_point, sync, | ||
728 | rb->nr_entries); | ||
729 | if (secs_to_flush < nr_entries) { | ||
730 | /* Protect flush points */ | ||
731 | smp_store_release(&rb->flush_point, EMPTY_ENTRY); | ||
732 | } | ||
733 | } | ||
734 | |||
735 | sync = (sync + nr_entries) & (rb->nr_entries - 1); | ||
742 | 736 | ||
743 | /* Protect from counts */ | 737 | /* Protect from counts */ |
744 | smp_store_release(&rb->sync, sync); | 738 | smp_store_release(&rb->sync, sync); |
@@ -746,22 +740,27 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) | |||
746 | return sync; | 740 | return sync; |
747 | } | 741 | } |
748 | 742 | ||
749 | unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb) | 743 | /* Calculate how many sectors to submit up to the current flush point. */ |
744 | unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb) | ||
750 | { | 745 | { |
751 | unsigned int subm, sync_point; | 746 | unsigned int subm, sync, flush_point; |
752 | unsigned int count; | 747 | unsigned int submitted, to_flush; |
753 | 748 | ||
754 | /* Protect syncs */ | 749 | /* Protect flush points */ |
755 | sync_point = smp_load_acquire(&rb->sync_point); | 750 | flush_point = smp_load_acquire(&rb->flush_point); |
756 | if (sync_point == EMPTY_ENTRY) | 751 | if (flush_point == EMPTY_ENTRY) |
757 | return 0; | 752 | return 0; |
758 | 753 | ||
754 | /* Protect syncs */ | ||
755 | sync = smp_load_acquire(&rb->sync); | ||
756 | |||
759 | subm = READ_ONCE(rb->subm); | 757 | subm = READ_ONCE(rb->subm); |
758 | submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries); | ||
760 | 759 | ||
761 | /* The sync point itself counts as a sector to sync */ | 760 | /* The sync point itself counts as a sector to sync */ |
762 | count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1; | 761 | to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1; |
763 | 762 | ||
764 | return count; | 763 | return (submitted < to_flush) ? (to_flush - submitted) : 0; |
765 | } | 764 | } |
766 | 765 | ||
767 | /* | 766 | /* |
@@ -801,7 +800,7 @@ int pblk_rb_tear_down_check(struct pblk_rb *rb) | |||
801 | 800 | ||
802 | if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && | 801 | if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && |
803 | (rb->sync == rb->l2p_update) && | 802 | (rb->sync == rb->l2p_update) && |
804 | (rb->sync_point == EMPTY_ENTRY)) { | 803 | (rb->flush_point == EMPTY_ENTRY)) { |
805 | goto out; | 804 | goto out; |
806 | } | 805 | } |
807 | 806 | ||
@@ -848,7 +847,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) | |||
848 | queued_entries++; | 847 | queued_entries++; |
849 | spin_unlock_irq(&rb->s_lock); | 848 | spin_unlock_irq(&rb->s_lock); |
850 | 849 | ||
851 | if (rb->sync_point != EMPTY_ENTRY) | 850 | if (rb->flush_point != EMPTY_ENTRY) |
852 | offset = scnprintf(buf, PAGE_SIZE, | 851 | offset = scnprintf(buf, PAGE_SIZE, |
853 | "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", | 852 | "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", |
854 | rb->nr_entries, | 853 | rb->nr_entries, |
@@ -857,14 +856,14 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) | |||
857 | rb->sync, | 856 | rb->sync, |
858 | rb->l2p_update, | 857 | rb->l2p_update, |
859 | #ifdef CONFIG_NVM_DEBUG | 858 | #ifdef CONFIG_NVM_DEBUG |
860 | atomic_read(&rb->inflight_sync_point), | 859 | atomic_read(&rb->inflight_flush_point), |
861 | #else | 860 | #else |
862 | 0, | 861 | 0, |
863 | #endif | 862 | #endif |
864 | rb->sync_point, | 863 | rb->flush_point, |
865 | pblk_rb_read_count(rb), | 864 | pblk_rb_read_count(rb), |
866 | pblk_rb_space(rb), | 865 | pblk_rb_space(rb), |
867 | pblk_rb_sync_point_count(rb), | 866 | pblk_rb_flush_point_count(rb), |
868 | queued_entries); | 867 | queued_entries); |
869 | else | 868 | else |
870 | offset = scnprintf(buf, PAGE_SIZE, | 869 | offset = scnprintf(buf, PAGE_SIZE, |
@@ -875,13 +874,13 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) | |||
875 | rb->sync, | 874 | rb->sync, |
876 | rb->l2p_update, | 875 | rb->l2p_update, |
877 | #ifdef CONFIG_NVM_DEBUG | 876 | #ifdef CONFIG_NVM_DEBUG |
878 | atomic_read(&rb->inflight_sync_point), | 877 | atomic_read(&rb->inflight_flush_point), |
879 | #else | 878 | #else |
880 | 0, | 879 | 0, |
881 | #endif | 880 | #endif |
882 | pblk_rb_read_count(rb), | 881 | pblk_rb_read_count(rb), |
883 | pblk_rb_space(rb), | 882 | pblk_rb_space(rb), |
884 | pblk_rb_sync_point_count(rb), | 883 | pblk_rb_flush_point_count(rb), |
885 | queued_entries); | 884 | queued_entries); |
886 | 885 | ||
887 | return offset; | 886 | return offset; |
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index ca79d8fb3e60..2f761283f43e 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c | |||
@@ -141,7 +141,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) | |||
141 | struct ppa_addr ppa = ppa_list[i]; | 141 | struct ppa_addr ppa = ppa_list[i]; |
142 | struct pblk_line *line; | 142 | struct pblk_line *line; |
143 | 143 | ||
144 | line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; | 144 | line = &pblk->lines[pblk_ppa_to_line(ppa)]; |
145 | kref_put(&line->ref, pblk_line_put_wq); | 145 | kref_put(&line->ref, pblk_line_put_wq); |
146 | } | 146 | } |
147 | } | 147 | } |
@@ -158,8 +158,12 @@ static void pblk_end_user_read(struct bio *bio) | |||
158 | static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, | 158 | static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, |
159 | bool put_line) | 159 | bool put_line) |
160 | { | 160 | { |
161 | struct nvm_tgt_dev *dev = pblk->dev; | ||
161 | struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); | 162 | struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); |
162 | struct bio *bio = rqd->bio; | 163 | struct bio *bio = rqd->bio; |
164 | unsigned long start_time = r_ctx->start_time; | ||
165 | |||
166 | generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time); | ||
163 | 167 | ||
164 | if (rqd->error) | 168 | if (rqd->error) |
165 | pblk_log_read_err(pblk, rqd); | 169 | pblk_log_read_err(pblk, rqd); |
@@ -193,9 +197,9 @@ static void pblk_end_io_read(struct nvm_rq *rqd) | |||
193 | __pblk_end_io_read(pblk, rqd, true); | 197 | __pblk_end_io_read(pblk, rqd, true); |
194 | } | 198 | } |
195 | 199 | ||
196 | static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | 200 | static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, |
197 | unsigned int bio_init_idx, | 201 | unsigned int bio_init_idx, |
198 | unsigned long *read_bitmap) | 202 | unsigned long *read_bitmap) |
199 | { | 203 | { |
200 | struct bio *new_bio, *bio = rqd->bio; | 204 | struct bio *new_bio, *bio = rqd->bio; |
201 | struct pblk_sec_meta *meta_list = rqd->meta_list; | 205 | struct pblk_sec_meta *meta_list = rqd->meta_list; |
@@ -270,7 +274,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
270 | i = 0; | 274 | i = 0; |
271 | hole = find_first_zero_bit(read_bitmap, nr_secs); | 275 | hole = find_first_zero_bit(read_bitmap, nr_secs); |
272 | do { | 276 | do { |
273 | int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]); | 277 | int line_id = pblk_ppa_to_line(rqd->ppa_list[i]); |
274 | struct pblk_line *line = &pblk->lines[line_id]; | 278 | struct pblk_line *line = &pblk->lines[line_id]; |
275 | 279 | ||
276 | kref_put(&line->ref, pblk_line_put); | 280 | kref_put(&line->ref, pblk_line_put); |
@@ -306,6 +310,8 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
306 | return NVM_IO_OK; | 310 | return NVM_IO_OK; |
307 | 311 | ||
308 | err: | 312 | err: |
313 | pr_err("pblk: failed to perform partial read\n"); | ||
314 | |||
309 | /* Free allocated pages in new bio */ | 315 | /* Free allocated pages in new bio */ |
310 | pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); | 316 | pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); |
311 | __pblk_end_io_read(pblk, rqd, false); | 317 | __pblk_end_io_read(pblk, rqd, false); |
@@ -357,6 +363,7 @@ retry: | |||
357 | int pblk_submit_read(struct pblk *pblk, struct bio *bio) | 363 | int pblk_submit_read(struct pblk *pblk, struct bio *bio) |
358 | { | 364 | { |
359 | struct nvm_tgt_dev *dev = pblk->dev; | 365 | struct nvm_tgt_dev *dev = pblk->dev; |
366 | struct request_queue *q = dev->q; | ||
360 | sector_t blba = pblk_get_lba(bio); | 367 | sector_t blba = pblk_get_lba(bio); |
361 | unsigned int nr_secs = pblk_get_secs(bio); | 368 | unsigned int nr_secs = pblk_get_secs(bio); |
362 | struct pblk_g_ctx *r_ctx; | 369 | struct pblk_g_ctx *r_ctx; |
@@ -372,6 +379,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
372 | return NVM_IO_ERR; | 379 | return NVM_IO_ERR; |
373 | } | 380 | } |
374 | 381 | ||
382 | generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0); | ||
383 | |||
375 | bitmap_zero(&read_bitmap, nr_secs); | 384 | bitmap_zero(&read_bitmap, nr_secs); |
376 | 385 | ||
377 | rqd = pblk_alloc_rqd(pblk, PBLK_READ); | 386 | rqd = pblk_alloc_rqd(pblk, PBLK_READ); |
@@ -383,6 +392,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
383 | rqd->end_io = pblk_end_io_read; | 392 | rqd->end_io = pblk_end_io_read; |
384 | 393 | ||
385 | r_ctx = nvm_rq_to_pdu(rqd); | 394 | r_ctx = nvm_rq_to_pdu(rqd); |
395 | r_ctx->start_time = jiffies; | ||
386 | r_ctx->lba = blba; | 396 | r_ctx->lba = blba; |
387 | 397 | ||
388 | /* Save the index for this bio's start. This is needed in case | 398 | /* Save the index for this bio's start. This is needed in case |
@@ -422,7 +432,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
422 | int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); | 432 | int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); |
423 | if (!int_bio) { | 433 | if (!int_bio) { |
424 | pr_err("pblk: could not clone read bio\n"); | 434 | pr_err("pblk: could not clone read bio\n"); |
425 | return NVM_IO_ERR; | 435 | goto fail_end_io; |
426 | } | 436 | } |
427 | 437 | ||
428 | rqd->bio = int_bio; | 438 | rqd->bio = int_bio; |
@@ -433,7 +443,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
433 | pr_err("pblk: read IO submission failed\n"); | 443 | pr_err("pblk: read IO submission failed\n"); |
434 | if (int_bio) | 444 | if (int_bio) |
435 | bio_put(int_bio); | 445 | bio_put(int_bio); |
436 | return ret; | 446 | goto fail_end_io; |
437 | } | 447 | } |
438 | 448 | ||
439 | return NVM_IO_OK; | 449 | return NVM_IO_OK; |
@@ -442,17 +452,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
442 | /* The read bio request could be partially filled by the write buffer, | 452 | /* The read bio request could be partially filled by the write buffer, |
443 | * but there are some holes that need to be read from the drive. | 453 | * but there are some holes that need to be read from the drive. |
444 | */ | 454 | */ |
445 | ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap); | 455 | return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap); |
446 | if (ret) { | ||
447 | pr_err("pblk: failed to perform partial read\n"); | ||
448 | return ret; | ||
449 | } | ||
450 | |||
451 | return NVM_IO_OK; | ||
452 | 456 | ||
453 | fail_rqd_free: | 457 | fail_rqd_free: |
454 | pblk_free_rqd(pblk, rqd, PBLK_READ); | 458 | pblk_free_rqd(pblk, rqd, PBLK_READ); |
455 | return ret; | 459 | return ret; |
460 | fail_end_io: | ||
461 | __pblk_end_io_read(pblk, rqd, false); | ||
462 | return ret; | ||
456 | } | 463 | } |
457 | 464 | ||
458 | static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, | 465 | static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, |
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index eadb3eb5d4dc..1d5e961bf5e0 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c | |||
@@ -111,18 +111,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, | |||
111 | return 0; | 111 | return 0; |
112 | } | 112 | } |
113 | 113 | ||
114 | __le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf) | 114 | int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) |
115 | { | 115 | { |
116 | u32 crc; | 116 | u32 crc; |
117 | 117 | ||
118 | crc = pblk_calc_emeta_crc(pblk, emeta_buf); | 118 | crc = pblk_calc_emeta_crc(pblk, emeta_buf); |
119 | if (le32_to_cpu(emeta_buf->crc) != crc) | 119 | if (le32_to_cpu(emeta_buf->crc) != crc) |
120 | return NULL; | 120 | return 1; |
121 | 121 | ||
122 | if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) | 122 | if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) |
123 | return NULL; | 123 | return 1; |
124 | 124 | ||
125 | return emeta_to_lbas(pblk, emeta_buf); | 125 | return 0; |
126 | } | 126 | } |
127 | 127 | ||
128 | static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) | 128 | static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) |
@@ -137,7 +137,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) | |||
137 | u64 nr_valid_lbas, nr_lbas = 0; | 137 | u64 nr_valid_lbas, nr_lbas = 0; |
138 | u64 i; | 138 | u64 i; |
139 | 139 | ||
140 | lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); | 140 | lba_list = emeta_to_lbas(pblk, emeta_buf); |
141 | if (!lba_list) | 141 | if (!lba_list) |
142 | return 1; | 142 | return 1; |
143 | 143 | ||
@@ -149,7 +149,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) | |||
149 | struct ppa_addr ppa; | 149 | struct ppa_addr ppa; |
150 | int pos; | 150 | int pos; |
151 | 151 | ||
152 | ppa = addr_to_pblk_ppa(pblk, i, line->id); | 152 | ppa = addr_to_gen_ppa(pblk, i, line->id); |
153 | pos = pblk_ppa_to_pos(geo, ppa); | 153 | pos = pblk_ppa_to_pos(geo, ppa); |
154 | 154 | ||
155 | /* Do not update bad blocks */ | 155 | /* Do not update bad blocks */ |
@@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line) | |||
188 | int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); | 188 | int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); |
189 | 189 | ||
190 | return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - | 190 | return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - |
191 | nr_bb * geo->sec_per_blk; | 191 | nr_bb * geo->sec_per_chk; |
192 | } | 192 | } |
193 | 193 | ||
194 | struct pblk_recov_alloc { | 194 | struct pblk_recov_alloc { |
@@ -263,12 +263,12 @@ next_read_rq: | |||
263 | int pos; | 263 | int pos; |
264 | 264 | ||
265 | ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); | 265 | ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); |
266 | pos = pblk_dev_ppa_to_pos(geo, ppa); | 266 | pos = pblk_ppa_to_pos(geo, ppa); |
267 | 267 | ||
268 | while (test_bit(pos, line->blk_bitmap)) { | 268 | while (test_bit(pos, line->blk_bitmap)) { |
269 | r_ptr_int += pblk->min_write_pgs; | 269 | r_ptr_int += pblk->min_write_pgs; |
270 | ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); | 270 | ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); |
271 | pos = pblk_dev_ppa_to_pos(geo, ppa); | 271 | pos = pblk_ppa_to_pos(geo, ppa); |
272 | } | 272 | } |
273 | 273 | ||
274 | for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++) | 274 | for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++) |
@@ -288,7 +288,7 @@ next_read_rq: | |||
288 | /* At this point, the read should not fail. If it does, it is a problem | 288 | /* At this point, the read should not fail. If it does, it is a problem |
289 | * we cannot recover from here. Need FTL log. | 289 | * we cannot recover from here. Need FTL log. |
290 | */ | 290 | */ |
291 | if (rqd->error) { | 291 | if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { |
292 | pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); | 292 | pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); |
293 | return -EINTR; | 293 | return -EINTR; |
294 | } | 294 | } |
@@ -411,12 +411,12 @@ next_pad_rq: | |||
411 | int pos; | 411 | int pos; |
412 | 412 | ||
413 | w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); | 413 | w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); |
414 | ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); | 414 | ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); |
415 | pos = pblk_ppa_to_pos(geo, ppa); | 415 | pos = pblk_ppa_to_pos(geo, ppa); |
416 | 416 | ||
417 | while (test_bit(pos, line->blk_bitmap)) { | 417 | while (test_bit(pos, line->blk_bitmap)) { |
418 | w_ptr += pblk->min_write_pgs; | 418 | w_ptr += pblk->min_write_pgs; |
419 | ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); | 419 | ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); |
420 | pos = pblk_ppa_to_pos(geo, ppa); | 420 | pos = pblk_ppa_to_pos(geo, ppa); |
421 | } | 421 | } |
422 | 422 | ||
@@ -541,12 +541,12 @@ next_rq: | |||
541 | 541 | ||
542 | w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); | 542 | w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); |
543 | ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); | 543 | ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); |
544 | pos = pblk_dev_ppa_to_pos(geo, ppa); | 544 | pos = pblk_ppa_to_pos(geo, ppa); |
545 | 545 | ||
546 | while (test_bit(pos, line->blk_bitmap)) { | 546 | while (test_bit(pos, line->blk_bitmap)) { |
547 | w_ptr += pblk->min_write_pgs; | 547 | w_ptr += pblk->min_write_pgs; |
548 | ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); | 548 | ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); |
549 | pos = pblk_dev_ppa_to_pos(geo, ppa); | 549 | pos = pblk_ppa_to_pos(geo, ppa); |
550 | } | 550 | } |
551 | 551 | ||
552 | for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) | 552 | for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) |
@@ -672,12 +672,12 @@ next_rq: | |||
672 | 672 | ||
673 | paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); | 673 | paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); |
674 | ppa = addr_to_gen_ppa(pblk, paddr, line->id); | 674 | ppa = addr_to_gen_ppa(pblk, paddr, line->id); |
675 | pos = pblk_dev_ppa_to_pos(geo, ppa); | 675 | pos = pblk_ppa_to_pos(geo, ppa); |
676 | 676 | ||
677 | while (test_bit(pos, line->blk_bitmap)) { | 677 | while (test_bit(pos, line->blk_bitmap)) { |
678 | paddr += pblk->min_write_pgs; | 678 | paddr += pblk->min_write_pgs; |
679 | ppa = addr_to_gen_ppa(pblk, paddr, line->id); | 679 | ppa = addr_to_gen_ppa(pblk, paddr, line->id); |
680 | pos = pblk_dev_ppa_to_pos(geo, ppa); | 680 | pos = pblk_ppa_to_pos(geo, ppa); |
681 | } | 681 | } |
682 | 682 | ||
683 | for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++) | 683 | for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++) |
@@ -817,7 +817,7 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) | |||
817 | 817 | ||
818 | while (emeta_secs) { | 818 | while (emeta_secs) { |
819 | emeta_start--; | 819 | emeta_start--; |
820 | ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id); | 820 | ppa = addr_to_gen_ppa(pblk, emeta_start, line->id); |
821 | pos = pblk_ppa_to_pos(geo, ppa); | 821 | pos = pblk_ppa_to_pos(geo, ppa); |
822 | if (!test_bit(pos, line->blk_bitmap)) | 822 | if (!test_bit(pos, line->blk_bitmap)) |
823 | emeta_secs--; | 823 | emeta_secs--; |
@@ -938,6 +938,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | |||
938 | goto next; | 938 | goto next; |
939 | } | 939 | } |
940 | 940 | ||
941 | if (pblk_recov_check_emeta(pblk, line->emeta->buf)) { | ||
942 | pblk_recov_l2p_from_oob(pblk, line); | ||
943 | goto next; | ||
944 | } | ||
945 | |||
941 | if (pblk_recov_l2p_from_emeta(pblk, line)) | 946 | if (pblk_recov_l2p_from_emeta(pblk, line)) |
942 | pblk_recov_l2p_from_oob(pblk, line); | 947 | pblk_recov_l2p_from_oob(pblk, line); |
943 | 948 | ||
@@ -984,10 +989,8 @@ next: | |||
984 | } | 989 | } |
985 | spin_unlock(&l_mg->free_lock); | 990 | spin_unlock(&l_mg->free_lock); |
986 | 991 | ||
987 | if (is_next) { | 992 | if (is_next) |
988 | pblk_line_erase(pblk, l_mg->data_next); | 993 | pblk_line_erase(pblk, l_mg->data_next); |
989 | pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); | ||
990 | } | ||
991 | 994 | ||
992 | out: | 995 | out: |
993 | if (found_lines != recovered_lines) | 996 | if (found_lines != recovered_lines) |
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index dacc71922260..0d457b162f23 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c | |||
@@ -89,17 +89,15 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) | |||
89 | return atomic_read(&rl->free_blocks); | 89 | return atomic_read(&rl->free_blocks); |
90 | } | 90 | } |
91 | 91 | ||
92 | /* | 92 | unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl) |
93 | * We check for (i) the number of free blocks in the current LUN and (ii) the | 93 | { |
94 | * total number of free blocks in the pblk instance. This is to even out the | 94 | return atomic_read(&rl->free_user_blocks); |
95 | * number of free blocks on each LUN when GC kicks in. | 95 | } |
96 | * | 96 | |
97 | * Only the total number of free blocks is used to configure the rate limiter. | 97 | static void __pblk_rl_update_rates(struct pblk_rl *rl, |
98 | */ | 98 | unsigned long free_blocks) |
99 | void pblk_rl_update_rates(struct pblk_rl *rl) | ||
100 | { | 99 | { |
101 | struct pblk *pblk = container_of(rl, struct pblk, rl); | 100 | struct pblk *pblk = container_of(rl, struct pblk, rl); |
102 | unsigned long free_blocks = pblk_rl_nr_free_blks(rl); | ||
103 | int max = rl->rb_budget; | 101 | int max = rl->rb_budget; |
104 | 102 | ||
105 | if (free_blocks >= rl->high) { | 103 | if (free_blocks >= rl->high) { |
@@ -132,20 +130,37 @@ void pblk_rl_update_rates(struct pblk_rl *rl) | |||
132 | pblk_gc_should_stop(pblk); | 130 | pblk_gc_should_stop(pblk); |
133 | } | 131 | } |
134 | 132 | ||
133 | void pblk_rl_update_rates(struct pblk_rl *rl) | ||
134 | { | ||
135 | __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl)); | ||
136 | } | ||
137 | |||
135 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) | 138 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) |
136 | { | 139 | { |
137 | int blk_in_line = atomic_read(&line->blk_in_line); | 140 | int blk_in_line = atomic_read(&line->blk_in_line); |
141 | int free_blocks; | ||
138 | 142 | ||
139 | atomic_add(blk_in_line, &rl->free_blocks); | 143 | atomic_add(blk_in_line, &rl->free_blocks); |
140 | pblk_rl_update_rates(rl); | 144 | free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks); |
145 | |||
146 | __pblk_rl_update_rates(rl, free_blocks); | ||
141 | } | 147 | } |
142 | 148 | ||
143 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) | 149 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, |
150 | bool used) | ||
144 | { | 151 | { |
145 | int blk_in_line = atomic_read(&line->blk_in_line); | 152 | int blk_in_line = atomic_read(&line->blk_in_line); |
153 | int free_blocks; | ||
146 | 154 | ||
147 | atomic_sub(blk_in_line, &rl->free_blocks); | 155 | atomic_sub(blk_in_line, &rl->free_blocks); |
148 | pblk_rl_update_rates(rl); | 156 | |
157 | if (used) | ||
158 | free_blocks = atomic_sub_return(blk_in_line, | ||
159 | &rl->free_user_blocks); | ||
160 | else | ||
161 | free_blocks = atomic_read(&rl->free_user_blocks); | ||
162 | |||
163 | __pblk_rl_update_rates(rl, free_blocks); | ||
149 | } | 164 | } |
150 | 165 | ||
151 | int pblk_rl_high_thrs(struct pblk_rl *rl) | 166 | int pblk_rl_high_thrs(struct pblk_rl *rl) |
@@ -174,16 +189,21 @@ void pblk_rl_free(struct pblk_rl *rl) | |||
174 | void pblk_rl_init(struct pblk_rl *rl, int budget) | 189 | void pblk_rl_init(struct pblk_rl *rl, int budget) |
175 | { | 190 | { |
176 | struct pblk *pblk = container_of(rl, struct pblk, rl); | 191 | struct pblk *pblk = container_of(rl, struct pblk, rl); |
192 | struct nvm_tgt_dev *dev = pblk->dev; | ||
193 | struct nvm_geo *geo = &dev->geo; | ||
194 | struct pblk_line_mgmt *l_mg = &pblk->l_mg; | ||
177 | struct pblk_line_meta *lm = &pblk->lm; | 195 | struct pblk_line_meta *lm = &pblk->lm; |
178 | int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; | 196 | int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; |
197 | int sec_meta, blk_meta; | ||
198 | |||
179 | unsigned int rb_windows; | 199 | unsigned int rb_windows; |
180 | 200 | ||
181 | rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS; | 201 | /* Consider sectors used for metadata */ |
182 | rl->high_pw = get_count_order(rl->high); | 202 | sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; |
203 | blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk); | ||
183 | 204 | ||
184 | rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; | 205 | rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; |
185 | if (rl->low < min_blocks) | 206 | rl->high_pw = get_count_order(rl->high); |
186 | rl->low = min_blocks; | ||
187 | 207 | ||
188 | rl->rsv_blocks = min_blocks; | 208 | rl->rsv_blocks = min_blocks; |
189 | 209 | ||
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index cd49e8875d4e..620bab853579 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c | |||
@@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) | |||
28 | ssize_t sz = 0; | 28 | ssize_t sz = 0; |
29 | int i; | 29 | int i; |
30 | 30 | ||
31 | for (i = 0; i < geo->nr_luns; i++) { | 31 | for (i = 0; i < geo->all_luns; i++) { |
32 | int active = 1; | 32 | int active = 1; |
33 | 33 | ||
34 | rlun = &pblk->luns[i]; | 34 | rlun = &pblk->luns[i]; |
@@ -49,11 +49,12 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) | |||
49 | 49 | ||
50 | static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) | 50 | static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) |
51 | { | 51 | { |
52 | int free_blocks, total_blocks; | 52 | int free_blocks, free_user_blocks, total_blocks; |
53 | int rb_user_max, rb_user_cnt; | 53 | int rb_user_max, rb_user_cnt; |
54 | int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; | 54 | int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; |
55 | 55 | ||
56 | free_blocks = atomic_read(&pblk->rl.free_blocks); | 56 | free_blocks = pblk_rl_nr_free_blks(&pblk->rl); |
57 | free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl); | ||
57 | rb_user_max = pblk->rl.rb_user_max; | 58 | rb_user_max = pblk->rl.rb_user_max; |
58 | rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); | 59 | rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); |
59 | rb_gc_max = pblk->rl.rb_gc_max; | 60 | rb_gc_max = pblk->rl.rb_gc_max; |
@@ -64,16 +65,16 @@ static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) | |||
64 | total_blocks = pblk->rl.total_blocks; | 65 | total_blocks = pblk->rl.total_blocks; |
65 | 66 | ||
66 | return snprintf(page, PAGE_SIZE, | 67 | return snprintf(page, PAGE_SIZE, |
67 | "u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", | 68 | "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n", |
68 | rb_user_cnt, | 69 | rb_user_cnt, |
69 | rb_user_max, | 70 | rb_user_max, |
70 | rb_gc_cnt, | 71 | rb_gc_cnt, |
71 | rb_gc_max, | 72 | rb_gc_max, |
72 | rb_state, | 73 | rb_state, |
73 | rb_budget, | 74 | rb_budget, |
74 | pblk->rl.low, | ||
75 | pblk->rl.high, | 75 | pblk->rl.high, |
76 | free_blocks, | 76 | free_blocks, |
77 | free_user_blocks, | ||
77 | total_blocks, | 78 | total_blocks, |
78 | READ_ONCE(pblk->rl.rb_user_active)); | 79 | READ_ONCE(pblk->rl.rb_user_active)); |
79 | } | 80 | } |
@@ -238,7 +239,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) | |||
238 | 239 | ||
239 | sz = snprintf(page, PAGE_SIZE - sz, | 240 | sz = snprintf(page, PAGE_SIZE - sz, |
240 | "line: nluns:%d, nblks:%d, nsecs:%d\n", | 241 | "line: nluns:%d, nblks:%d, nsecs:%d\n", |
241 | geo->nr_luns, lm->blk_per_line, lm->sec_per_line); | 242 | geo->all_luns, lm->blk_per_line, lm->sec_per_line); |
242 | 243 | ||
243 | sz += snprintf(page + sz, PAGE_SIZE - sz, | 244 | sz += snprintf(page + sz, PAGE_SIZE - sz, |
244 | "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", | 245 | "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", |
@@ -287,7 +288,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) | |||
287 | "blk_line:%d, sec_line:%d, sec_blk:%d\n", | 288 | "blk_line:%d, sec_line:%d, sec_blk:%d\n", |
288 | lm->blk_per_line, | 289 | lm->blk_per_line, |
289 | lm->sec_per_line, | 290 | lm->sec_per_line, |
290 | geo->sec_per_blk); | 291 | geo->sec_per_chk); |
291 | 292 | ||
292 | return sz; | 293 | return sz; |
293 | } | 294 | } |
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 6c1cafafef53..aae86ed60b98 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c | |||
@@ -21,13 +21,28 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
21 | struct pblk_c_ctx *c_ctx) | 21 | struct pblk_c_ctx *c_ctx) |
22 | { | 22 | { |
23 | struct bio *original_bio; | 23 | struct bio *original_bio; |
24 | struct pblk_rb *rwb = &pblk->rwb; | ||
24 | unsigned long ret; | 25 | unsigned long ret; |
25 | int i; | 26 | int i; |
26 | 27 | ||
27 | for (i = 0; i < c_ctx->nr_valid; i++) { | 28 | for (i = 0; i < c_ctx->nr_valid; i++) { |
28 | struct pblk_w_ctx *w_ctx; | 29 | struct pblk_w_ctx *w_ctx; |
30 | int pos = c_ctx->sentry + i; | ||
31 | int flags; | ||
32 | |||
33 | w_ctx = pblk_rb_w_ctx(rwb, pos); | ||
34 | flags = READ_ONCE(w_ctx->flags); | ||
35 | |||
36 | if (flags & PBLK_FLUSH_ENTRY) { | ||
37 | flags &= ~PBLK_FLUSH_ENTRY; | ||
38 | /* Release flags on context. Protect from writes */ | ||
39 | smp_store_release(&w_ctx->flags, flags); | ||
40 | |||
41 | #ifdef CONFIG_NVM_DEBUG | ||
42 | atomic_dec(&rwb->inflight_flush_point); | ||
43 | #endif | ||
44 | } | ||
29 | 45 | ||
30 | w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i); | ||
31 | while ((original_bio = bio_list_pop(&w_ctx->bios))) | 46 | while ((original_bio = bio_list_pop(&w_ctx->bios))) |
32 | bio_endio(original_bio); | 47 | bio_endio(original_bio); |
33 | } | 48 | } |
@@ -439,7 +454,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) | |||
439 | struct pblk_line *meta_line; | 454 | struct pblk_line *meta_line; |
440 | int err; | 455 | int err; |
441 | 456 | ||
442 | ppa_set_empty(&erase_ppa); | 457 | pblk_ppa_set_empty(&erase_ppa); |
443 | 458 | ||
444 | /* Assign lbas to ppas and populate request structure */ | 459 | /* Assign lbas to ppas and populate request structure */ |
445 | err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); | 460 | err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); |
@@ -457,7 +472,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) | |||
457 | return NVM_IO_ERR; | 472 | return NVM_IO_ERR; |
458 | } | 473 | } |
459 | 474 | ||
460 | if (!ppa_empty(erase_ppa)) { | 475 | if (!pblk_ppa_empty(erase_ppa)) { |
461 | /* Submit erase for next data line */ | 476 | /* Submit erase for next data line */ |
462 | if (pblk_blk_erase_async(pblk, erase_ppa)) { | 477 | if (pblk_blk_erase_async(pblk, erase_ppa)) { |
463 | struct pblk_line *e_line = pblk_line_get_erase(pblk); | 478 | struct pblk_line *e_line = pblk_line_get_erase(pblk); |
@@ -508,7 +523,7 @@ static int pblk_submit_write(struct pblk *pblk) | |||
508 | if (!secs_avail) | 523 | if (!secs_avail) |
509 | return 1; | 524 | return 1; |
510 | 525 | ||
511 | secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb); | 526 | secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); |
512 | if (!secs_to_flush && secs_avail < pblk->min_write_pgs) | 527 | if (!secs_to_flush && secs_avail < pblk->min_write_pgs) |
513 | return 1; | 528 | return 1; |
514 | 529 | ||
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 59a64d461a5d..8c357fb6538e 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h | |||
@@ -51,17 +51,16 @@ | |||
51 | 51 | ||
52 | #define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) | 52 | #define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) |
53 | 53 | ||
54 | #define pblk_for_each_lun(pblk, rlun, i) \ | ||
55 | for ((i) = 0, rlun = &(pblk)->luns[0]; \ | ||
56 | (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)]) | ||
57 | |||
58 | /* Static pool sizes */ | 54 | /* Static pool sizes */ |
59 | #define PBLK_GEN_WS_POOL_SIZE (2) | 55 | #define PBLK_GEN_WS_POOL_SIZE (2) |
60 | 56 | ||
57 | #define PBLK_DEFAULT_OP (11) | ||
58 | |||
61 | enum { | 59 | enum { |
62 | PBLK_READ = READ, | 60 | PBLK_READ = READ, |
63 | PBLK_WRITE = WRITE,/* Write from write buffer */ | 61 | PBLK_WRITE = WRITE,/* Write from write buffer */ |
64 | PBLK_WRITE_INT, /* Internal write - no write buffer */ | 62 | PBLK_WRITE_INT, /* Internal write - no write buffer */ |
63 | PBLK_READ_RECOV, /* Recovery read - errors allowed */ | ||
65 | PBLK_ERASE, | 64 | PBLK_ERASE, |
66 | }; | 65 | }; |
67 | 66 | ||
@@ -114,6 +113,7 @@ struct pblk_c_ctx { | |||
114 | /* read context */ | 113 | /* read context */ |
115 | struct pblk_g_ctx { | 114 | struct pblk_g_ctx { |
116 | void *private; | 115 | void *private; |
116 | unsigned long start_time; | ||
117 | u64 lba; | 117 | u64 lba; |
118 | }; | 118 | }; |
119 | 119 | ||
@@ -170,7 +170,7 @@ struct pblk_rb { | |||
170 | * the last submitted entry that has | 170 | * the last submitted entry that has |
171 | * been successfully persisted to media | 171 | * been successfully persisted to media |
172 | */ | 172 | */ |
173 | unsigned int sync_point; /* Sync point - last entry that must be | 173 | unsigned int flush_point; /* Sync point - last entry that must be |
174 | * flushed to the media. Used with | 174 | * flushed to the media. Used with |
175 | * REQ_FLUSH and REQ_FUA | 175 | * REQ_FLUSH and REQ_FUA |
176 | */ | 176 | */ |
@@ -193,7 +193,7 @@ struct pblk_rb { | |||
193 | spinlock_t s_lock; /* Sync lock */ | 193 | spinlock_t s_lock; /* Sync lock */ |
194 | 194 | ||
195 | #ifdef CONFIG_NVM_DEBUG | 195 | #ifdef CONFIG_NVM_DEBUG |
196 | atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */ | 196 | atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ |
197 | #endif | 197 | #endif |
198 | }; | 198 | }; |
199 | 199 | ||
@@ -256,9 +256,6 @@ struct pblk_rl { | |||
256 | unsigned int high; /* Upper threshold for rate limiter (free run - | 256 | unsigned int high; /* Upper threshold for rate limiter (free run - |
257 | * user I/O rate limiter | 257 | * user I/O rate limiter |
258 | */ | 258 | */ |
259 | unsigned int low; /* Lower threshold for rate limiter (user I/O | ||
260 | * rate limiter - stall) | ||
261 | */ | ||
262 | unsigned int high_pw; /* High rounded up as a power of 2 */ | 259 | unsigned int high_pw; /* High rounded up as a power of 2 */ |
263 | 260 | ||
264 | #define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ | 261 | #define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ |
@@ -292,7 +289,9 @@ struct pblk_rl { | |||
292 | 289 | ||
293 | unsigned long long nr_secs; | 290 | unsigned long long nr_secs; |
294 | unsigned long total_blocks; | 291 | unsigned long total_blocks; |
295 | atomic_t free_blocks; | 292 | |
293 | atomic_t free_blocks; /* Total number of free blocks (+ OP) */ | ||
294 | atomic_t free_user_blocks; /* Number of user free blocks (no OP) */ | ||
296 | }; | 295 | }; |
297 | 296 | ||
298 | #define PBLK_LINE_EMPTY (~0U) | 297 | #define PBLK_LINE_EMPTY (~0U) |
@@ -583,7 +582,9 @@ struct pblk { | |||
583 | */ | 582 | */ |
584 | 583 | ||
585 | sector_t capacity; /* Device capacity when bad blocks are subtracted */ | 584 | sector_t capacity; /* Device capacity when bad blocks are subtracted */ |
586 | int over_pct; /* Percentage of device used for over-provisioning */ | 585 | |
586 | int op; /* Percentage of device used for over-provisioning */ | ||
587 | int op_blks; /* Number of blocks used for over-provisioning */ | ||
587 | 588 | ||
588 | /* pblk provisioning values. Used by rate limiter */ | 589 | /* pblk provisioning values. Used by rate limiter */ |
589 | struct pblk_rl rl; | 590 | struct pblk_rl rl; |
@@ -691,7 +692,7 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries); | |||
691 | struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb, | 692 | struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb, |
692 | struct ppa_addr *ppa); | 693 | struct ppa_addr *ppa); |
693 | void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); | 694 | void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); |
694 | unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb); | 695 | unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb); |
695 | 696 | ||
696 | unsigned int pblk_rb_read_count(struct pblk_rb *rb); | 697 | unsigned int pblk_rb_read_count(struct pblk_rb *rb); |
697 | unsigned int pblk_rb_sync_count(struct pblk_rb *rb); | 698 | unsigned int pblk_rb_sync_count(struct pblk_rb *rb); |
@@ -812,7 +813,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); | |||
812 | void pblk_submit_rec(struct work_struct *work); | 813 | void pblk_submit_rec(struct work_struct *work); |
813 | struct pblk_line *pblk_recov_l2p(struct pblk *pblk); | 814 | struct pblk_line *pblk_recov_l2p(struct pblk *pblk); |
814 | int pblk_recov_pad(struct pblk *pblk); | 815 | int pblk_recov_pad(struct pblk *pblk); |
815 | __le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta); | 816 | int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); |
816 | int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, | 817 | int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, |
817 | struct pblk_rec_ctx *recovery, u64 *comp_bits, | 818 | struct pblk_rec_ctx *recovery, u64 *comp_bits, |
818 | unsigned int comp); | 819 | unsigned int comp); |
@@ -843,6 +844,7 @@ void pblk_rl_free(struct pblk_rl *rl); | |||
843 | void pblk_rl_update_rates(struct pblk_rl *rl); | 844 | void pblk_rl_update_rates(struct pblk_rl *rl); |
844 | int pblk_rl_high_thrs(struct pblk_rl *rl); | 845 | int pblk_rl_high_thrs(struct pblk_rl *rl); |
845 | unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); | 846 | unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); |
847 | unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl); | ||
846 | int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); | 848 | int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); |
847 | void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); | 849 | void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); |
848 | void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); | 850 | void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); |
@@ -851,7 +853,8 @@ void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); | |||
851 | void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); | 853 | void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); |
852 | int pblk_rl_max_io(struct pblk_rl *rl); | 854 | int pblk_rl_max_io(struct pblk_rl *rl); |
853 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); | 855 | void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); |
854 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); | 856 | void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, |
857 | bool used); | ||
855 | int pblk_rl_is_limit(struct pblk_rl *rl); | 858 | int pblk_rl_is_limit(struct pblk_rl *rl); |
856 | 859 | ||
857 | /* | 860 | /* |
@@ -907,28 +910,47 @@ static inline int pblk_pad_distance(struct pblk *pblk) | |||
907 | struct nvm_tgt_dev *dev = pblk->dev; | 910 | struct nvm_tgt_dev *dev = pblk->dev; |
908 | struct nvm_geo *geo = &dev->geo; | 911 | struct nvm_geo *geo = &dev->geo; |
909 | 912 | ||
910 | return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl; | 913 | return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl; |
911 | } | 914 | } |
912 | 915 | ||
913 | static inline int pblk_dev_ppa_to_line(struct ppa_addr p) | 916 | static inline int pblk_ppa_to_line(struct ppa_addr p) |
914 | { | 917 | { |
915 | return p.g.blk; | 918 | return p.g.blk; |
916 | } | 919 | } |
917 | 920 | ||
918 | static inline int pblk_tgt_ppa_to_line(struct ppa_addr p) | 921 | static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) |
919 | { | 922 | { |
920 | return p.g.blk; | 923 | return p.g.lun * geo->nr_chnls + p.g.ch; |
921 | } | 924 | } |
922 | 925 | ||
923 | static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) | 926 | static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, |
927 | u64 line_id) | ||
924 | { | 928 | { |
925 | return p.g.lun * geo->nr_chnls + p.g.ch; | 929 | struct ppa_addr ppa; |
930 | |||
931 | ppa.ppa = 0; | ||
932 | ppa.g.blk = line_id; | ||
933 | ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset; | ||
934 | ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset; | ||
935 | ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset; | ||
936 | ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset; | ||
937 | ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset; | ||
938 | |||
939 | return ppa; | ||
926 | } | 940 | } |
927 | 941 | ||
928 | /* A block within a line corresponds to the lun */ | 942 | static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, |
929 | static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) | 943 | struct ppa_addr p) |
930 | { | 944 | { |
931 | return p.g.lun * geo->nr_chnls + p.g.ch; | 945 | u64 paddr; |
946 | |||
947 | paddr = (u64)p.g.pg << pblk->ppaf.pg_offset; | ||
948 | paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset; | ||
949 | paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset; | ||
950 | paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset; | ||
951 | paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset; | ||
952 | |||
953 | return paddr; | ||
932 | } | 954 | } |
933 | 955 | ||
934 | static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) | 956 | static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) |
@@ -960,24 +982,6 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) | |||
960 | return ppa64; | 982 | return ppa64; |
961 | } | 983 | } |
962 | 984 | ||
963 | static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, | ||
964 | sector_t lba) | ||
965 | { | ||
966 | struct ppa_addr ppa; | ||
967 | |||
968 | if (pblk->ppaf_bitsize < 32) { | ||
969 | u32 *map = (u32 *)pblk->trans_map; | ||
970 | |||
971 | ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); | ||
972 | } else { | ||
973 | struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; | ||
974 | |||
975 | ppa = map[lba]; | ||
976 | } | ||
977 | |||
978 | return ppa; | ||
979 | } | ||
980 | |||
981 | static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) | 985 | static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) |
982 | { | 986 | { |
983 | u32 ppa32 = 0; | 987 | u32 ppa32 = 0; |
@@ -999,33 +1003,36 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) | |||
999 | return ppa32; | 1003 | return ppa32; |
1000 | } | 1004 | } |
1001 | 1005 | ||
1002 | static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, | 1006 | static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, |
1003 | struct ppa_addr ppa) | 1007 | sector_t lba) |
1004 | { | 1008 | { |
1009 | struct ppa_addr ppa; | ||
1010 | |||
1005 | if (pblk->ppaf_bitsize < 32) { | 1011 | if (pblk->ppaf_bitsize < 32) { |
1006 | u32 *map = (u32 *)pblk->trans_map; | 1012 | u32 *map = (u32 *)pblk->trans_map; |
1007 | 1013 | ||
1008 | map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); | 1014 | ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); |
1009 | } else { | 1015 | } else { |
1010 | u64 *map = (u64 *)pblk->trans_map; | 1016 | struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; |
1011 | 1017 | ||
1012 | map[lba] = ppa.ppa; | 1018 | ppa = map[lba]; |
1013 | } | 1019 | } |
1020 | |||
1021 | return ppa; | ||
1014 | } | 1022 | } |
1015 | 1023 | ||
1016 | static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, | 1024 | static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, |
1017 | struct ppa_addr p) | 1025 | struct ppa_addr ppa) |
1018 | { | 1026 | { |
1019 | u64 paddr; | 1027 | if (pblk->ppaf_bitsize < 32) { |
1028 | u32 *map = (u32 *)pblk->trans_map; | ||
1020 | 1029 | ||
1021 | paddr = 0; | 1030 | map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); |
1022 | paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset; | 1031 | } else { |
1023 | paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset; | 1032 | u64 *map = (u64 *)pblk->trans_map; |
1024 | paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset; | ||
1025 | paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset; | ||
1026 | paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset; | ||
1027 | 1033 | ||
1028 | return paddr; | 1034 | map[lba] = ppa.ppa; |
1035 | } | ||
1029 | } | 1036 | } |
1030 | 1037 | ||
1031 | static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) | 1038 | static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) |
@@ -1040,10 +1047,7 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) | |||
1040 | 1047 | ||
1041 | static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) | 1048 | static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) |
1042 | { | 1049 | { |
1043 | if (lppa.ppa == rppa.ppa) | 1050 | return (lppa.ppa == rppa.ppa); |
1044 | return true; | ||
1045 | |||
1046 | return false; | ||
1047 | } | 1051 | } |
1048 | 1052 | ||
1049 | static inline int pblk_addr_in_cache(struct ppa_addr ppa) | 1053 | static inline int pblk_addr_in_cache(struct ppa_addr ppa) |
@@ -1066,32 +1070,6 @@ static inline struct ppa_addr pblk_cacheline_to_addr(int addr) | |||
1066 | return p; | 1070 | return p; |
1067 | } | 1071 | } |
1068 | 1072 | ||
1069 | static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, | ||
1070 | u64 line_id) | ||
1071 | { | ||
1072 | struct ppa_addr ppa; | ||
1073 | |||
1074 | ppa.ppa = 0; | ||
1075 | ppa.g.blk = line_id; | ||
1076 | ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset; | ||
1077 | ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset; | ||
1078 | ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset; | ||
1079 | ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset; | ||
1080 | ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset; | ||
1081 | |||
1082 | return ppa; | ||
1083 | } | ||
1084 | |||
1085 | static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr, | ||
1086 | u64 line_id) | ||
1087 | { | ||
1088 | struct ppa_addr ppa; | ||
1089 | |||
1090 | ppa = addr_to_gen_ppa(pblk, paddr, line_id); | ||
1091 | |||
1092 | return ppa; | ||
1093 | } | ||
1094 | |||
1095 | static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, | 1073 | static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, |
1096 | struct line_header *header) | 1074 | struct line_header *header) |
1097 | { | 1075 | { |
@@ -1212,10 +1190,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, | |||
1212 | 1190 | ||
1213 | if (!ppa->c.is_cached && | 1191 | if (!ppa->c.is_cached && |
1214 | ppa->g.ch < geo->nr_chnls && | 1192 | ppa->g.ch < geo->nr_chnls && |
1215 | ppa->g.lun < geo->luns_per_chnl && | 1193 | ppa->g.lun < geo->nr_luns && |
1216 | ppa->g.pl < geo->nr_planes && | 1194 | ppa->g.pl < geo->nr_planes && |
1217 | ppa->g.blk < geo->blks_per_lun && | 1195 | ppa->g.blk < geo->nr_chks && |
1218 | ppa->g.pg < geo->pgs_per_blk && | 1196 | ppa->g.pg < geo->ws_per_chk && |
1219 | ppa->g.sec < geo->sec_per_pg) | 1197 | ppa->g.sec < geo->sec_per_pg) |
1220 | continue; | 1198 | continue; |
1221 | 1199 | ||
@@ -1245,7 +1223,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) | |||
1245 | 1223 | ||
1246 | for (i = 0; i < rqd->nr_ppas; i++) { | 1224 | for (i = 0; i < rqd->nr_ppas; i++) { |
1247 | ppa = ppa_list[i]; | 1225 | ppa = ppa_list[i]; |
1248 | line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; | 1226 | line = &pblk->lines[pblk_ppa_to_line(ppa)]; |
1249 | 1227 | ||
1250 | spin_lock(&line->lock); | 1228 | spin_lock(&line->lock); |
1251 | if (line->state != PBLK_LINESTATE_OPEN) { | 1229 | if (line->state != PBLK_LINESTATE_OPEN) { |
@@ -1288,11 +1266,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio) | |||
1288 | return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; | 1266 | return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; |
1289 | } | 1267 | } |
1290 | 1268 | ||
1291 | static inline sector_t pblk_get_sector(sector_t lba) | ||
1292 | { | ||
1293 | return lba * NR_PHY_IN_LOG; | ||
1294 | } | ||
1295 | |||
1296 | static inline void pblk_setup_uuid(struct pblk *pblk) | 1269 | static inline void pblk_setup_uuid(struct pblk *pblk) |
1297 | { | 1270 | { |
1298 | uuid_le uuid; | 1271 | uuid_le uuid; |
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c deleted file mode 100644 index 0993c14be860..000000000000 --- a/drivers/lightnvm/rrpc.c +++ /dev/null | |||
@@ -1,1625 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2015 IT University of Copenhagen | ||
3 | * Initial release: Matias Bjorling <m@bjorling.me> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License version | ||
7 | * 2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, but | ||
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs. | ||
15 | */ | ||
16 | |||
17 | #include "rrpc.h" | ||
18 | |||
19 | static struct kmem_cache *rrpc_gcb_cache, *rrpc_rq_cache; | ||
20 | static DECLARE_RWSEM(rrpc_lock); | ||
21 | |||
22 | static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, | ||
23 | struct nvm_rq *rqd, unsigned long flags); | ||
24 | |||
25 | #define rrpc_for_each_lun(rrpc, rlun, i) \ | ||
26 | for ((i) = 0, rlun = &(rrpc)->luns[0]; \ | ||
27 | (i) < (rrpc)->nr_luns; (i)++, rlun = &(rrpc)->luns[(i)]) | ||
28 | |||
29 | static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a) | ||
30 | { | ||
31 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
32 | struct rrpc_block *rblk = a->rblk; | ||
33 | unsigned int pg_offset; | ||
34 | |||
35 | lockdep_assert_held(&rrpc->rev_lock); | ||
36 | |||
37 | if (a->addr == ADDR_EMPTY || !rblk) | ||
38 | return; | ||
39 | |||
40 | spin_lock(&rblk->lock); | ||
41 | |||
42 | div_u64_rem(a->addr, dev->geo.sec_per_blk, &pg_offset); | ||
43 | WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages)); | ||
44 | rblk->nr_invalid_pages++; | ||
45 | |||
46 | spin_unlock(&rblk->lock); | ||
47 | |||
48 | rrpc->rev_trans_map[a->addr].addr = ADDR_EMPTY; | ||
49 | } | ||
50 | |||
51 | static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba, | ||
52 | unsigned int len) | ||
53 | { | ||
54 | sector_t i; | ||
55 | |||
56 | spin_lock(&rrpc->rev_lock); | ||
57 | for (i = slba; i < slba + len; i++) { | ||
58 | struct rrpc_addr *gp = &rrpc->trans_map[i]; | ||
59 | |||
60 | rrpc_page_invalidate(rrpc, gp); | ||
61 | gp->rblk = NULL; | ||
62 | } | ||
63 | spin_unlock(&rrpc->rev_lock); | ||
64 | } | ||
65 | |||
66 | static struct nvm_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc, | ||
67 | sector_t laddr, unsigned int pages) | ||
68 | { | ||
69 | struct nvm_rq *rqd; | ||
70 | struct rrpc_inflight_rq *inf; | ||
71 | |||
72 | rqd = mempool_alloc(rrpc->rq_pool, GFP_ATOMIC); | ||
73 | if (!rqd) | ||
74 | return ERR_PTR(-ENOMEM); | ||
75 | |||
76 | inf = rrpc_get_inflight_rq(rqd); | ||
77 | if (rrpc_lock_laddr(rrpc, laddr, pages, inf)) { | ||
78 | mempool_free(rqd, rrpc->rq_pool); | ||
79 | return NULL; | ||
80 | } | ||
81 | |||
82 | return rqd; | ||
83 | } | ||
84 | |||
85 | static void rrpc_inflight_laddr_release(struct rrpc *rrpc, struct nvm_rq *rqd) | ||
86 | { | ||
87 | struct rrpc_inflight_rq *inf = rrpc_get_inflight_rq(rqd); | ||
88 | |||
89 | rrpc_unlock_laddr(rrpc, inf); | ||
90 | |||
91 | mempool_free(rqd, rrpc->rq_pool); | ||
92 | } | ||
93 | |||
94 | static void rrpc_discard(struct rrpc *rrpc, struct bio *bio) | ||
95 | { | ||
96 | sector_t slba = bio->bi_iter.bi_sector / NR_PHY_IN_LOG; | ||
97 | sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE; | ||
98 | struct nvm_rq *rqd; | ||
99 | |||
100 | while (1) { | ||
101 | rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len); | ||
102 | if (rqd) | ||
103 | break; | ||
104 | |||
105 | schedule(); | ||
106 | } | ||
107 | |||
108 | if (IS_ERR(rqd)) { | ||
109 | pr_err("rrpc: unable to acquire inflight IO\n"); | ||
110 | bio_io_error(bio); | ||
111 | return; | ||
112 | } | ||
113 | |||
114 | rrpc_invalidate_range(rrpc, slba, len); | ||
115 | rrpc_inflight_laddr_release(rrpc, rqd); | ||
116 | } | ||
117 | |||
118 | static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
119 | { | ||
120 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
121 | |||
122 | return (rblk->next_page == dev->geo.sec_per_blk); | ||
123 | } | ||
124 | |||
125 | /* Calculate relative addr for the given block, considering instantiated LUNs */ | ||
126 | static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
127 | { | ||
128 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
129 | struct rrpc_lun *rlun = rblk->rlun; | ||
130 | |||
131 | return rlun->id * dev->geo.sec_per_blk; | ||
132 | } | ||
133 | |||
134 | static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev, | ||
135 | struct rrpc_addr *gp) | ||
136 | { | ||
137 | struct rrpc_block *rblk = gp->rblk; | ||
138 | struct rrpc_lun *rlun = rblk->rlun; | ||
139 | u64 addr = gp->addr; | ||
140 | struct ppa_addr paddr; | ||
141 | |||
142 | paddr.ppa = addr; | ||
143 | paddr = rrpc_linear_to_generic_addr(&dev->geo, paddr); | ||
144 | paddr.g.ch = rlun->bppa.g.ch; | ||
145 | paddr.g.lun = rlun->bppa.g.lun; | ||
146 | paddr.g.blk = rblk->id; | ||
147 | |||
148 | return paddr; | ||
149 | } | ||
150 | |||
151 | /* requires lun->lock taken */ | ||
152 | static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk, | ||
153 | struct rrpc_block **cur_rblk) | ||
154 | { | ||
155 | struct rrpc *rrpc = rlun->rrpc; | ||
156 | |||
157 | if (*cur_rblk) { | ||
158 | spin_lock(&(*cur_rblk)->lock); | ||
159 | WARN_ON(!block_is_full(rrpc, *cur_rblk)); | ||
160 | spin_unlock(&(*cur_rblk)->lock); | ||
161 | } | ||
162 | *cur_rblk = new_rblk; | ||
163 | } | ||
164 | |||
165 | static struct rrpc_block *__rrpc_get_blk(struct rrpc *rrpc, | ||
166 | struct rrpc_lun *rlun) | ||
167 | { | ||
168 | struct rrpc_block *rblk = NULL; | ||
169 | |||
170 | if (list_empty(&rlun->free_list)) | ||
171 | goto out; | ||
172 | |||
173 | rblk = list_first_entry(&rlun->free_list, struct rrpc_block, list); | ||
174 | |||
175 | list_move_tail(&rblk->list, &rlun->used_list); | ||
176 | rblk->state = NVM_BLK_ST_TGT; | ||
177 | rlun->nr_free_blocks--; | ||
178 | |||
179 | out: | ||
180 | return rblk; | ||
181 | } | ||
182 | |||
183 | static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, | ||
184 | unsigned long flags) | ||
185 | { | ||
186 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
187 | struct rrpc_block *rblk; | ||
188 | int is_gc = flags & NVM_IOTYPE_GC; | ||
189 | |||
190 | spin_lock(&rlun->lock); | ||
191 | if (!is_gc && rlun->nr_free_blocks < rlun->reserved_blocks) { | ||
192 | pr_err("nvm: rrpc: cannot give block to non GC request\n"); | ||
193 | spin_unlock(&rlun->lock); | ||
194 | return NULL; | ||
195 | } | ||
196 | |||
197 | rblk = __rrpc_get_blk(rrpc, rlun); | ||
198 | if (!rblk) { | ||
199 | pr_err("nvm: rrpc: cannot get new block\n"); | ||
200 | spin_unlock(&rlun->lock); | ||
201 | return NULL; | ||
202 | } | ||
203 | spin_unlock(&rlun->lock); | ||
204 | |||
205 | bitmap_zero(rblk->invalid_pages, dev->geo.sec_per_blk); | ||
206 | rblk->next_page = 0; | ||
207 | rblk->nr_invalid_pages = 0; | ||
208 | atomic_set(&rblk->data_cmnt_size, 0); | ||
209 | |||
210 | return rblk; | ||
211 | } | ||
212 | |||
213 | static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
214 | { | ||
215 | struct rrpc_lun *rlun = rblk->rlun; | ||
216 | |||
217 | spin_lock(&rlun->lock); | ||
218 | if (rblk->state & NVM_BLK_ST_TGT) { | ||
219 | list_move_tail(&rblk->list, &rlun->free_list); | ||
220 | rlun->nr_free_blocks++; | ||
221 | rblk->state = NVM_BLK_ST_FREE; | ||
222 | } else if (rblk->state & NVM_BLK_ST_BAD) { | ||
223 | list_move_tail(&rblk->list, &rlun->bb_list); | ||
224 | rblk->state = NVM_BLK_ST_BAD; | ||
225 | } else { | ||
226 | WARN_ON_ONCE(1); | ||
227 | pr_err("rrpc: erroneous type (ch:%d,lun:%d,blk%d-> %u)\n", | ||
228 | rlun->bppa.g.ch, rlun->bppa.g.lun, | ||
229 | rblk->id, rblk->state); | ||
230 | list_move_tail(&rblk->list, &rlun->bb_list); | ||
231 | } | ||
232 | spin_unlock(&rlun->lock); | ||
233 | } | ||
234 | |||
235 | static void rrpc_put_blks(struct rrpc *rrpc) | ||
236 | { | ||
237 | struct rrpc_lun *rlun; | ||
238 | int i; | ||
239 | |||
240 | for (i = 0; i < rrpc->nr_luns; i++) { | ||
241 | rlun = &rrpc->luns[i]; | ||
242 | if (rlun->cur) | ||
243 | rrpc_put_blk(rrpc, rlun->cur); | ||
244 | if (rlun->gc_cur) | ||
245 | rrpc_put_blk(rrpc, rlun->gc_cur); | ||
246 | } | ||
247 | } | ||
248 | |||
249 | static struct rrpc_lun *get_next_lun(struct rrpc *rrpc) | ||
250 | { | ||
251 | int next = atomic_inc_return(&rrpc->next_lun); | ||
252 | |||
253 | return &rrpc->luns[next % rrpc->nr_luns]; | ||
254 | } | ||
255 | |||
256 | static void rrpc_gc_kick(struct rrpc *rrpc) | ||
257 | { | ||
258 | struct rrpc_lun *rlun; | ||
259 | unsigned int i; | ||
260 | |||
261 | for (i = 0; i < rrpc->nr_luns; i++) { | ||
262 | rlun = &rrpc->luns[i]; | ||
263 | queue_work(rrpc->krqd_wq, &rlun->ws_gc); | ||
264 | } | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * timed GC every interval. | ||
269 | */ | ||
270 | static void rrpc_gc_timer(struct timer_list *t) | ||
271 | { | ||
272 | struct rrpc *rrpc = from_timer(rrpc, t, gc_timer); | ||
273 | |||
274 | rrpc_gc_kick(rrpc); | ||
275 | mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10)); | ||
276 | } | ||
277 | |||
278 | static void rrpc_end_sync_bio(struct bio *bio) | ||
279 | { | ||
280 | struct completion *waiting = bio->bi_private; | ||
281 | |||
282 | if (bio->bi_status) | ||
283 | pr_err("nvm: gc request failed (%u).\n", bio->bi_status); | ||
284 | |||
285 | complete(waiting); | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * rrpc_move_valid_pages -- migrate live data off the block | ||
290 | * @rrpc: the 'rrpc' structure | ||
291 | * @block: the block from which to migrate live pages | ||
292 | * | ||
293 | * Description: | ||
294 | * GC algorithms may call this function to migrate remaining live | ||
295 | * pages off the block prior to erasing it. This function blocks | ||
296 | * further execution until the operation is complete. | ||
297 | */ | ||
298 | static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
299 | { | ||
300 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
301 | struct request_queue *q = dev->q; | ||
302 | struct rrpc_rev_addr *rev; | ||
303 | struct nvm_rq *rqd; | ||
304 | struct bio *bio; | ||
305 | struct page *page; | ||
306 | int slot; | ||
307 | int nr_sec_per_blk = dev->geo.sec_per_blk; | ||
308 | u64 phys_addr; | ||
309 | DECLARE_COMPLETION_ONSTACK(wait); | ||
310 | |||
311 | if (bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) | ||
312 | return 0; | ||
313 | |||
314 | bio = bio_alloc(GFP_NOIO, 1); | ||
315 | if (!bio) { | ||
316 | pr_err("nvm: could not alloc bio to gc\n"); | ||
317 | return -ENOMEM; | ||
318 | } | ||
319 | |||
320 | page = mempool_alloc(rrpc->page_pool, GFP_NOIO); | ||
321 | |||
322 | while ((slot = find_first_zero_bit(rblk->invalid_pages, | ||
323 | nr_sec_per_blk)) < nr_sec_per_blk) { | ||
324 | |||
325 | /* Lock laddr */ | ||
326 | phys_addr = rrpc_blk_to_ppa(rrpc, rblk) + slot; | ||
327 | |||
328 | try: | ||
329 | spin_lock(&rrpc->rev_lock); | ||
330 | /* Get logical address from physical to logical table */ | ||
331 | rev = &rrpc->rev_trans_map[phys_addr]; | ||
332 | /* already updated by previous regular write */ | ||
333 | if (rev->addr == ADDR_EMPTY) { | ||
334 | spin_unlock(&rrpc->rev_lock); | ||
335 | continue; | ||
336 | } | ||
337 | |||
338 | rqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1); | ||
339 | if (IS_ERR_OR_NULL(rqd)) { | ||
340 | spin_unlock(&rrpc->rev_lock); | ||
341 | schedule(); | ||
342 | goto try; | ||
343 | } | ||
344 | |||
345 | spin_unlock(&rrpc->rev_lock); | ||
346 | |||
347 | /* Perform read to do GC */ | ||
348 | bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr); | ||
349 | bio_set_op_attrs(bio, REQ_OP_READ, 0); | ||
350 | bio->bi_private = &wait; | ||
351 | bio->bi_end_io = rrpc_end_sync_bio; | ||
352 | |||
353 | /* TODO: may fail when EXP_PG_SIZE > PAGE_SIZE */ | ||
354 | bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0); | ||
355 | |||
356 | if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) { | ||
357 | pr_err("rrpc: gc read failed.\n"); | ||
358 | rrpc_inflight_laddr_release(rrpc, rqd); | ||
359 | goto finished; | ||
360 | } | ||
361 | wait_for_completion_io(&wait); | ||
362 | if (bio->bi_status) { | ||
363 | rrpc_inflight_laddr_release(rrpc, rqd); | ||
364 | goto finished; | ||
365 | } | ||
366 | |||
367 | bio_reset(bio); | ||
368 | reinit_completion(&wait); | ||
369 | |||
370 | bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr); | ||
371 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | ||
372 | bio->bi_private = &wait; | ||
373 | bio->bi_end_io = rrpc_end_sync_bio; | ||
374 | |||
375 | bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0); | ||
376 | |||
377 | /* turn the command around and write the data back to a new | ||
378 | * address | ||
379 | */ | ||
380 | if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) { | ||
381 | pr_err("rrpc: gc write failed.\n"); | ||
382 | rrpc_inflight_laddr_release(rrpc, rqd); | ||
383 | goto finished; | ||
384 | } | ||
385 | wait_for_completion_io(&wait); | ||
386 | |||
387 | rrpc_inflight_laddr_release(rrpc, rqd); | ||
388 | if (bio->bi_status) | ||
389 | goto finished; | ||
390 | |||
391 | bio_reset(bio); | ||
392 | } | ||
393 | |||
394 | finished: | ||
395 | mempool_free(page, rrpc->page_pool); | ||
396 | bio_put(bio); | ||
397 | |||
398 | if (!bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) { | ||
399 | pr_err("nvm: failed to garbage collect block\n"); | ||
400 | return -EIO; | ||
401 | } | ||
402 | |||
403 | return 0; | ||
404 | } | ||
405 | |||
406 | static void rrpc_block_gc(struct work_struct *work) | ||
407 | { | ||
408 | struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc, | ||
409 | ws_gc); | ||
410 | struct rrpc *rrpc = gcb->rrpc; | ||
411 | struct rrpc_block *rblk = gcb->rblk; | ||
412 | struct rrpc_lun *rlun = rblk->rlun; | ||
413 | struct ppa_addr ppa; | ||
414 | |||
415 | mempool_free(gcb, rrpc->gcb_pool); | ||
416 | pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' being reclaimed\n", | ||
417 | rlun->bppa.g.ch, rlun->bppa.g.lun, | ||
418 | rblk->id); | ||
419 | |||
420 | if (rrpc_move_valid_pages(rrpc, rblk)) | ||
421 | goto put_back; | ||
422 | |||
423 | ppa.ppa = 0; | ||
424 | ppa.g.ch = rlun->bppa.g.ch; | ||
425 | ppa.g.lun = rlun->bppa.g.lun; | ||
426 | ppa.g.blk = rblk->id; | ||
427 | |||
428 | if (nvm_erase_sync(rrpc->dev, &ppa, 1)) | ||
429 | goto put_back; | ||
430 | |||
431 | rrpc_put_blk(rrpc, rblk); | ||
432 | |||
433 | return; | ||
434 | |||
435 | put_back: | ||
436 | spin_lock(&rlun->lock); | ||
437 | list_add_tail(&rblk->prio, &rlun->prio_list); | ||
438 | spin_unlock(&rlun->lock); | ||
439 | } | ||
440 | |||
441 | /* the block with highest number of invalid pages, will be in the beginning | ||
442 | * of the list | ||
443 | */ | ||
444 | static struct rrpc_block *rblk_max_invalid(struct rrpc_block *ra, | ||
445 | struct rrpc_block *rb) | ||
446 | { | ||
447 | if (ra->nr_invalid_pages == rb->nr_invalid_pages) | ||
448 | return ra; | ||
449 | |||
450 | return (ra->nr_invalid_pages < rb->nr_invalid_pages) ? rb : ra; | ||
451 | } | ||
452 | |||
453 | /* linearly find the block with highest number of invalid pages | ||
454 | * requires lun->lock | ||
455 | */ | ||
456 | static struct rrpc_block *block_prio_find_max(struct rrpc_lun *rlun) | ||
457 | { | ||
458 | struct list_head *prio_list = &rlun->prio_list; | ||
459 | struct rrpc_block *rblk, *max; | ||
460 | |||
461 | BUG_ON(list_empty(prio_list)); | ||
462 | |||
463 | max = list_first_entry(prio_list, struct rrpc_block, prio); | ||
464 | list_for_each_entry(rblk, prio_list, prio) | ||
465 | max = rblk_max_invalid(max, rblk); | ||
466 | |||
467 | return max; | ||
468 | } | ||
469 | |||
470 | static void rrpc_lun_gc(struct work_struct *work) | ||
471 | { | ||
472 | struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_gc); | ||
473 | struct rrpc *rrpc = rlun->rrpc; | ||
474 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
475 | struct rrpc_block_gc *gcb; | ||
476 | unsigned int nr_blocks_need; | ||
477 | |||
478 | nr_blocks_need = dev->geo.blks_per_lun / GC_LIMIT_INVERSE; | ||
479 | |||
480 | if (nr_blocks_need < rrpc->nr_luns) | ||
481 | nr_blocks_need = rrpc->nr_luns; | ||
482 | |||
483 | spin_lock(&rlun->lock); | ||
484 | while (nr_blocks_need > rlun->nr_free_blocks && | ||
485 | !list_empty(&rlun->prio_list)) { | ||
486 | struct rrpc_block *rblk = block_prio_find_max(rlun); | ||
487 | |||
488 | if (!rblk->nr_invalid_pages) | ||
489 | break; | ||
490 | |||
491 | gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); | ||
492 | if (!gcb) | ||
493 | break; | ||
494 | |||
495 | list_del_init(&rblk->prio); | ||
496 | |||
497 | WARN_ON(!block_is_full(rrpc, rblk)); | ||
498 | |||
499 | pr_debug("rrpc: selected block 'ch:%d,lun:%d,blk:%d' for GC\n", | ||
500 | rlun->bppa.g.ch, rlun->bppa.g.lun, | ||
501 | rblk->id); | ||
502 | |||
503 | gcb->rrpc = rrpc; | ||
504 | gcb->rblk = rblk; | ||
505 | INIT_WORK(&gcb->ws_gc, rrpc_block_gc); | ||
506 | |||
507 | queue_work(rrpc->kgc_wq, &gcb->ws_gc); | ||
508 | |||
509 | nr_blocks_need--; | ||
510 | } | ||
511 | spin_unlock(&rlun->lock); | ||
512 | |||
513 | /* TODO: Hint that request queue can be started again */ | ||
514 | } | ||
515 | |||
516 | static void rrpc_gc_queue(struct work_struct *work) | ||
517 | { | ||
518 | struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc, | ||
519 | ws_gc); | ||
520 | struct rrpc *rrpc = gcb->rrpc; | ||
521 | struct rrpc_block *rblk = gcb->rblk; | ||
522 | struct rrpc_lun *rlun = rblk->rlun; | ||
523 | |||
524 | spin_lock(&rlun->lock); | ||
525 | list_add_tail(&rblk->prio, &rlun->prio_list); | ||
526 | spin_unlock(&rlun->lock); | ||
527 | |||
528 | mempool_free(gcb, rrpc->gcb_pool); | ||
529 | pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' full, allow GC (sched)\n", | ||
530 | rlun->bppa.g.ch, rlun->bppa.g.lun, | ||
531 | rblk->id); | ||
532 | } | ||
533 | |||
534 | static const struct block_device_operations rrpc_fops = { | ||
535 | .owner = THIS_MODULE, | ||
536 | }; | ||
537 | |||
538 | static struct rrpc_lun *rrpc_get_lun_rr(struct rrpc *rrpc, int is_gc) | ||
539 | { | ||
540 | unsigned int i; | ||
541 | struct rrpc_lun *rlun, *max_free; | ||
542 | |||
543 | if (!is_gc) | ||
544 | return get_next_lun(rrpc); | ||
545 | |||
546 | /* during GC, we don't care about RR, instead we want to make | ||
547 | * sure that we maintain evenness between the block luns. | ||
548 | */ | ||
549 | max_free = &rrpc->luns[0]; | ||
550 | /* prevent GC-ing lun from devouring pages of a lun with | ||
551 | * little free blocks. We don't take the lock as we only need an | ||
552 | * estimate. | ||
553 | */ | ||
554 | rrpc_for_each_lun(rrpc, rlun, i) { | ||
555 | if (rlun->nr_free_blocks > max_free->nr_free_blocks) | ||
556 | max_free = rlun; | ||
557 | } | ||
558 | |||
559 | return max_free; | ||
560 | } | ||
561 | |||
562 | static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t laddr, | ||
563 | struct rrpc_block *rblk, u64 paddr) | ||
564 | { | ||
565 | struct rrpc_addr *gp; | ||
566 | struct rrpc_rev_addr *rev; | ||
567 | |||
568 | BUG_ON(laddr >= rrpc->nr_sects); | ||
569 | |||
570 | gp = &rrpc->trans_map[laddr]; | ||
571 | spin_lock(&rrpc->rev_lock); | ||
572 | if (gp->rblk) | ||
573 | rrpc_page_invalidate(rrpc, gp); | ||
574 | |||
575 | gp->addr = paddr; | ||
576 | gp->rblk = rblk; | ||
577 | |||
578 | rev = &rrpc->rev_trans_map[gp->addr]; | ||
579 | rev->addr = laddr; | ||
580 | spin_unlock(&rrpc->rev_lock); | ||
581 | |||
582 | return gp; | ||
583 | } | ||
584 | |||
585 | static u64 rrpc_alloc_addr(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
586 | { | ||
587 | u64 addr = ADDR_EMPTY; | ||
588 | |||
589 | spin_lock(&rblk->lock); | ||
590 | if (block_is_full(rrpc, rblk)) | ||
591 | goto out; | ||
592 | |||
593 | addr = rblk->next_page; | ||
594 | |||
595 | rblk->next_page++; | ||
596 | out: | ||
597 | spin_unlock(&rblk->lock); | ||
598 | return addr; | ||
599 | } | ||
600 | |||
601 | /* Map logical address to a physical page. The mapping implements a round robin | ||
602 | * approach and allocates a page from the next lun available. | ||
603 | * | ||
604 | * Returns rrpc_addr with the physical address and block. Returns NULL if no | ||
605 | * blocks in the next rlun are available. | ||
606 | */ | ||
607 | static struct ppa_addr rrpc_map_page(struct rrpc *rrpc, sector_t laddr, | ||
608 | int is_gc) | ||
609 | { | ||
610 | struct nvm_tgt_dev *tgt_dev = rrpc->dev; | ||
611 | struct rrpc_lun *rlun; | ||
612 | struct rrpc_block *rblk, **cur_rblk; | ||
613 | struct rrpc_addr *p; | ||
614 | struct ppa_addr ppa; | ||
615 | u64 paddr; | ||
616 | int gc_force = 0; | ||
617 | |||
618 | ppa.ppa = ADDR_EMPTY; | ||
619 | rlun = rrpc_get_lun_rr(rrpc, is_gc); | ||
620 | |||
621 | if (!is_gc && rlun->nr_free_blocks < rrpc->nr_luns * 4) | ||
622 | return ppa; | ||
623 | |||
624 | /* | ||
625 | * page allocation steps: | ||
626 | * 1. Try to allocate new page from current rblk | ||
627 | * 2a. If succeed, proceed to map it in and return | ||
628 | * 2b. If fail, first try to allocate a new block from media manger, | ||
629 | * and then retry step 1. Retry until the normal block pool is | ||
630 | * exhausted. | ||
631 | * 3. If exhausted, and garbage collector is requesting the block, | ||
632 | * go to the reserved block and retry step 1. | ||
633 | * In the case that this fails as well, or it is not GC | ||
634 | * requesting, report not able to retrieve a block and let the | ||
635 | * caller handle further processing. | ||
636 | */ | ||
637 | |||
638 | spin_lock(&rlun->lock); | ||
639 | cur_rblk = &rlun->cur; | ||
640 | rblk = rlun->cur; | ||
641 | retry: | ||
642 | paddr = rrpc_alloc_addr(rrpc, rblk); | ||
643 | |||
644 | if (paddr != ADDR_EMPTY) | ||
645 | goto done; | ||
646 | |||
647 | if (!list_empty(&rlun->wblk_list)) { | ||
648 | new_blk: | ||
649 | rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block, | ||
650 | prio); | ||
651 | rrpc_set_lun_cur(rlun, rblk, cur_rblk); | ||
652 | list_del(&rblk->prio); | ||
653 | goto retry; | ||
654 | } | ||
655 | spin_unlock(&rlun->lock); | ||
656 | |||
657 | rblk = rrpc_get_blk(rrpc, rlun, gc_force); | ||
658 | if (rblk) { | ||
659 | spin_lock(&rlun->lock); | ||
660 | list_add_tail(&rblk->prio, &rlun->wblk_list); | ||
661 | /* | ||
662 | * another thread might already have added a new block, | ||
663 | * Therefore, make sure that one is used, instead of the | ||
664 | * one just added. | ||
665 | */ | ||
666 | goto new_blk; | ||
667 | } | ||
668 | |||
669 | if (unlikely(is_gc) && !gc_force) { | ||
670 | /* retry from emergency gc block */ | ||
671 | cur_rblk = &rlun->gc_cur; | ||
672 | rblk = rlun->gc_cur; | ||
673 | gc_force = 1; | ||
674 | spin_lock(&rlun->lock); | ||
675 | goto retry; | ||
676 | } | ||
677 | |||
678 | pr_err("rrpc: failed to allocate new block\n"); | ||
679 | return ppa; | ||
680 | done: | ||
681 | spin_unlock(&rlun->lock); | ||
682 | p = rrpc_update_map(rrpc, laddr, rblk, paddr); | ||
683 | if (!p) | ||
684 | return ppa; | ||
685 | |||
686 | /* return global address */ | ||
687 | return rrpc_ppa_to_gaddr(tgt_dev, p); | ||
688 | } | ||
689 | |||
690 | static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
691 | { | ||
692 | struct rrpc_block_gc *gcb; | ||
693 | |||
694 | gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); | ||
695 | if (!gcb) { | ||
696 | pr_err("rrpc: unable to queue block for gc."); | ||
697 | return; | ||
698 | } | ||
699 | |||
700 | gcb->rrpc = rrpc; | ||
701 | gcb->rblk = rblk; | ||
702 | |||
703 | INIT_WORK(&gcb->ws_gc, rrpc_gc_queue); | ||
704 | queue_work(rrpc->kgc_wq, &gcb->ws_gc); | ||
705 | } | ||
706 | |||
707 | static struct rrpc_lun *rrpc_ppa_to_lun(struct rrpc *rrpc, struct ppa_addr p) | ||
708 | { | ||
709 | struct rrpc_lun *rlun = NULL; | ||
710 | int i; | ||
711 | |||
712 | for (i = 0; i < rrpc->nr_luns; i++) { | ||
713 | if (rrpc->luns[i].bppa.g.ch == p.g.ch && | ||
714 | rrpc->luns[i].bppa.g.lun == p.g.lun) { | ||
715 | rlun = &rrpc->luns[i]; | ||
716 | break; | ||
717 | } | ||
718 | } | ||
719 | |||
720 | return rlun; | ||
721 | } | ||
722 | |||
723 | static void __rrpc_mark_bad_block(struct rrpc *rrpc, struct ppa_addr ppa) | ||
724 | { | ||
725 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
726 | struct rrpc_lun *rlun; | ||
727 | struct rrpc_block *rblk; | ||
728 | |||
729 | rlun = rrpc_ppa_to_lun(rrpc, ppa); | ||
730 | rblk = &rlun->blocks[ppa.g.blk]; | ||
731 | rblk->state = NVM_BLK_ST_BAD; | ||
732 | |||
733 | nvm_set_tgt_bb_tbl(dev, &ppa, 1, NVM_BLK_T_GRWN_BAD); | ||
734 | } | ||
735 | |||
736 | static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd) | ||
737 | { | ||
738 | void *comp_bits = &rqd->ppa_status; | ||
739 | struct ppa_addr ppa, prev_ppa; | ||
740 | int nr_ppas = rqd->nr_ppas; | ||
741 | int bit; | ||
742 | |||
743 | if (rqd->nr_ppas == 1) | ||
744 | __rrpc_mark_bad_block(rrpc, rqd->ppa_addr); | ||
745 | |||
746 | ppa_set_empty(&prev_ppa); | ||
747 | bit = -1; | ||
748 | while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) { | ||
749 | ppa = rqd->ppa_list[bit]; | ||
750 | if (ppa_cmp_blk(ppa, prev_ppa)) | ||
751 | continue; | ||
752 | |||
753 | __rrpc_mark_bad_block(rrpc, ppa); | ||
754 | } | ||
755 | } | ||
756 | |||
757 | static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, | ||
758 | sector_t laddr, uint8_t npages) | ||
759 | { | ||
760 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
761 | struct rrpc_addr *p; | ||
762 | struct rrpc_block *rblk; | ||
763 | int cmnt_size, i; | ||
764 | |||
765 | for (i = 0; i < npages; i++) { | ||
766 | p = &rrpc->trans_map[laddr + i]; | ||
767 | rblk = p->rblk; | ||
768 | |||
769 | cmnt_size = atomic_inc_return(&rblk->data_cmnt_size); | ||
770 | if (unlikely(cmnt_size == dev->geo.sec_per_blk)) | ||
771 | rrpc_run_gc(rrpc, rblk); | ||
772 | } | ||
773 | } | ||
774 | |||
775 | static void rrpc_end_io(struct nvm_rq *rqd) | ||
776 | { | ||
777 | struct rrpc *rrpc = rqd->private; | ||
778 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
779 | struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); | ||
780 | uint8_t npages = rqd->nr_ppas; | ||
781 | sector_t laddr = rrpc_get_laddr(rqd->bio) - npages; | ||
782 | |||
783 | if (bio_data_dir(rqd->bio) == WRITE) { | ||
784 | if (rqd->error == NVM_RSP_ERR_FAILWRITE) | ||
785 | rrpc_mark_bad_block(rrpc, rqd); | ||
786 | |||
787 | rrpc_end_io_write(rrpc, rrqd, laddr, npages); | ||
788 | } | ||
789 | |||
790 | bio_put(rqd->bio); | ||
791 | |||
792 | if (rrqd->flags & NVM_IOTYPE_GC) | ||
793 | return; | ||
794 | |||
795 | rrpc_unlock_rq(rrpc, rqd); | ||
796 | |||
797 | if (npages > 1) | ||
798 | nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); | ||
799 | |||
800 | mempool_free(rqd, rrpc->rq_pool); | ||
801 | } | ||
802 | |||
803 | static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio, | ||
804 | struct nvm_rq *rqd, unsigned long flags, int npages) | ||
805 | { | ||
806 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
807 | struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); | ||
808 | struct rrpc_addr *gp; | ||
809 | sector_t laddr = rrpc_get_laddr(bio); | ||
810 | int is_gc = flags & NVM_IOTYPE_GC; | ||
811 | int i; | ||
812 | |||
813 | if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) { | ||
814 | nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); | ||
815 | return NVM_IO_REQUEUE; | ||
816 | } | ||
817 | |||
818 | for (i = 0; i < npages; i++) { | ||
819 | /* We assume that mapping occurs at 4KB granularity */ | ||
820 | BUG_ON(!(laddr + i < rrpc->nr_sects)); | ||
821 | gp = &rrpc->trans_map[laddr + i]; | ||
822 | |||
823 | if (gp->rblk) { | ||
824 | rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp); | ||
825 | } else { | ||
826 | BUG_ON(is_gc); | ||
827 | rrpc_unlock_laddr(rrpc, r); | ||
828 | nvm_dev_dma_free(dev->parent, rqd->ppa_list, | ||
829 | rqd->dma_ppa_list); | ||
830 | return NVM_IO_DONE; | ||
831 | } | ||
832 | } | ||
833 | |||
834 | rqd->opcode = NVM_OP_HBREAD; | ||
835 | |||
836 | return NVM_IO_OK; | ||
837 | } | ||
838 | |||
839 | static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd, | ||
840 | unsigned long flags) | ||
841 | { | ||
842 | int is_gc = flags & NVM_IOTYPE_GC; | ||
843 | sector_t laddr = rrpc_get_laddr(bio); | ||
844 | struct rrpc_addr *gp; | ||
845 | |||
846 | if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) | ||
847 | return NVM_IO_REQUEUE; | ||
848 | |||
849 | BUG_ON(!(laddr < rrpc->nr_sects)); | ||
850 | gp = &rrpc->trans_map[laddr]; | ||
851 | |||
852 | if (gp->rblk) { | ||
853 | rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp); | ||
854 | } else { | ||
855 | BUG_ON(is_gc); | ||
856 | rrpc_unlock_rq(rrpc, rqd); | ||
857 | return NVM_IO_DONE; | ||
858 | } | ||
859 | |||
860 | rqd->opcode = NVM_OP_HBREAD; | ||
861 | |||
862 | return NVM_IO_OK; | ||
863 | } | ||
864 | |||
865 | static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio, | ||
866 | struct nvm_rq *rqd, unsigned long flags, int npages) | ||
867 | { | ||
868 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
869 | struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); | ||
870 | struct ppa_addr p; | ||
871 | sector_t laddr = rrpc_get_laddr(bio); | ||
872 | int is_gc = flags & NVM_IOTYPE_GC; | ||
873 | int i; | ||
874 | |||
875 | if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) { | ||
876 | nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); | ||
877 | return NVM_IO_REQUEUE; | ||
878 | } | ||
879 | |||
880 | for (i = 0; i < npages; i++) { | ||
881 | /* We assume that mapping occurs at 4KB granularity */ | ||
882 | p = rrpc_map_page(rrpc, laddr + i, is_gc); | ||
883 | if (p.ppa == ADDR_EMPTY) { | ||
884 | BUG_ON(is_gc); | ||
885 | rrpc_unlock_laddr(rrpc, r); | ||
886 | nvm_dev_dma_free(dev->parent, rqd->ppa_list, | ||
887 | rqd->dma_ppa_list); | ||
888 | rrpc_gc_kick(rrpc); | ||
889 | return NVM_IO_REQUEUE; | ||
890 | } | ||
891 | |||
892 | rqd->ppa_list[i] = p; | ||
893 | } | ||
894 | |||
895 | rqd->opcode = NVM_OP_HBWRITE; | ||
896 | |||
897 | return NVM_IO_OK; | ||
898 | } | ||
899 | |||
900 | static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio, | ||
901 | struct nvm_rq *rqd, unsigned long flags) | ||
902 | { | ||
903 | struct ppa_addr p; | ||
904 | int is_gc = flags & NVM_IOTYPE_GC; | ||
905 | sector_t laddr = rrpc_get_laddr(bio); | ||
906 | |||
907 | if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) | ||
908 | return NVM_IO_REQUEUE; | ||
909 | |||
910 | p = rrpc_map_page(rrpc, laddr, is_gc); | ||
911 | if (p.ppa == ADDR_EMPTY) { | ||
912 | BUG_ON(is_gc); | ||
913 | rrpc_unlock_rq(rrpc, rqd); | ||
914 | rrpc_gc_kick(rrpc); | ||
915 | return NVM_IO_REQUEUE; | ||
916 | } | ||
917 | |||
918 | rqd->ppa_addr = p; | ||
919 | rqd->opcode = NVM_OP_HBWRITE; | ||
920 | |||
921 | return NVM_IO_OK; | ||
922 | } | ||
923 | |||
924 | static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio, | ||
925 | struct nvm_rq *rqd, unsigned long flags, uint8_t npages) | ||
926 | { | ||
927 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
928 | |||
929 | if (npages > 1) { | ||
930 | rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, | ||
931 | &rqd->dma_ppa_list); | ||
932 | if (!rqd->ppa_list) { | ||
933 | pr_err("rrpc: not able to allocate ppa list\n"); | ||
934 | return NVM_IO_ERR; | ||
935 | } | ||
936 | |||
937 | if (bio_op(bio) == REQ_OP_WRITE) | ||
938 | return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags, | ||
939 | npages); | ||
940 | |||
941 | return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages); | ||
942 | } | ||
943 | |||
944 | if (bio_op(bio) == REQ_OP_WRITE) | ||
945 | return rrpc_write_rq(rrpc, bio, rqd, flags); | ||
946 | |||
947 | return rrpc_read_rq(rrpc, bio, rqd, flags); | ||
948 | } | ||
949 | |||
950 | static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, | ||
951 | struct nvm_rq *rqd, unsigned long flags) | ||
952 | { | ||
953 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
954 | struct rrpc_rq *rrq = nvm_rq_to_pdu(rqd); | ||
955 | uint8_t nr_pages = rrpc_get_pages(bio); | ||
956 | int bio_size = bio_sectors(bio) << 9; | ||
957 | int err; | ||
958 | |||
959 | if (bio_size < dev->geo.sec_size) | ||
960 | return NVM_IO_ERR; | ||
961 | else if (bio_size > dev->geo.max_rq_size) | ||
962 | return NVM_IO_ERR; | ||
963 | |||
964 | err = rrpc_setup_rq(rrpc, bio, rqd, flags, nr_pages); | ||
965 | if (err) | ||
966 | return err; | ||
967 | |||
968 | bio_get(bio); | ||
969 | rqd->bio = bio; | ||
970 | rqd->private = rrpc; | ||
971 | rqd->nr_ppas = nr_pages; | ||
972 | rqd->end_io = rrpc_end_io; | ||
973 | rrq->flags = flags; | ||
974 | |||
975 | err = nvm_submit_io(dev, rqd); | ||
976 | if (err) { | ||
977 | pr_err("rrpc: I/O submission failed: %d\n", err); | ||
978 | bio_put(bio); | ||
979 | if (!(flags & NVM_IOTYPE_GC)) { | ||
980 | rrpc_unlock_rq(rrpc, rqd); | ||
981 | if (rqd->nr_ppas > 1) | ||
982 | nvm_dev_dma_free(dev->parent, rqd->ppa_list, | ||
983 | rqd->dma_ppa_list); | ||
984 | } | ||
985 | return NVM_IO_ERR; | ||
986 | } | ||
987 | |||
988 | return NVM_IO_OK; | ||
989 | } | ||
990 | |||
991 | static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio) | ||
992 | { | ||
993 | struct rrpc *rrpc = q->queuedata; | ||
994 | struct nvm_rq *rqd; | ||
995 | int err; | ||
996 | |||
997 | blk_queue_split(q, &bio); | ||
998 | |||
999 | if (bio_op(bio) == REQ_OP_DISCARD) { | ||
1000 | rrpc_discard(rrpc, bio); | ||
1001 | return BLK_QC_T_NONE; | ||
1002 | } | ||
1003 | |||
1004 | rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL); | ||
1005 | memset(rqd, 0, sizeof(struct nvm_rq)); | ||
1006 | |||
1007 | err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE); | ||
1008 | switch (err) { | ||
1009 | case NVM_IO_OK: | ||
1010 | return BLK_QC_T_NONE; | ||
1011 | case NVM_IO_ERR: | ||
1012 | bio_io_error(bio); | ||
1013 | break; | ||
1014 | case NVM_IO_DONE: | ||
1015 | bio_endio(bio); | ||
1016 | break; | ||
1017 | case NVM_IO_REQUEUE: | ||
1018 | spin_lock(&rrpc->bio_lock); | ||
1019 | bio_list_add(&rrpc->requeue_bios, bio); | ||
1020 | spin_unlock(&rrpc->bio_lock); | ||
1021 | queue_work(rrpc->kgc_wq, &rrpc->ws_requeue); | ||
1022 | break; | ||
1023 | } | ||
1024 | |||
1025 | mempool_free(rqd, rrpc->rq_pool); | ||
1026 | return BLK_QC_T_NONE; | ||
1027 | } | ||
1028 | |||
1029 | static void rrpc_requeue(struct work_struct *work) | ||
1030 | { | ||
1031 | struct rrpc *rrpc = container_of(work, struct rrpc, ws_requeue); | ||
1032 | struct bio_list bios; | ||
1033 | struct bio *bio; | ||
1034 | |||
1035 | bio_list_init(&bios); | ||
1036 | |||
1037 | spin_lock(&rrpc->bio_lock); | ||
1038 | bio_list_merge(&bios, &rrpc->requeue_bios); | ||
1039 | bio_list_init(&rrpc->requeue_bios); | ||
1040 | spin_unlock(&rrpc->bio_lock); | ||
1041 | |||
1042 | while ((bio = bio_list_pop(&bios))) | ||
1043 | rrpc_make_rq(rrpc->disk->queue, bio); | ||
1044 | } | ||
1045 | |||
1046 | static void rrpc_gc_free(struct rrpc *rrpc) | ||
1047 | { | ||
1048 | if (rrpc->krqd_wq) | ||
1049 | destroy_workqueue(rrpc->krqd_wq); | ||
1050 | |||
1051 | if (rrpc->kgc_wq) | ||
1052 | destroy_workqueue(rrpc->kgc_wq); | ||
1053 | } | ||
1054 | |||
1055 | static int rrpc_gc_init(struct rrpc *rrpc) | ||
1056 | { | ||
1057 | rrpc->krqd_wq = alloc_workqueue("rrpc-lun", WQ_MEM_RECLAIM|WQ_UNBOUND, | ||
1058 | rrpc->nr_luns); | ||
1059 | if (!rrpc->krqd_wq) | ||
1060 | return -ENOMEM; | ||
1061 | |||
1062 | rrpc->kgc_wq = alloc_workqueue("rrpc-bg", WQ_MEM_RECLAIM, 1); | ||
1063 | if (!rrpc->kgc_wq) | ||
1064 | return -ENOMEM; | ||
1065 | |||
1066 | timer_setup(&rrpc->gc_timer, rrpc_gc_timer, 0); | ||
1067 | |||
1068 | return 0; | ||
1069 | } | ||
1070 | |||
1071 | static void rrpc_map_free(struct rrpc *rrpc) | ||
1072 | { | ||
1073 | vfree(rrpc->rev_trans_map); | ||
1074 | vfree(rrpc->trans_map); | ||
1075 | } | ||
1076 | |||
1077 | static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private) | ||
1078 | { | ||
1079 | struct rrpc *rrpc = (struct rrpc *)private; | ||
1080 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1081 | struct rrpc_addr *addr = rrpc->trans_map + slba; | ||
1082 | struct rrpc_rev_addr *raddr = rrpc->rev_trans_map; | ||
1083 | struct rrpc_lun *rlun; | ||
1084 | struct rrpc_block *rblk; | ||
1085 | u64 i; | ||
1086 | |||
1087 | for (i = 0; i < nlb; i++) { | ||
1088 | struct ppa_addr gaddr; | ||
1089 | u64 pba = le64_to_cpu(entries[i]); | ||
1090 | unsigned int mod; | ||
1091 | |||
1092 | /* LNVM treats address-spaces as silos, LBA and PBA are | ||
1093 | * equally large and zero-indexed. | ||
1094 | */ | ||
1095 | if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) { | ||
1096 | pr_err("nvm: L2P data entry is out of bounds!\n"); | ||
1097 | pr_err("nvm: Maybe loaded an old target L2P\n"); | ||
1098 | return -EINVAL; | ||
1099 | } | ||
1100 | |||
1101 | /* Address zero is a special one. The first page on a disk is | ||
1102 | * protected. As it often holds internal device boot | ||
1103 | * information. | ||
1104 | */ | ||
1105 | if (!pba) | ||
1106 | continue; | ||
1107 | |||
1108 | div_u64_rem(pba, rrpc->nr_sects, &mod); | ||
1109 | |||
1110 | gaddr = rrpc_recov_addr(dev, pba); | ||
1111 | rlun = rrpc_ppa_to_lun(rrpc, gaddr); | ||
1112 | if (!rlun) { | ||
1113 | pr_err("rrpc: l2p corruption on lba %llu\n", | ||
1114 | slba + i); | ||
1115 | return -EINVAL; | ||
1116 | } | ||
1117 | |||
1118 | rblk = &rlun->blocks[gaddr.g.blk]; | ||
1119 | if (!rblk->state) { | ||
1120 | /* at this point, we don't know anything about the | ||
1121 | * block. It's up to the FTL on top to re-etablish the | ||
1122 | * block state. The block is assumed to be open. | ||
1123 | */ | ||
1124 | list_move_tail(&rblk->list, &rlun->used_list); | ||
1125 | rblk->state = NVM_BLK_ST_TGT; | ||
1126 | rlun->nr_free_blocks--; | ||
1127 | } | ||
1128 | |||
1129 | addr[i].addr = pba; | ||
1130 | addr[i].rblk = rblk; | ||
1131 | raddr[mod].addr = slba + i; | ||
1132 | } | ||
1133 | |||
1134 | return 0; | ||
1135 | } | ||
1136 | |||
1137 | static int rrpc_map_init(struct rrpc *rrpc) | ||
1138 | { | ||
1139 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1140 | sector_t i; | ||
1141 | int ret; | ||
1142 | |||
1143 | rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects); | ||
1144 | if (!rrpc->trans_map) | ||
1145 | return -ENOMEM; | ||
1146 | |||
1147 | rrpc->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr) | ||
1148 | * rrpc->nr_sects); | ||
1149 | if (!rrpc->rev_trans_map) | ||
1150 | return -ENOMEM; | ||
1151 | |||
1152 | for (i = 0; i < rrpc->nr_sects; i++) { | ||
1153 | struct rrpc_addr *p = &rrpc->trans_map[i]; | ||
1154 | struct rrpc_rev_addr *r = &rrpc->rev_trans_map[i]; | ||
1155 | |||
1156 | p->addr = ADDR_EMPTY; | ||
1157 | r->addr = ADDR_EMPTY; | ||
1158 | } | ||
1159 | |||
1160 | /* Bring up the mapping table from device */ | ||
1161 | ret = nvm_get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects, | ||
1162 | rrpc_l2p_update, rrpc); | ||
1163 | if (ret) { | ||
1164 | pr_err("nvm: rrpc: could not read L2P table.\n"); | ||
1165 | return -EINVAL; | ||
1166 | } | ||
1167 | |||
1168 | return 0; | ||
1169 | } | ||
1170 | |||
1171 | /* Minimum pages needed within a lun */ | ||
1172 | #define PAGE_POOL_SIZE 16 | ||
1173 | #define ADDR_POOL_SIZE 64 | ||
1174 | |||
1175 | static int rrpc_core_init(struct rrpc *rrpc) | ||
1176 | { | ||
1177 | down_write(&rrpc_lock); | ||
1178 | if (!rrpc_gcb_cache) { | ||
1179 | rrpc_gcb_cache = kmem_cache_create("rrpc_gcb", | ||
1180 | sizeof(struct rrpc_block_gc), 0, 0, NULL); | ||
1181 | if (!rrpc_gcb_cache) { | ||
1182 | up_write(&rrpc_lock); | ||
1183 | return -ENOMEM; | ||
1184 | } | ||
1185 | |||
1186 | rrpc_rq_cache = kmem_cache_create("rrpc_rq", | ||
1187 | sizeof(struct nvm_rq) + sizeof(struct rrpc_rq), | ||
1188 | 0, 0, NULL); | ||
1189 | if (!rrpc_rq_cache) { | ||
1190 | kmem_cache_destroy(rrpc_gcb_cache); | ||
1191 | up_write(&rrpc_lock); | ||
1192 | return -ENOMEM; | ||
1193 | } | ||
1194 | } | ||
1195 | up_write(&rrpc_lock); | ||
1196 | |||
1197 | rrpc->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0); | ||
1198 | if (!rrpc->page_pool) | ||
1199 | return -ENOMEM; | ||
1200 | |||
1201 | rrpc->gcb_pool = mempool_create_slab_pool(rrpc->dev->geo.nr_luns, | ||
1202 | rrpc_gcb_cache); | ||
1203 | if (!rrpc->gcb_pool) | ||
1204 | return -ENOMEM; | ||
1205 | |||
1206 | rrpc->rq_pool = mempool_create_slab_pool(64, rrpc_rq_cache); | ||
1207 | if (!rrpc->rq_pool) | ||
1208 | return -ENOMEM; | ||
1209 | |||
1210 | spin_lock_init(&rrpc->inflights.lock); | ||
1211 | INIT_LIST_HEAD(&rrpc->inflights.reqs); | ||
1212 | |||
1213 | return 0; | ||
1214 | } | ||
1215 | |||
1216 | static void rrpc_core_free(struct rrpc *rrpc) | ||
1217 | { | ||
1218 | mempool_destroy(rrpc->page_pool); | ||
1219 | mempool_destroy(rrpc->gcb_pool); | ||
1220 | mempool_destroy(rrpc->rq_pool); | ||
1221 | } | ||
1222 | |||
1223 | static void rrpc_luns_free(struct rrpc *rrpc) | ||
1224 | { | ||
1225 | struct rrpc_lun *rlun; | ||
1226 | int i; | ||
1227 | |||
1228 | if (!rrpc->luns) | ||
1229 | return; | ||
1230 | |||
1231 | for (i = 0; i < rrpc->nr_luns; i++) { | ||
1232 | rlun = &rrpc->luns[i]; | ||
1233 | vfree(rlun->blocks); | ||
1234 | } | ||
1235 | |||
1236 | kfree(rrpc->luns); | ||
1237 | } | ||
1238 | |||
1239 | static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun) | ||
1240 | { | ||
1241 | struct nvm_geo *geo = &dev->geo; | ||
1242 | struct rrpc_block *rblk; | ||
1243 | struct ppa_addr ppa; | ||
1244 | u8 *blks; | ||
1245 | int nr_blks; | ||
1246 | int i; | ||
1247 | int ret; | ||
1248 | |||
1249 | if (!dev->parent->ops->get_bb_tbl) | ||
1250 | return 0; | ||
1251 | |||
1252 | nr_blks = geo->blks_per_lun * geo->plane_mode; | ||
1253 | blks = kmalloc(nr_blks, GFP_KERNEL); | ||
1254 | if (!blks) | ||
1255 | return -ENOMEM; | ||
1256 | |||
1257 | ppa.ppa = 0; | ||
1258 | ppa.g.ch = rlun->bppa.g.ch; | ||
1259 | ppa.g.lun = rlun->bppa.g.lun; | ||
1260 | |||
1261 | ret = nvm_get_tgt_bb_tbl(dev, ppa, blks); | ||
1262 | if (ret) { | ||
1263 | pr_err("rrpc: could not get BB table\n"); | ||
1264 | goto out; | ||
1265 | } | ||
1266 | |||
1267 | nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks); | ||
1268 | if (nr_blks < 0) { | ||
1269 | ret = nr_blks; | ||
1270 | goto out; | ||
1271 | } | ||
1272 | |||
1273 | for (i = 0; i < nr_blks; i++) { | ||
1274 | if (blks[i] == NVM_BLK_T_FREE) | ||
1275 | continue; | ||
1276 | |||
1277 | rblk = &rlun->blocks[i]; | ||
1278 | list_move_tail(&rblk->list, &rlun->bb_list); | ||
1279 | rblk->state = NVM_BLK_ST_BAD; | ||
1280 | rlun->nr_free_blocks--; | ||
1281 | } | ||
1282 | |||
1283 | out: | ||
1284 | kfree(blks); | ||
1285 | return ret; | ||
1286 | } | ||
1287 | |||
1288 | static void rrpc_set_lun_ppa(struct rrpc_lun *rlun, struct ppa_addr ppa) | ||
1289 | { | ||
1290 | rlun->bppa.ppa = 0; | ||
1291 | rlun->bppa.g.ch = ppa.g.ch; | ||
1292 | rlun->bppa.g.lun = ppa.g.lun; | ||
1293 | } | ||
1294 | |||
1295 | static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns) | ||
1296 | { | ||
1297 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1298 | struct nvm_geo *geo = &dev->geo; | ||
1299 | struct rrpc_lun *rlun; | ||
1300 | int i, j, ret = -EINVAL; | ||
1301 | |||
1302 | if (geo->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { | ||
1303 | pr_err("rrpc: number of pages per block too high."); | ||
1304 | return -EINVAL; | ||
1305 | } | ||
1306 | |||
1307 | spin_lock_init(&rrpc->rev_lock); | ||
1308 | |||
1309 | rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun), | ||
1310 | GFP_KERNEL); | ||
1311 | if (!rrpc->luns) | ||
1312 | return -ENOMEM; | ||
1313 | |||
1314 | /* 1:1 mapping */ | ||
1315 | for (i = 0; i < rrpc->nr_luns; i++) { | ||
1316 | rlun = &rrpc->luns[i]; | ||
1317 | rlun->id = i; | ||
1318 | rrpc_set_lun_ppa(rlun, luns[i]); | ||
1319 | rlun->blocks = vzalloc(sizeof(struct rrpc_block) * | ||
1320 | geo->blks_per_lun); | ||
1321 | if (!rlun->blocks) { | ||
1322 | ret = -ENOMEM; | ||
1323 | goto err; | ||
1324 | } | ||
1325 | |||
1326 | INIT_LIST_HEAD(&rlun->free_list); | ||
1327 | INIT_LIST_HEAD(&rlun->used_list); | ||
1328 | INIT_LIST_HEAD(&rlun->bb_list); | ||
1329 | |||
1330 | for (j = 0; j < geo->blks_per_lun; j++) { | ||
1331 | struct rrpc_block *rblk = &rlun->blocks[j]; | ||
1332 | |||
1333 | rblk->id = j; | ||
1334 | rblk->rlun = rlun; | ||
1335 | rblk->state = NVM_BLK_T_FREE; | ||
1336 | INIT_LIST_HEAD(&rblk->prio); | ||
1337 | INIT_LIST_HEAD(&rblk->list); | ||
1338 | spin_lock_init(&rblk->lock); | ||
1339 | |||
1340 | list_add_tail(&rblk->list, &rlun->free_list); | ||
1341 | } | ||
1342 | |||
1343 | rlun->rrpc = rrpc; | ||
1344 | rlun->nr_free_blocks = geo->blks_per_lun; | ||
1345 | rlun->reserved_blocks = 2; /* for GC only */ | ||
1346 | |||
1347 | INIT_LIST_HEAD(&rlun->prio_list); | ||
1348 | INIT_LIST_HEAD(&rlun->wblk_list); | ||
1349 | |||
1350 | INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); | ||
1351 | spin_lock_init(&rlun->lock); | ||
1352 | |||
1353 | if (rrpc_bb_discovery(dev, rlun)) | ||
1354 | goto err; | ||
1355 | |||
1356 | } | ||
1357 | |||
1358 | return 0; | ||
1359 | err: | ||
1360 | return ret; | ||
1361 | } | ||
1362 | |||
1363 | /* returns 0 on success and stores the beginning address in *begin */ | ||
1364 | static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin) | ||
1365 | { | ||
1366 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1367 | sector_t size = rrpc->nr_sects * dev->geo.sec_size; | ||
1368 | int ret; | ||
1369 | |||
1370 | size >>= 9; | ||
1371 | |||
1372 | ret = nvm_get_area(dev, begin, size); | ||
1373 | if (!ret) | ||
1374 | *begin >>= (ilog2(dev->geo.sec_size) - 9); | ||
1375 | |||
1376 | return ret; | ||
1377 | } | ||
1378 | |||
1379 | static void rrpc_area_free(struct rrpc *rrpc) | ||
1380 | { | ||
1381 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1382 | sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9); | ||
1383 | |||
1384 | nvm_put_area(dev, begin); | ||
1385 | } | ||
1386 | |||
1387 | static void rrpc_free(struct rrpc *rrpc) | ||
1388 | { | ||
1389 | rrpc_gc_free(rrpc); | ||
1390 | rrpc_map_free(rrpc); | ||
1391 | rrpc_core_free(rrpc); | ||
1392 | rrpc_luns_free(rrpc); | ||
1393 | rrpc_area_free(rrpc); | ||
1394 | |||
1395 | kfree(rrpc); | ||
1396 | } | ||
1397 | |||
1398 | static void rrpc_exit(void *private) | ||
1399 | { | ||
1400 | struct rrpc *rrpc = private; | ||
1401 | |||
1402 | del_timer(&rrpc->gc_timer); | ||
1403 | |||
1404 | flush_workqueue(rrpc->krqd_wq); | ||
1405 | flush_workqueue(rrpc->kgc_wq); | ||
1406 | |||
1407 | rrpc_free(rrpc); | ||
1408 | } | ||
1409 | |||
1410 | static sector_t rrpc_capacity(void *private) | ||
1411 | { | ||
1412 | struct rrpc *rrpc = private; | ||
1413 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1414 | sector_t reserved, provisioned; | ||
1415 | |||
1416 | /* cur, gc, and two emergency blocks for each lun */ | ||
1417 | reserved = rrpc->nr_luns * dev->geo.sec_per_blk * 4; | ||
1418 | provisioned = rrpc->nr_sects - reserved; | ||
1419 | |||
1420 | if (reserved > rrpc->nr_sects) { | ||
1421 | pr_err("rrpc: not enough space available to expose storage.\n"); | ||
1422 | return 0; | ||
1423 | } | ||
1424 | |||
1425 | sector_div(provisioned, 10); | ||
1426 | return provisioned * 9 * NR_PHY_IN_LOG; | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * Looks up the logical address from reverse trans map and check if its valid by | ||
1431 | * comparing the logical to physical address with the physical address. | ||
1432 | * Returns 0 on free, otherwise 1 if in use | ||
1433 | */ | ||
1434 | static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
1435 | { | ||
1436 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1437 | int offset; | ||
1438 | struct rrpc_addr *laddr; | ||
1439 | u64 bpaddr, paddr, pladdr; | ||
1440 | |||
1441 | bpaddr = block_to_rel_addr(rrpc, rblk); | ||
1442 | for (offset = 0; offset < dev->geo.sec_per_blk; offset++) { | ||
1443 | paddr = bpaddr + offset; | ||
1444 | |||
1445 | pladdr = rrpc->rev_trans_map[paddr].addr; | ||
1446 | if (pladdr == ADDR_EMPTY) | ||
1447 | continue; | ||
1448 | |||
1449 | laddr = &rrpc->trans_map[pladdr]; | ||
1450 | |||
1451 | if (paddr == laddr->addr) { | ||
1452 | laddr->rblk = rblk; | ||
1453 | } else { | ||
1454 | set_bit(offset, rblk->invalid_pages); | ||
1455 | rblk->nr_invalid_pages++; | ||
1456 | } | ||
1457 | } | ||
1458 | } | ||
1459 | |||
1460 | static int rrpc_blocks_init(struct rrpc *rrpc) | ||
1461 | { | ||
1462 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
1463 | struct rrpc_lun *rlun; | ||
1464 | struct rrpc_block *rblk; | ||
1465 | int lun_iter, blk_iter; | ||
1466 | |||
1467 | for (lun_iter = 0; lun_iter < rrpc->nr_luns; lun_iter++) { | ||
1468 | rlun = &rrpc->luns[lun_iter]; | ||
1469 | |||
1470 | for (blk_iter = 0; blk_iter < dev->geo.blks_per_lun; | ||
1471 | blk_iter++) { | ||
1472 | rblk = &rlun->blocks[blk_iter]; | ||
1473 | rrpc_block_map_update(rrpc, rblk); | ||
1474 | } | ||
1475 | } | ||
1476 | |||
1477 | return 0; | ||
1478 | } | ||
1479 | |||
1480 | static int rrpc_luns_configure(struct rrpc *rrpc) | ||
1481 | { | ||
1482 | struct rrpc_lun *rlun; | ||
1483 | struct rrpc_block *rblk; | ||
1484 | int i; | ||
1485 | |||
1486 | for (i = 0; i < rrpc->nr_luns; i++) { | ||
1487 | rlun = &rrpc->luns[i]; | ||
1488 | |||
1489 | rblk = rrpc_get_blk(rrpc, rlun, 0); | ||
1490 | if (!rblk) | ||
1491 | goto err; | ||
1492 | rrpc_set_lun_cur(rlun, rblk, &rlun->cur); | ||
1493 | |||
1494 | /* Emergency gc block */ | ||
1495 | rblk = rrpc_get_blk(rrpc, rlun, 1); | ||
1496 | if (!rblk) | ||
1497 | goto err; | ||
1498 | rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur); | ||
1499 | } | ||
1500 | |||
1501 | return 0; | ||
1502 | err: | ||
1503 | rrpc_put_blks(rrpc); | ||
1504 | return -EINVAL; | ||
1505 | } | ||
1506 | |||
1507 | static struct nvm_tgt_type tt_rrpc; | ||
1508 | |||
1509 | static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | ||
1510 | int flags) | ||
1511 | { | ||
1512 | struct request_queue *bqueue = dev->q; | ||
1513 | struct request_queue *tqueue = tdisk->queue; | ||
1514 | struct nvm_geo *geo = &dev->geo; | ||
1515 | struct rrpc *rrpc; | ||
1516 | sector_t soffset; | ||
1517 | int ret; | ||
1518 | |||
1519 | if (!(dev->identity.dom & NVM_RSP_L2P)) { | ||
1520 | pr_err("nvm: rrpc: device does not support l2p (%x)\n", | ||
1521 | dev->identity.dom); | ||
1522 | return ERR_PTR(-EINVAL); | ||
1523 | } | ||
1524 | |||
1525 | rrpc = kzalloc(sizeof(struct rrpc), GFP_KERNEL); | ||
1526 | if (!rrpc) | ||
1527 | return ERR_PTR(-ENOMEM); | ||
1528 | |||
1529 | rrpc->dev = dev; | ||
1530 | rrpc->disk = tdisk; | ||
1531 | |||
1532 | bio_list_init(&rrpc->requeue_bios); | ||
1533 | spin_lock_init(&rrpc->bio_lock); | ||
1534 | INIT_WORK(&rrpc->ws_requeue, rrpc_requeue); | ||
1535 | |||
1536 | rrpc->nr_luns = geo->nr_luns; | ||
1537 | rrpc->nr_sects = (unsigned long long)geo->sec_per_lun * rrpc->nr_luns; | ||
1538 | |||
1539 | /* simple round-robin strategy */ | ||
1540 | atomic_set(&rrpc->next_lun, -1); | ||
1541 | |||
1542 | ret = rrpc_area_init(rrpc, &soffset); | ||
1543 | if (ret < 0) { | ||
1544 | pr_err("nvm: rrpc: could not initialize area\n"); | ||
1545 | return ERR_PTR(ret); | ||
1546 | } | ||
1547 | rrpc->soffset = soffset; | ||
1548 | |||
1549 | ret = rrpc_luns_init(rrpc, dev->luns); | ||
1550 | if (ret) { | ||
1551 | pr_err("nvm: rrpc: could not initialize luns\n"); | ||
1552 | goto err; | ||
1553 | } | ||
1554 | |||
1555 | ret = rrpc_core_init(rrpc); | ||
1556 | if (ret) { | ||
1557 | pr_err("nvm: rrpc: could not initialize core\n"); | ||
1558 | goto err; | ||
1559 | } | ||
1560 | |||
1561 | ret = rrpc_map_init(rrpc); | ||
1562 | if (ret) { | ||
1563 | pr_err("nvm: rrpc: could not initialize maps\n"); | ||
1564 | goto err; | ||
1565 | } | ||
1566 | |||
1567 | ret = rrpc_blocks_init(rrpc); | ||
1568 | if (ret) { | ||
1569 | pr_err("nvm: rrpc: could not initialize state for blocks\n"); | ||
1570 | goto err; | ||
1571 | } | ||
1572 | |||
1573 | ret = rrpc_luns_configure(rrpc); | ||
1574 | if (ret) { | ||
1575 | pr_err("nvm: rrpc: not enough blocks available in LUNs.\n"); | ||
1576 | goto err; | ||
1577 | } | ||
1578 | |||
1579 | ret = rrpc_gc_init(rrpc); | ||
1580 | if (ret) { | ||
1581 | pr_err("nvm: rrpc: could not initialize gc\n"); | ||
1582 | goto err; | ||
1583 | } | ||
1584 | |||
1585 | /* inherit the size from the underlying device */ | ||
1586 | blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue)); | ||
1587 | blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); | ||
1588 | |||
1589 | pr_info("nvm: rrpc initialized with %u luns and %llu pages.\n", | ||
1590 | rrpc->nr_luns, (unsigned long long)rrpc->nr_sects); | ||
1591 | |||
1592 | mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10)); | ||
1593 | |||
1594 | return rrpc; | ||
1595 | err: | ||
1596 | rrpc_free(rrpc); | ||
1597 | return ERR_PTR(ret); | ||
1598 | } | ||
1599 | |||
1600 | /* round robin, page-based FTL, and cost-based GC */ | ||
1601 | static struct nvm_tgt_type tt_rrpc = { | ||
1602 | .name = "rrpc", | ||
1603 | .version = {1, 0, 0}, | ||
1604 | |||
1605 | .make_rq = rrpc_make_rq, | ||
1606 | .capacity = rrpc_capacity, | ||
1607 | |||
1608 | .init = rrpc_init, | ||
1609 | .exit = rrpc_exit, | ||
1610 | }; | ||
1611 | |||
1612 | static int __init rrpc_module_init(void) | ||
1613 | { | ||
1614 | return nvm_register_tgt_type(&tt_rrpc); | ||
1615 | } | ||
1616 | |||
1617 | static void rrpc_module_exit(void) | ||
1618 | { | ||
1619 | nvm_unregister_tgt_type(&tt_rrpc); | ||
1620 | } | ||
1621 | |||
1622 | module_init(rrpc_module_init); | ||
1623 | module_exit(rrpc_module_exit); | ||
1624 | MODULE_LICENSE("GPL v2"); | ||
1625 | MODULE_DESCRIPTION("Block-Device Target for Open-Channel SSDs"); | ||
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h deleted file mode 100644 index fdb6ff902903..000000000000 --- a/drivers/lightnvm/rrpc.h +++ /dev/null | |||
@@ -1,290 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2015 IT University of Copenhagen | ||
3 | * Initial release: Matias Bjorling <m@bjorling.me> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License version | ||
7 | * 2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, but | ||
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs. | ||
15 | */ | ||
16 | |||
17 | #ifndef RRPC_H_ | ||
18 | #define RRPC_H_ | ||
19 | |||
20 | #include <linux/blkdev.h> | ||
21 | #include <linux/blk-mq.h> | ||
22 | #include <linux/bio.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/kthread.h> | ||
25 | #include <linux/vmalloc.h> | ||
26 | |||
27 | #include <linux/lightnvm.h> | ||
28 | |||
29 | /* Run only GC if less than 1/X blocks are free */ | ||
30 | #define GC_LIMIT_INVERSE 10 | ||
31 | #define GC_TIME_SECS 100 | ||
32 | |||
33 | #define RRPC_SECTOR (512) | ||
34 | #define RRPC_EXPOSED_PAGE_SIZE (4096) | ||
35 | |||
36 | #define NR_PHY_IN_LOG (RRPC_EXPOSED_PAGE_SIZE / RRPC_SECTOR) | ||
37 | |||
38 | struct rrpc_inflight { | ||
39 | struct list_head reqs; | ||
40 | spinlock_t lock; | ||
41 | }; | ||
42 | |||
43 | struct rrpc_inflight_rq { | ||
44 | struct list_head list; | ||
45 | sector_t l_start; | ||
46 | sector_t l_end; | ||
47 | }; | ||
48 | |||
49 | struct rrpc_rq { | ||
50 | struct rrpc_inflight_rq inflight_rq; | ||
51 | unsigned long flags; | ||
52 | }; | ||
53 | |||
54 | struct rrpc_block { | ||
55 | int id; /* id inside of LUN */ | ||
56 | struct rrpc_lun *rlun; | ||
57 | |||
58 | struct list_head prio; /* LUN CG list */ | ||
59 | struct list_head list; /* LUN free, used, bb list */ | ||
60 | |||
61 | #define MAX_INVALID_PAGES_STORAGE 8 | ||
62 | /* Bitmap for invalid page intries */ | ||
63 | unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE]; | ||
64 | /* points to the next writable page within a block */ | ||
65 | unsigned int next_page; | ||
66 | /* number of pages that are invalid, wrt host page size */ | ||
67 | unsigned int nr_invalid_pages; | ||
68 | |||
69 | int state; | ||
70 | |||
71 | spinlock_t lock; | ||
72 | atomic_t data_cmnt_size; /* data pages committed to stable storage */ | ||
73 | }; | ||
74 | |||
75 | struct rrpc_lun { | ||
76 | struct rrpc *rrpc; | ||
77 | |||
78 | int id; | ||
79 | struct ppa_addr bppa; | ||
80 | |||
81 | struct rrpc_block *cur, *gc_cur; | ||
82 | struct rrpc_block *blocks; /* Reference to block allocation */ | ||
83 | |||
84 | struct list_head prio_list; /* Blocks that may be GC'ed */ | ||
85 | struct list_head wblk_list; /* Queued blocks to be written to */ | ||
86 | |||
87 | /* lun block lists */ | ||
88 | struct list_head used_list; /* In-use blocks */ | ||
89 | struct list_head free_list; /* Not used blocks i.e. released | ||
90 | * and ready for use | ||
91 | */ | ||
92 | struct list_head bb_list; /* Bad blocks. Mutually exclusive with | ||
93 | * free_list and used_list | ||
94 | */ | ||
95 | unsigned int nr_free_blocks; /* Number of unused blocks */ | ||
96 | |||
97 | struct work_struct ws_gc; | ||
98 | |||
99 | int reserved_blocks; | ||
100 | |||
101 | spinlock_t lock; | ||
102 | }; | ||
103 | |||
104 | struct rrpc { | ||
105 | struct nvm_tgt_dev *dev; | ||
106 | struct gendisk *disk; | ||
107 | |||
108 | sector_t soffset; /* logical sector offset */ | ||
109 | |||
110 | int nr_luns; | ||
111 | struct rrpc_lun *luns; | ||
112 | |||
113 | /* calculated values */ | ||
114 | unsigned long long nr_sects; | ||
115 | |||
116 | /* Write strategy variables. Move these into each for structure for each | ||
117 | * strategy | ||
118 | */ | ||
119 | atomic_t next_lun; /* Whenever a page is written, this is updated | ||
120 | * to point to the next write lun | ||
121 | */ | ||
122 | |||
123 | spinlock_t bio_lock; | ||
124 | struct bio_list requeue_bios; | ||
125 | struct work_struct ws_requeue; | ||
126 | |||
127 | /* Simple translation map of logical addresses to physical addresses. | ||
128 | * The logical addresses is known by the host system, while the physical | ||
129 | * addresses are used when writing to the disk block device. | ||
130 | */ | ||
131 | struct rrpc_addr *trans_map; | ||
132 | /* also store a reverse map for garbage collection */ | ||
133 | struct rrpc_rev_addr *rev_trans_map; | ||
134 | spinlock_t rev_lock; | ||
135 | |||
136 | struct rrpc_inflight inflights; | ||
137 | |||
138 | mempool_t *addr_pool; | ||
139 | mempool_t *page_pool; | ||
140 | mempool_t *gcb_pool; | ||
141 | mempool_t *rq_pool; | ||
142 | |||
143 | struct timer_list gc_timer; | ||
144 | struct workqueue_struct *krqd_wq; | ||
145 | struct workqueue_struct *kgc_wq; | ||
146 | }; | ||
147 | |||
148 | struct rrpc_block_gc { | ||
149 | struct rrpc *rrpc; | ||
150 | struct rrpc_block *rblk; | ||
151 | struct work_struct ws_gc; | ||
152 | }; | ||
153 | |||
154 | /* Logical to physical mapping */ | ||
155 | struct rrpc_addr { | ||
156 | u64 addr; | ||
157 | struct rrpc_block *rblk; | ||
158 | }; | ||
159 | |||
160 | /* Physical to logical mapping */ | ||
161 | struct rrpc_rev_addr { | ||
162 | u64 addr; | ||
163 | }; | ||
164 | |||
165 | static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo, | ||
166 | struct ppa_addr r) | ||
167 | { | ||
168 | struct ppa_addr l; | ||
169 | int secs, pgs; | ||
170 | sector_t ppa = r.ppa; | ||
171 | |||
172 | l.ppa = 0; | ||
173 | |||
174 | div_u64_rem(ppa, geo->sec_per_pg, &secs); | ||
175 | l.g.sec = secs; | ||
176 | |||
177 | sector_div(ppa, geo->sec_per_pg); | ||
178 | div_u64_rem(ppa, geo->pgs_per_blk, &pgs); | ||
179 | l.g.pg = pgs; | ||
180 | |||
181 | return l; | ||
182 | } | ||
183 | |||
184 | static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba) | ||
185 | { | ||
186 | return linear_to_generic_addr(&dev->geo, pba); | ||
187 | } | ||
188 | |||
189 | static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk) | ||
190 | { | ||
191 | struct nvm_tgt_dev *dev = rrpc->dev; | ||
192 | struct nvm_geo *geo = &dev->geo; | ||
193 | struct rrpc_lun *rlun = rblk->rlun; | ||
194 | |||
195 | return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk); | ||
196 | } | ||
197 | |||
198 | static inline sector_t rrpc_get_laddr(struct bio *bio) | ||
199 | { | ||
200 | return bio->bi_iter.bi_sector / NR_PHY_IN_LOG; | ||
201 | } | ||
202 | |||
203 | static inline unsigned int rrpc_get_pages(struct bio *bio) | ||
204 | { | ||
205 | return bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE; | ||
206 | } | ||
207 | |||
208 | static inline sector_t rrpc_get_sector(sector_t laddr) | ||
209 | { | ||
210 | return laddr * NR_PHY_IN_LOG; | ||
211 | } | ||
212 | |||
213 | static inline int request_intersects(struct rrpc_inflight_rq *r, | ||
214 | sector_t laddr_start, sector_t laddr_end) | ||
215 | { | ||
216 | return (laddr_end >= r->l_start) && (laddr_start <= r->l_end); | ||
217 | } | ||
218 | |||
219 | static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, | ||
220 | unsigned int pages, struct rrpc_inflight_rq *r) | ||
221 | { | ||
222 | sector_t laddr_end = laddr + pages - 1; | ||
223 | struct rrpc_inflight_rq *rtmp; | ||
224 | |||
225 | WARN_ON(irqs_disabled()); | ||
226 | |||
227 | spin_lock_irq(&rrpc->inflights.lock); | ||
228 | list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) { | ||
229 | if (unlikely(request_intersects(rtmp, laddr, laddr_end))) { | ||
230 | /* existing, overlapping request, come back later */ | ||
231 | spin_unlock_irq(&rrpc->inflights.lock); | ||
232 | return 1; | ||
233 | } | ||
234 | } | ||
235 | |||
236 | r->l_start = laddr; | ||
237 | r->l_end = laddr_end; | ||
238 | |||
239 | list_add_tail(&r->list, &rrpc->inflights.reqs); | ||
240 | spin_unlock_irq(&rrpc->inflights.lock); | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, | ||
245 | unsigned int pages, | ||
246 | struct rrpc_inflight_rq *r) | ||
247 | { | ||
248 | BUG_ON((laddr + pages) > rrpc->nr_sects); | ||
249 | |||
250 | return __rrpc_lock_laddr(rrpc, laddr, pages, r); | ||
251 | } | ||
252 | |||
253 | static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq *rqd) | ||
254 | { | ||
255 | struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); | ||
256 | |||
257 | return &rrqd->inflight_rq; | ||
258 | } | ||
259 | |||
260 | static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio, | ||
261 | struct nvm_rq *rqd) | ||
262 | { | ||
263 | sector_t laddr = rrpc_get_laddr(bio); | ||
264 | unsigned int pages = rrpc_get_pages(bio); | ||
265 | struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); | ||
266 | |||
267 | return rrpc_lock_laddr(rrpc, laddr, pages, r); | ||
268 | } | ||
269 | |||
270 | static inline void rrpc_unlock_laddr(struct rrpc *rrpc, | ||
271 | struct rrpc_inflight_rq *r) | ||
272 | { | ||
273 | unsigned long flags; | ||
274 | |||
275 | spin_lock_irqsave(&rrpc->inflights.lock, flags); | ||
276 | list_del_init(&r->list); | ||
277 | spin_unlock_irqrestore(&rrpc->inflights.lock, flags); | ||
278 | } | ||
279 | |||
280 | static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd) | ||
281 | { | ||
282 | struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); | ||
283 | uint8_t pages = rqd->nr_ppas; | ||
284 | |||
285 | BUG_ON((r->l_start + pages) > rrpc->nr_sects); | ||
286 | |||
287 | rrpc_unlock_laddr(rrpc, r); | ||
288 | } | ||
289 | |||
290 | #endif /* RRPC_H_ */ | ||
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a0cc1bc6d884..6cc6c0f9c3a9 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -525,15 +525,21 @@ struct open_bucket { | |||
525 | 525 | ||
526 | /* | 526 | /* |
527 | * We keep multiple buckets open for writes, and try to segregate different | 527 | * We keep multiple buckets open for writes, and try to segregate different |
528 | * write streams for better cache utilization: first we look for a bucket where | 528 | * write streams for better cache utilization: first we try to segregate flash |
529 | * the last write to it was sequential with the current write, and failing that | 529 | * only volume write streams from cached devices, secondly we look for a bucket |
530 | * we look for a bucket that was last used by the same task. | 530 | * where the last write to it was sequential with the current write, and |
531 | * failing that we look for a bucket that was last used by the same task. | ||
531 | * | 532 | * |
532 | * The ideas is if you've got multiple tasks pulling data into the cache at the | 533 | * The ideas is if you've got multiple tasks pulling data into the cache at the |
533 | * same time, you'll get better cache utilization if you try to segregate their | 534 | * same time, you'll get better cache utilization if you try to segregate their |
534 | * data and preserve locality. | 535 | * data and preserve locality. |
535 | * | 536 | * |
536 | * For example, say you've starting Firefox at the same time you're copying a | 537 | * For example, dirty sectors of flash only volume is not reclaimable, if their |
538 | * dirty sectors mixed with dirty sectors of cached device, such buckets will | ||
539 | * be marked as dirty and won't be reclaimed, though the dirty data of cached | ||
540 | * device have been written back to backend device. | ||
541 | * | ||
542 | * And say you've starting Firefox at the same time you're copying a | ||
537 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | 543 | * bunch of files. Firefox will likely end up being fairly hot and stay in the |
538 | * cache awhile, but the data you copied might not be; if you wrote all that | 544 | * cache awhile, but the data you copied might not be; if you wrote all that |
539 | * data to the same buckets it'd get invalidated at the same time. | 545 | * data to the same buckets it'd get invalidated at the same time. |
@@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c, | |||
550 | struct open_bucket *ret, *ret_task = NULL; | 556 | struct open_bucket *ret, *ret_task = NULL; |
551 | 557 | ||
552 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | 558 | list_for_each_entry_reverse(ret, &c->data_buckets, list) |
553 | if (!bkey_cmp(&ret->key, search)) | 559 | if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) != |
560 | UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)])) | ||
561 | continue; | ||
562 | else if (!bkey_cmp(&ret->key, search)) | ||
554 | goto found; | 563 | goto found; |
555 | else if (ret->last_write_point == write_point) | 564 | else if (ret->last_write_point == write_point) |
556 | ret_task = ret; | 565 | ret_task = ret; |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 843877e017e1..5e2d4e80198e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -320,14 +320,15 @@ struct cached_dev { | |||
320 | */ | 320 | */ |
321 | atomic_t has_dirty; | 321 | atomic_t has_dirty; |
322 | 322 | ||
323 | struct bch_ratelimit writeback_rate; | ||
324 | struct delayed_work writeback_rate_update; | ||
325 | |||
326 | /* | 323 | /* |
327 | * Internal to the writeback code, so read_dirty() can keep track of | 324 | * Set to zero by things that touch the backing volume-- except |
328 | * where it's at. | 325 | * writeback. Incremented by writeback. Used to determine when to |
326 | * accelerate idle writeback. | ||
329 | */ | 327 | */ |
330 | sector_t last_read; | 328 | atomic_t backing_idle; |
329 | |||
330 | struct bch_ratelimit writeback_rate; | ||
331 | struct delayed_work writeback_rate_update; | ||
331 | 332 | ||
332 | /* Limit number of writeback bios in flight */ | 333 | /* Limit number of writeback bios in flight */ |
333 | struct semaphore in_flight; | 334 | struct semaphore in_flight; |
@@ -336,6 +337,14 @@ struct cached_dev { | |||
336 | 337 | ||
337 | struct keybuf writeback_keys; | 338 | struct keybuf writeback_keys; |
338 | 339 | ||
340 | /* | ||
341 | * Order the write-half of writeback operations strongly in dispatch | ||
342 | * order. (Maintain LBA order; don't allow reads completing out of | ||
343 | * order to re-order the writes...) | ||
344 | */ | ||
345 | struct closure_waitlist writeback_ordering_wait; | ||
346 | atomic_t writeback_sequence_next; | ||
347 | |||
339 | /* For tracking sequential IO */ | 348 | /* For tracking sequential IO */ |
340 | #define RECENT_IO_BITS 7 | 349 | #define RECENT_IO_BITS 7 |
341 | #define RECENT_IO (1 << RECENT_IO_BITS) | 350 | #define RECENT_IO (1 << RECENT_IO_BITS) |
@@ -488,6 +497,7 @@ struct cache_set { | |||
488 | int caches_loaded; | 497 | int caches_loaded; |
489 | 498 | ||
490 | struct bcache_device **devices; | 499 | struct bcache_device **devices; |
500 | unsigned devices_max_used; | ||
491 | struct list_head cached_devs; | 501 | struct list_head cached_devs; |
492 | uint64_t cached_dev_sectors; | 502 | uint64_t cached_dev_sectors; |
493 | struct closure caching; | 503 | struct closure caching; |
@@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c) | |||
852 | 862 | ||
853 | /* Forward declarations */ | 863 | /* Forward declarations */ |
854 | 864 | ||
855 | void bch_count_io_errors(struct cache *, blk_status_t, const char *); | 865 | void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); |
856 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, | 866 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, |
857 | blk_status_t, const char *); | 867 | blk_status_t, const char *); |
858 | void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, | 868 | void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81e8dc3dbe5e..bf3a48aa9a9a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b) | |||
419 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + | 419 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + |
420 | bset_sector_offset(&b->keys, i)); | 420 | bset_sector_offset(&b->keys, i)); |
421 | 421 | ||
422 | if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { | 422 | if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { |
423 | int j; | 423 | int j; |
424 | struct bio_vec *bv; | 424 | struct bio_vec *bv; |
425 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 425 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); |
@@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b) | |||
432 | 432 | ||
433 | continue_at(cl, btree_node_write_done, NULL); | 433 | continue_at(cl, btree_node_write_done, NULL); |
434 | } else { | 434 | } else { |
435 | /* No problem for multipage bvec since the bio is just allocated */ | ||
435 | b->bio->bi_vcnt = 0; | 436 | b->bio->bi_vcnt = 0; |
436 | bch_bio_map(b->bio, i); | 437 | bch_bio_map(b->bio, i); |
437 | 438 | ||
@@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c) | |||
1678 | 1679 | ||
1679 | /* don't reclaim buckets to which writeback keys point */ | 1680 | /* don't reclaim buckets to which writeback keys point */ |
1680 | rcu_read_lock(); | 1681 | rcu_read_lock(); |
1681 | for (i = 0; i < c->nr_uuids; i++) { | 1682 | for (i = 0; i < c->devices_max_used; i++) { |
1682 | struct bcache_device *d = c->devices[i]; | 1683 | struct bcache_device *d = c->devices[i]; |
1683 | struct cached_dev *dc; | 1684 | struct cached_dev *dc; |
1684 | struct keybuf_key *w, *n; | 1685 | struct keybuf_key *w, *n; |
@@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg) | |||
1803 | int bch_gc_thread_start(struct cache_set *c) | 1804 | int bch_gc_thread_start(struct cache_set *c) |
1804 | { | 1805 | { |
1805 | c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); | 1806 | c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); |
1806 | if (IS_ERR(c->gc_thread)) | 1807 | return PTR_ERR_OR_ZERO(c->gc_thread); |
1807 | return PTR_ERR(c->gc_thread); | ||
1808 | |||
1809 | return 0; | ||
1810 | } | 1808 | } |
1811 | 1809 | ||
1812 | /* Initial partial gc */ | 1810 | /* Initial partial gc */ |
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 1841d0359bac..7f12920c14f7 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/debugfs.h> | 8 | #include <linux/debugfs.h> |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/seq_file.h> | 10 | #include <linux/seq_file.h> |
11 | #include <linux/sched/debug.h> | ||
11 | 12 | ||
12 | #include "closure.h" | 13 | #include "closure.h" |
13 | 14 | ||
@@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | |||
18 | BUG_ON(flags & CLOSURE_GUARD_MASK); | 19 | BUG_ON(flags & CLOSURE_GUARD_MASK); |
19 | BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); | 20 | BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); |
20 | 21 | ||
21 | /* Must deliver precisely one wakeup */ | ||
22 | if (r == 1 && (flags & CLOSURE_SLEEPING)) | ||
23 | wake_up_process(cl->task); | ||
24 | |||
25 | if (!r) { | 22 | if (!r) { |
26 | if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { | 23 | if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { |
27 | atomic_set(&cl->remaining, | 24 | atomic_set(&cl->remaining, |
@@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) | |||
100 | } | 97 | } |
101 | EXPORT_SYMBOL(closure_wait); | 98 | EXPORT_SYMBOL(closure_wait); |
102 | 99 | ||
103 | /** | 100 | struct closure_syncer { |
104 | * closure_sync - sleep until a closure has nothing left to wait on | 101 | struct task_struct *task; |
105 | * | 102 | int done; |
106 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns | 103 | }; |
107 | * the last refcount. | 104 | |
108 | */ | 105 | static void closure_sync_fn(struct closure *cl) |
109 | void closure_sync(struct closure *cl) | ||
110 | { | 106 | { |
111 | while (1) { | 107 | cl->s->done = 1; |
112 | __closure_start_sleep(cl); | 108 | wake_up_process(cl->s->task); |
113 | closure_set_ret_ip(cl); | 109 | } |
114 | 110 | ||
115 | if ((atomic_read(&cl->remaining) & | 111 | void __sched __closure_sync(struct closure *cl) |
116 | CLOSURE_REMAINING_MASK) == 1) | 112 | { |
117 | break; | 113 | struct closure_syncer s = { .task = current }; |
118 | 114 | ||
115 | cl->s = &s; | ||
116 | continue_at(cl, closure_sync_fn, NULL); | ||
117 | |||
118 | while (1) { | ||
119 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
120 | if (s.done) | ||
121 | break; | ||
119 | schedule(); | 122 | schedule(); |
120 | } | 123 | } |
121 | 124 | ||
122 | __closure_end_sleep(cl); | 125 | __set_current_state(TASK_RUNNING); |
123 | } | 126 | } |
124 | EXPORT_SYMBOL(closure_sync); | 127 | EXPORT_SYMBOL(__closure_sync); |
125 | 128 | ||
126 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 129 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
127 | 130 | ||
@@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data) | |||
168 | cl, (void *) cl->ip, cl->fn, cl->parent, | 171 | cl, (void *) cl->ip, cl->fn, cl->parent, |
169 | r & CLOSURE_REMAINING_MASK); | 172 | r & CLOSURE_REMAINING_MASK); |
170 | 173 | ||
171 | seq_printf(f, "%s%s%s%s\n", | 174 | seq_printf(f, "%s%s\n", |
172 | test_bit(WORK_STRUCT_PENDING_BIT, | 175 | test_bit(WORK_STRUCT_PENDING_BIT, |
173 | work_data_bits(&cl->work)) ? "Q" : "", | 176 | work_data_bits(&cl->work)) ? "Q" : "", |
174 | r & CLOSURE_RUNNING ? "R" : "", | 177 | r & CLOSURE_RUNNING ? "R" : ""); |
175 | r & CLOSURE_STACK ? "S" : "", | ||
176 | r & CLOSURE_SLEEPING ? "Sl" : ""); | ||
177 | 178 | ||
178 | if (r & CLOSURE_WAITING) | 179 | if (r & CLOSURE_WAITING) |
179 | seq_printf(f, " W %pF\n", | 180 | seq_printf(f, " W %pF\n", |
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index ccfbea6f9f6b..3b9dfc9962ad 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h | |||
@@ -103,6 +103,7 @@ | |||
103 | */ | 103 | */ |
104 | 104 | ||
105 | struct closure; | 105 | struct closure; |
106 | struct closure_syncer; | ||
106 | typedef void (closure_fn) (struct closure *); | 107 | typedef void (closure_fn) (struct closure *); |
107 | 108 | ||
108 | struct closure_waitlist { | 109 | struct closure_waitlist { |
@@ -115,10 +116,6 @@ enum closure_state { | |||
115 | * the thread that owns the closure, and cleared by the thread that's | 116 | * the thread that owns the closure, and cleared by the thread that's |
116 | * waking up the closure. | 117 | * waking up the closure. |
117 | * | 118 | * |
118 | * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep | ||
119 | * - indicates that cl->task is valid and closure_put() may wake it up. | ||
120 | * Only set or cleared by the thread that owns the closure. | ||
121 | * | ||
122 | * The rest are for debugging and don't affect behaviour: | 119 | * The rest are for debugging and don't affect behaviour: |
123 | * | 120 | * |
124 | * CLOSURE_RUNNING: Set when a closure is running (i.e. by | 121 | * CLOSURE_RUNNING: Set when a closure is running (i.e. by |
@@ -128,22 +125,16 @@ enum closure_state { | |||
128 | * continue_at() and closure_return() clear it for you, if you're doing | 125 | * continue_at() and closure_return() clear it for you, if you're doing |
129 | * something unusual you can use closure_set_dead() which also helps | 126 | * something unusual you can use closure_set_dead() which also helps |
130 | * annotate where references are being transferred. | 127 | * annotate where references are being transferred. |
131 | * | ||
132 | * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a | ||
133 | * closure with this flag set | ||
134 | */ | 128 | */ |
135 | 129 | ||
136 | CLOSURE_BITS_START = (1 << 23), | 130 | CLOSURE_BITS_START = (1U << 26), |
137 | CLOSURE_DESTRUCTOR = (1 << 23), | 131 | CLOSURE_DESTRUCTOR = (1U << 26), |
138 | CLOSURE_WAITING = (1 << 25), | 132 | CLOSURE_WAITING = (1U << 28), |
139 | CLOSURE_SLEEPING = (1 << 27), | 133 | CLOSURE_RUNNING = (1U << 30), |
140 | CLOSURE_RUNNING = (1 << 29), | ||
141 | CLOSURE_STACK = (1 << 31), | ||
142 | }; | 134 | }; |
143 | 135 | ||
144 | #define CLOSURE_GUARD_MASK \ | 136 | #define CLOSURE_GUARD_MASK \ |
145 | ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ | 137 | ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) |
146 | CLOSURE_RUNNING|CLOSURE_STACK) << 1) | ||
147 | 138 | ||
148 | #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) | 139 | #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) |
149 | #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) | 140 | #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) |
@@ -152,7 +143,7 @@ struct closure { | |||
152 | union { | 143 | union { |
153 | struct { | 144 | struct { |
154 | struct workqueue_struct *wq; | 145 | struct workqueue_struct *wq; |
155 | struct task_struct *task; | 146 | struct closure_syncer *s; |
156 | struct llist_node list; | 147 | struct llist_node list; |
157 | closure_fn *fn; | 148 | closure_fn *fn; |
158 | }; | 149 | }; |
@@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v); | |||
178 | void closure_put(struct closure *cl); | 169 | void closure_put(struct closure *cl); |
179 | void __closure_wake_up(struct closure_waitlist *list); | 170 | void __closure_wake_up(struct closure_waitlist *list); |
180 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); | 171 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); |
181 | void closure_sync(struct closure *cl); | 172 | void __closure_sync(struct closure *cl); |
173 | |||
174 | /** | ||
175 | * closure_sync - sleep until a closure a closure has nothing left to wait on | ||
176 | * | ||
177 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns | ||
178 | * the last refcount. | ||
179 | */ | ||
180 | static inline void closure_sync(struct closure *cl) | ||
181 | { | ||
182 | if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) | ||
183 | __closure_sync(cl); | ||
184 | } | ||
182 | 185 | ||
183 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 186 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
184 | 187 | ||
@@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f) | |||
215 | #endif | 218 | #endif |
216 | } | 219 | } |
217 | 220 | ||
218 | static inline void __closure_end_sleep(struct closure *cl) | ||
219 | { | ||
220 | __set_current_state(TASK_RUNNING); | ||
221 | |||
222 | if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | ||
223 | atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | ||
224 | } | ||
225 | |||
226 | static inline void __closure_start_sleep(struct closure *cl) | ||
227 | { | ||
228 | closure_set_ip(cl); | ||
229 | cl->task = current; | ||
230 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
231 | |||
232 | if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||
233 | atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||
234 | } | ||
235 | |||
236 | static inline void closure_set_stopped(struct closure *cl) | 221 | static inline void closure_set_stopped(struct closure *cl) |
237 | { | 222 | { |
238 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); | 223 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); |
@@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl) | |||
241 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | 226 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, |
242 | struct workqueue_struct *wq) | 227 | struct workqueue_struct *wq) |
243 | { | 228 | { |
244 | BUG_ON(object_is_on_stack(cl)); | ||
245 | closure_set_ip(cl); | 229 | closure_set_ip(cl); |
246 | cl->fn = fn; | 230 | cl->fn = fn; |
247 | cl->wq = wq; | 231 | cl->wq = wq; |
@@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent) | |||
300 | static inline void closure_init_stack(struct closure *cl) | 284 | static inline void closure_init_stack(struct closure *cl) |
301 | { | 285 | { |
302 | memset(cl, 0, sizeof(struct closure)); | 286 | memset(cl, 0, sizeof(struct closure)); |
303 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); | 287 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); |
304 | } | 288 | } |
305 | 289 | ||
306 | /** | 290 | /** |
@@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list) | |||
322 | * This is because after calling continue_at() you no longer have a ref on @cl, | 306 | * This is because after calling continue_at() you no longer have a ref on @cl, |
323 | * and whatever @cl owns may be freed out from under you - a running closure fn | 307 | * and whatever @cl owns may be freed out from under you - a running closure fn |
324 | * has a ref on its own closure which continue_at() drops. | 308 | * has a ref on its own closure which continue_at() drops. |
309 | * | ||
310 | * Note you are expected to immediately return after using this macro. | ||
325 | */ | 311 | */ |
326 | #define continue_at(_cl, _fn, _wq) \ | 312 | #define continue_at(_cl, _fn, _wq) \ |
327 | do { \ | 313 | do { \ |
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c7a02c4900da..af89408befe8 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c | |||
@@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) | |||
116 | return; | 116 | return; |
117 | check->bi_opf = REQ_OP_READ; | 117 | check->bi_opf = REQ_OP_READ; |
118 | 118 | ||
119 | if (bio_alloc_pages(check, GFP_NOIO)) | 119 | if (bch_bio_alloc_pages(check, GFP_NOIO)) |
120 | goto out_put; | 120 | goto out_put; |
121 | 121 | ||
122 | submit_bio_wait(check); | 122 | submit_bio_wait(check); |
@@ -251,8 +251,7 @@ void bch_debug_exit(void) | |||
251 | 251 | ||
252 | int __init bch_debug_init(struct kobject *kobj) | 252 | int __init bch_debug_init(struct kobject *kobj) |
253 | { | 253 | { |
254 | int ret = 0; | ||
255 | |||
256 | debug = debugfs_create_dir("bcache", NULL); | 254 | debug = debugfs_create_dir("bcache", NULL); |
257 | return ret; | 255 | |
256 | return IS_ERR_OR_NULL(debug); | ||
258 | } | 257 | } |
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index fac97ec2d0e2..a783c5a41ff1 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c | |||
@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, | |||
51 | 51 | ||
52 | /* IO errors */ | 52 | /* IO errors */ |
53 | 53 | ||
54 | void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) | 54 | void bch_count_io_errors(struct cache *ca, |
55 | blk_status_t error, | ||
56 | int is_read, | ||
57 | const char *m) | ||
55 | { | 58 | { |
56 | /* | 59 | /* |
57 | * The halflife of an error is: | 60 | * The halflife of an error is: |
@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) | |||
94 | errors >>= IO_ERROR_SHIFT; | 97 | errors >>= IO_ERROR_SHIFT; |
95 | 98 | ||
96 | if (errors < ca->set->error_limit) | 99 | if (errors < ca->set->error_limit) |
97 | pr_err("%s: IO error on %s, recovering", | 100 | pr_err("%s: IO error on %s%s", |
98 | bdevname(ca->bdev, buf), m); | 101 | bdevname(ca->bdev, buf), m, |
102 | is_read ? ", recovering." : "."); | ||
99 | else | 103 | else |
100 | bch_cache_set_error(ca->set, | 104 | bch_cache_set_error(ca->set, |
101 | "%s: too many IO errors %s", | 105 | "%s: too many IO errors %s", |
@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, | |||
108 | { | 112 | { |
109 | struct bbio *b = container_of(bio, struct bbio, bio); | 113 | struct bbio *b = container_of(bio, struct bbio, bio); |
110 | struct cache *ca = PTR_CACHE(c, &b->key, 0); | 114 | struct cache *ca = PTR_CACHE(c, &b->key, 0); |
115 | int is_read = (bio_data_dir(bio) == READ ? 1 : 0); | ||
111 | 116 | ||
112 | unsigned threshold = op_is_write(bio_op(bio)) | 117 | unsigned threshold = op_is_write(bio_op(bio)) |
113 | ? c->congested_write_threshold_us | 118 | ? c->congested_write_threshold_us |
@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, | |||
129 | atomic_inc(&c->congested); | 134 | atomic_inc(&c->congested); |
130 | } | 135 | } |
131 | 136 | ||
132 | bch_count_io_errors(ca, error, m); | 137 | bch_count_io_errors(ca, error, is_read, m); |
133 | } | 138 | } |
134 | 139 | ||
135 | void bch_bbio_endio(struct cache_set *c, struct bio *bio, | 140 | void bch_bbio_endio(struct cache_set *c, struct bio *bio, |
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d50c1c97da68..a24c3a95b2c0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c | |||
@@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c) | |||
162 | bio_set_op_attrs(bio, REQ_OP_READ, 0); | 162 | bio_set_op_attrs(bio, REQ_OP_READ, 0); |
163 | bio->bi_end_io = read_moving_endio; | 163 | bio->bi_end_io = read_moving_endio; |
164 | 164 | ||
165 | if (bio_alloc_pages(bio, GFP_KERNEL)) | 165 | if (bch_bio_alloc_pages(bio, GFP_KERNEL)) |
166 | goto err; | 166 | goto err; |
167 | 167 | ||
168 | trace_bcache_gc_copy(&w->key); | 168 | trace_bcache_gc_copy(&w->key); |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 643c3021624f..1a46b41dac70 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl) | |||
576 | { | 576 | { |
577 | struct search *s = container_of(cl, struct search, iop.cl); | 577 | struct search *s = container_of(cl, struct search, iop.cl); |
578 | struct bio *bio = &s->bio.bio; | 578 | struct bio *bio = &s->bio.bio; |
579 | struct cached_dev *dc; | ||
579 | int ret; | 580 | int ret; |
580 | 581 | ||
581 | bch_btree_op_init(&s->op, -1); | 582 | bch_btree_op_init(&s->op, -1); |
@@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl) | |||
588 | return; | 589 | return; |
589 | } | 590 | } |
590 | 591 | ||
592 | /* | ||
593 | * We might meet err when searching the btree, If that happens, we will | ||
594 | * get negative ret, in this scenario we should not recover data from | ||
595 | * backing device (when cache device is dirty) because we don't know | ||
596 | * whether bkeys the read request covered are all clean. | ||
597 | * | ||
598 | * And after that happened, s->iop.status is still its initial value | ||
599 | * before we submit s->bio.bio | ||
600 | */ | ||
601 | if (ret < 0) { | ||
602 | BUG_ON(ret == -EINTR); | ||
603 | if (s->d && s->d->c && | ||
604 | !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) { | ||
605 | dc = container_of(s->d, struct cached_dev, disk); | ||
606 | if (dc && atomic_read(&dc->has_dirty)) | ||
607 | s->recoverable = false; | ||
608 | } | ||
609 | if (!s->iop.status) | ||
610 | s->iop.status = BLK_STS_IOERR; | ||
611 | } | ||
612 | |||
591 | closure_return(cl); | 613 | closure_return(cl); |
592 | } | 614 | } |
593 | 615 | ||
@@ -611,8 +633,8 @@ static void request_endio(struct bio *bio) | |||
611 | static void bio_complete(struct search *s) | 633 | static void bio_complete(struct search *s) |
612 | { | 634 | { |
613 | if (s->orig_bio) { | 635 | if (s->orig_bio) { |
614 | struct request_queue *q = s->orig_bio->bi_disk->queue; | 636 | generic_end_io_acct(s->d->disk->queue, |
615 | generic_end_io_acct(q, bio_data_dir(s->orig_bio), | 637 | bio_data_dir(s->orig_bio), |
616 | &s->d->disk->part0, s->start_time); | 638 | &s->d->disk->part0, s->start_time); |
617 | 639 | ||
618 | trace_bcache_request_end(s->d, s->orig_bio); | 640 | trace_bcache_request_end(s->d, s->orig_bio); |
@@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
841 | cache_bio->bi_private = &s->cl; | 863 | cache_bio->bi_private = &s->cl; |
842 | 864 | ||
843 | bch_bio_map(cache_bio, NULL); | 865 | bch_bio_map(cache_bio, NULL); |
844 | if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) | 866 | if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) |
845 | goto out_put; | 867 | goto out_put; |
846 | 868 | ||
847 | if (reada) | 869 | if (reada) |
@@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, | |||
974 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 996 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
975 | int rw = bio_data_dir(bio); | 997 | int rw = bio_data_dir(bio); |
976 | 998 | ||
999 | atomic_set(&dc->backing_idle, 0); | ||
977 | generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); | 1000 | generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); |
978 | 1001 | ||
979 | bio_set_dev(bio, dc->bdev); | 1002 | bio_set_dev(bio, dc->bdev); |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b4d28928dec5..133b81225ea9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio) | |||
211 | 211 | ||
212 | static void __write_super(struct cache_sb *sb, struct bio *bio) | 212 | static void __write_super(struct cache_sb *sb, struct bio *bio) |
213 | { | 213 | { |
214 | struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); | 214 | struct cache_sb *out = page_address(bio_first_page_all(bio)); |
215 | unsigned i; | 215 | unsigned i; |
216 | 216 | ||
217 | bio->bi_iter.bi_sector = SB_SECTOR; | 217 | bio->bi_iter.bi_sector = SB_SECTOR; |
@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio) | |||
274 | { | 274 | { |
275 | struct cache *ca = bio->bi_private; | 275 | struct cache *ca = bio->bi_private; |
276 | 276 | ||
277 | bch_count_io_errors(ca, bio->bi_status, "writing superblock"); | 277 | /* is_read = 0 */ |
278 | bch_count_io_errors(ca, bio->bi_status, 0, | ||
279 | "writing superblock"); | ||
278 | closure_put(&ca->set->sb_write); | 280 | closure_put(&ca->set->sb_write); |
279 | } | 281 | } |
280 | 282 | ||
@@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, | |||
721 | d->c = c; | 723 | d->c = c; |
722 | c->devices[id] = d; | 724 | c->devices[id] = d; |
723 | 725 | ||
726 | if (id >= c->devices_max_used) | ||
727 | c->devices_max_used = id + 1; | ||
728 | |||
724 | closure_get(&c->caching); | 729 | closure_get(&c->caching); |
725 | } | 730 | } |
726 | 731 | ||
@@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w) | |||
906 | 911 | ||
907 | mutex_lock(&bch_register_lock); | 912 | mutex_lock(&bch_register_lock); |
908 | 913 | ||
914 | cancel_delayed_work_sync(&dc->writeback_rate_update); | ||
915 | if (!IS_ERR_OR_NULL(dc->writeback_thread)) { | ||
916 | kthread_stop(dc->writeback_thread); | ||
917 | dc->writeback_thread = NULL; | ||
918 | } | ||
919 | |||
909 | memset(&dc->sb.set_uuid, 0, 16); | 920 | memset(&dc->sb.set_uuid, 0, 16); |
910 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); | 921 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); |
911 | 922 | ||
@@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, | |||
1166 | dc->bdev->bd_holder = dc; | 1177 | dc->bdev->bd_holder = dc; |
1167 | 1178 | ||
1168 | bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); | 1179 | bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); |
1169 | dc->sb_bio.bi_io_vec[0].bv_page = sb_page; | 1180 | bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; |
1170 | get_page(sb_page); | 1181 | get_page(sb_page); |
1171 | 1182 | ||
1172 | if (cached_dev_init(dc, sb->block_size << 9)) | 1183 | if (cached_dev_init(dc, sb->block_size << 9)) |
@@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c) | |||
1261 | struct uuid_entry *u; | 1272 | struct uuid_entry *u; |
1262 | 1273 | ||
1263 | for (u = c->uuids; | 1274 | for (u = c->uuids; |
1264 | u < c->uuids + c->nr_uuids && !ret; | 1275 | u < c->uuids + c->devices_max_used && !ret; |
1265 | u++) | 1276 | u++) |
1266 | if (UUID_FLASH_ONLY(u)) | 1277 | if (UUID_FLASH_ONLY(u)) |
1267 | ret = flash_dev_run(c, u); | 1278 | ret = flash_dev_run(c, u); |
@@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl) | |||
1427 | 1438 | ||
1428 | mutex_lock(&bch_register_lock); | 1439 | mutex_lock(&bch_register_lock); |
1429 | 1440 | ||
1430 | for (i = 0; i < c->nr_uuids; i++) | 1441 | for (i = 0; i < c->devices_max_used; i++) |
1431 | if (c->devices[i]) { | 1442 | if (c->devices[i]) { |
1432 | if (!UUID_FLASH_ONLY(&c->uuids[i]) && | 1443 | if (!UUID_FLASH_ONLY(&c->uuids[i]) && |
1433 | test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { | 1444 | test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { |
@@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1490 | c->bucket_bits = ilog2(sb->bucket_size); | 1501 | c->bucket_bits = ilog2(sb->bucket_size); |
1491 | c->block_bits = ilog2(sb->block_size); | 1502 | c->block_bits = ilog2(sb->block_size); |
1492 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); | 1503 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); |
1493 | 1504 | c->devices_max_used = 0; | |
1494 | c->btree_pages = bucket_pages(c); | 1505 | c->btree_pages = bucket_pages(c); |
1495 | if (c->btree_pages > BTREE_MAX_PAGES) | 1506 | if (c->btree_pages > BTREE_MAX_PAGES) |
1496 | c->btree_pages = max_t(int, c->btree_pages / 4, | 1507 | c->btree_pages = max_t(int, c->btree_pages / 4, |
@@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj) | |||
1810 | free_fifo(&ca->free[i]); | 1821 | free_fifo(&ca->free[i]); |
1811 | 1822 | ||
1812 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) | 1823 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) |
1813 | put_page(ca->sb_bio.bi_io_vec[0].bv_page); | 1824 | put_page(bio_first_page_all(&ca->sb_bio)); |
1814 | 1825 | ||
1815 | if (!IS_ERR_OR_NULL(ca->bdev)) | 1826 | if (!IS_ERR_OR_NULL(ca->bdev)) |
1816 | blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | 1827 | blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); |
@@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, | |||
1864 | ca->bdev->bd_holder = ca; | 1875 | ca->bdev->bd_holder = ca; |
1865 | 1876 | ||
1866 | bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); | 1877 | bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); |
1867 | ca->sb_bio.bi_io_vec[0].bv_page = sb_page; | 1878 | bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; |
1868 | get_page(sb_page); | 1879 | get_page(sb_page); |
1869 | 1880 | ||
1870 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) | 1881 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) |
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index e548b8b51322..a23cd6a14b74 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
@@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) | |||
249 | : 0; | 249 | : 0; |
250 | } | 250 | } |
251 | 251 | ||
252 | /* | ||
253 | * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly, | ||
254 | * the preferred way is bio_add_page, but in this case, bch_bio_map() | ||
255 | * supposes that the bvec table is empty, so it is safe to access | ||
256 | * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is | ||
257 | * supported. | ||
258 | */ | ||
252 | void bch_bio_map(struct bio *bio, void *base) | 259 | void bch_bio_map(struct bio *bio, void *base) |
253 | { | 260 | { |
254 | size_t size = bio->bi_iter.bi_size; | 261 | size_t size = bio->bi_iter.bi_size; |
@@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, | |||
276 | } | 283 | } |
277 | } | 284 | } |
278 | 285 | ||
286 | /** | ||
287 | * bch_bio_alloc_pages - allocates a single page for each bvec in a bio | ||
288 | * @bio: bio to allocate pages for | ||
289 | * @gfp_mask: flags for allocation | ||
290 | * | ||
291 | * Allocates pages up to @bio->bi_vcnt. | ||
292 | * | ||
293 | * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are | ||
294 | * freed. | ||
295 | */ | ||
296 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) | ||
297 | { | ||
298 | int i; | ||
299 | struct bio_vec *bv; | ||
300 | |||
301 | bio_for_each_segment_all(bv, bio, i) { | ||
302 | bv->bv_page = alloc_page(gfp_mask); | ||
303 | if (!bv->bv_page) { | ||
304 | while (--bv >= bio->bi_io_vec) | ||
305 | __free_page(bv->bv_page); | ||
306 | return -ENOMEM; | ||
307 | } | ||
308 | } | ||
309 | |||
310 | return 0; | ||
311 | } | ||
312 | |||
279 | /* | 313 | /* |
280 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any | 314 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any |
281 | * use permitted, subject to terms of PostgreSQL license; see.) | 315 | * use permitted, subject to terms of PostgreSQL license; see.) |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ed5e8a412eb8..4df4c5c1cab2 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) | |||
558 | } | 558 | } |
559 | 559 | ||
560 | void bch_bio_map(struct bio *bio, void *base); | 560 | void bch_bio_map(struct bio *bio, void *base); |
561 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); | ||
561 | 562 | ||
562 | static inline sector_t bdev_sectors(struct block_device *bdev) | 563 | static inline sector_t bdev_sectors(struct block_device *bdev) |
563 | { | 564 | { |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 56a37884ca8b..51306a19ab03 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -18,17 +18,39 @@ | |||
18 | #include <trace/events/bcache.h> | 18 | #include <trace/events/bcache.h> |
19 | 19 | ||
20 | /* Rate limiting */ | 20 | /* Rate limiting */ |
21 | 21 | static uint64_t __calc_target_rate(struct cached_dev *dc) | |
22 | static void __update_writeback_rate(struct cached_dev *dc) | ||
23 | { | 22 | { |
24 | struct cache_set *c = dc->disk.c; | 23 | struct cache_set *c = dc->disk.c; |
24 | |||
25 | /* | ||
26 | * This is the size of the cache, minus the amount used for | ||
27 | * flash-only devices | ||
28 | */ | ||
25 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - | 29 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - |
26 | bcache_flash_devs_sectors_dirty(c); | 30 | bcache_flash_devs_sectors_dirty(c); |
31 | |||
32 | /* | ||
33 | * Unfortunately there is no control of global dirty data. If the | ||
34 | * user states that they want 10% dirty data in the cache, and has, | ||
35 | * e.g., 5 backing volumes of equal size, we try and ensure each | ||
36 | * backing volume uses about 2% of the cache for dirty data. | ||
37 | */ | ||
38 | uint32_t bdev_share = | ||
39 | div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, | ||
40 | c->cached_dev_sectors); | ||
41 | |||
27 | uint64_t cache_dirty_target = | 42 | uint64_t cache_dirty_target = |
28 | div_u64(cache_sectors * dc->writeback_percent, 100); | 43 | div_u64(cache_sectors * dc->writeback_percent, 100); |
29 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | ||
30 | c->cached_dev_sectors); | ||
31 | 44 | ||
45 | /* Ensure each backing dev gets at least one dirty share */ | ||
46 | if (bdev_share < 1) | ||
47 | bdev_share = 1; | ||
48 | |||
49 | return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; | ||
50 | } | ||
51 | |||
52 | static void __update_writeback_rate(struct cached_dev *dc) | ||
53 | { | ||
32 | /* | 54 | /* |
33 | * PI controller: | 55 | * PI controller: |
34 | * Figures out the amount that should be written per second. | 56 | * Figures out the amount that should be written per second. |
@@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc) | |||
49 | * This acts as a slow, long-term average that is not subject to | 71 | * This acts as a slow, long-term average that is not subject to |
50 | * variations in usage like the p term. | 72 | * variations in usage like the p term. |
51 | */ | 73 | */ |
74 | int64_t target = __calc_target_rate(dc); | ||
52 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); | 75 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
53 | int64_t error = dirty - target; | 76 | int64_t error = dirty - target; |
54 | int64_t proportional_scaled = | 77 | int64_t proportional_scaled = |
@@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | |||
116 | struct dirty_io { | 139 | struct dirty_io { |
117 | struct closure cl; | 140 | struct closure cl; |
118 | struct cached_dev *dc; | 141 | struct cached_dev *dc; |
142 | uint16_t sequence; | ||
119 | struct bio bio; | 143 | struct bio bio; |
120 | }; | 144 | }; |
121 | 145 | ||
@@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl) | |||
194 | { | 218 | { |
195 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 219 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
196 | struct keybuf_key *w = io->bio.bi_private; | 220 | struct keybuf_key *w = io->bio.bi_private; |
221 | struct cached_dev *dc = io->dc; | ||
222 | |||
223 | uint16_t next_sequence; | ||
224 | |||
225 | if (atomic_read(&dc->writeback_sequence_next) != io->sequence) { | ||
226 | /* Not our turn to write; wait for a write to complete */ | ||
227 | closure_wait(&dc->writeback_ordering_wait, cl); | ||
228 | |||
229 | if (atomic_read(&dc->writeback_sequence_next) == io->sequence) { | ||
230 | /* | ||
231 | * Edge case-- it happened in indeterminate order | ||
232 | * relative to when we were added to wait list.. | ||
233 | */ | ||
234 | closure_wake_up(&dc->writeback_ordering_wait); | ||
235 | } | ||
236 | |||
237 | continue_at(cl, write_dirty, io->dc->writeback_write_wq); | ||
238 | return; | ||
239 | } | ||
240 | |||
241 | next_sequence = io->sequence + 1; | ||
197 | 242 | ||
198 | /* | 243 | /* |
199 | * IO errors are signalled using the dirty bit on the key. | 244 | * IO errors are signalled using the dirty bit on the key. |
@@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl) | |||
211 | closure_bio_submit(&io->bio, cl); | 256 | closure_bio_submit(&io->bio, cl); |
212 | } | 257 | } |
213 | 258 | ||
259 | atomic_set(&dc->writeback_sequence_next, next_sequence); | ||
260 | closure_wake_up(&dc->writeback_ordering_wait); | ||
261 | |||
214 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); | 262 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); |
215 | } | 263 | } |
216 | 264 | ||
@@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio) | |||
219 | struct keybuf_key *w = bio->bi_private; | 267 | struct keybuf_key *w = bio->bi_private; |
220 | struct dirty_io *io = w->private; | 268 | struct dirty_io *io = w->private; |
221 | 269 | ||
270 | /* is_read = 1 */ | ||
222 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | 271 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), |
223 | bio->bi_status, "reading dirty data from cache"); | 272 | bio->bi_status, 1, |
273 | "reading dirty data from cache"); | ||
224 | 274 | ||
225 | dirty_endio(bio); | 275 | dirty_endio(bio); |
226 | } | 276 | } |
@@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl) | |||
237 | static void read_dirty(struct cached_dev *dc) | 287 | static void read_dirty(struct cached_dev *dc) |
238 | { | 288 | { |
239 | unsigned delay = 0; | 289 | unsigned delay = 0; |
240 | struct keybuf_key *w; | 290 | struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; |
291 | size_t size; | ||
292 | int nk, i; | ||
241 | struct dirty_io *io; | 293 | struct dirty_io *io; |
242 | struct closure cl; | 294 | struct closure cl; |
295 | uint16_t sequence = 0; | ||
243 | 296 | ||
297 | BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list)); | ||
298 | atomic_set(&dc->writeback_sequence_next, sequence); | ||
244 | closure_init_stack(&cl); | 299 | closure_init_stack(&cl); |
245 | 300 | ||
246 | /* | 301 | /* |
@@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc) | |||
248 | * mempools. | 303 | * mempools. |
249 | */ | 304 | */ |
250 | 305 | ||
251 | while (!kthread_should_stop()) { | 306 | next = bch_keybuf_next(&dc->writeback_keys); |
252 | 307 | ||
253 | w = bch_keybuf_next(&dc->writeback_keys); | 308 | while (!kthread_should_stop() && next) { |
254 | if (!w) | 309 | size = 0; |
255 | break; | 310 | nk = 0; |
256 | 311 | ||
257 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | 312 | do { |
258 | 313 | BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); | |
259 | if (KEY_START(&w->key) != dc->last_read || | 314 | |
260 | jiffies_to_msecs(delay) > 50) | 315 | /* |
261 | while (!kthread_should_stop() && delay) | 316 | * Don't combine too many operations, even if they |
262 | delay = schedule_timeout_interruptible(delay); | 317 | * are all small. |
263 | 318 | */ | |
264 | dc->last_read = KEY_OFFSET(&w->key); | 319 | if (nk >= MAX_WRITEBACKS_IN_PASS) |
265 | 320 | break; | |
266 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | 321 | |
267 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | 322 | /* |
268 | GFP_KERNEL); | 323 | * If the current operation is very large, don't |
269 | if (!io) | 324 | * further combine operations. |
270 | goto err; | 325 | */ |
271 | 326 | if (size >= MAX_WRITESIZE_IN_PASS) | |
272 | w->private = io; | 327 | break; |
273 | io->dc = dc; | 328 | |
274 | 329 | /* | |
275 | dirty_init(w); | 330 | * Operations are only eligible to be combined |
276 | bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); | 331 | * if they are contiguous. |
277 | io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); | 332 | * |
278 | bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); | 333 | * TODO: add a heuristic willing to fire a |
279 | io->bio.bi_end_io = read_dirty_endio; | 334 | * certain amount of non-contiguous IO per pass, |
280 | 335 | * so that we can benefit from backing device | |
281 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) | 336 | * command queueing. |
282 | goto err_free; | 337 | */ |
338 | if ((nk != 0) && bkey_cmp(&keys[nk-1]->key, | ||
339 | &START_KEY(&next->key))) | ||
340 | break; | ||
341 | |||
342 | size += KEY_SIZE(&next->key); | ||
343 | keys[nk++] = next; | ||
344 | } while ((next = bch_keybuf_next(&dc->writeback_keys))); | ||
345 | |||
346 | /* Now we have gathered a set of 1..5 keys to write back. */ | ||
347 | for (i = 0; i < nk; i++) { | ||
348 | w = keys[i]; | ||
349 | |||
350 | io = kzalloc(sizeof(struct dirty_io) + | ||
351 | sizeof(struct bio_vec) * | ||
352 | DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||
353 | GFP_KERNEL); | ||
354 | if (!io) | ||
355 | goto err; | ||
356 | |||
357 | w->private = io; | ||
358 | io->dc = dc; | ||
359 | io->sequence = sequence++; | ||
360 | |||
361 | dirty_init(w); | ||
362 | bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); | ||
363 | io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); | ||
364 | bio_set_dev(&io->bio, | ||
365 | PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); | ||
366 | io->bio.bi_end_io = read_dirty_endio; | ||
367 | |||
368 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) | ||
369 | goto err_free; | ||
370 | |||
371 | trace_bcache_writeback(&w->key); | ||
372 | |||
373 | down(&dc->in_flight); | ||
374 | |||
375 | /* We've acquired a semaphore for the maximum | ||
376 | * simultaneous number of writebacks; from here | ||
377 | * everything happens asynchronously. | ||
378 | */ | ||
379 | closure_call(&io->cl, read_dirty_submit, NULL, &cl); | ||
380 | } | ||
283 | 381 | ||
284 | trace_bcache_writeback(&w->key); | 382 | delay = writeback_delay(dc, size); |
285 | 383 | ||
286 | down(&dc->in_flight); | 384 | /* If the control system would wait for at least half a |
287 | closure_call(&io->cl, read_dirty_submit, NULL, &cl); | 385 | * second, and there's been no reqs hitting the backing disk |
386 | * for awhile: use an alternate mode where we have at most | ||
387 | * one contiguous set of writebacks in flight at a time. If | ||
388 | * someone wants to do IO it will be quick, as it will only | ||
389 | * have to contend with one operation in flight, and we'll | ||
390 | * be round-tripping data to the backing disk as quickly as | ||
391 | * it can accept it. | ||
392 | */ | ||
393 | if (delay >= HZ / 2) { | ||
394 | /* 3 means at least 1.5 seconds, up to 7.5 if we | ||
395 | * have slowed way down. | ||
396 | */ | ||
397 | if (atomic_inc_return(&dc->backing_idle) >= 3) { | ||
398 | /* Wait for current I/Os to finish */ | ||
399 | closure_sync(&cl); | ||
400 | /* And immediately launch a new set. */ | ||
401 | delay = 0; | ||
402 | } | ||
403 | } | ||
288 | 404 | ||
289 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | 405 | while (!kthread_should_stop() && delay) { |
406 | schedule_timeout_interruptible(delay); | ||
407 | delay = writeback_delay(dc, 0); | ||
408 | } | ||
290 | } | 409 | } |
291 | 410 | ||
292 | if (0) { | 411 | if (0) { |
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index a9e3ffb4b03c..66f1c527fa24 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h | |||
@@ -5,6 +5,16 @@ | |||
5 | #define CUTOFF_WRITEBACK 40 | 5 | #define CUTOFF_WRITEBACK 40 |
6 | #define CUTOFF_WRITEBACK_SYNC 70 | 6 | #define CUTOFF_WRITEBACK_SYNC 70 |
7 | 7 | ||
8 | #define MAX_WRITEBACKS_IN_PASS 5 | ||
9 | #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ | ||
10 | |||
11 | /* | ||
12 | * 14 (16384ths) is chosen here as something that each backing device | ||
13 | * should be a reasonable fraction of the share, and not to blow up | ||
14 | * until individual backing devices are a petabyte. | ||
15 | */ | ||
16 | #define WRITEBACK_SHARE_SHIFT 14 | ||
17 | |||
8 | static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) | 18 | static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) |
9 | { | 19 | { |
10 | uint64_t i, ret = 0; | 20 | uint64_t i, ret = 0; |
@@ -21,7 +31,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) | |||
21 | 31 | ||
22 | mutex_lock(&bch_register_lock); | 32 | mutex_lock(&bch_register_lock); |
23 | 33 | ||
24 | for (i = 0; i < c->nr_uuids; i++) { | 34 | for (i = 0; i < c->devices_max_used; i++) { |
25 | struct bcache_device *d = c->devices[i]; | 35 | struct bcache_device *d = c->devices[i]; |
26 | 36 | ||
27 | if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) | 37 | if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 554d60394c06..2ad429100d25 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) | |||
1446 | bio_for_each_segment_all(bv, clone, i) { | 1446 | bio_for_each_segment_all(bv, clone, i) { |
1447 | BUG_ON(!bv->bv_page); | 1447 | BUG_ON(!bv->bv_page); |
1448 | mempool_free(bv->bv_page, cc->page_pool); | 1448 | mempool_free(bv->bv_page, cc->page_pool); |
1449 | bv->bv_page = NULL; | ||
1450 | } | 1449 | } |
1451 | } | 1450 | } |
1452 | 1451 | ||
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f7810cc869ac..ef57c6d1c887 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -1475,21 +1475,6 @@ static void activate_path_work(struct work_struct *work) | |||
1475 | activate_or_offline_path(pgpath); | 1475 | activate_or_offline_path(pgpath); |
1476 | } | 1476 | } |
1477 | 1477 | ||
1478 | static int noretry_error(blk_status_t error) | ||
1479 | { | ||
1480 | switch (error) { | ||
1481 | case BLK_STS_NOTSUPP: | ||
1482 | case BLK_STS_NOSPC: | ||
1483 | case BLK_STS_TARGET: | ||
1484 | case BLK_STS_NEXUS: | ||
1485 | case BLK_STS_MEDIUM: | ||
1486 | return 1; | ||
1487 | } | ||
1488 | |||
1489 | /* Anything else could be a path failure, so should be retried */ | ||
1490 | return 0; | ||
1491 | } | ||
1492 | |||
1493 | static int multipath_end_io(struct dm_target *ti, struct request *clone, | 1478 | static int multipath_end_io(struct dm_target *ti, struct request *clone, |
1494 | blk_status_t error, union map_info *map_context) | 1479 | blk_status_t error, union map_info *map_context) |
1495 | { | 1480 | { |
@@ -1508,7 +1493,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, | |||
1508 | * request into dm core, which will remake a clone request and | 1493 | * request into dm core, which will remake a clone request and |
1509 | * clone bios for it and resubmit it later. | 1494 | * clone bios for it and resubmit it later. |
1510 | */ | 1495 | */ |
1511 | if (error && !noretry_error(error)) { | 1496 | if (error && blk_path_error(error)) { |
1512 | struct multipath *m = ti->private; | 1497 | struct multipath *m = ti->private; |
1513 | 1498 | ||
1514 | r = DM_ENDIO_REQUEUE; | 1499 | r = DM_ENDIO_REQUEUE; |
@@ -1544,7 +1529,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, | |||
1544 | unsigned long flags; | 1529 | unsigned long flags; |
1545 | int r = DM_ENDIO_DONE; | 1530 | int r = DM_ENDIO_DONE; |
1546 | 1531 | ||
1547 | if (!*error || noretry_error(*error)) | 1532 | if (!*error || !blk_path_error(*error)) |
1548 | goto done; | 1533 | goto done; |
1549 | 1534 | ||
1550 | if (pgpath) | 1535 | if (pgpath) |
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9d32f25489c2..b7d175e94a02 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c | |||
@@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error) | |||
395 | dm_complete_request(tio->orig, error); | 395 | dm_complete_request(tio->orig, error); |
396 | } | 396 | } |
397 | 397 | ||
398 | static void dm_dispatch_clone_request(struct request *clone, struct request *rq) | 398 | static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq) |
399 | { | 399 | { |
400 | blk_status_t r; | 400 | blk_status_t r; |
401 | 401 | ||
@@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) | |||
404 | 404 | ||
405 | clone->start_time = jiffies; | 405 | clone->start_time = jiffies; |
406 | r = blk_insert_cloned_request(clone->q, clone); | 406 | r = blk_insert_cloned_request(clone->q, clone); |
407 | if (r) | 407 | if (r != BLK_STS_OK && r != BLK_STS_RESOURCE) |
408 | /* must complete clone in terms of original request */ | 408 | /* must complete clone in terms of original request */ |
409 | dm_complete_request(rq, r); | 409 | dm_complete_request(rq, r); |
410 | return r; | ||
410 | } | 411 | } |
411 | 412 | ||
412 | static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | 413 | static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, |
@@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio) | |||
476 | struct mapped_device *md = tio->md; | 477 | struct mapped_device *md = tio->md; |
477 | struct request *rq = tio->orig; | 478 | struct request *rq = tio->orig; |
478 | struct request *clone = NULL; | 479 | struct request *clone = NULL; |
480 | blk_status_t ret; | ||
479 | 481 | ||
480 | r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); | 482 | r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); |
483 | check_again: | ||
481 | switch (r) { | 484 | switch (r) { |
482 | case DM_MAPIO_SUBMITTED: | 485 | case DM_MAPIO_SUBMITTED: |
483 | /* The target has taken the I/O to submit by itself later */ | 486 | /* The target has taken the I/O to submit by itself later */ |
@@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio) | |||
492 | /* The target has remapped the I/O so dispatch it */ | 495 | /* The target has remapped the I/O so dispatch it */ |
493 | trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), | 496 | trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), |
494 | blk_rq_pos(rq)); | 497 | blk_rq_pos(rq)); |
495 | dm_dispatch_clone_request(clone, rq); | 498 | ret = dm_dispatch_clone_request(clone, rq); |
499 | if (ret == BLK_STS_RESOURCE) { | ||
500 | blk_rq_unprep_clone(clone); | ||
501 | tio->ti->type->release_clone_rq(clone); | ||
502 | tio->clone = NULL; | ||
503 | if (!rq->q->mq_ops) | ||
504 | r = DM_MAPIO_DELAY_REQUEUE; | ||
505 | else | ||
506 | r = DM_MAPIO_REQUEUE; | ||
507 | goto check_again; | ||
508 | } | ||
496 | break; | 509 | break; |
497 | case DM_MAPIO_REQUEUE: | 510 | case DM_MAPIO_REQUEUE: |
498 | /* The target wants to requeue the I/O */ | 511 | /* The target wants to requeue the I/O */ |
@@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) | |||
713 | return error; | 726 | return error; |
714 | } | 727 | } |
715 | 728 | ||
716 | elv_register_queue(md->queue); | ||
717 | |||
718 | return 0; | 729 | return 0; |
719 | } | 730 | } |
720 | 731 | ||
@@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) | |||
812 | } | 823 | } |
813 | dm_init_md_queue(md); | 824 | dm_init_md_queue(md); |
814 | 825 | ||
815 | /* backfill 'mq' sysfs registration normally done in blk_register_queue */ | ||
816 | err = blk_mq_register_dev(disk_to_dev(md->disk), q); | ||
817 | if (err) | ||
818 | goto out_cleanup_queue; | ||
819 | |||
820 | return 0; | 826 | return 0; |
821 | 827 | ||
822 | out_cleanup_queue: | ||
823 | blk_cleanup_queue(q); | ||
824 | out_tag_set: | 828 | out_tag_set: |
825 | blk_mq_free_tag_set(md->tag_set); | 829 | blk_mq_free_tag_set(md->tag_set); |
826 | out_kfree_tag_set: | 830 | out_kfree_tag_set: |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index de17b7193299..8c26bfc35335 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -920,7 +920,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) | |||
920 | return -EINVAL; | 920 | return -EINVAL; |
921 | } | 921 | } |
922 | 922 | ||
923 | ti->max_io_len = (uint32_t) len; | 923 | /* |
924 | * BIO based queue uses its own splitting. When multipage bvecs | ||
925 | * is switched on, size of the incoming bio may be too big to | ||
926 | * be handled in some targets, such as crypt. | ||
927 | * | ||
928 | * When these targets are ready for the big bio, we can remove | ||
929 | * the limit. | ||
930 | */ | ||
931 | ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE); | ||
924 | 932 | ||
925 | return 0; | 933 | return 0; |
926 | } | 934 | } |
@@ -1753,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1753 | goto bad; | 1761 | goto bad; |
1754 | md->dax_dev = dax_dev; | 1762 | md->dax_dev = dax_dev; |
1755 | 1763 | ||
1756 | add_disk(md->disk); | 1764 | add_disk_no_queue_reg(md->disk); |
1757 | format_dev_t(md->name, MKDEV(_major, minor)); | 1765 | format_dev_t(md->name, MKDEV(_major, minor)); |
1758 | 1766 | ||
1759 | md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); | 1767 | md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); |
@@ -2013,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); | |||
2013 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) | 2021 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) |
2014 | { | 2022 | { |
2015 | int r; | 2023 | int r; |
2024 | struct queue_limits limits; | ||
2016 | enum dm_queue_mode type = dm_get_md_type(md); | 2025 | enum dm_queue_mode type = dm_get_md_type(md); |
2017 | 2026 | ||
2018 | switch (type) { | 2027 | switch (type) { |
@@ -2049,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) | |||
2049 | break; | 2058 | break; |
2050 | } | 2059 | } |
2051 | 2060 | ||
2061 | r = dm_calculate_queue_limits(t, &limits); | ||
2062 | if (r) { | ||
2063 | DMERR("Cannot calculate initial queue limits"); | ||
2064 | return r; | ||
2065 | } | ||
2066 | dm_table_set_restrictions(t, md->queue, &limits); | ||
2067 | blk_register_queue(md->disk); | ||
2068 | |||
2052 | return 0; | 2069 | return 0; |
2053 | } | 2070 | } |
2054 | 2071 | ||
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index a25fd43650ad..441e67e3a9d7 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile | |||
@@ -1,4 +1,7 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
2 | |||
3 | ccflags-y += -I$(src) | ||
4 | |||
2 | obj-$(CONFIG_NVME_CORE) += nvme-core.o | 5 | obj-$(CONFIG_NVME_CORE) += nvme-core.o |
3 | obj-$(CONFIG_BLK_DEV_NVME) += nvme.o | 6 | obj-$(CONFIG_BLK_DEV_NVME) += nvme.o |
4 | obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o | 7 | obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o |
@@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o | |||
6 | obj-$(CONFIG_NVME_FC) += nvme-fc.o | 9 | obj-$(CONFIG_NVME_FC) += nvme-fc.o |
7 | 10 | ||
8 | nvme-core-y := core.o | 11 | nvme-core-y := core.o |
12 | nvme-core-$(CONFIG_TRACING) += trace.o | ||
9 | nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o | 13 | nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o |
10 | nvme-core-$(CONFIG_NVM) += lightnvm.o | 14 | nvme-core-$(CONFIG_NVM) += lightnvm.o |
11 | 15 | ||
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 839650e0926a..e8104871cbbf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c | |||
@@ -29,6 +29,9 @@ | |||
29 | #include <linux/pm_qos.h> | 29 | #include <linux/pm_qos.h> |
30 | #include <asm/unaligned.h> | 30 | #include <asm/unaligned.h> |
31 | 31 | ||
32 | #define CREATE_TRACE_POINTS | ||
33 | #include "trace.h" | ||
34 | |||
32 | #include "nvme.h" | 35 | #include "nvme.h" |
33 | #include "fabrics.h" | 36 | #include "fabrics.h" |
34 | 37 | ||
@@ -65,9 +68,26 @@ static bool streams; | |||
65 | module_param(streams, bool, 0644); | 68 | module_param(streams, bool, 0644); |
66 | MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); | 69 | MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); |
67 | 70 | ||
71 | /* | ||
72 | * nvme_wq - hosts nvme related works that are not reset or delete | ||
73 | * nvme_reset_wq - hosts nvme reset works | ||
74 | * nvme_delete_wq - hosts nvme delete works | ||
75 | * | ||
76 | * nvme_wq will host works such are scan, aen handling, fw activation, | ||
77 | * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq | ||
78 | * runs reset works which also flush works hosted on nvme_wq for | ||
79 | * serialization purposes. nvme_delete_wq host controller deletion | ||
80 | * works which flush reset works for serialization. | ||
81 | */ | ||
68 | struct workqueue_struct *nvme_wq; | 82 | struct workqueue_struct *nvme_wq; |
69 | EXPORT_SYMBOL_GPL(nvme_wq); | 83 | EXPORT_SYMBOL_GPL(nvme_wq); |
70 | 84 | ||
85 | struct workqueue_struct *nvme_reset_wq; | ||
86 | EXPORT_SYMBOL_GPL(nvme_reset_wq); | ||
87 | |||
88 | struct workqueue_struct *nvme_delete_wq; | ||
89 | EXPORT_SYMBOL_GPL(nvme_delete_wq); | ||
90 | |||
71 | static DEFINE_IDA(nvme_subsystems_ida); | 91 | static DEFINE_IDA(nvme_subsystems_ida); |
72 | static LIST_HEAD(nvme_subsystems); | 92 | static LIST_HEAD(nvme_subsystems); |
73 | static DEFINE_MUTEX(nvme_subsystems_lock); | 93 | static DEFINE_MUTEX(nvme_subsystems_lock); |
@@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) | |||
89 | { | 109 | { |
90 | if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) | 110 | if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) |
91 | return -EBUSY; | 111 | return -EBUSY; |
92 | if (!queue_work(nvme_wq, &ctrl->reset_work)) | 112 | if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) |
93 | return -EBUSY; | 113 | return -EBUSY; |
94 | return 0; | 114 | return 0; |
95 | } | 115 | } |
96 | EXPORT_SYMBOL_GPL(nvme_reset_ctrl); | 116 | EXPORT_SYMBOL_GPL(nvme_reset_ctrl); |
97 | 117 | ||
98 | static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) | 118 | int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) |
99 | { | 119 | { |
100 | int ret; | 120 | int ret; |
101 | 121 | ||
@@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) | |||
104 | flush_work(&ctrl->reset_work); | 124 | flush_work(&ctrl->reset_work); |
105 | return ret; | 125 | return ret; |
106 | } | 126 | } |
127 | EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); | ||
107 | 128 | ||
108 | static void nvme_delete_ctrl_work(struct work_struct *work) | 129 | static void nvme_delete_ctrl_work(struct work_struct *work) |
109 | { | 130 | { |
@@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) | |||
122 | { | 143 | { |
123 | if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) | 144 | if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) |
124 | return -EBUSY; | 145 | return -EBUSY; |
125 | if (!queue_work(nvme_wq, &ctrl->delete_work)) | 146 | if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) |
126 | return -EBUSY; | 147 | return -EBUSY; |
127 | return 0; | 148 | return 0; |
128 | } | 149 | } |
@@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req) | |||
157 | return BLK_STS_OK; | 178 | return BLK_STS_OK; |
158 | case NVME_SC_CAP_EXCEEDED: | 179 | case NVME_SC_CAP_EXCEEDED: |
159 | return BLK_STS_NOSPC; | 180 | return BLK_STS_NOSPC; |
181 | case NVME_SC_LBA_RANGE: | ||
182 | return BLK_STS_TARGET; | ||
183 | case NVME_SC_BAD_ATTRIBUTES: | ||
160 | case NVME_SC_ONCS_NOT_SUPPORTED: | 184 | case NVME_SC_ONCS_NOT_SUPPORTED: |
185 | case NVME_SC_INVALID_OPCODE: | ||
186 | case NVME_SC_INVALID_FIELD: | ||
187 | case NVME_SC_INVALID_NS: | ||
161 | return BLK_STS_NOTSUPP; | 188 | return BLK_STS_NOTSUPP; |
162 | case NVME_SC_WRITE_FAULT: | 189 | case NVME_SC_WRITE_FAULT: |
163 | case NVME_SC_READ_ERROR: | 190 | case NVME_SC_READ_ERROR: |
164 | case NVME_SC_UNWRITTEN_BLOCK: | 191 | case NVME_SC_UNWRITTEN_BLOCK: |
165 | case NVME_SC_ACCESS_DENIED: | 192 | case NVME_SC_ACCESS_DENIED: |
166 | case NVME_SC_READ_ONLY: | 193 | case NVME_SC_READ_ONLY: |
194 | case NVME_SC_COMPARE_FAILED: | ||
167 | return BLK_STS_MEDIUM; | 195 | return BLK_STS_MEDIUM; |
168 | case NVME_SC_GUARD_CHECK: | 196 | case NVME_SC_GUARD_CHECK: |
169 | case NVME_SC_APPTAG_CHECK: | 197 | case NVME_SC_APPTAG_CHECK: |
@@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req) | |||
190 | 218 | ||
191 | void nvme_complete_rq(struct request *req) | 219 | void nvme_complete_rq(struct request *req) |
192 | { | 220 | { |
193 | if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { | 221 | blk_status_t status = nvme_error_status(req); |
194 | if (nvme_req_needs_failover(req)) { | 222 | |
223 | trace_nvme_complete_rq(req); | ||
224 | |||
225 | if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { | ||
226 | if (nvme_req_needs_failover(req, status)) { | ||
195 | nvme_failover_req(req); | 227 | nvme_failover_req(req); |
196 | return; | 228 | return; |
197 | } | 229 | } |
@@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req) | |||
202 | return; | 234 | return; |
203 | } | 235 | } |
204 | } | 236 | } |
205 | 237 | blk_mq_end_request(req, status); | |
206 | blk_mq_end_request(req, nvme_error_status(req)); | ||
207 | } | 238 | } |
208 | EXPORT_SYMBOL_GPL(nvme_complete_rq); | 239 | EXPORT_SYMBOL_GPL(nvme_complete_rq); |
209 | 240 | ||
@@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, | |||
232 | 263 | ||
233 | old_state = ctrl->state; | 264 | old_state = ctrl->state; |
234 | switch (new_state) { | 265 | switch (new_state) { |
266 | case NVME_CTRL_ADMIN_ONLY: | ||
267 | switch (old_state) { | ||
268 | case NVME_CTRL_RECONNECTING: | ||
269 | changed = true; | ||
270 | /* FALLTHRU */ | ||
271 | default: | ||
272 | break; | ||
273 | } | ||
274 | break; | ||
235 | case NVME_CTRL_LIVE: | 275 | case NVME_CTRL_LIVE: |
236 | switch (old_state) { | 276 | switch (old_state) { |
237 | case NVME_CTRL_NEW: | 277 | case NVME_CTRL_NEW: |
@@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, | |||
247 | switch (old_state) { | 287 | switch (old_state) { |
248 | case NVME_CTRL_NEW: | 288 | case NVME_CTRL_NEW: |
249 | case NVME_CTRL_LIVE: | 289 | case NVME_CTRL_LIVE: |
290 | case NVME_CTRL_ADMIN_ONLY: | ||
250 | changed = true; | 291 | changed = true; |
251 | /* FALLTHRU */ | 292 | /* FALLTHRU */ |
252 | default: | 293 | default: |
@@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, | |||
266 | case NVME_CTRL_DELETING: | 307 | case NVME_CTRL_DELETING: |
267 | switch (old_state) { | 308 | switch (old_state) { |
268 | case NVME_CTRL_LIVE: | 309 | case NVME_CTRL_LIVE: |
310 | case NVME_CTRL_ADMIN_ONLY: | ||
269 | case NVME_CTRL_RESETTING: | 311 | case NVME_CTRL_RESETTING: |
270 | case NVME_CTRL_RECONNECTING: | 312 | case NVME_CTRL_RECONNECTING: |
271 | changed = true; | 313 | changed = true; |
@@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, | |||
591 | } | 633 | } |
592 | 634 | ||
593 | cmd->common.command_id = req->tag; | 635 | cmd->common.command_id = req->tag; |
636 | if (ns) | ||
637 | trace_nvme_setup_nvm_cmd(req->q->id, cmd); | ||
638 | else | ||
639 | trace_nvme_setup_admin_cmd(cmd); | ||
594 | return ret; | 640 | return ret; |
595 | } | 641 | } |
596 | EXPORT_SYMBOL_GPL(nvme_setup_cmd); | 642 | EXPORT_SYMBOL_GPL(nvme_setup_cmd); |
@@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) | |||
1217 | #ifdef CONFIG_NVME_MULTIPATH | 1263 | #ifdef CONFIG_NVME_MULTIPATH |
1218 | /* should never be called due to GENHD_FL_HIDDEN */ | 1264 | /* should never be called due to GENHD_FL_HIDDEN */ |
1219 | if (WARN_ON_ONCE(ns->head->disk)) | 1265 | if (WARN_ON_ONCE(ns->head->disk)) |
1220 | return -ENXIO; | 1266 | goto fail; |
1221 | #endif | 1267 | #endif |
1222 | if (!kref_get_unless_zero(&ns->kref)) | 1268 | if (!kref_get_unless_zero(&ns->kref)) |
1223 | return -ENXIO; | 1269 | goto fail; |
1270 | if (!try_module_get(ns->ctrl->ops->module)) | ||
1271 | goto fail_put_ns; | ||
1272 | |||
1224 | return 0; | 1273 | return 0; |
1274 | |||
1275 | fail_put_ns: | ||
1276 | nvme_put_ns(ns); | ||
1277 | fail: | ||
1278 | return -ENXIO; | ||
1225 | } | 1279 | } |
1226 | 1280 | ||
1227 | static void nvme_release(struct gendisk *disk, fmode_t mode) | 1281 | static void nvme_release(struct gendisk *disk, fmode_t mode) |
1228 | { | 1282 | { |
1229 | nvme_put_ns(disk->private_data); | 1283 | struct nvme_ns *ns = disk->private_data; |
1284 | |||
1285 | module_put(ns->ctrl->ops->module); | ||
1286 | nvme_put_ns(ns); | ||
1230 | } | 1287 | } |
1231 | 1288 | ||
1232 | static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) | 1289 | static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
@@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = { | |||
2052 | NULL, | 2109 | NULL, |
2053 | }; | 2110 | }; |
2054 | 2111 | ||
2112 | static int nvme_active_ctrls(struct nvme_subsystem *subsys) | ||
2113 | { | ||
2114 | int count = 0; | ||
2115 | struct nvme_ctrl *ctrl; | ||
2116 | |||
2117 | mutex_lock(&subsys->lock); | ||
2118 | list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { | ||
2119 | if (ctrl->state != NVME_CTRL_DELETING && | ||
2120 | ctrl->state != NVME_CTRL_DEAD) | ||
2121 | count++; | ||
2122 | } | ||
2123 | mutex_unlock(&subsys->lock); | ||
2124 | |||
2125 | return count; | ||
2126 | } | ||
2127 | |||
2055 | static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | 2128 | static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) |
2056 | { | 2129 | { |
2057 | struct nvme_subsystem *subsys, *found; | 2130 | struct nvme_subsystem *subsys, *found; |
@@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | |||
2090 | * Verify that the subsystem actually supports multiple | 2163 | * Verify that the subsystem actually supports multiple |
2091 | * controllers, else bail out. | 2164 | * controllers, else bail out. |
2092 | */ | 2165 | */ |
2093 | if (!(id->cmic & (1 << 1))) { | 2166 | if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { |
2094 | dev_err(ctrl->device, | 2167 | dev_err(ctrl->device, |
2095 | "ignoring ctrl due to duplicate subnqn (%s).\n", | 2168 | "ignoring ctrl due to duplicate subnqn (%s).\n", |
2096 | found->subnqn); | 2169 | found->subnqn); |
@@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) | |||
2257 | shutdown_timeout, 60); | 2330 | shutdown_timeout, 60); |
2258 | 2331 | ||
2259 | if (ctrl->shutdown_timeout != shutdown_timeout) | 2332 | if (ctrl->shutdown_timeout != shutdown_timeout) |
2260 | dev_warn(ctrl->device, | 2333 | dev_info(ctrl->device, |
2261 | "Shutdown timeout set to %u seconds\n", | 2334 | "Shutdown timeout set to %u seconds\n", |
2262 | ctrl->shutdown_timeout); | 2335 | ctrl->shutdown_timeout); |
2263 | } else | 2336 | } else |
@@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file) | |||
2341 | struct nvme_ctrl *ctrl = | 2414 | struct nvme_ctrl *ctrl = |
2342 | container_of(inode->i_cdev, struct nvme_ctrl, cdev); | 2415 | container_of(inode->i_cdev, struct nvme_ctrl, cdev); |
2343 | 2416 | ||
2344 | if (ctrl->state != NVME_CTRL_LIVE) | 2417 | switch (ctrl->state) { |
2418 | case NVME_CTRL_LIVE: | ||
2419 | case NVME_CTRL_ADMIN_ONLY: | ||
2420 | break; | ||
2421 | default: | ||
2345 | return -EWOULDBLOCK; | 2422 | return -EWOULDBLOCK; |
2423 | } | ||
2424 | |||
2346 | file->private_data = ctrl; | 2425 | file->private_data = ctrl; |
2347 | return 0; | 2426 | return 0; |
2348 | } | 2427 | } |
@@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, | |||
2606 | static const char *const state_name[] = { | 2685 | static const char *const state_name[] = { |
2607 | [NVME_CTRL_NEW] = "new", | 2686 | [NVME_CTRL_NEW] = "new", |
2608 | [NVME_CTRL_LIVE] = "live", | 2687 | [NVME_CTRL_LIVE] = "live", |
2688 | [NVME_CTRL_ADMIN_ONLY] = "only-admin", | ||
2609 | [NVME_CTRL_RESETTING] = "resetting", | 2689 | [NVME_CTRL_RESETTING] = "resetting", |
2610 | [NVME_CTRL_RECONNECTING]= "reconnecting", | 2690 | [NVME_CTRL_RECONNECTING]= "reconnecting", |
2611 | [NVME_CTRL_DELETING] = "deleting", | 2691 | [NVME_CTRL_DELETING] = "deleting", |
@@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work) | |||
3079 | if (ctrl->state != NVME_CTRL_LIVE) | 3159 | if (ctrl->state != NVME_CTRL_LIVE) |
3080 | return; | 3160 | return; |
3081 | 3161 | ||
3162 | WARN_ON_ONCE(!ctrl->tagset); | ||
3163 | |||
3082 | if (nvme_identify_ctrl(ctrl, &id)) | 3164 | if (nvme_identify_ctrl(ctrl, &id)) |
3083 | return; | 3165 | return; |
3084 | 3166 | ||
@@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work) | |||
3099 | void nvme_queue_scan(struct nvme_ctrl *ctrl) | 3181 | void nvme_queue_scan(struct nvme_ctrl *ctrl) |
3100 | { | 3182 | { |
3101 | /* | 3183 | /* |
3102 | * Do not queue new scan work when a controller is reset during | 3184 | * Only new queue scan work when admin and IO queues are both alive |
3103 | * removal. | ||
3104 | */ | 3185 | */ |
3105 | if (ctrl->state == NVME_CTRL_LIVE) | 3186 | if (ctrl->state == NVME_CTRL_LIVE) |
3106 | queue_work(nvme_wq, &ctrl->scan_work); | 3187 | queue_work(nvme_wq, &ctrl->scan_work); |
@@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset); | |||
3477 | 3558 | ||
3478 | int __init nvme_core_init(void) | 3559 | int __init nvme_core_init(void) |
3479 | { | 3560 | { |
3480 | int result; | 3561 | int result = -ENOMEM; |
3481 | 3562 | ||
3482 | nvme_wq = alloc_workqueue("nvme-wq", | 3563 | nvme_wq = alloc_workqueue("nvme-wq", |
3483 | WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); | 3564 | WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); |
3484 | if (!nvme_wq) | 3565 | if (!nvme_wq) |
3485 | return -ENOMEM; | 3566 | goto out; |
3567 | |||
3568 | nvme_reset_wq = alloc_workqueue("nvme-reset-wq", | ||
3569 | WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); | ||
3570 | if (!nvme_reset_wq) | ||
3571 | goto destroy_wq; | ||
3572 | |||
3573 | nvme_delete_wq = alloc_workqueue("nvme-delete-wq", | ||
3574 | WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); | ||
3575 | if (!nvme_delete_wq) | ||
3576 | goto destroy_reset_wq; | ||
3486 | 3577 | ||
3487 | result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); | 3578 | result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); |
3488 | if (result < 0) | 3579 | if (result < 0) |
3489 | goto destroy_wq; | 3580 | goto destroy_delete_wq; |
3490 | 3581 | ||
3491 | nvme_class = class_create(THIS_MODULE, "nvme"); | 3582 | nvme_class = class_create(THIS_MODULE, "nvme"); |
3492 | if (IS_ERR(nvme_class)) { | 3583 | if (IS_ERR(nvme_class)) { |
@@ -3505,8 +3596,13 @@ destroy_class: | |||
3505 | class_destroy(nvme_class); | 3596 | class_destroy(nvme_class); |
3506 | unregister_chrdev: | 3597 | unregister_chrdev: |
3507 | unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); | 3598 | unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); |
3599 | destroy_delete_wq: | ||
3600 | destroy_workqueue(nvme_delete_wq); | ||
3601 | destroy_reset_wq: | ||
3602 | destroy_workqueue(nvme_reset_wq); | ||
3508 | destroy_wq: | 3603 | destroy_wq: |
3509 | destroy_workqueue(nvme_wq); | 3604 | destroy_workqueue(nvme_wq); |
3605 | out: | ||
3510 | return result; | 3606 | return result; |
3511 | } | 3607 | } |
3512 | 3608 | ||
@@ -3516,6 +3612,8 @@ void nvme_core_exit(void) | |||
3516 | class_destroy(nvme_subsys_class); | 3612 | class_destroy(nvme_subsys_class); |
3517 | class_destroy(nvme_class); | 3613 | class_destroy(nvme_class); |
3518 | unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); | 3614 | unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); |
3615 | destroy_workqueue(nvme_delete_wq); | ||
3616 | destroy_workqueue(nvme_reset_wq); | ||
3519 | destroy_workqueue(nvme_wq); | 3617 | destroy_workqueue(nvme_wq); |
3520 | } | 3618 | } |
3521 | 3619 | ||
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 894c2ccb3891..5dd4ceefed8f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c | |||
@@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect); | |||
493 | */ | 493 | */ |
494 | int nvmf_register_transport(struct nvmf_transport_ops *ops) | 494 | int nvmf_register_transport(struct nvmf_transport_ops *ops) |
495 | { | 495 | { |
496 | if (!ops->create_ctrl) | 496 | if (!ops->create_ctrl || !ops->module) |
497 | return -EINVAL; | 497 | return -EINVAL; |
498 | 498 | ||
499 | down_write(&nvmf_transports_rwsem); | 499 | down_write(&nvmf_transports_rwsem); |
@@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, | |||
739 | ret = -ENOMEM; | 739 | ret = -ENOMEM; |
740 | goto out; | 740 | goto out; |
741 | } | 741 | } |
742 | if (uuid_parse(p, &hostid)) { | 742 | ret = uuid_parse(p, &hostid); |
743 | if (ret) { | ||
743 | pr_err("Invalid hostid %s\n", p); | 744 | pr_err("Invalid hostid %s\n", p); |
744 | ret = -EINVAL; | 745 | ret = -EINVAL; |
746 | kfree(p); | ||
745 | goto out; | 747 | goto out; |
746 | } | 748 | } |
749 | kfree(p); | ||
747 | break; | 750 | break; |
748 | case NVMF_OPT_DUP_CONNECT: | 751 | case NVMF_OPT_DUP_CONNECT: |
749 | opts->duplicate_connect = true; | 752 | opts->duplicate_connect = true; |
@@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) | |||
869 | goto out_unlock; | 872 | goto out_unlock; |
870 | } | 873 | } |
871 | 874 | ||
875 | if (!try_module_get(ops->module)) { | ||
876 | ret = -EBUSY; | ||
877 | goto out_unlock; | ||
878 | } | ||
879 | |||
872 | ret = nvmf_check_required_opts(opts, ops->required_opts); | 880 | ret = nvmf_check_required_opts(opts, ops->required_opts); |
873 | if (ret) | 881 | if (ret) |
874 | goto out_unlock; | 882 | goto out_module_put; |
875 | ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | | 883 | ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | |
876 | ops->allowed_opts | ops->required_opts); | 884 | ops->allowed_opts | ops->required_opts); |
877 | if (ret) | 885 | if (ret) |
878 | goto out_unlock; | 886 | goto out_module_put; |
879 | 887 | ||
880 | ctrl = ops->create_ctrl(dev, opts); | 888 | ctrl = ops->create_ctrl(dev, opts); |
881 | if (IS_ERR(ctrl)) { | 889 | if (IS_ERR(ctrl)) { |
882 | ret = PTR_ERR(ctrl); | 890 | ret = PTR_ERR(ctrl); |
883 | goto out_unlock; | 891 | goto out_module_put; |
884 | } | 892 | } |
885 | 893 | ||
886 | if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { | 894 | if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { |
887 | dev_warn(ctrl->device, | 895 | dev_warn(ctrl->device, |
888 | "controller returned incorrect NQN: \"%s\".\n", | 896 | "controller returned incorrect NQN: \"%s\".\n", |
889 | ctrl->subsys->subnqn); | 897 | ctrl->subsys->subnqn); |
898 | module_put(ops->module); | ||
890 | up_read(&nvmf_transports_rwsem); | 899 | up_read(&nvmf_transports_rwsem); |
891 | nvme_delete_ctrl_sync(ctrl); | 900 | nvme_delete_ctrl_sync(ctrl); |
892 | return ERR_PTR(-EINVAL); | 901 | return ERR_PTR(-EINVAL); |
893 | } | 902 | } |
894 | 903 | ||
904 | module_put(ops->module); | ||
895 | up_read(&nvmf_transports_rwsem); | 905 | up_read(&nvmf_transports_rwsem); |
896 | return ctrl; | 906 | return ctrl; |
897 | 907 | ||
908 | out_module_put: | ||
909 | module_put(ops->module); | ||
898 | out_unlock: | 910 | out_unlock: |
899 | up_read(&nvmf_transports_rwsem); | 911 | up_read(&nvmf_transports_rwsem); |
900 | out_free_opts: | 912 | out_free_opts: |
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 9ba614953607..25b19f722f5b 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h | |||
@@ -108,6 +108,7 @@ struct nvmf_ctrl_options { | |||
108 | * fabric implementation of NVMe fabrics. | 108 | * fabric implementation of NVMe fabrics. |
109 | * @entry: Used by the fabrics library to add the new | 109 | * @entry: Used by the fabrics library to add the new |
110 | * registration entry to its linked-list internal tree. | 110 | * registration entry to its linked-list internal tree. |
111 | * @module: Transport module reference | ||
111 | * @name: Name of the NVMe fabric driver implementation. | 112 | * @name: Name of the NVMe fabric driver implementation. |
112 | * @required_opts: sysfs command-line options that must be specified | 113 | * @required_opts: sysfs command-line options that must be specified |
113 | * when adding a new NVMe controller. | 114 | * when adding a new NVMe controller. |
@@ -126,6 +127,7 @@ struct nvmf_ctrl_options { | |||
126 | */ | 127 | */ |
127 | struct nvmf_transport_ops { | 128 | struct nvmf_transport_ops { |
128 | struct list_head entry; | 129 | struct list_head entry; |
130 | struct module *module; | ||
129 | const char *name; | 131 | const char *name; |
130 | int required_opts; | 132 | int required_opts; |
131 | int allowed_opts; | 133 | int allowed_opts; |
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 794e66e4aa20..99bf51c7e513 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c | |||
@@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) | |||
2921 | __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); | 2921 | __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); |
2922 | nvme_fc_free_queue(&ctrl->queues[0]); | 2922 | nvme_fc_free_queue(&ctrl->queues[0]); |
2923 | 2923 | ||
2924 | /* re-enable the admin_q so anything new can fast fail */ | ||
2925 | blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); | ||
2926 | |||
2924 | nvme_fc_ctlr_inactive_on_rport(ctrl); | 2927 | nvme_fc_ctlr_inactive_on_rport(ctrl); |
2925 | } | 2928 | } |
2926 | 2929 | ||
@@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) | |||
2935 | * waiting for io to terminate | 2938 | * waiting for io to terminate |
2936 | */ | 2939 | */ |
2937 | nvme_fc_delete_association(ctrl); | 2940 | nvme_fc_delete_association(ctrl); |
2941 | |||
2942 | /* resume the io queues so that things will fast fail */ | ||
2943 | nvme_start_queues(nctrl); | ||
2938 | } | 2944 | } |
2939 | 2945 | ||
2940 | static void | 2946 | static void |
@@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) | |||
3380 | 3386 | ||
3381 | static struct nvmf_transport_ops nvme_fc_transport = { | 3387 | static struct nvmf_transport_ops nvme_fc_transport = { |
3382 | .name = "fc", | 3388 | .name = "fc", |
3389 | .module = THIS_MODULE, | ||
3383 | .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, | 3390 | .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, |
3384 | .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, | 3391 | .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, |
3385 | .create_ctrl = nvme_fc_create_ctrl, | 3392 | .create_ctrl = nvme_fc_create_ctrl, |
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ba3d7f3349e5..50ef71ee3d86 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c | |||
@@ -31,27 +31,10 @@ | |||
31 | 31 | ||
32 | enum nvme_nvm_admin_opcode { | 32 | enum nvme_nvm_admin_opcode { |
33 | nvme_nvm_admin_identity = 0xe2, | 33 | nvme_nvm_admin_identity = 0xe2, |
34 | nvme_nvm_admin_get_l2p_tbl = 0xea, | ||
35 | nvme_nvm_admin_get_bb_tbl = 0xf2, | 34 | nvme_nvm_admin_get_bb_tbl = 0xf2, |
36 | nvme_nvm_admin_set_bb_tbl = 0xf1, | 35 | nvme_nvm_admin_set_bb_tbl = 0xf1, |
37 | }; | 36 | }; |
38 | 37 | ||
39 | struct nvme_nvm_hb_rw { | ||
40 | __u8 opcode; | ||
41 | __u8 flags; | ||
42 | __u16 command_id; | ||
43 | __le32 nsid; | ||
44 | __u64 rsvd2; | ||
45 | __le64 metadata; | ||
46 | __le64 prp1; | ||
47 | __le64 prp2; | ||
48 | __le64 spba; | ||
49 | __le16 length; | ||
50 | __le16 control; | ||
51 | __le32 dsmgmt; | ||
52 | __le64 slba; | ||
53 | }; | ||
54 | |||
55 | struct nvme_nvm_ph_rw { | 38 | struct nvme_nvm_ph_rw { |
56 | __u8 opcode; | 39 | __u8 opcode; |
57 | __u8 flags; | 40 | __u8 flags; |
@@ -80,19 +63,6 @@ struct nvme_nvm_identity { | |||
80 | __u32 rsvd11[5]; | 63 | __u32 rsvd11[5]; |
81 | }; | 64 | }; |
82 | 65 | ||
83 | struct nvme_nvm_l2ptbl { | ||
84 | __u8 opcode; | ||
85 | __u8 flags; | ||
86 | __u16 command_id; | ||
87 | __le32 nsid; | ||
88 | __le32 cdw2[4]; | ||
89 | __le64 prp1; | ||
90 | __le64 prp2; | ||
91 | __le64 slba; | ||
92 | __le32 nlb; | ||
93 | __le16 cdw14[6]; | ||
94 | }; | ||
95 | |||
96 | struct nvme_nvm_getbbtbl { | 66 | struct nvme_nvm_getbbtbl { |
97 | __u8 opcode; | 67 | __u8 opcode; |
98 | __u8 flags; | 68 | __u8 flags; |
@@ -139,9 +109,7 @@ struct nvme_nvm_command { | |||
139 | union { | 109 | union { |
140 | struct nvme_common_command common; | 110 | struct nvme_common_command common; |
141 | struct nvme_nvm_identity identity; | 111 | struct nvme_nvm_identity identity; |
142 | struct nvme_nvm_hb_rw hb_rw; | ||
143 | struct nvme_nvm_ph_rw ph_rw; | 112 | struct nvme_nvm_ph_rw ph_rw; |
144 | struct nvme_nvm_l2ptbl l2p; | ||
145 | struct nvme_nvm_getbbtbl get_bb; | 113 | struct nvme_nvm_getbbtbl get_bb; |
146 | struct nvme_nvm_setbbtbl set_bb; | 114 | struct nvme_nvm_setbbtbl set_bb; |
147 | struct nvme_nvm_erase_blk erase; | 115 | struct nvme_nvm_erase_blk erase; |
@@ -167,7 +135,7 @@ struct nvme_nvm_id_group { | |||
167 | __u8 num_lun; | 135 | __u8 num_lun; |
168 | __u8 num_pln; | 136 | __u8 num_pln; |
169 | __u8 rsvd1; | 137 | __u8 rsvd1; |
170 | __le16 num_blk; | 138 | __le16 num_chk; |
171 | __le16 num_pg; | 139 | __le16 num_pg; |
172 | __le16 fpg_sz; | 140 | __le16 fpg_sz; |
173 | __le16 csecs; | 141 | __le16 csecs; |
@@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl { | |||
234 | static inline void _nvme_nvm_check_size(void) | 202 | static inline void _nvme_nvm_check_size(void) |
235 | { | 203 | { |
236 | BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); | 204 | BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); |
237 | BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64); | ||
238 | BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); | 205 | BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); |
239 | BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); | 206 | BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); |
240 | BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); | 207 | BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); |
241 | BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64); | ||
242 | BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); | 208 | BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); |
243 | BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); | 209 | BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); |
244 | BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); | 210 | BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); |
@@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void) | |||
249 | static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) | 215 | static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) |
250 | { | 216 | { |
251 | struct nvme_nvm_id_group *src; | 217 | struct nvme_nvm_id_group *src; |
252 | struct nvm_id_group *dst; | 218 | struct nvm_id_group *grp; |
219 | int sec_per_pg, sec_per_pl, pg_per_blk; | ||
253 | 220 | ||
254 | if (nvme_nvm_id->cgrps != 1) | 221 | if (nvme_nvm_id->cgrps != 1) |
255 | return -EINVAL; | 222 | return -EINVAL; |
256 | 223 | ||
257 | src = &nvme_nvm_id->groups[0]; | 224 | src = &nvme_nvm_id->groups[0]; |
258 | dst = &nvm_id->grp; | 225 | grp = &nvm_id->grp; |
259 | 226 | ||
260 | dst->mtype = src->mtype; | 227 | grp->mtype = src->mtype; |
261 | dst->fmtype = src->fmtype; | 228 | grp->fmtype = src->fmtype; |
262 | dst->num_ch = src->num_ch; | 229 | |
263 | dst->num_lun = src->num_lun; | 230 | grp->num_ch = src->num_ch; |
264 | dst->num_pln = src->num_pln; | 231 | grp->num_lun = src->num_lun; |
265 | 232 | ||
266 | dst->num_pg = le16_to_cpu(src->num_pg); | 233 | grp->num_chk = le16_to_cpu(src->num_chk); |
267 | dst->num_blk = le16_to_cpu(src->num_blk); | 234 | grp->csecs = le16_to_cpu(src->csecs); |
268 | dst->fpg_sz = le16_to_cpu(src->fpg_sz); | 235 | grp->sos = le16_to_cpu(src->sos); |
269 | dst->csecs = le16_to_cpu(src->csecs); | 236 | |
270 | dst->sos = le16_to_cpu(src->sos); | 237 | pg_per_blk = le16_to_cpu(src->num_pg); |
271 | 238 | sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs; | |
272 | dst->trdt = le32_to_cpu(src->trdt); | 239 | sec_per_pl = sec_per_pg * src->num_pln; |
273 | dst->trdm = le32_to_cpu(src->trdm); | 240 | grp->clba = sec_per_pl * pg_per_blk; |
274 | dst->tprt = le32_to_cpu(src->tprt); | 241 | grp->ws_per_chk = pg_per_blk; |
275 | dst->tprm = le32_to_cpu(src->tprm); | ||
276 | dst->tbet = le32_to_cpu(src->tbet); | ||
277 | dst->tbem = le32_to_cpu(src->tbem); | ||
278 | dst->mpos = le32_to_cpu(src->mpos); | ||
279 | dst->mccap = le32_to_cpu(src->mccap); | ||
280 | |||
281 | dst->cpar = le16_to_cpu(src->cpar); | ||
282 | |||
283 | if (dst->fmtype == NVM_ID_FMTYPE_MLC) { | ||
284 | memcpy(dst->lptbl.id, src->lptbl.id, 8); | ||
285 | dst->lptbl.mlc.num_pairs = | ||
286 | le16_to_cpu(src->lptbl.mlc.num_pairs); | ||
287 | |||
288 | if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { | ||
289 | pr_err("nvm: number of MLC pairs not supported\n"); | ||
290 | return -EINVAL; | ||
291 | } | ||
292 | 242 | ||
293 | memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, | 243 | grp->mpos = le32_to_cpu(src->mpos); |
294 | dst->lptbl.mlc.num_pairs); | 244 | grp->cpar = le16_to_cpu(src->cpar); |
245 | grp->mccap = le32_to_cpu(src->mccap); | ||
246 | |||
247 | grp->ws_opt = grp->ws_min = sec_per_pg; | ||
248 | grp->ws_seq = NVM_IO_SNGL_ACCESS; | ||
249 | |||
250 | if (grp->mpos & 0x020202) { | ||
251 | grp->ws_seq = NVM_IO_DUAL_ACCESS; | ||
252 | grp->ws_opt <<= 1; | ||
253 | } else if (grp->mpos & 0x040404) { | ||
254 | grp->ws_seq = NVM_IO_QUAD_ACCESS; | ||
255 | grp->ws_opt <<= 2; | ||
295 | } | 256 | } |
296 | 257 | ||
258 | grp->trdt = le32_to_cpu(src->trdt); | ||
259 | grp->trdm = le32_to_cpu(src->trdm); | ||
260 | grp->tprt = le32_to_cpu(src->tprt); | ||
261 | grp->tprm = le32_to_cpu(src->tprm); | ||
262 | grp->tbet = le32_to_cpu(src->tbet); | ||
263 | grp->tbem = le32_to_cpu(src->tbem); | ||
264 | |||
265 | /* 1.2 compatibility */ | ||
266 | grp->num_pln = src->num_pln; | ||
267 | grp->num_pg = le16_to_cpu(src->num_pg); | ||
268 | grp->fpg_sz = le16_to_cpu(src->fpg_sz); | ||
269 | |||
297 | return 0; | 270 | return 0; |
298 | } | 271 | } |
299 | 272 | ||
@@ -332,62 +305,6 @@ out: | |||
332 | return ret; | 305 | return ret; |
333 | } | 306 | } |
334 | 307 | ||
335 | static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, | ||
336 | nvm_l2p_update_fn *update_l2p, void *priv) | ||
337 | { | ||
338 | struct nvme_ns *ns = nvmdev->q->queuedata; | ||
339 | struct nvme_nvm_command c = {}; | ||
340 | u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9; | ||
341 | u32 nlb_pr_rq = len / sizeof(u64); | ||
342 | u64 cmd_slba = slba; | ||
343 | void *entries; | ||
344 | int ret = 0; | ||
345 | |||
346 | c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; | ||
347 | c.l2p.nsid = cpu_to_le32(ns->head->ns_id); | ||
348 | entries = kmalloc(len, GFP_KERNEL); | ||
349 | if (!entries) | ||
350 | return -ENOMEM; | ||
351 | |||
352 | while (nlb) { | ||
353 | u32 cmd_nlb = min(nlb_pr_rq, nlb); | ||
354 | u64 elba = slba + cmd_nlb; | ||
355 | |||
356 | c.l2p.slba = cpu_to_le64(cmd_slba); | ||
357 | c.l2p.nlb = cpu_to_le32(cmd_nlb); | ||
358 | |||
359 | ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, | ||
360 | (struct nvme_command *)&c, entries, len); | ||
361 | if (ret) { | ||
362 | dev_err(ns->ctrl->device, | ||
363 | "L2P table transfer failed (%d)\n", ret); | ||
364 | ret = -EIO; | ||
365 | goto out; | ||
366 | } | ||
367 | |||
368 | if (unlikely(elba > nvmdev->total_secs)) { | ||
369 | pr_err("nvm: L2P data from device is out of bounds!\n"); | ||
370 | ret = -EINVAL; | ||
371 | goto out; | ||
372 | } | ||
373 | |||
374 | /* Transform physical address to target address space */ | ||
375 | nvm_part_to_tgt(nvmdev, entries, cmd_nlb); | ||
376 | |||
377 | if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) { | ||
378 | ret = -EINTR; | ||
379 | goto out; | ||
380 | } | ||
381 | |||
382 | cmd_slba += cmd_nlb; | ||
383 | nlb -= cmd_nlb; | ||
384 | } | ||
385 | |||
386 | out: | ||
387 | kfree(entries); | ||
388 | return ret; | ||
389 | } | ||
390 | |||
391 | static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, | 308 | static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, |
392 | u8 *blks) | 309 | u8 *blks) |
393 | { | 310 | { |
@@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, | |||
397 | struct nvme_ctrl *ctrl = ns->ctrl; | 314 | struct nvme_ctrl *ctrl = ns->ctrl; |
398 | struct nvme_nvm_command c = {}; | 315 | struct nvme_nvm_command c = {}; |
399 | struct nvme_nvm_bb_tbl *bb_tbl; | 316 | struct nvme_nvm_bb_tbl *bb_tbl; |
400 | int nr_blks = geo->blks_per_lun * geo->plane_mode; | 317 | int nr_blks = geo->nr_chks * geo->plane_mode; |
401 | int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; | 318 | int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; |
402 | int ret = 0; | 319 | int ret = 0; |
403 | 320 | ||
@@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, | |||
438 | goto out; | 355 | goto out; |
439 | } | 356 | } |
440 | 357 | ||
441 | memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode); | 358 | memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode); |
442 | out: | 359 | out: |
443 | kfree(bb_tbl); | 360 | kfree(bb_tbl); |
444 | return ret; | 361 | return ret; |
@@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, | |||
474 | c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); | 391 | c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); |
475 | c->ph_rw.control = cpu_to_le16(rqd->flags); | 392 | c->ph_rw.control = cpu_to_le16(rqd->flags); |
476 | c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); | 393 | c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); |
477 | |||
478 | if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) | ||
479 | c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, | ||
480 | rqd->bio->bi_iter.bi_sector)); | ||
481 | } | 394 | } |
482 | 395 | ||
483 | static void nvme_nvm_end_io(struct request *rq, blk_status_t status) | 396 | static void nvme_nvm_end_io(struct request *rq, blk_status_t status) |
@@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr, | |||
597 | static struct nvm_dev_ops nvme_nvm_dev_ops = { | 510 | static struct nvm_dev_ops nvme_nvm_dev_ops = { |
598 | .identity = nvme_nvm_identity, | 511 | .identity = nvme_nvm_identity, |
599 | 512 | ||
600 | .get_l2p_tbl = nvme_nvm_get_l2p_tbl, | ||
601 | |||
602 | .get_bb_tbl = nvme_nvm_get_bb_tbl, | 513 | .get_bb_tbl = nvme_nvm_get_bb_tbl, |
603 | .set_bb_tbl = nvme_nvm_set_bb_tbl, | 514 | .set_bb_tbl = nvme_nvm_set_bb_tbl, |
604 | 515 | ||
@@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev, | |||
883 | } else if (strcmp(attr->name, "num_planes") == 0) { | 794 | } else if (strcmp(attr->name, "num_planes") == 0) { |
884 | return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); | 795 | return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); |
885 | } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ | 796 | } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ |
886 | return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); | 797 | return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk); |
887 | } else if (strcmp(attr->name, "num_pages") == 0) { | 798 | } else if (strcmp(attr->name, "num_pages") == 0) { |
888 | return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); | 799 | return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); |
889 | } else if (strcmp(attr->name, "page_size") == 0) { | 800 | } else if (strcmp(attr->name, "page_size") == 0) { |
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1218a9fca846..3b211d9e58b8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c | |||
@@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req) | |||
33 | kblockd_schedule_work(&ns->head->requeue_work); | 33 | kblockd_schedule_work(&ns->head->requeue_work); |
34 | } | 34 | } |
35 | 35 | ||
36 | bool nvme_req_needs_failover(struct request *req) | 36 | bool nvme_req_needs_failover(struct request *req, blk_status_t error) |
37 | { | 37 | { |
38 | if (!(req->cmd_flags & REQ_NVME_MPATH)) | 38 | if (!(req->cmd_flags & REQ_NVME_MPATH)) |
39 | return false; | 39 | return false; |
40 | 40 | return blk_path_error(error); | |
41 | switch (nvme_req(req)->status & 0x7ff) { | ||
42 | /* | ||
43 | * Generic command status: | ||
44 | */ | ||
45 | case NVME_SC_INVALID_OPCODE: | ||
46 | case NVME_SC_INVALID_FIELD: | ||
47 | case NVME_SC_INVALID_NS: | ||
48 | case NVME_SC_LBA_RANGE: | ||
49 | case NVME_SC_CAP_EXCEEDED: | ||
50 | case NVME_SC_RESERVATION_CONFLICT: | ||
51 | return false; | ||
52 | |||
53 | /* | ||
54 | * I/O command set specific error. Unfortunately these values are | ||
55 | * reused for fabrics commands, but those should never get here. | ||
56 | */ | ||
57 | case NVME_SC_BAD_ATTRIBUTES: | ||
58 | case NVME_SC_INVALID_PI: | ||
59 | case NVME_SC_READ_ONLY: | ||
60 | case NVME_SC_ONCS_NOT_SUPPORTED: | ||
61 | WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == | ||
62 | nvme_fabrics_command); | ||
63 | return false; | ||
64 | |||
65 | /* | ||
66 | * Media and Data Integrity Errors: | ||
67 | */ | ||
68 | case NVME_SC_WRITE_FAULT: | ||
69 | case NVME_SC_READ_ERROR: | ||
70 | case NVME_SC_GUARD_CHECK: | ||
71 | case NVME_SC_APPTAG_CHECK: | ||
72 | case NVME_SC_REFTAG_CHECK: | ||
73 | case NVME_SC_COMPARE_FAILED: | ||
74 | case NVME_SC_ACCESS_DENIED: | ||
75 | case NVME_SC_UNWRITTEN_BLOCK: | ||
76 | return false; | ||
77 | } | ||
78 | |||
79 | /* Everything else could be a path failure, so should be retried */ | ||
80 | return true; | ||
81 | } | 41 | } |
82 | 42 | ||
83 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) | 43 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a00eabd06427..8e4550fa08f8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h | |||
@@ -32,6 +32,8 @@ extern unsigned int admin_timeout; | |||
32 | #define NVME_KATO_GRACE 10 | 32 | #define NVME_KATO_GRACE 10 |
33 | 33 | ||
34 | extern struct workqueue_struct *nvme_wq; | 34 | extern struct workqueue_struct *nvme_wq; |
35 | extern struct workqueue_struct *nvme_reset_wq; | ||
36 | extern struct workqueue_struct *nvme_delete_wq; | ||
35 | 37 | ||
36 | enum { | 38 | enum { |
37 | NVME_NS_LBA = 0, | 39 | NVME_NS_LBA = 0, |
@@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req) | |||
119 | enum nvme_ctrl_state { | 121 | enum nvme_ctrl_state { |
120 | NVME_CTRL_NEW, | 122 | NVME_CTRL_NEW, |
121 | NVME_CTRL_LIVE, | 123 | NVME_CTRL_LIVE, |
124 | NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ | ||
122 | NVME_CTRL_RESETTING, | 125 | NVME_CTRL_RESETTING, |
123 | NVME_CTRL_RECONNECTING, | 126 | NVME_CTRL_RECONNECTING, |
124 | NVME_CTRL_DELETING, | 127 | NVME_CTRL_DELETING, |
@@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); | |||
393 | void nvme_start_keep_alive(struct nvme_ctrl *ctrl); | 396 | void nvme_start_keep_alive(struct nvme_ctrl *ctrl); |
394 | void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); | 397 | void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); |
395 | int nvme_reset_ctrl(struct nvme_ctrl *ctrl); | 398 | int nvme_reset_ctrl(struct nvme_ctrl *ctrl); |
399 | int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); | ||
396 | int nvme_delete_ctrl(struct nvme_ctrl *ctrl); | 400 | int nvme_delete_ctrl(struct nvme_ctrl *ctrl); |
397 | int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); | 401 | int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); |
398 | 402 | ||
@@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops; | |||
401 | 405 | ||
402 | #ifdef CONFIG_NVME_MULTIPATH | 406 | #ifdef CONFIG_NVME_MULTIPATH |
403 | void nvme_failover_req(struct request *req); | 407 | void nvme_failover_req(struct request *req); |
404 | bool nvme_req_needs_failover(struct request *req); | 408 | bool nvme_req_needs_failover(struct request *req, blk_status_t error); |
405 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); | 409 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); |
406 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); | 410 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); |
407 | void nvme_mpath_add_disk(struct nvme_ns_head *head); | 411 | void nvme_mpath_add_disk(struct nvme_ns_head *head); |
@@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) | |||
430 | static inline void nvme_failover_req(struct request *req) | 434 | static inline void nvme_failover_req(struct request *req) |
431 | { | 435 | { |
432 | } | 436 | } |
433 | static inline bool nvme_req_needs_failover(struct request *req) | 437 | static inline bool nvme_req_needs_failover(struct request *req, |
438 | blk_status_t error) | ||
434 | { | 439 | { |
435 | return false; | 440 | return false; |
436 | } | 441 | } |
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4276ebfff22b..6fe7af00a1f4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c | |||
@@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); | |||
75 | * Represents an NVM Express device. Each nvme_dev is a PCI function. | 75 | * Represents an NVM Express device. Each nvme_dev is a PCI function. |
76 | */ | 76 | */ |
77 | struct nvme_dev { | 77 | struct nvme_dev { |
78 | struct nvme_queue **queues; | 78 | struct nvme_queue *queues; |
79 | struct blk_mq_tag_set tagset; | 79 | struct blk_mq_tag_set tagset; |
80 | struct blk_mq_tag_set admin_tagset; | 80 | struct blk_mq_tag_set admin_tagset; |
81 | u32 __iomem *dbs; | 81 | u32 __iomem *dbs; |
@@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, | |||
365 | unsigned int hctx_idx) | 365 | unsigned int hctx_idx) |
366 | { | 366 | { |
367 | struct nvme_dev *dev = data; | 367 | struct nvme_dev *dev = data; |
368 | struct nvme_queue *nvmeq = dev->queues[0]; | 368 | struct nvme_queue *nvmeq = &dev->queues[0]; |
369 | 369 | ||
370 | WARN_ON(hctx_idx != 0); | 370 | WARN_ON(hctx_idx != 0); |
371 | WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); | 371 | WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); |
@@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, | |||
387 | unsigned int hctx_idx) | 387 | unsigned int hctx_idx) |
388 | { | 388 | { |
389 | struct nvme_dev *dev = data; | 389 | struct nvme_dev *dev = data; |
390 | struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; | 390 | struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; |
391 | 391 | ||
392 | if (!nvmeq->tags) | 392 | if (!nvmeq->tags) |
393 | nvmeq->tags = &dev->tagset.tags[hctx_idx]; | 393 | nvmeq->tags = &dev->tagset.tags[hctx_idx]; |
@@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, | |||
403 | struct nvme_dev *dev = set->driver_data; | 403 | struct nvme_dev *dev = set->driver_data; |
404 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); | 404 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
405 | int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; | 405 | int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; |
406 | struct nvme_queue *nvmeq = dev->queues[queue_idx]; | 406 | struct nvme_queue *nvmeq = &dev->queues[queue_idx]; |
407 | 407 | ||
408 | BUG_ON(!nvmeq); | 408 | BUG_ON(!nvmeq); |
409 | iod->nvmeq = nvmeq; | 409 | iod->nvmeq = nvmeq; |
@@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) | |||
1044 | static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) | 1044 | static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) |
1045 | { | 1045 | { |
1046 | struct nvme_dev *dev = to_nvme_dev(ctrl); | 1046 | struct nvme_dev *dev = to_nvme_dev(ctrl); |
1047 | struct nvme_queue *nvmeq = dev->queues[0]; | 1047 | struct nvme_queue *nvmeq = &dev->queues[0]; |
1048 | struct nvme_command c; | 1048 | struct nvme_command c; |
1049 | 1049 | ||
1050 | memset(&c, 0, sizeof(c)); | 1050 | memset(&c, 0, sizeof(c)); |
@@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) | |||
1138 | */ | 1138 | */ |
1139 | bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); | 1139 | bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); |
1140 | 1140 | ||
1141 | /* If there is a reset ongoing, we shouldn't reset again. */ | 1141 | /* If there is a reset/reinit ongoing, we shouldn't reset again. */ |
1142 | if (dev->ctrl.state == NVME_CTRL_RESETTING) | 1142 | switch (dev->ctrl.state) { |
1143 | case NVME_CTRL_RESETTING: | ||
1144 | case NVME_CTRL_RECONNECTING: | ||
1143 | return false; | 1145 | return false; |
1146 | default: | ||
1147 | break; | ||
1148 | } | ||
1144 | 1149 | ||
1145 | /* We shouldn't reset unless the controller is on fatal error state | 1150 | /* We shouldn't reset unless the controller is on fatal error state |
1146 | * _or_ if we lost the communication with it. | 1151 | * _or_ if we lost the communication with it. |
@@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) | |||
1280 | if (nvmeq->sq_cmds) | 1285 | if (nvmeq->sq_cmds) |
1281 | dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), | 1286 | dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), |
1282 | nvmeq->sq_cmds, nvmeq->sq_dma_addr); | 1287 | nvmeq->sq_cmds, nvmeq->sq_dma_addr); |
1283 | kfree(nvmeq); | ||
1284 | } | 1288 | } |
1285 | 1289 | ||
1286 | static void nvme_free_queues(struct nvme_dev *dev, int lowest) | 1290 | static void nvme_free_queues(struct nvme_dev *dev, int lowest) |
@@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) | |||
1288 | int i; | 1292 | int i; |
1289 | 1293 | ||
1290 | for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { | 1294 | for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { |
1291 | struct nvme_queue *nvmeq = dev->queues[i]; | ||
1292 | dev->ctrl.queue_count--; | 1295 | dev->ctrl.queue_count--; |
1293 | dev->queues[i] = NULL; | 1296 | nvme_free_queue(&dev->queues[i]); |
1294 | nvme_free_queue(nvmeq); | ||
1295 | } | 1297 | } |
1296 | } | 1298 | } |
1297 | 1299 | ||
@@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) | |||
1323 | 1325 | ||
1324 | static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) | 1326 | static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) |
1325 | { | 1327 | { |
1326 | struct nvme_queue *nvmeq = dev->queues[0]; | 1328 | struct nvme_queue *nvmeq = &dev->queues[0]; |
1327 | |||
1328 | if (!nvmeq) | ||
1329 | return; | ||
1330 | if (nvme_suspend_queue(nvmeq)) | ||
1331 | return; | ||
1332 | 1329 | ||
1333 | if (shutdown) | 1330 | if (shutdown) |
1334 | nvme_shutdown_ctrl(&dev->ctrl); | 1331 | nvme_shutdown_ctrl(&dev->ctrl); |
@@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, | |||
1367 | static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, | 1364 | static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, |
1368 | int qid, int depth) | 1365 | int qid, int depth) |
1369 | { | 1366 | { |
1370 | if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { | 1367 | if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { |
1371 | unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), | 1368 | unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), |
1372 | dev->ctrl.page_size); | 1369 | dev->ctrl.page_size); |
1373 | nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; | 1370 | nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; |
@@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, | |||
1382 | return 0; | 1379 | return 0; |
1383 | } | 1380 | } |
1384 | 1381 | ||
1385 | static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, | 1382 | static int nvme_alloc_queue(struct nvme_dev *dev, int qid, |
1386 | int depth, int node) | 1383 | int depth, int node) |
1387 | { | 1384 | { |
1388 | struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, | 1385 | struct nvme_queue *nvmeq = &dev->queues[qid]; |
1389 | node); | 1386 | |
1390 | if (!nvmeq) | 1387 | if (dev->ctrl.queue_count > qid) |
1391 | return NULL; | 1388 | return 0; |
1392 | 1389 | ||
1393 | nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), | 1390 | nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), |
1394 | &nvmeq->cq_dma_addr, GFP_KERNEL); | 1391 | &nvmeq->cq_dma_addr, GFP_KERNEL); |
@@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, | |||
1407 | nvmeq->q_depth = depth; | 1404 | nvmeq->q_depth = depth; |
1408 | nvmeq->qid = qid; | 1405 | nvmeq->qid = qid; |
1409 | nvmeq->cq_vector = -1; | 1406 | nvmeq->cq_vector = -1; |
1410 | dev->queues[qid] = nvmeq; | ||
1411 | dev->ctrl.queue_count++; | 1407 | dev->ctrl.queue_count++; |
1412 | 1408 | ||
1413 | return nvmeq; | 1409 | return 0; |
1414 | 1410 | ||
1415 | free_cqdma: | 1411 | free_cqdma: |
1416 | dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, | 1412 | dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, |
1417 | nvmeq->cq_dma_addr); | 1413 | nvmeq->cq_dma_addr); |
1418 | free_nvmeq: | 1414 | free_nvmeq: |
1419 | kfree(nvmeq); | 1415 | return -ENOMEM; |
1420 | return NULL; | ||
1421 | } | 1416 | } |
1422 | 1417 | ||
1423 | static int queue_request_irq(struct nvme_queue *nvmeq) | 1418 | static int queue_request_irq(struct nvme_queue *nvmeq) |
@@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) | |||
1590 | if (result < 0) | 1585 | if (result < 0) |
1591 | return result; | 1586 | return result; |
1592 | 1587 | ||
1593 | nvmeq = dev->queues[0]; | 1588 | result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, |
1594 | if (!nvmeq) { | 1589 | dev_to_node(dev->dev)); |
1595 | nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, | 1590 | if (result) |
1596 | dev_to_node(dev->dev)); | 1591 | return result; |
1597 | if (!nvmeq) | ||
1598 | return -ENOMEM; | ||
1599 | } | ||
1600 | 1592 | ||
1593 | nvmeq = &dev->queues[0]; | ||
1601 | aqa = nvmeq->q_depth - 1; | 1594 | aqa = nvmeq->q_depth - 1; |
1602 | aqa |= aqa << 16; | 1595 | aqa |= aqa << 16; |
1603 | 1596 | ||
@@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) | |||
1627 | 1620 | ||
1628 | for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { | 1621 | for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { |
1629 | /* vector == qid - 1, match nvme_create_queue */ | 1622 | /* vector == qid - 1, match nvme_create_queue */ |
1630 | if (!nvme_alloc_queue(dev, i, dev->q_depth, | 1623 | if (nvme_alloc_queue(dev, i, dev->q_depth, |
1631 | pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { | 1624 | pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { |
1632 | ret = -ENOMEM; | 1625 | ret = -ENOMEM; |
1633 | break; | 1626 | break; |
@@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev) | |||
1636 | 1629 | ||
1637 | max = min(dev->max_qid, dev->ctrl.queue_count - 1); | 1630 | max = min(dev->max_qid, dev->ctrl.queue_count - 1); |
1638 | for (i = dev->online_queues; i <= max; i++) { | 1631 | for (i = dev->online_queues; i <= max; i++) { |
1639 | ret = nvme_create_queue(dev->queues[i], i); | 1632 | ret = nvme_create_queue(&dev->queues[i], i); |
1640 | if (ret) | 1633 | if (ret) |
1641 | break; | 1634 | break; |
1642 | } | 1635 | } |
1643 | 1636 | ||
1644 | /* | 1637 | /* |
1645 | * Ignore failing Create SQ/CQ commands, we can continue with less | 1638 | * Ignore failing Create SQ/CQ commands, we can continue with less |
1646 | * than the desired aount of queues, and even a controller without | 1639 | * than the desired amount of queues, and even a controller without |
1647 | * I/O queues an still be used to issue admin commands. This might | 1640 | * I/O queues can still be used to issue admin commands. This might |
1648 | * be useful to upgrade a buggy firmware for example. | 1641 | * be useful to upgrade a buggy firmware for example. |
1649 | */ | 1642 | */ |
1650 | return ret >= 0 ? 0 : ret; | 1643 | return ret >= 0 ? 0 : ret; |
@@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev, | |||
1661 | } | 1654 | } |
1662 | static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); | 1655 | static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); |
1663 | 1656 | ||
1664 | static void __iomem *nvme_map_cmb(struct nvme_dev *dev) | 1657 | static u64 nvme_cmb_size_unit(struct nvme_dev *dev) |
1665 | { | 1658 | { |
1666 | u64 szu, size, offset; | 1659 | u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; |
1660 | |||
1661 | return 1ULL << (12 + 4 * szu); | ||
1662 | } | ||
1663 | |||
1664 | static u32 nvme_cmb_size(struct nvme_dev *dev) | ||
1665 | { | ||
1666 | return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; | ||
1667 | } | ||
1668 | |||
1669 | static void nvme_map_cmb(struct nvme_dev *dev) | ||
1670 | { | ||
1671 | u64 size, offset; | ||
1667 | resource_size_t bar_size; | 1672 | resource_size_t bar_size; |
1668 | struct pci_dev *pdev = to_pci_dev(dev->dev); | 1673 | struct pci_dev *pdev = to_pci_dev(dev->dev); |
1669 | void __iomem *cmb; | ||
1670 | int bar; | 1674 | int bar; |
1671 | 1675 | ||
1672 | dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); | 1676 | dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); |
1673 | if (!(NVME_CMB_SZ(dev->cmbsz))) | 1677 | if (!dev->cmbsz) |
1674 | return NULL; | 1678 | return; |
1675 | dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); | 1679 | dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); |
1676 | 1680 | ||
1677 | if (!use_cmb_sqes) | 1681 | if (!use_cmb_sqes) |
1678 | return NULL; | 1682 | return; |
1679 | 1683 | ||
1680 | szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); | 1684 | size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); |
1681 | size = szu * NVME_CMB_SZ(dev->cmbsz); | 1685 | offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); |
1682 | offset = szu * NVME_CMB_OFST(dev->cmbloc); | ||
1683 | bar = NVME_CMB_BIR(dev->cmbloc); | 1686 | bar = NVME_CMB_BIR(dev->cmbloc); |
1684 | bar_size = pci_resource_len(pdev, bar); | 1687 | bar_size = pci_resource_len(pdev, bar); |
1685 | 1688 | ||
1686 | if (offset > bar_size) | 1689 | if (offset > bar_size) |
1687 | return NULL; | 1690 | return; |
1688 | 1691 | ||
1689 | /* | 1692 | /* |
1690 | * Controllers may support a CMB size larger than their BAR, | 1693 | * Controllers may support a CMB size larger than their BAR, |
@@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) | |||
1694 | if (size > bar_size - offset) | 1697 | if (size > bar_size - offset) |
1695 | size = bar_size - offset; | 1698 | size = bar_size - offset; |
1696 | 1699 | ||
1697 | cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); | 1700 | dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); |
1698 | if (!cmb) | 1701 | if (!dev->cmb) |
1699 | return NULL; | 1702 | return; |
1700 | |||
1701 | dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; | 1703 | dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; |
1702 | dev->cmb_size = size; | 1704 | dev->cmb_size = size; |
1703 | return cmb; | 1705 | |
1706 | if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, | ||
1707 | &dev_attr_cmb.attr, NULL)) | ||
1708 | dev_warn(dev->ctrl.device, | ||
1709 | "failed to add sysfs attribute for CMB\n"); | ||
1704 | } | 1710 | } |
1705 | 1711 | ||
1706 | static inline void nvme_release_cmb(struct nvme_dev *dev) | 1712 | static inline void nvme_release_cmb(struct nvme_dev *dev) |
@@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, | |||
1768 | dma_addr_t descs_dma; | 1774 | dma_addr_t descs_dma; |
1769 | int i = 0; | 1775 | int i = 0; |
1770 | void **bufs; | 1776 | void **bufs; |
1771 | u64 size = 0, tmp; | 1777 | u64 size, tmp; |
1772 | 1778 | ||
1773 | tmp = (preferred + chunk_size - 1); | 1779 | tmp = (preferred + chunk_size - 1); |
1774 | do_div(tmp, chunk_size); | 1780 | do_div(tmp, chunk_size); |
@@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) | |||
1851 | u64 preferred = (u64)dev->ctrl.hmpre * 4096; | 1857 | u64 preferred = (u64)dev->ctrl.hmpre * 4096; |
1852 | u64 min = (u64)dev->ctrl.hmmin * 4096; | 1858 | u64 min = (u64)dev->ctrl.hmmin * 4096; |
1853 | u32 enable_bits = NVME_HOST_MEM_ENABLE; | 1859 | u32 enable_bits = NVME_HOST_MEM_ENABLE; |
1854 | int ret = 0; | 1860 | int ret; |
1855 | 1861 | ||
1856 | preferred = min(preferred, max); | 1862 | preferred = min(preferred, max); |
1857 | if (min > max) { | 1863 | if (min > max) { |
@@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) | |||
1892 | 1898 | ||
1893 | static int nvme_setup_io_queues(struct nvme_dev *dev) | 1899 | static int nvme_setup_io_queues(struct nvme_dev *dev) |
1894 | { | 1900 | { |
1895 | struct nvme_queue *adminq = dev->queues[0]; | 1901 | struct nvme_queue *adminq = &dev->queues[0]; |
1896 | struct pci_dev *pdev = to_pci_dev(dev->dev); | 1902 | struct pci_dev *pdev = to_pci_dev(dev->dev); |
1897 | int result, nr_io_queues; | 1903 | int result, nr_io_queues; |
1898 | unsigned long size; | 1904 | unsigned long size; |
@@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) | |||
1905 | if (nr_io_queues == 0) | 1911 | if (nr_io_queues == 0) |
1906 | return 0; | 1912 | return 0; |
1907 | 1913 | ||
1908 | if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { | 1914 | if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { |
1909 | result = nvme_cmb_qdepth(dev, nr_io_queues, | 1915 | result = nvme_cmb_qdepth(dev, nr_io_queues, |
1910 | sizeof(struct nvme_command)); | 1916 | sizeof(struct nvme_command)); |
1911 | if (result > 0) | 1917 | if (result > 0) |
@@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) | |||
2005 | return 0; | 2011 | return 0; |
2006 | } | 2012 | } |
2007 | 2013 | ||
2008 | static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) | 2014 | static void nvme_disable_io_queues(struct nvme_dev *dev) |
2009 | { | 2015 | { |
2010 | int pass; | 2016 | int pass, queues = dev->online_queues - 1; |
2011 | unsigned long timeout; | 2017 | unsigned long timeout; |
2012 | u8 opcode = nvme_admin_delete_sq; | 2018 | u8 opcode = nvme_admin_delete_sq; |
2013 | 2019 | ||
@@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) | |||
2018 | retry: | 2024 | retry: |
2019 | timeout = ADMIN_TIMEOUT; | 2025 | timeout = ADMIN_TIMEOUT; |
2020 | for (; i > 0; i--, sent++) | 2026 | for (; i > 0; i--, sent++) |
2021 | if (nvme_delete_queue(dev->queues[i], opcode)) | 2027 | if (nvme_delete_queue(&dev->queues[i], opcode)) |
2022 | break; | 2028 | break; |
2023 | 2029 | ||
2024 | while (sent--) { | 2030 | while (sent--) { |
@@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) | |||
2033 | } | 2039 | } |
2034 | 2040 | ||
2035 | /* | 2041 | /* |
2036 | * Return: error value if an error occurred setting up the queues or calling | 2042 | * return error value only when tagset allocation failed |
2037 | * Identify Device. 0 if these succeeded, even if adding some of the | ||
2038 | * namespaces failed. At the moment, these failures are silent. TBD which | ||
2039 | * failures should be reported. | ||
2040 | */ | 2043 | */ |
2041 | static int nvme_dev_add(struct nvme_dev *dev) | 2044 | static int nvme_dev_add(struct nvme_dev *dev) |
2042 | { | 2045 | { |
2046 | int ret; | ||
2047 | |||
2043 | if (!dev->ctrl.tagset) { | 2048 | if (!dev->ctrl.tagset) { |
2044 | dev->tagset.ops = &nvme_mq_ops; | 2049 | dev->tagset.ops = &nvme_mq_ops; |
2045 | dev->tagset.nr_hw_queues = dev->online_queues - 1; | 2050 | dev->tagset.nr_hw_queues = dev->online_queues - 1; |
@@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev) | |||
2055 | dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; | 2060 | dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; |
2056 | dev->tagset.driver_data = dev; | 2061 | dev->tagset.driver_data = dev; |
2057 | 2062 | ||
2058 | if (blk_mq_alloc_tag_set(&dev->tagset)) | 2063 | ret = blk_mq_alloc_tag_set(&dev->tagset); |
2059 | return 0; | 2064 | if (ret) { |
2065 | dev_warn(dev->ctrl.device, | ||
2066 | "IO queues tagset allocation failed %d\n", ret); | ||
2067 | return ret; | ||
2068 | } | ||
2060 | dev->ctrl.tagset = &dev->tagset; | 2069 | dev->ctrl.tagset = &dev->tagset; |
2061 | 2070 | ||
2062 | nvme_dbbuf_set(dev); | 2071 | nvme_dbbuf_set(dev); |
@@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) | |||
2122 | "set queue depth=%u\n", dev->q_depth); | 2131 | "set queue depth=%u\n", dev->q_depth); |
2123 | } | 2132 | } |
2124 | 2133 | ||
2125 | /* | 2134 | nvme_map_cmb(dev); |
2126 | * CMBs can currently only exist on >=1.2 PCIe devices. We only | ||
2127 | * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group | ||
2128 | * has no name we can pass NULL as final argument to | ||
2129 | * sysfs_add_file_to_group. | ||
2130 | */ | ||
2131 | |||
2132 | if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) { | ||
2133 | dev->cmb = nvme_map_cmb(dev); | ||
2134 | if (dev->cmb) { | ||
2135 | if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, | ||
2136 | &dev_attr_cmb.attr, NULL)) | ||
2137 | dev_warn(dev->ctrl.device, | ||
2138 | "failed to add sysfs attribute for CMB\n"); | ||
2139 | } | ||
2140 | } | ||
2141 | 2135 | ||
2142 | pci_enable_pcie_error_reporting(pdev); | 2136 | pci_enable_pcie_error_reporting(pdev); |
2143 | pci_save_state(pdev); | 2137 | pci_save_state(pdev); |
@@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev) | |||
2170 | 2164 | ||
2171 | static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) | 2165 | static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) |
2172 | { | 2166 | { |
2173 | int i, queues; | 2167 | int i; |
2174 | bool dead = true; | 2168 | bool dead = true; |
2175 | struct pci_dev *pdev = to_pci_dev(dev->dev); | 2169 | struct pci_dev *pdev = to_pci_dev(dev->dev); |
2176 | 2170 | ||
@@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) | |||
2205 | } | 2199 | } |
2206 | nvme_stop_queues(&dev->ctrl); | 2200 | nvme_stop_queues(&dev->ctrl); |
2207 | 2201 | ||
2208 | queues = dev->online_queues - 1; | 2202 | if (!dead) { |
2209 | for (i = dev->ctrl.queue_count - 1; i > 0; i--) | 2203 | nvme_disable_io_queues(dev); |
2210 | nvme_suspend_queue(dev->queues[i]); | ||
2211 | |||
2212 | if (dead) { | ||
2213 | /* A device might become IO incapable very soon during | ||
2214 | * probe, before the admin queue is configured. Thus, | ||
2215 | * queue_count can be 0 here. | ||
2216 | */ | ||
2217 | if (dev->ctrl.queue_count) | ||
2218 | nvme_suspend_queue(dev->queues[0]); | ||
2219 | } else { | ||
2220 | nvme_disable_io_queues(dev, queues); | ||
2221 | nvme_disable_admin_queue(dev, shutdown); | 2204 | nvme_disable_admin_queue(dev, shutdown); |
2222 | } | 2205 | } |
2206 | for (i = dev->ctrl.queue_count - 1; i >= 0; i--) | ||
2207 | nvme_suspend_queue(&dev->queues[i]); | ||
2208 | |||
2223 | nvme_pci_disable(dev); | 2209 | nvme_pci_disable(dev); |
2224 | 2210 | ||
2225 | blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); | 2211 | blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); |
@@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work) | |||
2289 | container_of(work, struct nvme_dev, ctrl.reset_work); | 2275 | container_of(work, struct nvme_dev, ctrl.reset_work); |
2290 | bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); | 2276 | bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); |
2291 | int result = -ENODEV; | 2277 | int result = -ENODEV; |
2278 | enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; | ||
2292 | 2279 | ||
2293 | if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) | 2280 | if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) |
2294 | goto out; | 2281 | goto out; |
@@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work) | |||
2300 | if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) | 2287 | if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) |
2301 | nvme_dev_disable(dev, false); | 2288 | nvme_dev_disable(dev, false); |
2302 | 2289 | ||
2290 | /* | ||
2291 | * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the | ||
2292 | * initializing procedure here. | ||
2293 | */ | ||
2294 | if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) { | ||
2295 | dev_warn(dev->ctrl.device, | ||
2296 | "failed to mark controller RECONNECTING\n"); | ||
2297 | goto out; | ||
2298 | } | ||
2299 | |||
2303 | result = nvme_pci_enable(dev); | 2300 | result = nvme_pci_enable(dev); |
2304 | if (result) | 2301 | if (result) |
2305 | goto out; | 2302 | goto out; |
@@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work) | |||
2352 | dev_warn(dev->ctrl.device, "IO queues not created\n"); | 2349 | dev_warn(dev->ctrl.device, "IO queues not created\n"); |
2353 | nvme_kill_queues(&dev->ctrl); | 2350 | nvme_kill_queues(&dev->ctrl); |
2354 | nvme_remove_namespaces(&dev->ctrl); | 2351 | nvme_remove_namespaces(&dev->ctrl); |
2352 | new_state = NVME_CTRL_ADMIN_ONLY; | ||
2355 | } else { | 2353 | } else { |
2356 | nvme_start_queues(&dev->ctrl); | 2354 | nvme_start_queues(&dev->ctrl); |
2357 | nvme_wait_freeze(&dev->ctrl); | 2355 | nvme_wait_freeze(&dev->ctrl); |
2358 | nvme_dev_add(dev); | 2356 | /* hit this only when allocate tagset fails */ |
2357 | if (nvme_dev_add(dev)) | ||
2358 | new_state = NVME_CTRL_ADMIN_ONLY; | ||
2359 | nvme_unfreeze(&dev->ctrl); | 2359 | nvme_unfreeze(&dev->ctrl); |
2360 | } | 2360 | } |
2361 | 2361 | ||
2362 | if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { | 2362 | /* |
2363 | dev_warn(dev->ctrl.device, "failed to mark controller live\n"); | 2363 | * If only admin queue live, keep it to do further investigation or |
2364 | * recovery. | ||
2365 | */ | ||
2366 | if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { | ||
2367 | dev_warn(dev->ctrl.device, | ||
2368 | "failed to mark controller state %d\n", new_state); | ||
2364 | goto out; | 2369 | goto out; |
2365 | } | 2370 | } |
2366 | 2371 | ||
@@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) | |||
2468 | dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); | 2473 | dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); |
2469 | if (!dev) | 2474 | if (!dev) |
2470 | return -ENOMEM; | 2475 | return -ENOMEM; |
2471 | dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), | 2476 | |
2472 | GFP_KERNEL, node); | 2477 | dev->queues = kcalloc_node(num_possible_cpus() + 1, |
2478 | sizeof(struct nvme_queue), GFP_KERNEL, node); | ||
2473 | if (!dev->queues) | 2479 | if (!dev->queues) |
2474 | goto free; | 2480 | goto free; |
2475 | 2481 | ||
@@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) | |||
2496 | if (result) | 2502 | if (result) |
2497 | goto release_pools; | 2503 | goto release_pools; |
2498 | 2504 | ||
2499 | nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); | ||
2500 | dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); | 2505 | dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); |
2501 | 2506 | ||
2502 | queue_work(nvme_wq, &dev->ctrl.reset_work); | 2507 | nvme_reset_ctrl(&dev->ctrl); |
2508 | |||
2503 | return 0; | 2509 | return 0; |
2504 | 2510 | ||
2505 | release_pools: | 2511 | release_pools: |
@@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev) | |||
2523 | static void nvme_reset_done(struct pci_dev *pdev) | 2529 | static void nvme_reset_done(struct pci_dev *pdev) |
2524 | { | 2530 | { |
2525 | struct nvme_dev *dev = pci_get_drvdata(pdev); | 2531 | struct nvme_dev *dev = pci_get_drvdata(pdev); |
2526 | nvme_reset_ctrl(&dev->ctrl); | 2532 | nvme_reset_ctrl_sync(&dev->ctrl); |
2527 | } | 2533 | } |
2528 | 2534 | ||
2529 | static void nvme_shutdown(struct pci_dev *pdev) | 2535 | static void nvme_shutdown(struct pci_dev *pdev) |
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2a0bba7f50cf..2bc059f7d73c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c | |||
@@ -66,7 +66,6 @@ struct nvme_rdma_request { | |||
66 | struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; | 66 | struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; |
67 | u32 num_sge; | 67 | u32 num_sge; |
68 | int nents; | 68 | int nents; |
69 | bool inline_data; | ||
70 | struct ib_reg_wr reg_wr; | 69 | struct ib_reg_wr reg_wr; |
71 | struct ib_cqe reg_cqe; | 70 | struct ib_cqe reg_cqe; |
72 | struct nvme_rdma_queue *queue; | 71 | struct nvme_rdma_queue *queue; |
@@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, | |||
1092 | sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); | 1091 | sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); |
1093 | sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; | 1092 | sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; |
1094 | 1093 | ||
1095 | req->inline_data = true; | ||
1096 | req->num_sge++; | 1094 | req->num_sge++; |
1097 | return 0; | 1095 | return 0; |
1098 | } | 1096 | } |
@@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, | |||
1164 | int count, ret; | 1162 | int count, ret; |
1165 | 1163 | ||
1166 | req->num_sge = 1; | 1164 | req->num_sge = 1; |
1167 | req->inline_data = false; | ||
1168 | refcount_set(&req->ref, 2); /* send and recv completions */ | 1165 | refcount_set(&req->ref, 2); /* send and recv completions */ |
1169 | 1166 | ||
1170 | c->common.flags |= NVME_CMD_SGL_METABUF; | 1167 | c->common.flags |= NVME_CMD_SGL_METABUF; |
@@ -2018,6 +2015,7 @@ out_free_ctrl: | |||
2018 | 2015 | ||
2019 | static struct nvmf_transport_ops nvme_rdma_transport = { | 2016 | static struct nvmf_transport_ops nvme_rdma_transport = { |
2020 | .name = "rdma", | 2017 | .name = "rdma", |
2018 | .module = THIS_MODULE, | ||
2021 | .required_opts = NVMF_OPT_TRADDR, | 2019 | .required_opts = NVMF_OPT_TRADDR, |
2022 | .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | | 2020 | .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | |
2023 | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, | 2021 | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, |
@@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) | |||
2040 | } | 2038 | } |
2041 | mutex_unlock(&nvme_rdma_ctrl_mutex); | 2039 | mutex_unlock(&nvme_rdma_ctrl_mutex); |
2042 | 2040 | ||
2043 | flush_workqueue(nvme_wq); | 2041 | flush_workqueue(nvme_delete_wq); |
2044 | } | 2042 | } |
2045 | 2043 | ||
2046 | static struct ib_client nvme_rdma_ib_client = { | 2044 | static struct ib_client nvme_rdma_ib_client = { |
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c new file mode 100644 index 000000000000..41944bbef835 --- /dev/null +++ b/drivers/nvme/host/trace.c | |||
@@ -0,0 +1,130 @@ | |||
1 | /* | ||
2 | * NVM Express device driver tracepoints | ||
3 | * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <asm/unaligned.h> | ||
16 | #include "trace.h" | ||
17 | |||
18 | static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) | ||
19 | { | ||
20 | const char *ret = trace_seq_buffer_ptr(p); | ||
21 | u16 sqid = get_unaligned_le16(cdw10); | ||
22 | u16 qsize = get_unaligned_le16(cdw10 + 2); | ||
23 | u16 sq_flags = get_unaligned_le16(cdw10 + 4); | ||
24 | u16 cqid = get_unaligned_le16(cdw10 + 6); | ||
25 | |||
26 | |||
27 | trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u", | ||
28 | sqid, qsize, sq_flags, cqid); | ||
29 | trace_seq_putc(p, 0); | ||
30 | |||
31 | return ret; | ||
32 | } | ||
33 | |||
34 | static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) | ||
35 | { | ||
36 | const char *ret = trace_seq_buffer_ptr(p); | ||
37 | u16 cqid = get_unaligned_le16(cdw10); | ||
38 | u16 qsize = get_unaligned_le16(cdw10 + 2); | ||
39 | u16 cq_flags = get_unaligned_le16(cdw10 + 4); | ||
40 | u16 irq_vector = get_unaligned_le16(cdw10 + 6); | ||
41 | |||
42 | trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u", | ||
43 | cqid, qsize, cq_flags, irq_vector); | ||
44 | trace_seq_putc(p, 0); | ||
45 | |||
46 | return ret; | ||
47 | } | ||
48 | |||
49 | static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) | ||
50 | { | ||
51 | const char *ret = trace_seq_buffer_ptr(p); | ||
52 | u8 cns = cdw10[0]; | ||
53 | u16 ctrlid = get_unaligned_le16(cdw10 + 2); | ||
54 | |||
55 | trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid); | ||
56 | trace_seq_putc(p, 0); | ||
57 | |||
58 | return ret; | ||
59 | } | ||
60 | |||
61 | |||
62 | |||
63 | static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) | ||
64 | { | ||
65 | const char *ret = trace_seq_buffer_ptr(p); | ||
66 | u64 slba = get_unaligned_le64(cdw10); | ||
67 | u16 length = get_unaligned_le16(cdw10 + 8); | ||
68 | u16 control = get_unaligned_le16(cdw10 + 10); | ||
69 | u32 dsmgmt = get_unaligned_le32(cdw10 + 12); | ||
70 | u32 reftag = get_unaligned_le32(cdw10 + 16); | ||
71 | |||
72 | trace_seq_printf(p, | ||
73 | "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", | ||
74 | slba, length, control, dsmgmt, reftag); | ||
75 | trace_seq_putc(p, 0); | ||
76 | |||
77 | return ret; | ||
78 | } | ||
79 | |||
80 | static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) | ||
81 | { | ||
82 | const char *ret = trace_seq_buffer_ptr(p); | ||
83 | |||
84 | trace_seq_printf(p, "nr=%u, attributes=%u", | ||
85 | get_unaligned_le32(cdw10), | ||
86 | get_unaligned_le32(cdw10 + 4)); | ||
87 | trace_seq_putc(p, 0); | ||
88 | |||
89 | return ret; | ||
90 | } | ||
91 | |||
92 | static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10) | ||
93 | { | ||
94 | const char *ret = trace_seq_buffer_ptr(p); | ||
95 | |||
96 | trace_seq_printf(p, "cdw10=%*ph", 24, cdw10); | ||
97 | trace_seq_putc(p, 0); | ||
98 | |||
99 | return ret; | ||
100 | } | ||
101 | |||
102 | const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, | ||
103 | u8 opcode, u8 *cdw10) | ||
104 | { | ||
105 | switch (opcode) { | ||
106 | case nvme_admin_create_sq: | ||
107 | return nvme_trace_create_sq(p, cdw10); | ||
108 | case nvme_admin_create_cq: | ||
109 | return nvme_trace_create_cq(p, cdw10); | ||
110 | case nvme_admin_identify: | ||
111 | return nvme_trace_admin_identify(p, cdw10); | ||
112 | default: | ||
113 | return nvme_trace_common(p, cdw10); | ||
114 | } | ||
115 | } | ||
116 | |||
117 | const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, | ||
118 | u8 opcode, u8 *cdw10) | ||
119 | { | ||
120 | switch (opcode) { | ||
121 | case nvme_cmd_read: | ||
122 | case nvme_cmd_write: | ||
123 | case nvme_cmd_write_zeroes: | ||
124 | return nvme_trace_read_write(p, cdw10); | ||
125 | case nvme_cmd_dsm: | ||
126 | return nvme_trace_dsm(p, cdw10); | ||
127 | default: | ||
128 | return nvme_trace_common(p, cdw10); | ||
129 | } | ||
130 | } | ||
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h new file mode 100644 index 000000000000..ea91fccd1bc0 --- /dev/null +++ b/drivers/nvme/host/trace.h | |||
@@ -0,0 +1,165 @@ | |||
1 | /* | ||
2 | * NVM Express device driver tracepoints | ||
3 | * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #undef TRACE_SYSTEM | ||
16 | #define TRACE_SYSTEM nvme | ||
17 | |||
18 | #if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ) | ||
19 | #define _TRACE_NVME_H | ||
20 | |||
21 | #include <linux/nvme.h> | ||
22 | #include <linux/tracepoint.h> | ||
23 | #include <linux/trace_seq.h> | ||
24 | |||
25 | #include "nvme.h" | ||
26 | |||
27 | #define nvme_admin_opcode_name(opcode) { opcode, #opcode } | ||
28 | #define show_admin_opcode_name(val) \ | ||
29 | __print_symbolic(val, \ | ||
30 | nvme_admin_opcode_name(nvme_admin_delete_sq), \ | ||
31 | nvme_admin_opcode_name(nvme_admin_create_sq), \ | ||
32 | nvme_admin_opcode_name(nvme_admin_get_log_page), \ | ||
33 | nvme_admin_opcode_name(nvme_admin_delete_cq), \ | ||
34 | nvme_admin_opcode_name(nvme_admin_create_cq), \ | ||
35 | nvme_admin_opcode_name(nvme_admin_identify), \ | ||
36 | nvme_admin_opcode_name(nvme_admin_abort_cmd), \ | ||
37 | nvme_admin_opcode_name(nvme_admin_set_features), \ | ||
38 | nvme_admin_opcode_name(nvme_admin_get_features), \ | ||
39 | nvme_admin_opcode_name(nvme_admin_async_event), \ | ||
40 | nvme_admin_opcode_name(nvme_admin_ns_mgmt), \ | ||
41 | nvme_admin_opcode_name(nvme_admin_activate_fw), \ | ||
42 | nvme_admin_opcode_name(nvme_admin_download_fw), \ | ||
43 | nvme_admin_opcode_name(nvme_admin_ns_attach), \ | ||
44 | nvme_admin_opcode_name(nvme_admin_keep_alive), \ | ||
45 | nvme_admin_opcode_name(nvme_admin_directive_send), \ | ||
46 | nvme_admin_opcode_name(nvme_admin_directive_recv), \ | ||
47 | nvme_admin_opcode_name(nvme_admin_dbbuf), \ | ||
48 | nvme_admin_opcode_name(nvme_admin_format_nvm), \ | ||
49 | nvme_admin_opcode_name(nvme_admin_security_send), \ | ||
50 | nvme_admin_opcode_name(nvme_admin_security_recv), \ | ||
51 | nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) | ||
52 | |||
53 | const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, | ||
54 | u8 *cdw10); | ||
55 | #define __parse_nvme_admin_cmd(opcode, cdw10) \ | ||
56 | nvme_trace_parse_admin_cmd(p, opcode, cdw10) | ||
57 | |||
58 | #define nvme_opcode_name(opcode) { opcode, #opcode } | ||
59 | #define show_opcode_name(val) \ | ||
60 | __print_symbolic(val, \ | ||
61 | nvme_opcode_name(nvme_cmd_flush), \ | ||
62 | nvme_opcode_name(nvme_cmd_write), \ | ||
63 | nvme_opcode_name(nvme_cmd_read), \ | ||
64 | nvme_opcode_name(nvme_cmd_write_uncor), \ | ||
65 | nvme_opcode_name(nvme_cmd_compare), \ | ||
66 | nvme_opcode_name(nvme_cmd_write_zeroes), \ | ||
67 | nvme_opcode_name(nvme_cmd_dsm), \ | ||
68 | nvme_opcode_name(nvme_cmd_resv_register), \ | ||
69 | nvme_opcode_name(nvme_cmd_resv_report), \ | ||
70 | nvme_opcode_name(nvme_cmd_resv_acquire), \ | ||
71 | nvme_opcode_name(nvme_cmd_resv_release)) | ||
72 | |||
73 | const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, | ||
74 | u8 *cdw10); | ||
75 | #define __parse_nvme_cmd(opcode, cdw10) \ | ||
76 | nvme_trace_parse_nvm_cmd(p, opcode, cdw10) | ||
77 | |||
78 | TRACE_EVENT(nvme_setup_admin_cmd, | ||
79 | TP_PROTO(struct nvme_command *cmd), | ||
80 | TP_ARGS(cmd), | ||
81 | TP_STRUCT__entry( | ||
82 | __field(u8, opcode) | ||
83 | __field(u8, flags) | ||
84 | __field(u16, cid) | ||
85 | __field(u64, metadata) | ||
86 | __array(u8, cdw10, 24) | ||
87 | ), | ||
88 | TP_fast_assign( | ||
89 | __entry->opcode = cmd->common.opcode; | ||
90 | __entry->flags = cmd->common.flags; | ||
91 | __entry->cid = cmd->common.command_id; | ||
92 | __entry->metadata = le64_to_cpu(cmd->common.metadata); | ||
93 | memcpy(__entry->cdw10, cmd->common.cdw10, | ||
94 | sizeof(__entry->cdw10)); | ||
95 | ), | ||
96 | TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", | ||
97 | __entry->cid, __entry->flags, __entry->metadata, | ||
98 | show_admin_opcode_name(__entry->opcode), | ||
99 | __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10)) | ||
100 | ); | ||
101 | |||
102 | |||
103 | TRACE_EVENT(nvme_setup_nvm_cmd, | ||
104 | TP_PROTO(int qid, struct nvme_command *cmd), | ||
105 | TP_ARGS(qid, cmd), | ||
106 | TP_STRUCT__entry( | ||
107 | __field(int, qid) | ||
108 | __field(u8, opcode) | ||
109 | __field(u8, flags) | ||
110 | __field(u16, cid) | ||
111 | __field(u32, nsid) | ||
112 | __field(u64, metadata) | ||
113 | __array(u8, cdw10, 24) | ||
114 | ), | ||
115 | TP_fast_assign( | ||
116 | __entry->qid = qid; | ||
117 | __entry->opcode = cmd->common.opcode; | ||
118 | __entry->flags = cmd->common.flags; | ||
119 | __entry->cid = cmd->common.command_id; | ||
120 | __entry->nsid = le32_to_cpu(cmd->common.nsid); | ||
121 | __entry->metadata = le64_to_cpu(cmd->common.metadata); | ||
122 | memcpy(__entry->cdw10, cmd->common.cdw10, | ||
123 | sizeof(__entry->cdw10)); | ||
124 | ), | ||
125 | TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", | ||
126 | __entry->qid, __entry->nsid, __entry->cid, | ||
127 | __entry->flags, __entry->metadata, | ||
128 | show_opcode_name(__entry->opcode), | ||
129 | __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) | ||
130 | ); | ||
131 | |||
132 | TRACE_EVENT(nvme_complete_rq, | ||
133 | TP_PROTO(struct request *req), | ||
134 | TP_ARGS(req), | ||
135 | TP_STRUCT__entry( | ||
136 | __field(int, qid) | ||
137 | __field(int, cid) | ||
138 | __field(u64, result) | ||
139 | __field(u8, retries) | ||
140 | __field(u8, flags) | ||
141 | __field(u16, status) | ||
142 | ), | ||
143 | TP_fast_assign( | ||
144 | __entry->qid = req->q->id; | ||
145 | __entry->cid = req->tag; | ||
146 | __entry->result = le64_to_cpu(nvme_req(req)->result.u64); | ||
147 | __entry->retries = nvme_req(req)->retries; | ||
148 | __entry->flags = nvme_req(req)->flags; | ||
149 | __entry->status = nvme_req(req)->status; | ||
150 | ), | ||
151 | TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u", | ||
152 | __entry->cid, __entry->qid, __entry->result, | ||
153 | __entry->retries, __entry->flags, __entry->status) | ||
154 | |||
155 | ); | ||
156 | |||
157 | #endif /* _TRACE_NVME_H */ | ||
158 | |||
159 | #undef TRACE_INCLUDE_PATH | ||
160 | #define TRACE_INCLUDE_PATH . | ||
161 | #undef TRACE_INCLUDE_FILE | ||
162 | #define TRACE_INCLUDE_FILE trace | ||
163 | |||
164 | /* This part must be outside protection */ | ||
165 | #include <trace/define_trace.h> | ||
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 03e4ab65fe77..5f4f8b16685f 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig | |||
@@ -29,6 +29,7 @@ config NVME_TARGET_RDMA | |||
29 | tristate "NVMe over Fabrics RDMA target support" | 29 | tristate "NVMe over Fabrics RDMA target support" |
30 | depends on INFINIBAND | 30 | depends on INFINIBAND |
31 | depends on NVME_TARGET | 31 | depends on NVME_TARGET |
32 | select SGL_ALLOC | ||
32 | help | 33 | help |
33 | This enables the NVMe RDMA target support, which allows exporting NVMe | 34 | This enables the NVMe RDMA target support, which allows exporting NVMe |
34 | devices over RDMA. | 35 | devices over RDMA. |
@@ -39,6 +40,7 @@ config NVME_TARGET_FC | |||
39 | tristate "NVMe over Fabrics FC target driver" | 40 | tristate "NVMe over Fabrics FC target driver" |
40 | depends on NVME_TARGET | 41 | depends on NVME_TARGET |
41 | depends on HAS_DMA | 42 | depends on HAS_DMA |
43 | select SGL_ALLOC | ||
42 | help | 44 | help |
43 | This enables the NVMe FC target support, which allows exporting NVMe | 45 | This enables the NVMe FC target support, which allows exporting NVMe |
44 | devices over FC. | 46 | devices over FC. |
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b54748ad5f48..0bd737117a80 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c | |||
@@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, | |||
512 | req->sg_cnt = 0; | 512 | req->sg_cnt = 0; |
513 | req->transfer_len = 0; | 513 | req->transfer_len = 0; |
514 | req->rsp->status = 0; | 514 | req->rsp->status = 0; |
515 | req->ns = NULL; | ||
515 | 516 | ||
516 | /* no support for fused commands yet */ | 517 | /* no support for fused commands yet */ |
517 | if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { | 518 | if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { |
@@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init); | |||
557 | void nvmet_req_uninit(struct nvmet_req *req) | 558 | void nvmet_req_uninit(struct nvmet_req *req) |
558 | { | 559 | { |
559 | percpu_ref_put(&req->sq->ref); | 560 | percpu_ref_put(&req->sq->ref); |
561 | if (req->ns) | ||
562 | nvmet_put_namespace(req->ns); | ||
560 | } | 563 | } |
561 | EXPORT_SYMBOL_GPL(nvmet_req_uninit); | 564 | EXPORT_SYMBOL_GPL(nvmet_req_uninit); |
562 | 565 | ||
@@ -830,7 +833,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, | |||
830 | /* Don't accept keep-alive timeout for discovery controllers */ | 833 | /* Don't accept keep-alive timeout for discovery controllers */ |
831 | if (kato) { | 834 | if (kato) { |
832 | status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; | 835 | status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; |
833 | goto out_free_sqs; | 836 | goto out_remove_ida; |
834 | } | 837 | } |
835 | 838 | ||
836 | /* | 839 | /* |
@@ -860,6 +863,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, | |||
860 | *ctrlp = ctrl; | 863 | *ctrlp = ctrl; |
861 | return 0; | 864 | return 0; |
862 | 865 | ||
866 | out_remove_ida: | ||
867 | ida_simple_remove(&cntlid_ida, ctrl->cntlid); | ||
863 | out_free_sqs: | 868 | out_free_sqs: |
864 | kfree(ctrl->sqs); | 869 | kfree(ctrl->sqs); |
865 | out_free_cqs: | 870 | out_free_cqs: |
@@ -877,21 +882,22 @@ static void nvmet_ctrl_free(struct kref *ref) | |||
877 | struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); | 882 | struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); |
878 | struct nvmet_subsys *subsys = ctrl->subsys; | 883 | struct nvmet_subsys *subsys = ctrl->subsys; |
879 | 884 | ||
880 | nvmet_stop_keep_alive_timer(ctrl); | ||
881 | |||
882 | mutex_lock(&subsys->lock); | 885 | mutex_lock(&subsys->lock); |
883 | list_del(&ctrl->subsys_entry); | 886 | list_del(&ctrl->subsys_entry); |
884 | mutex_unlock(&subsys->lock); | 887 | mutex_unlock(&subsys->lock); |
885 | 888 | ||
889 | nvmet_stop_keep_alive_timer(ctrl); | ||
890 | |||
886 | flush_work(&ctrl->async_event_work); | 891 | flush_work(&ctrl->async_event_work); |
887 | cancel_work_sync(&ctrl->fatal_err_work); | 892 | cancel_work_sync(&ctrl->fatal_err_work); |
888 | 893 | ||
889 | ida_simple_remove(&cntlid_ida, ctrl->cntlid); | 894 | ida_simple_remove(&cntlid_ida, ctrl->cntlid); |
890 | nvmet_subsys_put(subsys); | ||
891 | 895 | ||
892 | kfree(ctrl->sqs); | 896 | kfree(ctrl->sqs); |
893 | kfree(ctrl->cqs); | 897 | kfree(ctrl->cqs); |
894 | kfree(ctrl); | 898 | kfree(ctrl); |
899 | |||
900 | nvmet_subsys_put(subsys); | ||
895 | } | 901 | } |
896 | 902 | ||
897 | void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) | 903 | void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) |
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index db3bf6b8bf9e..19e9e42ae943 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c | |||
@@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) | |||
225 | goto out_ctrl_put; | 225 | goto out_ctrl_put; |
226 | } | 226 | } |
227 | 227 | ||
228 | pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); | 228 | pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); |
229 | 229 | ||
230 | out: | 230 | out: |
231 | kfree(d); | 231 | kfree(d); |
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 5fd86039e353..9b39a6cb1935 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c | |||
@@ -1697,31 +1697,12 @@ static int | |||
1697 | nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) | 1697 | nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) |
1698 | { | 1698 | { |
1699 | struct scatterlist *sg; | 1699 | struct scatterlist *sg; |
1700 | struct page *page; | ||
1701 | unsigned int nent; | 1700 | unsigned int nent; |
1702 | u32 page_len, length; | ||
1703 | int i = 0; | ||
1704 | 1701 | ||
1705 | length = fod->req.transfer_len; | 1702 | sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent); |
1706 | nent = DIV_ROUND_UP(length, PAGE_SIZE); | ||
1707 | sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); | ||
1708 | if (!sg) | 1703 | if (!sg) |
1709 | goto out; | 1704 | goto out; |
1710 | 1705 | ||
1711 | sg_init_table(sg, nent); | ||
1712 | |||
1713 | while (length) { | ||
1714 | page_len = min_t(u32, length, PAGE_SIZE); | ||
1715 | |||
1716 | page = alloc_page(GFP_KERNEL); | ||
1717 | if (!page) | ||
1718 | goto out_free_pages; | ||
1719 | |||
1720 | sg_set_page(&sg[i], page, page_len, 0); | ||
1721 | length -= page_len; | ||
1722 | i++; | ||
1723 | } | ||
1724 | |||
1725 | fod->data_sg = sg; | 1706 | fod->data_sg = sg; |
1726 | fod->data_sg_cnt = nent; | 1707 | fod->data_sg_cnt = nent; |
1727 | fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent, | 1708 | fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent, |
@@ -1731,14 +1712,6 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) | |||
1731 | 1712 | ||
1732 | return 0; | 1713 | return 0; |
1733 | 1714 | ||
1734 | out_free_pages: | ||
1735 | while (i > 0) { | ||
1736 | i--; | ||
1737 | __free_page(sg_page(&sg[i])); | ||
1738 | } | ||
1739 | kfree(sg); | ||
1740 | fod->data_sg = NULL; | ||
1741 | fod->data_sg_cnt = 0; | ||
1742 | out: | 1715 | out: |
1743 | return NVME_SC_INTERNAL; | 1716 | return NVME_SC_INTERNAL; |
1744 | } | 1717 | } |
@@ -1746,18 +1719,13 @@ out: | |||
1746 | static void | 1719 | static void |
1747 | nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) | 1720 | nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) |
1748 | { | 1721 | { |
1749 | struct scatterlist *sg; | ||
1750 | int count; | ||
1751 | |||
1752 | if (!fod->data_sg || !fod->data_sg_cnt) | 1722 | if (!fod->data_sg || !fod->data_sg_cnt) |
1753 | return; | 1723 | return; |
1754 | 1724 | ||
1755 | fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt, | 1725 | fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt, |
1756 | ((fod->io_dir == NVMET_FCP_WRITE) ? | 1726 | ((fod->io_dir == NVMET_FCP_WRITE) ? |
1757 | DMA_FROM_DEVICE : DMA_TO_DEVICE)); | 1727 | DMA_FROM_DEVICE : DMA_TO_DEVICE)); |
1758 | for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count) | 1728 | sgl_free(fod->data_sg); |
1759 | __free_page(sg_page(sg)); | ||
1760 | kfree(fod->data_sg); | ||
1761 | fod->data_sg = NULL; | 1729 | fod->data_sg = NULL; |
1762 | fod->data_sg_cnt = 0; | 1730 | fod->data_sg_cnt = 0; |
1763 | } | 1731 | } |
@@ -2522,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port) | |||
2522 | list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { | 2490 | list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { |
2523 | if ((tgtport->fc_target_port.node_name == traddr.nn) && | 2491 | if ((tgtport->fc_target_port.node_name == traddr.nn) && |
2524 | (tgtport->fc_target_port.port_name == traddr.pn)) { | 2492 | (tgtport->fc_target_port.port_name == traddr.pn)) { |
2525 | /* a FC port can only be 1 nvmet port id */ | 2493 | tgtport->port = port; |
2526 | if (!tgtport->port) { | 2494 | ret = 0; |
2527 | tgtport->port = port; | ||
2528 | port->priv = tgtport; | ||
2529 | nvmet_fc_tgtport_get(tgtport); | ||
2530 | ret = 0; | ||
2531 | } else | ||
2532 | ret = -EALREADY; | ||
2533 | break; | 2495 | break; |
2534 | } | 2496 | } |
2535 | } | 2497 | } |
@@ -2540,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port) | |||
2540 | static void | 2502 | static void |
2541 | nvmet_fc_remove_port(struct nvmet_port *port) | 2503 | nvmet_fc_remove_port(struct nvmet_port *port) |
2542 | { | 2504 | { |
2543 | struct nvmet_fc_tgtport *tgtport = port->priv; | 2505 | /* nothing to do */ |
2544 | unsigned long flags; | ||
2545 | bool matched = false; | ||
2546 | |||
2547 | spin_lock_irqsave(&nvmet_fc_tgtlock, flags); | ||
2548 | if (tgtport->port == port) { | ||
2549 | matched = true; | ||
2550 | tgtport->port = NULL; | ||
2551 | } | ||
2552 | spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); | ||
2553 | |||
2554 | if (matched) | ||
2555 | nvmet_fc_tgtport_put(tgtport); | ||
2556 | } | 2506 | } |
2557 | 2507 | ||
2558 | static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { | 2508 | static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { |
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 6a018a0bd6ce..34712def81b1 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c | |||
@@ -204,6 +204,10 @@ struct fcloop_lport { | |||
204 | struct completion unreg_done; | 204 | struct completion unreg_done; |
205 | }; | 205 | }; |
206 | 206 | ||
207 | struct fcloop_lport_priv { | ||
208 | struct fcloop_lport *lport; | ||
209 | }; | ||
210 | |||
207 | struct fcloop_rport { | 211 | struct fcloop_rport { |
208 | struct nvme_fc_remote_port *remoteport; | 212 | struct nvme_fc_remote_port *remoteport; |
209 | struct nvmet_fc_target_port *targetport; | 213 | struct nvmet_fc_target_port *targetport; |
@@ -238,21 +242,32 @@ struct fcloop_lsreq { | |||
238 | int status; | 242 | int status; |
239 | }; | 243 | }; |
240 | 244 | ||
245 | enum { | ||
246 | INI_IO_START = 0, | ||
247 | INI_IO_ACTIVE = 1, | ||
248 | INI_IO_ABORTED = 2, | ||
249 | INI_IO_COMPLETED = 3, | ||
250 | }; | ||
251 | |||
241 | struct fcloop_fcpreq { | 252 | struct fcloop_fcpreq { |
242 | struct fcloop_tport *tport; | 253 | struct fcloop_tport *tport; |
243 | struct nvmefc_fcp_req *fcpreq; | 254 | struct nvmefc_fcp_req *fcpreq; |
244 | spinlock_t reqlock; | 255 | spinlock_t reqlock; |
245 | u16 status; | 256 | u16 status; |
257 | u32 inistate; | ||
246 | bool active; | 258 | bool active; |
247 | bool aborted; | 259 | bool aborted; |
248 | struct work_struct work; | 260 | struct kref ref; |
261 | struct work_struct fcp_rcv_work; | ||
262 | struct work_struct abort_rcv_work; | ||
263 | struct work_struct tio_done_work; | ||
249 | struct nvmefc_tgt_fcp_req tgt_fcp_req; | 264 | struct nvmefc_tgt_fcp_req tgt_fcp_req; |
250 | }; | 265 | }; |
251 | 266 | ||
252 | struct fcloop_ini_fcpreq { | 267 | struct fcloop_ini_fcpreq { |
253 | struct nvmefc_fcp_req *fcpreq; | 268 | struct nvmefc_fcp_req *fcpreq; |
254 | struct fcloop_fcpreq *tfcp_req; | 269 | struct fcloop_fcpreq *tfcp_req; |
255 | struct work_struct iniwork; | 270 | spinlock_t inilock; |
256 | }; | 271 | }; |
257 | 272 | ||
258 | static inline struct fcloop_lsreq * | 273 | static inline struct fcloop_lsreq * |
@@ -343,17 +358,122 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport, | |||
343 | return 0; | 358 | return 0; |
344 | } | 359 | } |
345 | 360 | ||
346 | /* | ||
347 | * FCP IO operation done by initiator abort. | ||
348 | * call back up initiator "done" flows. | ||
349 | */ | ||
350 | static void | 361 | static void |
351 | fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work) | 362 | fcloop_tfcp_req_free(struct kref *ref) |
352 | { | 363 | { |
353 | struct fcloop_ini_fcpreq *inireq = | 364 | struct fcloop_fcpreq *tfcp_req = |
354 | container_of(work, struct fcloop_ini_fcpreq, iniwork); | 365 | container_of(ref, struct fcloop_fcpreq, ref); |
366 | |||
367 | kfree(tfcp_req); | ||
368 | } | ||
369 | |||
370 | static void | ||
371 | fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req) | ||
372 | { | ||
373 | kref_put(&tfcp_req->ref, fcloop_tfcp_req_free); | ||
374 | } | ||
375 | |||
376 | static int | ||
377 | fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req) | ||
378 | { | ||
379 | return kref_get_unless_zero(&tfcp_req->ref); | ||
380 | } | ||
381 | |||
382 | static void | ||
383 | fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, | ||
384 | struct fcloop_fcpreq *tfcp_req, int status) | ||
385 | { | ||
386 | struct fcloop_ini_fcpreq *inireq = NULL; | ||
387 | |||
388 | if (fcpreq) { | ||
389 | inireq = fcpreq->private; | ||
390 | spin_lock(&inireq->inilock); | ||
391 | inireq->tfcp_req = NULL; | ||
392 | spin_unlock(&inireq->inilock); | ||
393 | |||
394 | fcpreq->status = status; | ||
395 | fcpreq->done(fcpreq); | ||
396 | } | ||
397 | |||
398 | /* release original io reference on tgt struct */ | ||
399 | fcloop_tfcp_req_put(tfcp_req); | ||
400 | } | ||
401 | |||
402 | static void | ||
403 | fcloop_fcp_recv_work(struct work_struct *work) | ||
404 | { | ||
405 | struct fcloop_fcpreq *tfcp_req = | ||
406 | container_of(work, struct fcloop_fcpreq, fcp_rcv_work); | ||
407 | struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; | ||
408 | int ret = 0; | ||
409 | bool aborted = false; | ||
410 | |||
411 | spin_lock(&tfcp_req->reqlock); | ||
412 | switch (tfcp_req->inistate) { | ||
413 | case INI_IO_START: | ||
414 | tfcp_req->inistate = INI_IO_ACTIVE; | ||
415 | break; | ||
416 | case INI_IO_ABORTED: | ||
417 | aborted = true; | ||
418 | break; | ||
419 | default: | ||
420 | spin_unlock(&tfcp_req->reqlock); | ||
421 | WARN_ON(1); | ||
422 | return; | ||
423 | } | ||
424 | spin_unlock(&tfcp_req->reqlock); | ||
425 | |||
426 | if (unlikely(aborted)) | ||
427 | ret = -ECANCELED; | ||
428 | else | ||
429 | ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, | ||
430 | &tfcp_req->tgt_fcp_req, | ||
431 | fcpreq->cmdaddr, fcpreq->cmdlen); | ||
432 | if (ret) | ||
433 | fcloop_call_host_done(fcpreq, tfcp_req, ret); | ||
434 | |||
435 | return; | ||
436 | } | ||
437 | |||
438 | static void | ||
439 | fcloop_fcp_abort_recv_work(struct work_struct *work) | ||
440 | { | ||
441 | struct fcloop_fcpreq *tfcp_req = | ||
442 | container_of(work, struct fcloop_fcpreq, abort_rcv_work); | ||
443 | struct nvmefc_fcp_req *fcpreq; | ||
444 | bool completed = false; | ||
445 | |||
446 | spin_lock(&tfcp_req->reqlock); | ||
447 | fcpreq = tfcp_req->fcpreq; | ||
448 | switch (tfcp_req->inistate) { | ||
449 | case INI_IO_ABORTED: | ||
450 | break; | ||
451 | case INI_IO_COMPLETED: | ||
452 | completed = true; | ||
453 | break; | ||
454 | default: | ||
455 | spin_unlock(&tfcp_req->reqlock); | ||
456 | WARN_ON(1); | ||
457 | return; | ||
458 | } | ||
459 | spin_unlock(&tfcp_req->reqlock); | ||
460 | |||
461 | if (unlikely(completed)) { | ||
462 | /* remove reference taken in original abort downcall */ | ||
463 | fcloop_tfcp_req_put(tfcp_req); | ||
464 | return; | ||
465 | } | ||
355 | 466 | ||
356 | inireq->fcpreq->done(inireq->fcpreq); | 467 | if (tfcp_req->tport->targetport) |
468 | nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, | ||
469 | &tfcp_req->tgt_fcp_req); | ||
470 | |||
471 | spin_lock(&tfcp_req->reqlock); | ||
472 | tfcp_req->fcpreq = NULL; | ||
473 | spin_unlock(&tfcp_req->reqlock); | ||
474 | |||
475 | fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); | ||
476 | /* call_host_done releases reference for abort downcall */ | ||
357 | } | 477 | } |
358 | 478 | ||
359 | /* | 479 | /* |
@@ -364,20 +484,15 @@ static void | |||
364 | fcloop_tgt_fcprqst_done_work(struct work_struct *work) | 484 | fcloop_tgt_fcprqst_done_work(struct work_struct *work) |
365 | { | 485 | { |
366 | struct fcloop_fcpreq *tfcp_req = | 486 | struct fcloop_fcpreq *tfcp_req = |
367 | container_of(work, struct fcloop_fcpreq, work); | 487 | container_of(work, struct fcloop_fcpreq, tio_done_work); |
368 | struct fcloop_tport *tport = tfcp_req->tport; | ||
369 | struct nvmefc_fcp_req *fcpreq; | 488 | struct nvmefc_fcp_req *fcpreq; |
370 | 489 | ||
371 | spin_lock(&tfcp_req->reqlock); | 490 | spin_lock(&tfcp_req->reqlock); |
372 | fcpreq = tfcp_req->fcpreq; | 491 | fcpreq = tfcp_req->fcpreq; |
492 | tfcp_req->inistate = INI_IO_COMPLETED; | ||
373 | spin_unlock(&tfcp_req->reqlock); | 493 | spin_unlock(&tfcp_req->reqlock); |
374 | 494 | ||
375 | if (tport->remoteport && fcpreq) { | 495 | fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); |
376 | fcpreq->status = tfcp_req->status; | ||
377 | fcpreq->done(fcpreq); | ||
378 | } | ||
379 | |||
380 | kfree(tfcp_req); | ||
381 | } | 496 | } |
382 | 497 | ||
383 | 498 | ||
@@ -390,7 +505,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport, | |||
390 | struct fcloop_rport *rport = remoteport->private; | 505 | struct fcloop_rport *rport = remoteport->private; |
391 | struct fcloop_ini_fcpreq *inireq = fcpreq->private; | 506 | struct fcloop_ini_fcpreq *inireq = fcpreq->private; |
392 | struct fcloop_fcpreq *tfcp_req; | 507 | struct fcloop_fcpreq *tfcp_req; |
393 | int ret = 0; | ||
394 | 508 | ||
395 | if (!rport->targetport) | 509 | if (!rport->targetport) |
396 | return -ECONNREFUSED; | 510 | return -ECONNREFUSED; |
@@ -401,16 +515,20 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport, | |||
401 | 515 | ||
402 | inireq->fcpreq = fcpreq; | 516 | inireq->fcpreq = fcpreq; |
403 | inireq->tfcp_req = tfcp_req; | 517 | inireq->tfcp_req = tfcp_req; |
404 | INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work); | 518 | spin_lock_init(&inireq->inilock); |
519 | |||
405 | tfcp_req->fcpreq = fcpreq; | 520 | tfcp_req->fcpreq = fcpreq; |
406 | tfcp_req->tport = rport->targetport->private; | 521 | tfcp_req->tport = rport->targetport->private; |
522 | tfcp_req->inistate = INI_IO_START; | ||
407 | spin_lock_init(&tfcp_req->reqlock); | 523 | spin_lock_init(&tfcp_req->reqlock); |
408 | INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work); | 524 | INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work); |
525 | INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work); | ||
526 | INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work); | ||
527 | kref_init(&tfcp_req->ref); | ||
409 | 528 | ||
410 | ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req, | 529 | schedule_work(&tfcp_req->fcp_rcv_work); |
411 | fcpreq->cmdaddr, fcpreq->cmdlen); | ||
412 | 530 | ||
413 | return ret; | 531 | return 0; |
414 | } | 532 | } |
415 | 533 | ||
416 | static void | 534 | static void |
@@ -589,7 +707,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport, | |||
589 | { | 707 | { |
590 | struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); | 708 | struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); |
591 | 709 | ||
592 | schedule_work(&tfcp_req->work); | 710 | schedule_work(&tfcp_req->tio_done_work); |
593 | } | 711 | } |
594 | 712 | ||
595 | static void | 713 | static void |
@@ -605,27 +723,47 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, | |||
605 | void *hw_queue_handle, | 723 | void *hw_queue_handle, |
606 | struct nvmefc_fcp_req *fcpreq) | 724 | struct nvmefc_fcp_req *fcpreq) |
607 | { | 725 | { |
608 | struct fcloop_rport *rport = remoteport->private; | ||
609 | struct fcloop_ini_fcpreq *inireq = fcpreq->private; | 726 | struct fcloop_ini_fcpreq *inireq = fcpreq->private; |
610 | struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req; | 727 | struct fcloop_fcpreq *tfcp_req; |
728 | bool abortio = true; | ||
729 | |||
730 | spin_lock(&inireq->inilock); | ||
731 | tfcp_req = inireq->tfcp_req; | ||
732 | if (tfcp_req) | ||
733 | fcloop_tfcp_req_get(tfcp_req); | ||
734 | spin_unlock(&inireq->inilock); | ||
611 | 735 | ||
612 | if (!tfcp_req) | 736 | if (!tfcp_req) |
613 | /* abort has already been called */ | 737 | /* abort has already been called */ |
614 | return; | 738 | return; |
615 | 739 | ||
616 | if (rport->targetport) | ||
617 | nvmet_fc_rcv_fcp_abort(rport->targetport, | ||
618 | &tfcp_req->tgt_fcp_req); | ||
619 | |||
620 | /* break initiator/target relationship for io */ | 740 | /* break initiator/target relationship for io */ |
621 | spin_lock(&tfcp_req->reqlock); | 741 | spin_lock(&tfcp_req->reqlock); |
622 | inireq->tfcp_req = NULL; | 742 | switch (tfcp_req->inistate) { |
623 | tfcp_req->fcpreq = NULL; | 743 | case INI_IO_START: |
744 | case INI_IO_ACTIVE: | ||
745 | tfcp_req->inistate = INI_IO_ABORTED; | ||
746 | break; | ||
747 | case INI_IO_COMPLETED: | ||
748 | abortio = false; | ||
749 | break; | ||
750 | default: | ||
751 | spin_unlock(&tfcp_req->reqlock); | ||
752 | WARN_ON(1); | ||
753 | return; | ||
754 | } | ||
624 | spin_unlock(&tfcp_req->reqlock); | 755 | spin_unlock(&tfcp_req->reqlock); |
625 | 756 | ||
626 | /* post the aborted io completion */ | 757 | if (abortio) |
627 | fcpreq->status = -ECANCELED; | 758 | /* leave the reference while the work item is scheduled */ |
628 | schedule_work(&inireq->iniwork); | 759 | WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work)); |
760 | else { | ||
761 | /* | ||
762 | * as the io has already had the done callback made, | ||
763 | * nothing more to do. So release the reference taken above | ||
764 | */ | ||
765 | fcloop_tfcp_req_put(tfcp_req); | ||
766 | } | ||
629 | } | 767 | } |
630 | 768 | ||
631 | static void | 769 | static void |
@@ -657,7 +795,8 @@ fcloop_nport_get(struct fcloop_nport *nport) | |||
657 | static void | 795 | static void |
658 | fcloop_localport_delete(struct nvme_fc_local_port *localport) | 796 | fcloop_localport_delete(struct nvme_fc_local_port *localport) |
659 | { | 797 | { |
660 | struct fcloop_lport *lport = localport->private; | 798 | struct fcloop_lport_priv *lport_priv = localport->private; |
799 | struct fcloop_lport *lport = lport_priv->lport; | ||
661 | 800 | ||
662 | /* release any threads waiting for the unreg to complete */ | 801 | /* release any threads waiting for the unreg to complete */ |
663 | complete(&lport->unreg_done); | 802 | complete(&lport->unreg_done); |
@@ -697,7 +836,7 @@ static struct nvme_fc_port_template fctemplate = { | |||
697 | .max_dif_sgl_segments = FCLOOP_SGL_SEGS, | 836 | .max_dif_sgl_segments = FCLOOP_SGL_SEGS, |
698 | .dma_boundary = FCLOOP_DMABOUND_4G, | 837 | .dma_boundary = FCLOOP_DMABOUND_4G, |
699 | /* sizes of additional private data for data structures */ | 838 | /* sizes of additional private data for data structures */ |
700 | .local_priv_sz = sizeof(struct fcloop_lport), | 839 | .local_priv_sz = sizeof(struct fcloop_lport_priv), |
701 | .remote_priv_sz = sizeof(struct fcloop_rport), | 840 | .remote_priv_sz = sizeof(struct fcloop_rport), |
702 | .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), | 841 | .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), |
703 | .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), | 842 | .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), |
@@ -714,8 +853,7 @@ static struct nvmet_fc_target_template tgttemplate = { | |||
714 | .max_dif_sgl_segments = FCLOOP_SGL_SEGS, | 853 | .max_dif_sgl_segments = FCLOOP_SGL_SEGS, |
715 | .dma_boundary = FCLOOP_DMABOUND_4G, | 854 | .dma_boundary = FCLOOP_DMABOUND_4G, |
716 | /* optional features */ | 855 | /* optional features */ |
717 | .target_features = NVMET_FCTGTFEAT_CMD_IN_ISR | | 856 | .target_features = 0, |
718 | NVMET_FCTGTFEAT_OPDONE_IN_ISR, | ||
719 | /* sizes of additional private data for data structures */ | 857 | /* sizes of additional private data for data structures */ |
720 | .target_priv_sz = sizeof(struct fcloop_tport), | 858 | .target_priv_sz = sizeof(struct fcloop_tport), |
721 | }; | 859 | }; |
@@ -728,11 +866,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, | |||
728 | struct fcloop_ctrl_options *opts; | 866 | struct fcloop_ctrl_options *opts; |
729 | struct nvme_fc_local_port *localport; | 867 | struct nvme_fc_local_port *localport; |
730 | struct fcloop_lport *lport; | 868 | struct fcloop_lport *lport; |
731 | int ret; | 869 | struct fcloop_lport_priv *lport_priv; |
870 | unsigned long flags; | ||
871 | int ret = -ENOMEM; | ||
872 | |||
873 | lport = kzalloc(sizeof(*lport), GFP_KERNEL); | ||
874 | if (!lport) | ||
875 | return -ENOMEM; | ||
732 | 876 | ||
733 | opts = kzalloc(sizeof(*opts), GFP_KERNEL); | 877 | opts = kzalloc(sizeof(*opts), GFP_KERNEL); |
734 | if (!opts) | 878 | if (!opts) |
735 | return -ENOMEM; | 879 | goto out_free_lport; |
736 | 880 | ||
737 | ret = fcloop_parse_options(opts, buf); | 881 | ret = fcloop_parse_options(opts, buf); |
738 | if (ret) | 882 | if (ret) |
@@ -752,23 +896,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, | |||
752 | 896 | ||
753 | ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport); | 897 | ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport); |
754 | if (!ret) { | 898 | if (!ret) { |
755 | unsigned long flags; | ||
756 | |||
757 | /* success */ | 899 | /* success */ |
758 | lport = localport->private; | 900 | lport_priv = localport->private; |
901 | lport_priv->lport = lport; | ||
902 | |||
759 | lport->localport = localport; | 903 | lport->localport = localport; |
760 | INIT_LIST_HEAD(&lport->lport_list); | 904 | INIT_LIST_HEAD(&lport->lport_list); |
761 | 905 | ||
762 | spin_lock_irqsave(&fcloop_lock, flags); | 906 | spin_lock_irqsave(&fcloop_lock, flags); |
763 | list_add_tail(&lport->lport_list, &fcloop_lports); | 907 | list_add_tail(&lport->lport_list, &fcloop_lports); |
764 | spin_unlock_irqrestore(&fcloop_lock, flags); | 908 | spin_unlock_irqrestore(&fcloop_lock, flags); |
765 | |||
766 | /* mark all of the input buffer consumed */ | ||
767 | ret = count; | ||
768 | } | 909 | } |
769 | 910 | ||
770 | out_free_opts: | 911 | out_free_opts: |
771 | kfree(opts); | 912 | kfree(opts); |
913 | out_free_lport: | ||
914 | /* free only if we're going to fail */ | ||
915 | if (ret) | ||
916 | kfree(lport); | ||
917 | |||
772 | return ret ? ret : count; | 918 | return ret ? ret : count; |
773 | } | 919 | } |
774 | 920 | ||
@@ -790,6 +936,8 @@ __wait_localport_unreg(struct fcloop_lport *lport) | |||
790 | 936 | ||
791 | wait_for_completion(&lport->unreg_done); | 937 | wait_for_completion(&lport->unreg_done); |
792 | 938 | ||
939 | kfree(lport); | ||
940 | |||
793 | return ret; | 941 | return ret; |
794 | } | 942 | } |
795 | 943 | ||
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 1e21b286f299..7991ec3a17db 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c | |||
@@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = { | |||
686 | 686 | ||
687 | static struct nvmf_transport_ops nvme_loop_transport = { | 687 | static struct nvmf_transport_ops nvme_loop_transport = { |
688 | .name = "loop", | 688 | .name = "loop", |
689 | .module = THIS_MODULE, | ||
689 | .create_ctrl = nvme_loop_create_ctrl, | 690 | .create_ctrl = nvme_loop_create_ctrl, |
690 | }; | 691 | }; |
691 | 692 | ||
@@ -716,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void) | |||
716 | nvme_delete_ctrl(&ctrl->ctrl); | 717 | nvme_delete_ctrl(&ctrl->ctrl); |
717 | mutex_unlock(&nvme_loop_ctrl_mutex); | 718 | mutex_unlock(&nvme_loop_ctrl_mutex); |
718 | 719 | ||
719 | flush_workqueue(nvme_wq); | 720 | flush_workqueue(nvme_delete_wq); |
720 | } | 721 | } |
721 | 722 | ||
722 | module_init(nvme_loop_init_module); | 723 | module_init(nvme_loop_init_module); |
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 49912909c298..978e169c11bf 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c | |||
@@ -185,59 +185,6 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) | |||
185 | spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); | 185 | spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); |
186 | } | 186 | } |
187 | 187 | ||
188 | static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents) | ||
189 | { | ||
190 | struct scatterlist *sg; | ||
191 | int count; | ||
192 | |||
193 | if (!sgl || !nents) | ||
194 | return; | ||
195 | |||
196 | for_each_sg(sgl, sg, nents, count) | ||
197 | __free_page(sg_page(sg)); | ||
198 | kfree(sgl); | ||
199 | } | ||
200 | |||
201 | static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, | ||
202 | u32 length) | ||
203 | { | ||
204 | struct scatterlist *sg; | ||
205 | struct page *page; | ||
206 | unsigned int nent; | ||
207 | int i = 0; | ||
208 | |||
209 | nent = DIV_ROUND_UP(length, PAGE_SIZE); | ||
210 | sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); | ||
211 | if (!sg) | ||
212 | goto out; | ||
213 | |||
214 | sg_init_table(sg, nent); | ||
215 | |||
216 | while (length) { | ||
217 | u32 page_len = min_t(u32, length, PAGE_SIZE); | ||
218 | |||
219 | page = alloc_page(GFP_KERNEL); | ||
220 | if (!page) | ||
221 | goto out_free_pages; | ||
222 | |||
223 | sg_set_page(&sg[i], page, page_len, 0); | ||
224 | length -= page_len; | ||
225 | i++; | ||
226 | } | ||
227 | *sgl = sg; | ||
228 | *nents = nent; | ||
229 | return 0; | ||
230 | |||
231 | out_free_pages: | ||
232 | while (i > 0) { | ||
233 | i--; | ||
234 | __free_page(sg_page(&sg[i])); | ||
235 | } | ||
236 | kfree(sg); | ||
237 | out: | ||
238 | return NVME_SC_INTERNAL; | ||
239 | } | ||
240 | |||
241 | static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, | 188 | static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, |
242 | struct nvmet_rdma_cmd *c, bool admin) | 189 | struct nvmet_rdma_cmd *c, bool admin) |
243 | { | 190 | { |
@@ -484,7 +431,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) | |||
484 | } | 431 | } |
485 | 432 | ||
486 | if (rsp->req.sg != &rsp->cmd->inline_sg) | 433 | if (rsp->req.sg != &rsp->cmd->inline_sg) |
487 | nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); | 434 | sgl_free(rsp->req.sg); |
488 | 435 | ||
489 | if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) | 436 | if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) |
490 | nvmet_rdma_process_wr_wait_list(queue); | 437 | nvmet_rdma_process_wr_wait_list(queue); |
@@ -621,16 +568,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, | |||
621 | u32 len = get_unaligned_le24(sgl->length); | 568 | u32 len = get_unaligned_le24(sgl->length); |
622 | u32 key = get_unaligned_le32(sgl->key); | 569 | u32 key = get_unaligned_le32(sgl->key); |
623 | int ret; | 570 | int ret; |
624 | u16 status; | ||
625 | 571 | ||
626 | /* no data command? */ | 572 | /* no data command? */ |
627 | if (!len) | 573 | if (!len) |
628 | return 0; | 574 | return 0; |
629 | 575 | ||
630 | status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, | 576 | rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); |
631 | len); | 577 | if (!rsp->req.sg) |
632 | if (status) | 578 | return NVME_SC_INTERNAL; |
633 | return status; | ||
634 | 579 | ||
635 | ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, | 580 | ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, |
636 | rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, | 581 | rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, |
@@ -976,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) | |||
976 | 921 | ||
977 | static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) | 922 | static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) |
978 | { | 923 | { |
979 | pr_info("freeing queue %d\n", queue->idx); | 924 | pr_debug("freeing queue %d\n", queue->idx); |
980 | 925 | ||
981 | nvmet_sq_destroy(&queue->nvme_sq); | 926 | nvmet_sq_destroy(&queue->nvme_sq); |
982 | 927 | ||
@@ -1558,25 +1503,9 @@ err_ib_client: | |||
1558 | 1503 | ||
1559 | static void __exit nvmet_rdma_exit(void) | 1504 | static void __exit nvmet_rdma_exit(void) |
1560 | { | 1505 | { |
1561 | struct nvmet_rdma_queue *queue; | ||
1562 | |||
1563 | nvmet_unregister_transport(&nvmet_rdma_ops); | 1506 | nvmet_unregister_transport(&nvmet_rdma_ops); |
1564 | |||
1565 | flush_scheduled_work(); | ||
1566 | |||
1567 | mutex_lock(&nvmet_rdma_queue_mutex); | ||
1568 | while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list, | ||
1569 | struct nvmet_rdma_queue, queue_list))) { | ||
1570 | list_del_init(&queue->queue_list); | ||
1571 | |||
1572 | mutex_unlock(&nvmet_rdma_queue_mutex); | ||
1573 | __nvmet_rdma_queue_disconnect(queue); | ||
1574 | mutex_lock(&nvmet_rdma_queue_mutex); | ||
1575 | } | ||
1576 | mutex_unlock(&nvmet_rdma_queue_mutex); | ||
1577 | |||
1578 | flush_scheduled_work(); | ||
1579 | ib_unregister_client(&nvmet_rdma_ib_client); | 1507 | ib_unregister_client(&nvmet_rdma_ib_client); |
1508 | WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); | ||
1580 | ida_destroy(&nvmet_rdma_queue_ida); | 1509 | ida_destroy(&nvmet_rdma_queue_ida); |
1581 | } | 1510 | } |
1582 | 1511 | ||
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index e2bc99980f75..4c44d7bed01a 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig | |||
@@ -5,6 +5,7 @@ menuconfig TARGET_CORE | |||
5 | select CONFIGFS_FS | 5 | select CONFIGFS_FS |
6 | select CRC_T10DIF | 6 | select CRC_T10DIF |
7 | select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. | 7 | select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. |
8 | select SGL_ALLOC | ||
8 | default n | 9 | default n |
9 | help | 10 | help |
10 | Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled | 11 | Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled |
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 58caacd54a3b..c03a78ee26cd 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c | |||
@@ -2300,13 +2300,7 @@ queue_full: | |||
2300 | 2300 | ||
2301 | void target_free_sgl(struct scatterlist *sgl, int nents) | 2301 | void target_free_sgl(struct scatterlist *sgl, int nents) |
2302 | { | 2302 | { |
2303 | struct scatterlist *sg; | 2303 | sgl_free_n_order(sgl, nents, 0); |
2304 | int count; | ||
2305 | |||
2306 | for_each_sg(sgl, sg, nents, count) | ||
2307 | __free_page(sg_page(sg)); | ||
2308 | |||
2309 | kfree(sgl); | ||
2310 | } | 2304 | } |
2311 | EXPORT_SYMBOL(target_free_sgl); | 2305 | EXPORT_SYMBOL(target_free_sgl); |
2312 | 2306 | ||
@@ -2414,42 +2408,10 @@ int | |||
2414 | target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length, | 2408 | target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length, |
2415 | bool zero_page, bool chainable) | 2409 | bool zero_page, bool chainable) |
2416 | { | 2410 | { |
2417 | struct scatterlist *sg; | 2411 | gfp_t gfp = GFP_KERNEL | (zero_page ? __GFP_ZERO : 0); |
2418 | struct page *page; | ||
2419 | gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0; | ||
2420 | unsigned int nalloc, nent; | ||
2421 | int i = 0; | ||
2422 | |||
2423 | nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE); | ||
2424 | if (chainable) | ||
2425 | nalloc++; | ||
2426 | sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL); | ||
2427 | if (!sg) | ||
2428 | return -ENOMEM; | ||
2429 | 2412 | ||
2430 | sg_init_table(sg, nalloc); | 2413 | *sgl = sgl_alloc_order(length, 0, chainable, gfp, nents); |
2431 | 2414 | return *sgl ? 0 : -ENOMEM; | |
2432 | while (length) { | ||
2433 | u32 page_len = min_t(u32, length, PAGE_SIZE); | ||
2434 | page = alloc_page(GFP_KERNEL | zero_flag); | ||
2435 | if (!page) | ||
2436 | goto out; | ||
2437 | |||
2438 | sg_set_page(&sg[i], page, page_len, 0); | ||
2439 | length -= page_len; | ||
2440 | i++; | ||
2441 | } | ||
2442 | *sgl = sg; | ||
2443 | *nents = nent; | ||
2444 | return 0; | ||
2445 | |||
2446 | out: | ||
2447 | while (i > 0) { | ||
2448 | i--; | ||
2449 | __free_page(sg_page(&sg[i])); | ||
2450 | } | ||
2451 | kfree(sg); | ||
2452 | return -ENOMEM; | ||
2453 | } | 2415 | } |
2454 | EXPORT_SYMBOL(target_alloc_sgl); | 2416 | EXPORT_SYMBOL(target_alloc_sgl); |
2455 | 2417 | ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5982c8a71f02..75610d23d197 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -411,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
411 | 411 | ||
412 | static u64 bio_end_offset(struct bio *bio) | 412 | static u64 bio_end_offset(struct bio *bio) |
413 | { | 413 | { |
414 | struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; | 414 | struct bio_vec *last = bio_last_bvec_all(bio); |
415 | 415 | ||
416 | return page_offset(last->bv_page) + last->bv_len + last->bv_offset; | 416 | return page_offset(last->bv_page) + last->bv_len + last->bv_offset; |
417 | } | 417 | } |
@@ -563,7 +563,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
563 | /* we need the actual starting offset of this extent in the file */ | 563 | /* we need the actual starting offset of this extent in the file */ |
564 | read_lock(&em_tree->lock); | 564 | read_lock(&em_tree->lock); |
565 | em = lookup_extent_mapping(em_tree, | 565 | em = lookup_extent_mapping(em_tree, |
566 | page_offset(bio->bi_io_vec->bv_page), | 566 | page_offset(bio_first_page_all(bio)), |
567 | PAGE_SIZE); | 567 | PAGE_SIZE); |
568 | read_unlock(&em_tree->lock); | 568 | read_unlock(&em_tree->lock); |
569 | if (!em) | 569 | if (!em) |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 012d63870b99..d43360b33ef6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -2257,7 +2257,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, | |||
2257 | return 0; | 2257 | return 0; |
2258 | } | 2258 | } |
2259 | 2259 | ||
2260 | bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, | 2260 | bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, |
2261 | struct io_failure_record *failrec, int failed_mirror) | 2261 | struct io_failure_record *failrec, int failed_mirror) |
2262 | { | 2262 | { |
2263 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | 2263 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
@@ -2281,7 +2281,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, | |||
2281 | * a) deliver good data to the caller | 2281 | * a) deliver good data to the caller |
2282 | * b) correct the bad sectors on disk | 2282 | * b) correct the bad sectors on disk |
2283 | */ | 2283 | */ |
2284 | if (failed_bio->bi_vcnt > 1) { | 2284 | if (failed_bio_pages > 1) { |
2285 | /* | 2285 | /* |
2286 | * to fulfill b), we need to know the exact failing sectors, as | 2286 | * to fulfill b), we need to know the exact failing sectors, as |
2287 | * we don't want to rewrite any more than the failed ones. thus, | 2287 | * we don't want to rewrite any more than the failed ones. thus, |
@@ -2374,6 +2374,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, | |||
2374 | int read_mode = 0; | 2374 | int read_mode = 0; |
2375 | blk_status_t status; | 2375 | blk_status_t status; |
2376 | int ret; | 2376 | int ret; |
2377 | unsigned failed_bio_pages = bio_pages_all(failed_bio); | ||
2377 | 2378 | ||
2378 | BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); | 2379 | BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); |
2379 | 2380 | ||
@@ -2381,13 +2382,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, | |||
2381 | if (ret) | 2382 | if (ret) |
2382 | return ret; | 2383 | return ret; |
2383 | 2384 | ||
2384 | if (!btrfs_check_repairable(inode, failed_bio, failrec, | 2385 | if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, |
2385 | failed_mirror)) { | 2386 | failed_mirror)) { |
2386 | free_io_failure(failure_tree, tree, failrec); | 2387 | free_io_failure(failure_tree, tree, failrec); |
2387 | return -EIO; | 2388 | return -EIO; |
2388 | } | 2389 | } |
2389 | 2390 | ||
2390 | if (failed_bio->bi_vcnt > 1) | 2391 | if (failed_bio_pages > 1) |
2391 | read_mode |= REQ_FAILFAST_DEV; | 2392 | read_mode |= REQ_FAILFAST_DEV; |
2392 | 2393 | ||
2393 | phy_offset >>= inode->i_sb->s_blocksize_bits; | 2394 | phy_offset >>= inode->i_sb->s_blocksize_bits; |
@@ -2724,7 +2725,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, | |||
2724 | unsigned long bio_flags) | 2725 | unsigned long bio_flags) |
2725 | { | 2726 | { |
2726 | blk_status_t ret = 0; | 2727 | blk_status_t ret = 0; |
2727 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | 2728 | struct bio_vec *bvec = bio_last_bvec_all(bio); |
2728 | struct page *page = bvec->bv_page; | 2729 | struct page *page = bvec->bv_page; |
2729 | struct extent_io_tree *tree = bio->bi_private; | 2730 | struct extent_io_tree *tree = bio->bi_private; |
2730 | u64 start; | 2731 | u64 start; |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 93dcae0c3183..20854d63c75b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -540,7 +540,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, | |||
540 | u64 end); | 540 | u64 end); |
541 | int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, | 541 | int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, |
542 | struct io_failure_record **failrec_ret); | 542 | struct io_failure_record **failrec_ret); |
543 | bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, | 543 | bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, |
544 | struct io_failure_record *failrec, int fail_mirror); | 544 | struct io_failure_record *failrec, int fail_mirror); |
545 | struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, | 545 | struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, |
546 | struct io_failure_record *failrec, | 546 | struct io_failure_record *failrec, |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e1a7f3cb5be9..cb1e2d201434 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -8015,6 +8015,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, | |||
8015 | int segs; | 8015 | int segs; |
8016 | int ret; | 8016 | int ret; |
8017 | blk_status_t status; | 8017 | blk_status_t status; |
8018 | struct bio_vec bvec; | ||
8018 | 8019 | ||
8019 | BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); | 8020 | BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); |
8020 | 8021 | ||
@@ -8030,8 +8031,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, | |||
8030 | } | 8031 | } |
8031 | 8032 | ||
8032 | segs = bio_segments(failed_bio); | 8033 | segs = bio_segments(failed_bio); |
8034 | bio_get_first_bvec(failed_bio, &bvec); | ||
8033 | if (segs > 1 || | 8035 | if (segs > 1 || |
8034 | (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) | 8036 | (bvec.bv_len > btrfs_inode_sectorsize(inode))) |
8035 | read_mode |= REQ_FAILFAST_DEV; | 8037 | read_mode |= REQ_FAILFAST_DEV; |
8036 | 8038 | ||
8037 | isector = start - btrfs_io_bio(failed_bio)->logical; | 8039 | isector = start - btrfs_io_bio(failed_bio)->logical; |
@@ -8074,7 +8076,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) | |||
8074 | ASSERT(bio->bi_vcnt == 1); | 8076 | ASSERT(bio->bi_vcnt == 1); |
8075 | io_tree = &BTRFS_I(inode)->io_tree; | 8077 | io_tree = &BTRFS_I(inode)->io_tree; |
8076 | failure_tree = &BTRFS_I(inode)->io_failure_tree; | 8078 | failure_tree = &BTRFS_I(inode)->io_failure_tree; |
8077 | ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); | 8079 | ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); |
8078 | 8080 | ||
8079 | done->uptodate = 1; | 8081 | done->uptodate = 1; |
8080 | ASSERT(!bio_flagged(bio, BIO_CLONED)); | 8082 | ASSERT(!bio_flagged(bio, BIO_CLONED)); |
@@ -8164,7 +8166,7 @@ static void btrfs_retry_endio(struct bio *bio) | |||
8164 | uptodate = 1; | 8166 | uptodate = 1; |
8165 | 8167 | ||
8166 | ASSERT(bio->bi_vcnt == 1); | 8168 | ASSERT(bio->bi_vcnt == 1); |
8167 | ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); | 8169 | ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); |
8168 | 8170 | ||
8169 | io_tree = &BTRFS_I(inode)->io_tree; | 8171 | io_tree = &BTRFS_I(inode)->io_tree; |
8170 | failure_tree = &BTRFS_I(inode)->io_failure_tree; | 8172 | failure_tree = &BTRFS_I(inode)->io_failure_tree; |
diff --git a/fs/buffer.c b/fs/buffer.c index 0736a6a2e2f0..8b26295a56fe 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -3014,7 +3014,7 @@ static void end_bio_bh_io_sync(struct bio *bio) | |||
3014 | void guard_bio_eod(int op, struct bio *bio) | 3014 | void guard_bio_eod(int op, struct bio *bio) |
3015 | { | 3015 | { |
3016 | sector_t maxsector; | 3016 | sector_t maxsector; |
3017 | struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; | 3017 | struct bio_vec *bvec = bio_last_bvec_all(bio); |
3018 | unsigned truncated_bytes; | 3018 | unsigned truncated_bytes; |
3019 | struct hd_struct *part; | 3019 | struct hd_struct *part; |
3020 | 3020 | ||
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 516fa0d3ff9c..455f086cce3d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c | |||
@@ -56,7 +56,7 @@ static void f2fs_read_end_io(struct bio *bio) | |||
56 | int i; | 56 | int i; |
57 | 57 | ||
58 | #ifdef CONFIG_F2FS_FAULT_INJECTION | 58 | #ifdef CONFIG_F2FS_FAULT_INJECTION |
59 | if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { | 59 | if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { |
60 | f2fs_show_injection_info(FAULT_IO); | 60 | f2fs_show_injection_info(FAULT_IO); |
61 | bio->bi_status = BLK_STS_IOERR; | 61 | bio->bi_status = BLK_STS_IOERR; |
62 | } | 62 | } |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cea4836385b7..d4d04fee568a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -126,7 +126,7 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) | |||
126 | * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list | 126 | * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list |
127 | * @inode: inode to be moved | 127 | * @inode: inode to be moved |
128 | * @wb: target bdi_writeback | 128 | * @wb: target bdi_writeback |
129 | * @head: one of @wb->b_{dirty|io|more_io} | 129 | * @head: one of @wb->b_{dirty|io|more_io|dirty_time} |
130 | * | 130 | * |
131 | * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. | 131 | * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. |
132 | * Returns %true if @inode is the first occupant of the !dirty_time IO | 132 | * Returns %true if @inode is the first occupant of the !dirty_time IO |
diff --git a/include/linux/bio.h b/include/linux/bio.h index 23d29b39f71e..d0eb659fa733 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h | |||
@@ -300,6 +300,29 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) | |||
300 | bv->bv_len = iter.bi_bvec_done; | 300 | bv->bv_len = iter.bi_bvec_done; |
301 | } | 301 | } |
302 | 302 | ||
303 | static inline unsigned bio_pages_all(struct bio *bio) | ||
304 | { | ||
305 | WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); | ||
306 | return bio->bi_vcnt; | ||
307 | } | ||
308 | |||
309 | static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) | ||
310 | { | ||
311 | WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); | ||
312 | return bio->bi_io_vec; | ||
313 | } | ||
314 | |||
315 | static inline struct page *bio_first_page_all(struct bio *bio) | ||
316 | { | ||
317 | return bio_first_bvec_all(bio)->bv_page; | ||
318 | } | ||
319 | |||
320 | static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) | ||
321 | { | ||
322 | WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); | ||
323 | return &bio->bi_io_vec[bio->bi_vcnt - 1]; | ||
324 | } | ||
325 | |||
303 | enum bip_flags { | 326 | enum bip_flags { |
304 | BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ | 327 | BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ |
305 | BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ | 328 | BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ |
@@ -477,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi) | |||
477 | #endif | 500 | #endif |
478 | 501 | ||
479 | extern void bio_copy_data(struct bio *dst, struct bio *src); | 502 | extern void bio_copy_data(struct bio *dst, struct bio *src); |
480 | extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); | ||
481 | extern void bio_free_pages(struct bio *bio); | 503 | extern void bio_free_pages(struct bio *bio); |
482 | 504 | ||
483 | extern struct bio *bio_copy_user_iov(struct request_queue *, | 505 | extern struct bio *bio_copy_user_iov(struct request_queue *, |
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e9825ff57b15..69bea82ebeb1 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h | |||
@@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) | |||
660 | static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, | 660 | static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, |
661 | struct blkg_rwstat *from) | 661 | struct blkg_rwstat *from) |
662 | { | 662 | { |
663 | struct blkg_rwstat v = blkg_rwstat_read(from); | 663 | u64 sum[BLKG_RWSTAT_NR]; |
664 | int i; | 664 | int i; |
665 | 665 | ||
666 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | 666 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
667 | atomic64_add(atomic64_read(&v.aux_cnt[i]) + | 667 | sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); |
668 | atomic64_read(&from->aux_cnt[i]), | 668 | |
669 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | ||
670 | atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), | ||
669 | &to->aux_cnt[i]); | 671 | &to->aux_cnt[i]); |
670 | } | 672 | } |
671 | 673 | ||
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 95c9a5c862e2..8efcf49796a3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx { | |||
51 | unsigned int queue_num; | 51 | unsigned int queue_num; |
52 | 52 | ||
53 | atomic_t nr_active; | 53 | atomic_t nr_active; |
54 | unsigned int nr_expired; | ||
54 | 55 | ||
55 | struct hlist_node cpuhp_dead; | 56 | struct hlist_node cpuhp_dead; |
56 | struct kobject kobj; | 57 | struct kobject kobj; |
@@ -65,7 +66,7 @@ struct blk_mq_hw_ctx { | |||
65 | #endif | 66 | #endif |
66 | 67 | ||
67 | /* Must be the last member - see also blk_mq_hw_ctx_size(). */ | 68 | /* Must be the last member - see also blk_mq_hw_ctx_size(). */ |
68 | struct srcu_struct queue_rq_srcu[0]; | 69 | struct srcu_struct srcu[0]; |
69 | }; | 70 | }; |
70 | 71 | ||
71 | struct blk_mq_tag_set { | 72 | struct blk_mq_tag_set { |
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 9e7d8bd776d2..c5d3db0d83f8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
@@ -39,6 +39,34 @@ typedef u8 __bitwise blk_status_t; | |||
39 | 39 | ||
40 | #define BLK_STS_AGAIN ((__force blk_status_t)12) | 40 | #define BLK_STS_AGAIN ((__force blk_status_t)12) |
41 | 41 | ||
42 | /** | ||
43 | * blk_path_error - returns true if error may be path related | ||
44 | * @error: status the request was completed with | ||
45 | * | ||
46 | * Description: | ||
47 | * This classifies block error status into non-retryable errors and ones | ||
48 | * that may be successful if retried on a failover path. | ||
49 | * | ||
50 | * Return: | ||
51 | * %false - retrying failover path will not help | ||
52 | * %true - may succeed if retried | ||
53 | */ | ||
54 | static inline bool blk_path_error(blk_status_t error) | ||
55 | { | ||
56 | switch (error) { | ||
57 | case BLK_STS_NOTSUPP: | ||
58 | case BLK_STS_NOSPC: | ||
59 | case BLK_STS_TARGET: | ||
60 | case BLK_STS_NEXUS: | ||
61 | case BLK_STS_MEDIUM: | ||
62 | case BLK_STS_PROTECTION: | ||
63 | return false; | ||
64 | } | ||
65 | |||
66 | /* Anything else could be a path failure, so should be retried */ | ||
67 | return true; | ||
68 | } | ||
69 | |||
42 | struct blk_issue_stat { | 70 | struct blk_issue_stat { |
43 | u64 stat; | 71 | u64 stat; |
44 | }; | 72 | }; |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ce8a372d506..4f3df807cf8f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -27,6 +27,8 @@ | |||
27 | #include <linux/percpu-refcount.h> | 27 | #include <linux/percpu-refcount.h> |
28 | #include <linux/scatterlist.h> | 28 | #include <linux/scatterlist.h> |
29 | #include <linux/blkzoned.h> | 29 | #include <linux/blkzoned.h> |
30 | #include <linux/seqlock.h> | ||
31 | #include <linux/u64_stats_sync.h> | ||
30 | 32 | ||
31 | struct module; | 33 | struct module; |
32 | struct scsi_ioctl_command; | 34 | struct scsi_ioctl_command; |
@@ -121,6 +123,12 @@ typedef __u32 __bitwise req_flags_t; | |||
121 | /* Look at ->special_vec for the actual data payload instead of the | 123 | /* Look at ->special_vec for the actual data payload instead of the |
122 | bio chain. */ | 124 | bio chain. */ |
123 | #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) | 125 | #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) |
126 | /* The per-zone write lock is held for this request */ | ||
127 | #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) | ||
128 | /* timeout is expired */ | ||
129 | #define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) | ||
130 | /* already slept for hybrid poll */ | ||
131 | #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21)) | ||
124 | 132 | ||
125 | /* flags that prevent us from merging requests: */ | 133 | /* flags that prevent us from merging requests: */ |
126 | #define RQF_NOMERGE_FLAGS \ | 134 | #define RQF_NOMERGE_FLAGS \ |
@@ -133,12 +141,6 @@ typedef __u32 __bitwise req_flags_t; | |||
133 | * especially blk_mq_rq_ctx_init() to take care of the added fields. | 141 | * especially blk_mq_rq_ctx_init() to take care of the added fields. |
134 | */ | 142 | */ |
135 | struct request { | 143 | struct request { |
136 | struct list_head queuelist; | ||
137 | union { | ||
138 | struct __call_single_data csd; | ||
139 | u64 fifo_time; | ||
140 | }; | ||
141 | |||
142 | struct request_queue *q; | 144 | struct request_queue *q; |
143 | struct blk_mq_ctx *mq_ctx; | 145 | struct blk_mq_ctx *mq_ctx; |
144 | 146 | ||
@@ -148,8 +150,6 @@ struct request { | |||
148 | 150 | ||
149 | int internal_tag; | 151 | int internal_tag; |
150 | 152 | ||
151 | unsigned long atomic_flags; | ||
152 | |||
153 | /* the following two fields are internal, NEVER access directly */ | 153 | /* the following two fields are internal, NEVER access directly */ |
154 | unsigned int __data_len; /* total data len */ | 154 | unsigned int __data_len; /* total data len */ |
155 | int tag; | 155 | int tag; |
@@ -158,6 +158,8 @@ struct request { | |||
158 | struct bio *bio; | 158 | struct bio *bio; |
159 | struct bio *biotail; | 159 | struct bio *biotail; |
160 | 160 | ||
161 | struct list_head queuelist; | ||
162 | |||
161 | /* | 163 | /* |
162 | * The hash is used inside the scheduler, and killed once the | 164 | * The hash is used inside the scheduler, and killed once the |
163 | * request reaches the dispatch list. The ipi_list is only used | 165 | * request reaches the dispatch list. The ipi_list is only used |
@@ -205,19 +207,16 @@ struct request { | |||
205 | struct hd_struct *part; | 207 | struct hd_struct *part; |
206 | unsigned long start_time; | 208 | unsigned long start_time; |
207 | struct blk_issue_stat issue_stat; | 209 | struct blk_issue_stat issue_stat; |
208 | #ifdef CONFIG_BLK_CGROUP | ||
209 | struct request_list *rl; /* rl this rq is alloced from */ | ||
210 | unsigned long long start_time_ns; | ||
211 | unsigned long long io_start_time_ns; /* when passed to hardware */ | ||
212 | #endif | ||
213 | /* Number of scatter-gather DMA addr+len pairs after | 210 | /* Number of scatter-gather DMA addr+len pairs after |
214 | * physical address coalescing is performed. | 211 | * physical address coalescing is performed. |
215 | */ | 212 | */ |
216 | unsigned short nr_phys_segments; | 213 | unsigned short nr_phys_segments; |
214 | |||
217 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 215 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
218 | unsigned short nr_integrity_segments; | 216 | unsigned short nr_integrity_segments; |
219 | #endif | 217 | #endif |
220 | 218 | ||
219 | unsigned short write_hint; | ||
221 | unsigned short ioprio; | 220 | unsigned short ioprio; |
222 | 221 | ||
223 | unsigned int timeout; | 222 | unsigned int timeout; |
@@ -226,11 +225,37 @@ struct request { | |||
226 | 225 | ||
227 | unsigned int extra_len; /* length of alignment and padding */ | 226 | unsigned int extra_len; /* length of alignment and padding */ |
228 | 227 | ||
229 | unsigned short write_hint; | 228 | /* |
229 | * On blk-mq, the lower bits of ->gstate (generation number and | ||
230 | * state) carry the MQ_RQ_* state value and the upper bits the | ||
231 | * generation number which is monotonically incremented and used to | ||
232 | * distinguish the reuse instances. | ||
233 | * | ||
234 | * ->gstate_seq allows updates to ->gstate and other fields | ||
235 | * (currently ->deadline) during request start to be read | ||
236 | * atomically from the timeout path, so that it can operate on a | ||
237 | * coherent set of information. | ||
238 | */ | ||
239 | seqcount_t gstate_seq; | ||
240 | u64 gstate; | ||
241 | |||
242 | /* | ||
243 | * ->aborted_gstate is used by the timeout to claim a specific | ||
244 | * recycle instance of this request. See blk_mq_timeout_work(). | ||
245 | */ | ||
246 | struct u64_stats_sync aborted_gstate_sync; | ||
247 | u64 aborted_gstate; | ||
248 | |||
249 | /* access through blk_rq_set_deadline, blk_rq_deadline */ | ||
250 | unsigned long __deadline; | ||
230 | 251 | ||
231 | unsigned long deadline; | ||
232 | struct list_head timeout_list; | 252 | struct list_head timeout_list; |
233 | 253 | ||
254 | union { | ||
255 | struct __call_single_data csd; | ||
256 | u64 fifo_time; | ||
257 | }; | ||
258 | |||
234 | /* | 259 | /* |
235 | * completion callback. | 260 | * completion callback. |
236 | */ | 261 | */ |
@@ -239,6 +264,12 @@ struct request { | |||
239 | 264 | ||
240 | /* for bidi */ | 265 | /* for bidi */ |
241 | struct request *next_rq; | 266 | struct request *next_rq; |
267 | |||
268 | #ifdef CONFIG_BLK_CGROUP | ||
269 | struct request_list *rl; /* rl this rq is alloced from */ | ||
270 | unsigned long long start_time_ns; | ||
271 | unsigned long long io_start_time_ns; /* when passed to hardware */ | ||
272 | #endif | ||
242 | }; | 273 | }; |
243 | 274 | ||
244 | static inline bool blk_op_is_scsi(unsigned int op) | 275 | static inline bool blk_op_is_scsi(unsigned int op) |
@@ -564,6 +595,22 @@ struct request_queue { | |||
564 | struct queue_limits limits; | 595 | struct queue_limits limits; |
565 | 596 | ||
566 | /* | 597 | /* |
598 | * Zoned block device information for request dispatch control. | ||
599 | * nr_zones is the total number of zones of the device. This is always | ||
600 | * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones | ||
601 | * bits which indicates if a zone is conventional (bit clear) or | ||
602 | * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones | ||
603 | * bits which indicates if a zone is write locked, that is, if a write | ||
604 | * request targeting the zone was dispatched. All three fields are | ||
605 | * initialized by the low level device driver (e.g. scsi/sd.c). | ||
606 | * Stacking drivers (device mappers) may or may not initialize | ||
607 | * these fields. | ||
608 | */ | ||
609 | unsigned int nr_zones; | ||
610 | unsigned long *seq_zones_bitmap; | ||
611 | unsigned long *seq_zones_wlock; | ||
612 | |||
613 | /* | ||
567 | * sg stuff | 614 | * sg stuff |
568 | */ | 615 | */ |
569 | unsigned int sg_timeout; | 616 | unsigned int sg_timeout; |
@@ -807,6 +854,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) | |||
807 | return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; | 854 | return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; |
808 | } | 855 | } |
809 | 856 | ||
857 | static inline unsigned int blk_queue_nr_zones(struct request_queue *q) | ||
858 | { | ||
859 | return q->nr_zones; | ||
860 | } | ||
861 | |||
862 | static inline unsigned int blk_queue_zone_no(struct request_queue *q, | ||
863 | sector_t sector) | ||
864 | { | ||
865 | if (!blk_queue_is_zoned(q)) | ||
866 | return 0; | ||
867 | return sector >> ilog2(q->limits.chunk_sectors); | ||
868 | } | ||
869 | |||
870 | static inline bool blk_queue_zone_is_seq(struct request_queue *q, | ||
871 | sector_t sector) | ||
872 | { | ||
873 | if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap) | ||
874 | return false; | ||
875 | return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); | ||
876 | } | ||
877 | |||
810 | static inline bool rq_is_sync(struct request *rq) | 878 | static inline bool rq_is_sync(struct request *rq) |
811 | { | 879 | { |
812 | return op_is_sync(rq->cmd_flags); | 880 | return op_is_sync(rq->cmd_flags); |
@@ -1046,6 +1114,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) | |||
1046 | return blk_rq_cur_bytes(rq) >> 9; | 1114 | return blk_rq_cur_bytes(rq) >> 9; |
1047 | } | 1115 | } |
1048 | 1116 | ||
1117 | static inline unsigned int blk_rq_zone_no(struct request *rq) | ||
1118 | { | ||
1119 | return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); | ||
1120 | } | ||
1121 | |||
1122 | static inline unsigned int blk_rq_zone_is_seq(struct request *rq) | ||
1123 | { | ||
1124 | return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); | ||
1125 | } | ||
1126 | |||
1049 | /* | 1127 | /* |
1050 | * Some commands like WRITE SAME have a payload or data transfer size which | 1128 | * Some commands like WRITE SAME have a payload or data transfer size which |
1051 | * is different from the size of the request. Any driver that supports such | 1129 | * is different from the size of the request. Any driver that supports such |
@@ -1595,7 +1673,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) | |||
1595 | 1673 | ||
1596 | if (q) | 1674 | if (q) |
1597 | return blk_queue_zone_sectors(q); | 1675 | return blk_queue_zone_sectors(q); |
1676 | return 0; | ||
1677 | } | ||
1678 | |||
1679 | static inline unsigned int bdev_nr_zones(struct block_device *bdev) | ||
1680 | { | ||
1681 | struct request_queue *q = bdev_get_queue(bdev); | ||
1598 | 1682 | ||
1683 | if (q) | ||
1684 | return blk_queue_nr_zones(q); | ||
1599 | return 0; | 1685 | return 0; |
1600 | } | 1686 | } |
1601 | 1687 | ||
@@ -1731,8 +1817,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) | |||
1731 | 1817 | ||
1732 | int kblockd_schedule_work(struct work_struct *work); | 1818 | int kblockd_schedule_work(struct work_struct *work); |
1733 | int kblockd_schedule_work_on(int cpu, struct work_struct *work); | 1819 | int kblockd_schedule_work_on(int cpu, struct work_struct *work); |
1734 | int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); | ||
1735 | int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); | ||
1736 | int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); | 1820 | int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); |
1737 | 1821 | ||
1738 | #ifdef CONFIG_BLK_CGROUP | 1822 | #ifdef CONFIG_BLK_CGROUP |
@@ -1971,6 +2055,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, | |||
1971 | extern int bdev_read_page(struct block_device *, sector_t, struct page *); | 2055 | extern int bdev_read_page(struct block_device *, sector_t, struct page *); |
1972 | extern int bdev_write_page(struct block_device *, sector_t, struct page *, | 2056 | extern int bdev_write_page(struct block_device *, sector_t, struct page *, |
1973 | struct writeback_control *); | 2057 | struct writeback_control *); |
2058 | |||
2059 | #ifdef CONFIG_BLK_DEV_ZONED | ||
2060 | bool blk_req_needs_zone_write_lock(struct request *rq); | ||
2061 | void __blk_req_zone_write_lock(struct request *rq); | ||
2062 | void __blk_req_zone_write_unlock(struct request *rq); | ||
2063 | |||
2064 | static inline void blk_req_zone_write_lock(struct request *rq) | ||
2065 | { | ||
2066 | if (blk_req_needs_zone_write_lock(rq)) | ||
2067 | __blk_req_zone_write_lock(rq); | ||
2068 | } | ||
2069 | |||
2070 | static inline void blk_req_zone_write_unlock(struct request *rq) | ||
2071 | { | ||
2072 | if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) | ||
2073 | __blk_req_zone_write_unlock(rq); | ||
2074 | } | ||
2075 | |||
2076 | static inline bool blk_req_zone_is_write_locked(struct request *rq) | ||
2077 | { | ||
2078 | return rq->q->seq_zones_wlock && | ||
2079 | test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); | ||
2080 | } | ||
2081 | |||
2082 | static inline bool blk_req_can_dispatch_to_zone(struct request *rq) | ||
2083 | { | ||
2084 | if (!blk_req_needs_zone_write_lock(rq)) | ||
2085 | return true; | ||
2086 | return !blk_req_zone_is_write_locked(rq); | ||
2087 | } | ||
2088 | #else | ||
2089 | static inline bool blk_req_needs_zone_write_lock(struct request *rq) | ||
2090 | { | ||
2091 | return false; | ||
2092 | } | ||
2093 | |||
2094 | static inline void blk_req_zone_write_lock(struct request *rq) | ||
2095 | { | ||
2096 | } | ||
2097 | |||
2098 | static inline void blk_req_zone_write_unlock(struct request *rq) | ||
2099 | { | ||
2100 | } | ||
2101 | static inline bool blk_req_zone_is_write_locked(struct request *rq) | ||
2102 | { | ||
2103 | return false; | ||
2104 | } | ||
2105 | |||
2106 | static inline bool blk_req_can_dispatch_to_zone(struct request *rq) | ||
2107 | { | ||
2108 | return true; | ||
2109 | } | ||
2110 | #endif /* CONFIG_BLK_DEV_ZONED */ | ||
2111 | |||
1974 | #else /* CONFIG_BLOCK */ | 2112 | #else /* CONFIG_BLOCK */ |
1975 | 2113 | ||
1976 | struct block_device; | 2114 | struct block_device; |
diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ec8a4d7af6bd..fe7a22dd133b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h | |||
@@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv, | |||
125 | ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ | 125 | ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ |
126 | bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) | 126 | bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) |
127 | 127 | ||
128 | /* for iterating one bio from start to end */ | ||
129 | #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ | ||
130 | { \ | ||
131 | .bi_sector = 0, \ | ||
132 | .bi_size = UINT_MAX, \ | ||
133 | .bi_idx = 0, \ | ||
134 | .bi_bvec_done = 0, \ | ||
135 | } | ||
136 | |||
128 | #endif /* __LINUX_BVEC_ITER_H */ | 137 | #endif /* __LINUX_BVEC_ITER_H */ |
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 3d794b3dc532..6d9e230dffd2 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h | |||
@@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); | |||
198 | extern void elv_requeue_request(struct request_queue *, struct request *); | 198 | extern void elv_requeue_request(struct request_queue *, struct request *); |
199 | extern struct request *elv_former_request(struct request_queue *, struct request *); | 199 | extern struct request *elv_former_request(struct request_queue *, struct request *); |
200 | extern struct request *elv_latter_request(struct request_queue *, struct request *); | 200 | extern struct request *elv_latter_request(struct request_queue *, struct request *); |
201 | extern int elv_register_queue(struct request_queue *q); | ||
202 | extern void elv_unregister_queue(struct request_queue *q); | ||
203 | extern int elv_may_queue(struct request_queue *, unsigned int); | 201 | extern int elv_may_queue(struct request_queue *, unsigned int); |
204 | extern void elv_completed_request(struct request_queue *, struct request *); | 202 | extern void elv_completed_request(struct request_queue *, struct request *); |
205 | extern int elv_set_request(struct request_queue *q, struct request *rq, | 203 | extern int elv_set_request(struct request_queue *q, struct request *rq, |
diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5144ebe046c9..5e3531027b51 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h | |||
@@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk) | |||
395 | { | 395 | { |
396 | device_add_disk(NULL, disk); | 396 | device_add_disk(NULL, disk); |
397 | } | 397 | } |
398 | extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); | ||
399 | static inline void add_disk_no_queue_reg(struct gendisk *disk) | ||
400 | { | ||
401 | device_add_disk_no_queue_reg(NULL, disk); | ||
402 | } | ||
398 | 403 | ||
399 | extern void del_gendisk(struct gendisk *gp); | 404 | extern void del_gendisk(struct gendisk *gp); |
400 | extern struct gendisk *get_gendisk(dev_t dev, int *partno); | 405 | extern struct gendisk *get_gendisk(dev_t dev, int *partno); |
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2d1d9de06728..7f4b60abdf27 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h | |||
@@ -50,10 +50,7 @@ struct nvm_id; | |||
50 | struct nvm_dev; | 50 | struct nvm_dev; |
51 | struct nvm_tgt_dev; | 51 | struct nvm_tgt_dev; |
52 | 52 | ||
53 | typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); | ||
54 | typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); | 53 | typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); |
55 | typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, | ||
56 | nvm_l2p_update_fn *, void *); | ||
57 | typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); | 54 | typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); |
58 | typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); | 55 | typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); |
59 | typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); | 56 | typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); |
@@ -66,7 +63,6 @@ typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); | |||
66 | 63 | ||
67 | struct nvm_dev_ops { | 64 | struct nvm_dev_ops { |
68 | nvm_id_fn *identity; | 65 | nvm_id_fn *identity; |
69 | nvm_get_l2p_tbl_fn *get_l2p_tbl; | ||
70 | nvm_op_bb_tbl_fn *get_bb_tbl; | 66 | nvm_op_bb_tbl_fn *get_bb_tbl; |
71 | nvm_op_set_bb_fn *set_bb_tbl; | 67 | nvm_op_set_bb_fn *set_bb_tbl; |
72 | 68 | ||
@@ -112,8 +108,6 @@ enum { | |||
112 | NVM_RSP_WARN_HIGHECC = 0x4700, | 108 | NVM_RSP_WARN_HIGHECC = 0x4700, |
113 | 109 | ||
114 | /* Device opcodes */ | 110 | /* Device opcodes */ |
115 | NVM_OP_HBREAD = 0x02, | ||
116 | NVM_OP_HBWRITE = 0x81, | ||
117 | NVM_OP_PWRITE = 0x91, | 111 | NVM_OP_PWRITE = 0x91, |
118 | NVM_OP_PREAD = 0x92, | 112 | NVM_OP_PREAD = 0x92, |
119 | NVM_OP_ERASE = 0x90, | 113 | NVM_OP_ERASE = 0x90, |
@@ -165,12 +159,16 @@ struct nvm_id_group { | |||
165 | u8 fmtype; | 159 | u8 fmtype; |
166 | u8 num_ch; | 160 | u8 num_ch; |
167 | u8 num_lun; | 161 | u8 num_lun; |
168 | u8 num_pln; | 162 | u16 num_chk; |
169 | u16 num_blk; | 163 | u16 clba; |
170 | u16 num_pg; | ||
171 | u16 fpg_sz; | ||
172 | u16 csecs; | 164 | u16 csecs; |
173 | u16 sos; | 165 | u16 sos; |
166 | |||
167 | u16 ws_min; | ||
168 | u16 ws_opt; | ||
169 | u16 ws_seq; | ||
170 | u16 ws_per_chk; | ||
171 | |||
174 | u32 trdt; | 172 | u32 trdt; |
175 | u32 trdm; | 173 | u32 trdm; |
176 | u32 tprt; | 174 | u32 tprt; |
@@ -181,7 +179,10 @@ struct nvm_id_group { | |||
181 | u32 mccap; | 179 | u32 mccap; |
182 | u16 cpar; | 180 | u16 cpar; |
183 | 181 | ||
184 | struct nvm_id_lp_tbl lptbl; | 182 | /* 1.2 compatibility */ |
183 | u8 num_pln; | ||
184 | u16 num_pg; | ||
185 | u16 fpg_sz; | ||
185 | }; | 186 | }; |
186 | 187 | ||
187 | struct nvm_addr_format { | 188 | struct nvm_addr_format { |
@@ -217,6 +218,10 @@ struct nvm_target { | |||
217 | 218 | ||
218 | #define ADDR_EMPTY (~0ULL) | 219 | #define ADDR_EMPTY (~0ULL) |
219 | 220 | ||
221 | #define NVM_TARGET_DEFAULT_OP (101) | ||
222 | #define NVM_TARGET_MIN_OP (3) | ||
223 | #define NVM_TARGET_MAX_OP (80) | ||
224 | |||
220 | #define NVM_VERSION_MAJOR 1 | 225 | #define NVM_VERSION_MAJOR 1 |
221 | #define NVM_VERSION_MINOR 0 | 226 | #define NVM_VERSION_MINOR 0 |
222 | #define NVM_VERSION_PATCH 0 | 227 | #define NVM_VERSION_PATCH 0 |
@@ -239,7 +244,6 @@ struct nvm_rq { | |||
239 | void *meta_list; | 244 | void *meta_list; |
240 | dma_addr_t dma_meta_list; | 245 | dma_addr_t dma_meta_list; |
241 | 246 | ||
242 | struct completion *wait; | ||
243 | nvm_end_io_fn *end_io; | 247 | nvm_end_io_fn *end_io; |
244 | 248 | ||
245 | uint8_t opcode; | 249 | uint8_t opcode; |
@@ -268,31 +272,38 @@ enum { | |||
268 | NVM_BLK_ST_BAD = 0x8, /* Bad block */ | 272 | NVM_BLK_ST_BAD = 0x8, /* Bad block */ |
269 | }; | 273 | }; |
270 | 274 | ||
275 | |||
271 | /* Device generic information */ | 276 | /* Device generic information */ |
272 | struct nvm_geo { | 277 | struct nvm_geo { |
278 | /* generic geometry */ | ||
273 | int nr_chnls; | 279 | int nr_chnls; |
274 | int nr_luns; | 280 | int all_luns; /* across channels */ |
275 | int luns_per_chnl; /* -1 if channels are not symmetric */ | 281 | int nr_luns; /* per channel */ |
276 | int nr_planes; | 282 | int nr_chks; /* per lun */ |
277 | int sec_per_pg; /* only sectors for a single page */ | 283 | |
278 | int pgs_per_blk; | ||
279 | int blks_per_lun; | ||
280 | int fpg_size; | ||
281 | int pfpg_size; /* size of buffer if all pages are to be read */ | ||
282 | int sec_size; | 284 | int sec_size; |
283 | int oob_size; | 285 | int oob_size; |
284 | int mccap; | 286 | int mccap; |
285 | struct nvm_addr_format ppaf; | ||
286 | 287 | ||
287 | /* Calculated/Cached values. These do not reflect the actual usable | 288 | int sec_per_chk; |
288 | * blocks at run-time. | 289 | int sec_per_lun; |
289 | */ | 290 | |
291 | int ws_min; | ||
292 | int ws_opt; | ||
293 | int ws_seq; | ||
294 | int ws_per_chk; | ||
295 | |||
290 | int max_rq_size; | 296 | int max_rq_size; |
291 | int plane_mode; /* drive device in single, double or quad mode */ | ||
292 | 297 | ||
298 | int op; | ||
299 | |||
300 | struct nvm_addr_format ppaf; | ||
301 | |||
302 | /* Legacy 1.2 specific geometry */ | ||
303 | int plane_mode; /* drive device in single, double or quad mode */ | ||
304 | int nr_planes; | ||
305 | int sec_per_pg; /* only sectors for a single page */ | ||
293 | int sec_per_pl; /* all sectors across planes */ | 306 | int sec_per_pl; /* all sectors across planes */ |
294 | int sec_per_blk; | ||
295 | int sec_per_lun; | ||
296 | }; | 307 | }; |
297 | 308 | ||
298 | /* sub-device structure */ | 309 | /* sub-device structure */ |
@@ -320,10 +331,6 @@ struct nvm_dev { | |||
320 | /* Device information */ | 331 | /* Device information */ |
321 | struct nvm_geo geo; | 332 | struct nvm_geo geo; |
322 | 333 | ||
323 | /* lower page table */ | ||
324 | int lps_per_blk; | ||
325 | int *lptbl; | ||
326 | |||
327 | unsigned long total_secs; | 334 | unsigned long total_secs; |
328 | 335 | ||
329 | unsigned long *lun_map; | 336 | unsigned long *lun_map; |
@@ -346,36 +353,6 @@ struct nvm_dev { | |||
346 | struct list_head targets; | 353 | struct list_head targets; |
347 | }; | 354 | }; |
348 | 355 | ||
349 | static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, | ||
350 | u64 pba) | ||
351 | { | ||
352 | struct ppa_addr l; | ||
353 | int secs, pgs, blks, luns; | ||
354 | sector_t ppa = pba; | ||
355 | |||
356 | l.ppa = 0; | ||
357 | |||
358 | div_u64_rem(ppa, geo->sec_per_pg, &secs); | ||
359 | l.g.sec = secs; | ||
360 | |||
361 | sector_div(ppa, geo->sec_per_pg); | ||
362 | div_u64_rem(ppa, geo->pgs_per_blk, &pgs); | ||
363 | l.g.pg = pgs; | ||
364 | |||
365 | sector_div(ppa, geo->pgs_per_blk); | ||
366 | div_u64_rem(ppa, geo->blks_per_lun, &blks); | ||
367 | l.g.blk = blks; | ||
368 | |||
369 | sector_div(ppa, geo->blks_per_lun); | ||
370 | div_u64_rem(ppa, geo->luns_per_chnl, &luns); | ||
371 | l.g.lun = luns; | ||
372 | |||
373 | sector_div(ppa, geo->luns_per_chnl); | ||
374 | l.g.ch = ppa; | ||
375 | |||
376 | return l; | ||
377 | } | ||
378 | |||
379 | static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, | 356 | static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, |
380 | struct ppa_addr r) | 357 | struct ppa_addr r) |
381 | { | 358 | { |
@@ -418,25 +395,6 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev, | |||
418 | return l; | 395 | return l; |
419 | } | 396 | } |
420 | 397 | ||
421 | static inline int ppa_empty(struct ppa_addr ppa_addr) | ||
422 | { | ||
423 | return (ppa_addr.ppa == ADDR_EMPTY); | ||
424 | } | ||
425 | |||
426 | static inline void ppa_set_empty(struct ppa_addr *ppa_addr) | ||
427 | { | ||
428 | ppa_addr->ppa = ADDR_EMPTY; | ||
429 | } | ||
430 | |||
431 | static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) | ||
432 | { | ||
433 | if (ppa_empty(ppa1) || ppa_empty(ppa2)) | ||
434 | return 0; | ||
435 | |||
436 | return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) && | ||
437 | (ppa1.g.blk == ppa2.g.blk)); | ||
438 | } | ||
439 | |||
440 | typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); | 398 | typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); |
441 | typedef sector_t (nvm_tgt_capacity_fn)(void *); | 399 | typedef sector_t (nvm_tgt_capacity_fn)(void *); |
442 | typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, | 400 | typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, |
@@ -481,17 +439,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, | |||
481 | extern int nvm_max_phys_sects(struct nvm_tgt_dev *); | 439 | extern int nvm_max_phys_sects(struct nvm_tgt_dev *); |
482 | extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); | 440 | extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); |
483 | extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); | 441 | extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); |
484 | extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); | ||
485 | extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, | ||
486 | void *); | ||
487 | extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); | ||
488 | extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); | ||
489 | extern void nvm_end_io(struct nvm_rq *); | 442 | extern void nvm_end_io(struct nvm_rq *); |
490 | extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); | 443 | extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); |
491 | extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); | 444 | extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); |
492 | 445 | ||
493 | extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); | ||
494 | |||
495 | #else /* CONFIG_NVM */ | 446 | #else /* CONFIG_NVM */ |
496 | struct nvm_dev_ops; | 447 | struct nvm_dev_ops; |
497 | 448 | ||
diff --git a/include/linux/nvme.h b/include/linux/nvme.h index aea87f0d917b..4112e2bd747f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h | |||
@@ -124,14 +124,20 @@ enum { | |||
124 | 124 | ||
125 | #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) | 125 | #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) |
126 | #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) | 126 | #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) |
127 | #define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) | 127 | |
128 | #define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) | 128 | enum { |
129 | 129 | NVME_CMBSZ_SQS = 1 << 0, | |
130 | #define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) | 130 | NVME_CMBSZ_CQS = 1 << 1, |
131 | #define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) | 131 | NVME_CMBSZ_LISTS = 1 << 2, |
132 | #define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) | 132 | NVME_CMBSZ_RDS = 1 << 3, |
133 | #define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) | 133 | NVME_CMBSZ_WDS = 1 << 4, |
134 | #define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) | 134 | |
135 | NVME_CMBSZ_SZ_SHIFT = 12, | ||
136 | NVME_CMBSZ_SZ_MASK = 0xfffff, | ||
137 | |||
138 | NVME_CMBSZ_SZU_SHIFT = 8, | ||
139 | NVME_CMBSZ_SZU_MASK = 0xf, | ||
140 | }; | ||
135 | 141 | ||
136 | /* | 142 | /* |
137 | * Submission and Completion Queue Entry Sizes for the NVM command set. | 143 | * Submission and Completion Queue Entry Sizes for the NVM command set. |
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b7c83254c566..22b2131bcdcd 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h | |||
@@ -276,6 +276,17 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, | |||
276 | unsigned int n_pages, unsigned int offset, | 276 | unsigned int n_pages, unsigned int offset, |
277 | unsigned long size, gfp_t gfp_mask); | 277 | unsigned long size, gfp_t gfp_mask); |
278 | 278 | ||
279 | #ifdef CONFIG_SGL_ALLOC | ||
280 | struct scatterlist *sgl_alloc_order(unsigned long long length, | ||
281 | unsigned int order, bool chainable, | ||
282 | gfp_t gfp, unsigned int *nent_p); | ||
283 | struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, | ||
284 | unsigned int *nent_p); | ||
285 | void sgl_free_n_order(struct scatterlist *sgl, int nents, int order); | ||
286 | void sgl_free_order(struct scatterlist *sgl, int order); | ||
287 | void sgl_free(struct scatterlist *sgl); | ||
288 | #endif /* CONFIG_SGL_ALLOC */ | ||
289 | |||
279 | size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, | 290 | size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, |
280 | size_t buflen, off_t skip, bool to_buffer); | 291 | size_t buflen, off_t skip, bool to_buffer); |
281 | 292 | ||
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 42d1a434af29..f9a1be7fc696 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h | |||
@@ -75,14 +75,23 @@ struct nvm_ioctl_create_simple { | |||
75 | __u32 lun_end; | 75 | __u32 lun_end; |
76 | }; | 76 | }; |
77 | 77 | ||
78 | struct nvm_ioctl_create_extended { | ||
79 | __u16 lun_begin; | ||
80 | __u16 lun_end; | ||
81 | __u16 op; | ||
82 | __u16 rsv; | ||
83 | }; | ||
84 | |||
78 | enum { | 85 | enum { |
79 | NVM_CONFIG_TYPE_SIMPLE = 0, | 86 | NVM_CONFIG_TYPE_SIMPLE = 0, |
87 | NVM_CONFIG_TYPE_EXTENDED = 1, | ||
80 | }; | 88 | }; |
81 | 89 | ||
82 | struct nvm_ioctl_create_conf { | 90 | struct nvm_ioctl_create_conf { |
83 | __u32 type; | 91 | __u32 type; |
84 | union { | 92 | union { |
85 | struct nvm_ioctl_create_simple s; | 93 | struct nvm_ioctl_create_simple s; |
94 | struct nvm_ioctl_create_extended e; | ||
86 | }; | 95 | }; |
87 | }; | 96 | }; |
88 | 97 | ||
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e12d35108225..a37a3b4b6342 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c | |||
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, | |||
39 | } | 39 | } |
40 | } | 40 | } |
41 | 41 | ||
42 | static cpumask_var_t *alloc_node_to_present_cpumask(void) | 42 | static cpumask_var_t *alloc_node_to_possible_cpumask(void) |
43 | { | 43 | { |
44 | cpumask_var_t *masks; | 44 | cpumask_var_t *masks; |
45 | int node; | 45 | int node; |
@@ -62,7 +62,7 @@ out_unwind: | |||
62 | return NULL; | 62 | return NULL; |
63 | } | 63 | } |
64 | 64 | ||
65 | static void free_node_to_present_cpumask(cpumask_var_t *masks) | 65 | static void free_node_to_possible_cpumask(cpumask_var_t *masks) |
66 | { | 66 | { |
67 | int node; | 67 | int node; |
68 | 68 | ||
@@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks) | |||
71 | kfree(masks); | 71 | kfree(masks); |
72 | } | 72 | } |
73 | 73 | ||
74 | static void build_node_to_present_cpumask(cpumask_var_t *masks) | 74 | static void build_node_to_possible_cpumask(cpumask_var_t *masks) |
75 | { | 75 | { |
76 | int cpu; | 76 | int cpu; |
77 | 77 | ||
78 | for_each_present_cpu(cpu) | 78 | for_each_possible_cpu(cpu) |
79 | cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); | 79 | cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); |
80 | } | 80 | } |
81 | 81 | ||
82 | static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, | 82 | static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, |
83 | const struct cpumask *mask, nodemask_t *nodemsk) | 83 | const struct cpumask *mask, nodemask_t *nodemsk) |
84 | { | 84 | { |
85 | int n, nodes = 0; | 85 | int n, nodes = 0; |
86 | 86 | ||
87 | /* Calculate the number of nodes in the supplied affinity mask */ | 87 | /* Calculate the number of nodes in the supplied affinity mask */ |
88 | for_each_node(n) { | 88 | for_each_node(n) { |
89 | if (cpumask_intersects(mask, node_to_present_cpumask[n])) { | 89 | if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { |
90 | node_set(n, *nodemsk); | 90 | node_set(n, *nodemsk); |
91 | nodes++; | 91 | nodes++; |
92 | } | 92 | } |
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
109 | int last_affv = affv + affd->pre_vectors; | 109 | int last_affv = affv + affd->pre_vectors; |
110 | nodemask_t nodemsk = NODE_MASK_NONE; | 110 | nodemask_t nodemsk = NODE_MASK_NONE; |
111 | struct cpumask *masks; | 111 | struct cpumask *masks; |
112 | cpumask_var_t nmsk, *node_to_present_cpumask; | 112 | cpumask_var_t nmsk, *node_to_possible_cpumask; |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * If there aren't any vectors left after applying the pre/post | 115 | * If there aren't any vectors left after applying the pre/post |
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
125 | if (!masks) | 125 | if (!masks) |
126 | goto out; | 126 | goto out; |
127 | 127 | ||
128 | node_to_present_cpumask = alloc_node_to_present_cpumask(); | 128 | node_to_possible_cpumask = alloc_node_to_possible_cpumask(); |
129 | if (!node_to_present_cpumask) | 129 | if (!node_to_possible_cpumask) |
130 | goto out; | 130 | goto out; |
131 | 131 | ||
132 | /* Fill out vectors at the beginning that don't need affinity */ | 132 | /* Fill out vectors at the beginning that don't need affinity */ |
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
135 | 135 | ||
136 | /* Stabilize the cpumasks */ | 136 | /* Stabilize the cpumasks */ |
137 | get_online_cpus(); | 137 | get_online_cpus(); |
138 | build_node_to_present_cpumask(node_to_present_cpumask); | 138 | build_node_to_possible_cpumask(node_to_possible_cpumask); |
139 | nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, | 139 | nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, |
140 | &nodemsk); | 140 | &nodemsk); |
141 | 141 | ||
142 | /* | 142 | /* |
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
146 | if (affv <= nodes) { | 146 | if (affv <= nodes) { |
147 | for_each_node_mask(n, nodemsk) { | 147 | for_each_node_mask(n, nodemsk) { |
148 | cpumask_copy(masks + curvec, | 148 | cpumask_copy(masks + curvec, |
149 | node_to_present_cpumask[n]); | 149 | node_to_possible_cpumask[n]); |
150 | if (++curvec == last_affv) | 150 | if (++curvec == last_affv) |
151 | break; | 151 | break; |
152 | } | 152 | } |
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
160 | vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; | 160 | vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; |
161 | 161 | ||
162 | /* Get the cpus on this node which are in the mask */ | 162 | /* Get the cpus on this node which are in the mask */ |
163 | cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); | 163 | cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); |
164 | 164 | ||
165 | /* Calculate the number of cpus per vector */ | 165 | /* Calculate the number of cpus per vector */ |
166 | ncpus = cpumask_weight(nmsk); | 166 | ncpus = cpumask_weight(nmsk); |
@@ -192,7 +192,7 @@ done: | |||
192 | /* Fill out vectors at the end that don't need affinity */ | 192 | /* Fill out vectors at the end that don't need affinity */ |
193 | for (; curvec < nvecs; curvec++) | 193 | for (; curvec < nvecs; curvec++) |
194 | cpumask_copy(masks + curvec, irq_default_affinity); | 194 | cpumask_copy(masks + curvec, irq_default_affinity); |
195 | free_node_to_present_cpumask(node_to_present_cpumask); | 195 | free_node_to_possible_cpumask(node_to_possible_cpumask); |
196 | out: | 196 | out: |
197 | free_cpumask_var(nmsk); | 197 | free_cpumask_var(nmsk); |
198 | return masks; | 198 | return masks; |
@@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity | |||
214 | return 0; | 214 | return 0; |
215 | 215 | ||
216 | get_online_cpus(); | 216 | get_online_cpus(); |
217 | ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; | 217 | ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; |
218 | put_online_cpus(); | 218 | put_online_cpus(); |
219 | return ret; | 219 | return ret; |
220 | } | 220 | } |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a46be1261c09..11b4282c2d20 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb) | |||
240 | static void hib_end_io(struct bio *bio) | 240 | static void hib_end_io(struct bio *bio) |
241 | { | 241 | { |
242 | struct hib_bio_batch *hb = bio->bi_private; | 242 | struct hib_bio_batch *hb = bio->bi_private; |
243 | struct page *page = bio->bi_io_vec[0].bv_page; | 243 | struct page *page = bio_first_page_all(bio); |
244 | 244 | ||
245 | if (bio->bi_status) { | 245 | if (bio->bi_status) { |
246 | pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", | 246 | pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", |
diff --git a/lib/Kconfig b/lib/Kconfig index c5e84fbcb30b..4dd5c11366f9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig | |||
@@ -409,6 +409,10 @@ config HAS_DMA | |||
409 | depends on !NO_DMA | 409 | depends on !NO_DMA |
410 | default y | 410 | default y |
411 | 411 | ||
412 | config SGL_ALLOC | ||
413 | bool | ||
414 | default n | ||
415 | |||
412 | config DMA_NOOP_OPS | 416 | config DMA_NOOP_OPS |
413 | bool | 417 | bool |
414 | depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT) | 418 | depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT) |
diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 80aa8d5463fa..42b5ca0acf93 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c | |||
@@ -462,7 +462,7 @@ static void sbq_wake_up(struct sbitmap_queue *sbq) | |||
462 | */ | 462 | */ |
463 | atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); | 463 | atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); |
464 | sbq_index_atomic_inc(&sbq->wake_index); | 464 | sbq_index_atomic_inc(&sbq->wake_index); |
465 | wake_up(&ws->wait); | 465 | wake_up_nr(&ws->wait, wake_batch); |
466 | } | 466 | } |
467 | } | 467 | } |
468 | 468 | ||
diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 7c1c55f7daaa..53728d391d3a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c | |||
@@ -474,6 +474,133 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, | |||
474 | } | 474 | } |
475 | EXPORT_SYMBOL(sg_alloc_table_from_pages); | 475 | EXPORT_SYMBOL(sg_alloc_table_from_pages); |
476 | 476 | ||
477 | #ifdef CONFIG_SGL_ALLOC | ||
478 | |||
479 | /** | ||
480 | * sgl_alloc_order - allocate a scatterlist and its pages | ||
481 | * @length: Length in bytes of the scatterlist. Must be at least one | ||
482 | * @order: Second argument for alloc_pages() | ||
483 | * @chainable: Whether or not to allocate an extra element in the scatterlist | ||
484 | * for scatterlist chaining purposes | ||
485 | * @gfp: Memory allocation flags | ||
486 | * @nent_p: [out] Number of entries in the scatterlist that have pages | ||
487 | * | ||
488 | * Returns: A pointer to an initialized scatterlist or %NULL upon failure. | ||
489 | */ | ||
490 | struct scatterlist *sgl_alloc_order(unsigned long long length, | ||
491 | unsigned int order, bool chainable, | ||
492 | gfp_t gfp, unsigned int *nent_p) | ||
493 | { | ||
494 | struct scatterlist *sgl, *sg; | ||
495 | struct page *page; | ||
496 | unsigned int nent, nalloc; | ||
497 | u32 elem_len; | ||
498 | |||
499 | nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order); | ||
500 | /* Check for integer overflow */ | ||
501 | if (length > (nent << (PAGE_SHIFT + order))) | ||
502 | return NULL; | ||
503 | nalloc = nent; | ||
504 | if (chainable) { | ||
505 | /* Check for integer overflow */ | ||
506 | if (nalloc + 1 < nalloc) | ||
507 | return NULL; | ||
508 | nalloc++; | ||
509 | } | ||
510 | sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), | ||
511 | (gfp & ~GFP_DMA) | __GFP_ZERO); | ||
512 | if (!sgl) | ||
513 | return NULL; | ||
514 | |||
515 | sg_init_table(sgl, nalloc); | ||
516 | sg = sgl; | ||
517 | while (length) { | ||
518 | elem_len = min_t(u64, length, PAGE_SIZE << order); | ||
519 | page = alloc_pages(gfp, order); | ||
520 | if (!page) { | ||
521 | sgl_free(sgl); | ||
522 | return NULL; | ||
523 | } | ||
524 | |||
525 | sg_set_page(sg, page, elem_len, 0); | ||
526 | length -= elem_len; | ||
527 | sg = sg_next(sg); | ||
528 | } | ||
529 | WARN_ONCE(length, "length = %lld\n", length); | ||
530 | if (nent_p) | ||
531 | *nent_p = nent; | ||
532 | return sgl; | ||
533 | } | ||
534 | EXPORT_SYMBOL(sgl_alloc_order); | ||
535 | |||
536 | /** | ||
537 | * sgl_alloc - allocate a scatterlist and its pages | ||
538 | * @length: Length in bytes of the scatterlist | ||
539 | * @gfp: Memory allocation flags | ||
540 | * @nent_p: [out] Number of entries in the scatterlist | ||
541 | * | ||
542 | * Returns: A pointer to an initialized scatterlist or %NULL upon failure. | ||
543 | */ | ||
544 | struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, | ||
545 | unsigned int *nent_p) | ||
546 | { | ||
547 | return sgl_alloc_order(length, 0, false, gfp, nent_p); | ||
548 | } | ||
549 | EXPORT_SYMBOL(sgl_alloc); | ||
550 | |||
551 | /** | ||
552 | * sgl_free_n_order - free a scatterlist and its pages | ||
553 | * @sgl: Scatterlist with one or more elements | ||
554 | * @nents: Maximum number of elements to free | ||
555 | * @order: Second argument for __free_pages() | ||
556 | * | ||
557 | * Notes: | ||
558 | * - If several scatterlists have been chained and each chain element is | ||
559 | * freed separately then it's essential to set nents correctly to avoid that a | ||
560 | * page would get freed twice. | ||
561 | * - All pages in a chained scatterlist can be freed at once by setting @nents | ||
562 | * to a high number. | ||
563 | */ | ||
564 | void sgl_free_n_order(struct scatterlist *sgl, int nents, int order) | ||
565 | { | ||
566 | struct scatterlist *sg; | ||
567 | struct page *page; | ||
568 | int i; | ||
569 | |||
570 | for_each_sg(sgl, sg, nents, i) { | ||
571 | if (!sg) | ||
572 | break; | ||
573 | page = sg_page(sg); | ||
574 | if (page) | ||
575 | __free_pages(page, order); | ||
576 | } | ||
577 | kfree(sgl); | ||
578 | } | ||
579 | EXPORT_SYMBOL(sgl_free_n_order); | ||
580 | |||
581 | /** | ||
582 | * sgl_free_order - free a scatterlist and its pages | ||
583 | * @sgl: Scatterlist with one or more elements | ||
584 | * @order: Second argument for __free_pages() | ||
585 | */ | ||
586 | void sgl_free_order(struct scatterlist *sgl, int order) | ||
587 | { | ||
588 | sgl_free_n_order(sgl, INT_MAX, order); | ||
589 | } | ||
590 | EXPORT_SYMBOL(sgl_free_order); | ||
591 | |||
592 | /** | ||
593 | * sgl_free - free a scatterlist and its pages | ||
594 | * @sgl: Scatterlist with one or more elements | ||
595 | */ | ||
596 | void sgl_free(struct scatterlist *sgl) | ||
597 | { | ||
598 | sgl_free_order(sgl, 0); | ||
599 | } | ||
600 | EXPORT_SYMBOL(sgl_free); | ||
601 | |||
602 | #endif /* CONFIG_SGL_ALLOC */ | ||
603 | |||
477 | void __sg_page_iter_start(struct sg_page_iter *piter, | 604 | void __sg_page_iter_start(struct sg_page_iter *piter, |
478 | struct scatterlist *sglist, unsigned int nents, | 605 | struct scatterlist *sglist, unsigned int nents, |
479 | unsigned long pgoffset) | 606 | unsigned long pgoffset) |
diff --git a/mm/page_io.c b/mm/page_io.c index e93f1a4cacd7..b41cf9644585 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -50,7 +50,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, | |||
50 | 50 | ||
51 | void end_swap_bio_write(struct bio *bio) | 51 | void end_swap_bio_write(struct bio *bio) |
52 | { | 52 | { |
53 | struct page *page = bio->bi_io_vec[0].bv_page; | 53 | struct page *page = bio_first_page_all(bio); |
54 | 54 | ||
55 | if (bio->bi_status) { | 55 | if (bio->bi_status) { |
56 | SetPageError(page); | 56 | SetPageError(page); |
@@ -122,7 +122,7 @@ static void swap_slot_free_notify(struct page *page) | |||
122 | 122 | ||
123 | static void end_swap_bio_read(struct bio *bio) | 123 | static void end_swap_bio_read(struct bio *bio) |
124 | { | 124 | { |
125 | struct page *page = bio->bi_io_vec[0].bv_page; | 125 | struct page *page = bio_first_page_all(bio); |
126 | struct task_struct *waiter = bio->bi_private; | 126 | struct task_struct *waiter = bio->bi_private; |
127 | 127 | ||
128 | if (bio->bi_status) { | 128 | if (bio->bi_status) { |