diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-09 13:45:06 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-09 13:45:06 -0400 |
commit | 3b99107f0e0298e6fe0787f75b8f3d8306dfb230 (patch) | |
tree | 30536dbc9ca176470a2ae2938f952381e33f5deb /block | |
parent | 0415052db4f92b7e272fc15802ad8b8be672deea (diff) | |
parent | c9b3007feca018d3f7061f5d5a14cb00766ffe9b (diff) |
Merge tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"This is the main block updates for 5.3. Nothing earth shattering or
major in here, just fixes, additions, and improvements all over the
map. This contains:
- Series of documentation fixes (Bart)
- Optimization of the blk-mq ctx get/put (Bart)
- null_blk removal race condition fix (Bob)
- req/bio_op() cleanups (Chaitanya)
- Series cleaning up the segment accounting, and request/bio mapping
(Christoph)
- Series cleaning up the page getting/putting for bios (Christoph)
- block cgroup cleanups and moving it to where it is used (Christoph)
- block cgroup fixes (Tejun)
- Series of fixes and improvements to bcache, most notably a write
deadlock fix (Coly)
- blk-iolatency STS_AGAIN and accounting fixes (Dennis)
- Series of improvements and fixes to BFQ (Douglas, Paolo)
- debugfs_create() return value check removal for drbd (Greg)
- Use struct_size(), where appropriate (Gustavo)
- Two lighnvm fixes (Heiner, Geert)
- MD fixes, including a read balance and corruption fix (Guoqing,
Marcos, Xiao, Yufen)
- block opal shadow mbr additions (Jonas, Revanth)
- sbitmap compare-and-exhange improvemnts (Pavel)
- Fix for potential bio->bi_size overflow (Ming)
- NVMe pull requests:
- improved PCIe suspent support (Keith Busch)
- error injection support for the admin queue (Akinobu Mita)
- Fibre Channel discovery improvements (James Smart)
- tracing improvements including nvmetc tracing support (Minwoo Im)
- misc fixes and cleanups (Anton Eidelman, Minwoo Im, Chaitanya
Kulkarni)"
- Various little fixes and improvements to drivers and core"
* tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block: (153 commits)
blk-iolatency: fix STS_AGAIN handling
block: nr_phys_segments needs to be zero for REQ_OP_WRITE_ZEROES
blk-mq: simplify blk_mq_make_request()
blk-mq: remove blk_mq_put_ctx()
sbitmap: Replace cmpxchg with xchg
block: fix .bi_size overflow
block: sed-opal: check size of shadow mbr
block: sed-opal: ioctl for writing to shadow mbr
block: sed-opal: add ioctl for done-mark of shadow mbr
block: never take page references for ITER_BVEC
direct-io: use bio_release_pages in dio_bio_complete
block_dev: use bio_release_pages in bio_unmap_user
block_dev: use bio_release_pages in blkdev_bio_end_io
iomap: use bio_release_pages in iomap_dio_bio_end_io
block: use bio_release_pages in bio_map_user_iov
block: use bio_release_pages in bio_unmap_user
block: optionally mark pages dirty in bio_release_pages
block: move the BIO_NO_PAGE_REF check into bio_release_pages
block: skd_main.c: Remove call to memset after dma_alloc_coherent
block: mtip32xx: Remove call to memset after dma_alloc_coherent
...
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig.iosched | 7 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 212 | ||||
-rw-r--r-- | block/bfq-iosched.c | 967 | ||||
-rw-r--r-- | block/bfq-iosched.h | 48 | ||||
-rw-r--r-- | block/bio.c | 96 | ||||
-rw-r--r-- | block/blk-cgroup.c | 139 | ||||
-rw-r--r-- | block/blk-core.c | 111 | ||||
-rw-r--r-- | block/blk-iolatency.c | 51 | ||||
-rw-r--r-- | block/blk-map.c | 10 | ||||
-rw-r--r-- | block/blk-merge.c | 112 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 42 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 31 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 10 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 8 | ||||
-rw-r--r-- | block/blk-mq.c | 44 | ||||
-rw-r--r-- | block/blk-mq.h | 7 | ||||
-rw-r--r-- | block/blk.h | 36 | ||||
-rw-r--r-- | block/genhd.c | 5 | ||||
-rw-r--r-- | block/kyber-iosched.c | 6 | ||||
-rw-r--r-- | block/mq-deadline.c | 5 | ||||
-rw-r--r-- | block/opal_proto.h | 16 | ||||
-rw-r--r-- | block/sed-opal.c | 197 |
22 files changed, 1342 insertions, 818 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 4626b88b2d5a..7a6b2f29a582 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -36,6 +36,13 @@ config BFQ_GROUP_IOSCHED | |||
36 | Enable hierarchical scheduling in BFQ, using the blkio | 36 | Enable hierarchical scheduling in BFQ, using the blkio |
37 | (cgroups-v1) or io (cgroups-v2) controller. | 37 | (cgroups-v1) or io (cgroups-v2) controller. |
38 | 38 | ||
39 | config BFQ_CGROUP_DEBUG | ||
40 | bool "BFQ IO controller debugging" | ||
41 | depends on BFQ_GROUP_IOSCHED | ||
42 | ---help--- | ||
43 | Enable some debugging help. Currently it exports additional stat | ||
44 | files in a cgroup which can be useful for debugging. | ||
45 | |||
39 | endmenu | 46 | endmenu |
40 | 47 | ||
41 | endif | 48 | endif |
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b3796a40a61a..0f6cd688924f 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c | |||
@@ -15,7 +15,83 @@ | |||
15 | 15 | ||
16 | #include "bfq-iosched.h" | 16 | #include "bfq-iosched.h" |
17 | 17 | ||
18 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | 18 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
19 | static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) | ||
20 | { | ||
21 | int ret; | ||
22 | |||
23 | ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); | ||
24 | if (ret) | ||
25 | return ret; | ||
26 | |||
27 | atomic64_set(&stat->aux_cnt, 0); | ||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | static void bfq_stat_exit(struct bfq_stat *stat) | ||
32 | { | ||
33 | percpu_counter_destroy(&stat->cpu_cnt); | ||
34 | } | ||
35 | |||
36 | /** | ||
37 | * bfq_stat_add - add a value to a bfq_stat | ||
38 | * @stat: target bfq_stat | ||
39 | * @val: value to add | ||
40 | * | ||
41 | * Add @val to @stat. The caller must ensure that IRQ on the same CPU | ||
42 | * don't re-enter this function for the same counter. | ||
43 | */ | ||
44 | static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) | ||
45 | { | ||
46 | percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); | ||
47 | } | ||
48 | |||
49 | /** | ||
50 | * bfq_stat_read - read the current value of a bfq_stat | ||
51 | * @stat: bfq_stat to read | ||
52 | */ | ||
53 | static inline uint64_t bfq_stat_read(struct bfq_stat *stat) | ||
54 | { | ||
55 | return percpu_counter_sum_positive(&stat->cpu_cnt); | ||
56 | } | ||
57 | |||
58 | /** | ||
59 | * bfq_stat_reset - reset a bfq_stat | ||
60 | * @stat: bfq_stat to reset | ||
61 | */ | ||
62 | static inline void bfq_stat_reset(struct bfq_stat *stat) | ||
63 | { | ||
64 | percpu_counter_set(&stat->cpu_cnt, 0); | ||
65 | atomic64_set(&stat->aux_cnt, 0); | ||
66 | } | ||
67 | |||
68 | /** | ||
69 | * bfq_stat_add_aux - add a bfq_stat into another's aux count | ||
70 | * @to: the destination bfq_stat | ||
71 | * @from: the source | ||
72 | * | ||
73 | * Add @from's count including the aux one to @to's aux count. | ||
74 | */ | ||
75 | static inline void bfq_stat_add_aux(struct bfq_stat *to, | ||
76 | struct bfq_stat *from) | ||
77 | { | ||
78 | atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), | ||
79 | &to->aux_cnt); | ||
80 | } | ||
81 | |||
82 | /** | ||
83 | * blkg_prfill_stat - prfill callback for bfq_stat | ||
84 | * @sf: seq_file to print to | ||
85 | * @pd: policy private data of interest | ||
86 | * @off: offset to the bfq_stat in @pd | ||
87 | * | ||
88 | * prfill callback for printing a bfq_stat. | ||
89 | */ | ||
90 | static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, | ||
91 | int off) | ||
92 | { | ||
93 | return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); | ||
94 | } | ||
19 | 95 | ||
20 | /* bfqg stats flags */ | 96 | /* bfqg stats flags */ |
21 | enum bfqg_stats_flags { | 97 | enum bfqg_stats_flags { |
@@ -53,7 +129,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) | |||
53 | 129 | ||
54 | now = ktime_get_ns(); | 130 | now = ktime_get_ns(); |
55 | if (now > stats->start_group_wait_time) | 131 | if (now > stats->start_group_wait_time) |
56 | blkg_stat_add(&stats->group_wait_time, | 132 | bfq_stat_add(&stats->group_wait_time, |
57 | now - stats->start_group_wait_time); | 133 | now - stats->start_group_wait_time); |
58 | bfqg_stats_clear_waiting(stats); | 134 | bfqg_stats_clear_waiting(stats); |
59 | } | 135 | } |
@@ -82,14 +158,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) | |||
82 | 158 | ||
83 | now = ktime_get_ns(); | 159 | now = ktime_get_ns(); |
84 | if (now > stats->start_empty_time) | 160 | if (now > stats->start_empty_time) |
85 | blkg_stat_add(&stats->empty_time, | 161 | bfq_stat_add(&stats->empty_time, |
86 | now - stats->start_empty_time); | 162 | now - stats->start_empty_time); |
87 | bfqg_stats_clear_empty(stats); | 163 | bfqg_stats_clear_empty(stats); |
88 | } | 164 | } |
89 | 165 | ||
90 | void bfqg_stats_update_dequeue(struct bfq_group *bfqg) | 166 | void bfqg_stats_update_dequeue(struct bfq_group *bfqg) |
91 | { | 167 | { |
92 | blkg_stat_add(&bfqg->stats.dequeue, 1); | 168 | bfq_stat_add(&bfqg->stats.dequeue, 1); |
93 | } | 169 | } |
94 | 170 | ||
95 | void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) | 171 | void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) |
@@ -119,7 +195,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) | |||
119 | u64 now = ktime_get_ns(); | 195 | u64 now = ktime_get_ns(); |
120 | 196 | ||
121 | if (now > stats->start_idle_time) | 197 | if (now > stats->start_idle_time) |
122 | blkg_stat_add(&stats->idle_time, | 198 | bfq_stat_add(&stats->idle_time, |
123 | now - stats->start_idle_time); | 199 | now - stats->start_idle_time); |
124 | bfqg_stats_clear_idling(stats); | 200 | bfqg_stats_clear_idling(stats); |
125 | } | 201 | } |
@@ -137,9 +213,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) | |||
137 | { | 213 | { |
138 | struct bfqg_stats *stats = &bfqg->stats; | 214 | struct bfqg_stats *stats = &bfqg->stats; |
139 | 215 | ||
140 | blkg_stat_add(&stats->avg_queue_size_sum, | 216 | bfq_stat_add(&stats->avg_queue_size_sum, |
141 | blkg_rwstat_total(&stats->queued)); | 217 | blkg_rwstat_total(&stats->queued)); |
142 | blkg_stat_add(&stats->avg_queue_size_samples, 1); | 218 | bfq_stat_add(&stats->avg_queue_size_samples, 1); |
143 | bfqg_stats_update_group_wait_time(stats); | 219 | bfqg_stats_update_group_wait_time(stats); |
144 | } | 220 | } |
145 | 221 | ||
@@ -176,7 +252,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, | |||
176 | io_start_time_ns - start_time_ns); | 252 | io_start_time_ns - start_time_ns); |
177 | } | 253 | } |
178 | 254 | ||
179 | #else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | 255 | #else /* CONFIG_BFQ_CGROUP_DEBUG */ |
180 | 256 | ||
181 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, | 257 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, |
182 | unsigned int op) { } | 258 | unsigned int op) { } |
@@ -190,7 +266,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } | |||
190 | void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } | 266 | void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } |
191 | void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } | 267 | void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } |
192 | 268 | ||
193 | #endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | 269 | #endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
194 | 270 | ||
195 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 271 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
196 | 272 | ||
@@ -274,18 +350,18 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) | |||
274 | /* @stats = 0 */ | 350 | /* @stats = 0 */ |
275 | static void bfqg_stats_reset(struct bfqg_stats *stats) | 351 | static void bfqg_stats_reset(struct bfqg_stats *stats) |
276 | { | 352 | { |
277 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 353 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
278 | /* queued stats shouldn't be cleared */ | 354 | /* queued stats shouldn't be cleared */ |
279 | blkg_rwstat_reset(&stats->merged); | 355 | blkg_rwstat_reset(&stats->merged); |
280 | blkg_rwstat_reset(&stats->service_time); | 356 | blkg_rwstat_reset(&stats->service_time); |
281 | blkg_rwstat_reset(&stats->wait_time); | 357 | blkg_rwstat_reset(&stats->wait_time); |
282 | blkg_stat_reset(&stats->time); | 358 | bfq_stat_reset(&stats->time); |
283 | blkg_stat_reset(&stats->avg_queue_size_sum); | 359 | bfq_stat_reset(&stats->avg_queue_size_sum); |
284 | blkg_stat_reset(&stats->avg_queue_size_samples); | 360 | bfq_stat_reset(&stats->avg_queue_size_samples); |
285 | blkg_stat_reset(&stats->dequeue); | 361 | bfq_stat_reset(&stats->dequeue); |
286 | blkg_stat_reset(&stats->group_wait_time); | 362 | bfq_stat_reset(&stats->group_wait_time); |
287 | blkg_stat_reset(&stats->idle_time); | 363 | bfq_stat_reset(&stats->idle_time); |
288 | blkg_stat_reset(&stats->empty_time); | 364 | bfq_stat_reset(&stats->empty_time); |
289 | #endif | 365 | #endif |
290 | } | 366 | } |
291 | 367 | ||
@@ -295,19 +371,19 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) | |||
295 | if (!to || !from) | 371 | if (!to || !from) |
296 | return; | 372 | return; |
297 | 373 | ||
298 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 374 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
299 | /* queued stats shouldn't be cleared */ | 375 | /* queued stats shouldn't be cleared */ |
300 | blkg_rwstat_add_aux(&to->merged, &from->merged); | 376 | blkg_rwstat_add_aux(&to->merged, &from->merged); |
301 | blkg_rwstat_add_aux(&to->service_time, &from->service_time); | 377 | blkg_rwstat_add_aux(&to->service_time, &from->service_time); |
302 | blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); | 378 | blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); |
303 | blkg_stat_add_aux(&from->time, &from->time); | 379 | bfq_stat_add_aux(&from->time, &from->time); |
304 | blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); | 380 | bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); |
305 | blkg_stat_add_aux(&to->avg_queue_size_samples, | 381 | bfq_stat_add_aux(&to->avg_queue_size_samples, |
306 | &from->avg_queue_size_samples); | 382 | &from->avg_queue_size_samples); |
307 | blkg_stat_add_aux(&to->dequeue, &from->dequeue); | 383 | bfq_stat_add_aux(&to->dequeue, &from->dequeue); |
308 | blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); | 384 | bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); |
309 | blkg_stat_add_aux(&to->idle_time, &from->idle_time); | 385 | bfq_stat_add_aux(&to->idle_time, &from->idle_time); |
310 | blkg_stat_add_aux(&to->empty_time, &from->empty_time); | 386 | bfq_stat_add_aux(&to->empty_time, &from->empty_time); |
311 | #endif | 387 | #endif |
312 | } | 388 | } |
313 | 389 | ||
@@ -355,35 +431,35 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) | |||
355 | 431 | ||
356 | static void bfqg_stats_exit(struct bfqg_stats *stats) | 432 | static void bfqg_stats_exit(struct bfqg_stats *stats) |
357 | { | 433 | { |
358 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 434 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
359 | blkg_rwstat_exit(&stats->merged); | 435 | blkg_rwstat_exit(&stats->merged); |
360 | blkg_rwstat_exit(&stats->service_time); | 436 | blkg_rwstat_exit(&stats->service_time); |
361 | blkg_rwstat_exit(&stats->wait_time); | 437 | blkg_rwstat_exit(&stats->wait_time); |
362 | blkg_rwstat_exit(&stats->queued); | 438 | blkg_rwstat_exit(&stats->queued); |
363 | blkg_stat_exit(&stats->time); | 439 | bfq_stat_exit(&stats->time); |
364 | blkg_stat_exit(&stats->avg_queue_size_sum); | 440 | bfq_stat_exit(&stats->avg_queue_size_sum); |
365 | blkg_stat_exit(&stats->avg_queue_size_samples); | 441 | bfq_stat_exit(&stats->avg_queue_size_samples); |
366 | blkg_stat_exit(&stats->dequeue); | 442 | bfq_stat_exit(&stats->dequeue); |
367 | blkg_stat_exit(&stats->group_wait_time); | 443 | bfq_stat_exit(&stats->group_wait_time); |
368 | blkg_stat_exit(&stats->idle_time); | 444 | bfq_stat_exit(&stats->idle_time); |
369 | blkg_stat_exit(&stats->empty_time); | 445 | bfq_stat_exit(&stats->empty_time); |
370 | #endif | 446 | #endif |
371 | } | 447 | } |
372 | 448 | ||
373 | static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) | 449 | static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) |
374 | { | 450 | { |
375 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 451 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
376 | if (blkg_rwstat_init(&stats->merged, gfp) || | 452 | if (blkg_rwstat_init(&stats->merged, gfp) || |
377 | blkg_rwstat_init(&stats->service_time, gfp) || | 453 | blkg_rwstat_init(&stats->service_time, gfp) || |
378 | blkg_rwstat_init(&stats->wait_time, gfp) || | 454 | blkg_rwstat_init(&stats->wait_time, gfp) || |
379 | blkg_rwstat_init(&stats->queued, gfp) || | 455 | blkg_rwstat_init(&stats->queued, gfp) || |
380 | blkg_stat_init(&stats->time, gfp) || | 456 | bfq_stat_init(&stats->time, gfp) || |
381 | blkg_stat_init(&stats->avg_queue_size_sum, gfp) || | 457 | bfq_stat_init(&stats->avg_queue_size_sum, gfp) || |
382 | blkg_stat_init(&stats->avg_queue_size_samples, gfp) || | 458 | bfq_stat_init(&stats->avg_queue_size_samples, gfp) || |
383 | blkg_stat_init(&stats->dequeue, gfp) || | 459 | bfq_stat_init(&stats->dequeue, gfp) || |
384 | blkg_stat_init(&stats->group_wait_time, gfp) || | 460 | bfq_stat_init(&stats->group_wait_time, gfp) || |
385 | blkg_stat_init(&stats->idle_time, gfp) || | 461 | bfq_stat_init(&stats->idle_time, gfp) || |
386 | blkg_stat_init(&stats->empty_time, gfp)) { | 462 | bfq_stat_init(&stats->empty_time, gfp)) { |
387 | bfqg_stats_exit(stats); | 463 | bfqg_stats_exit(stats); |
388 | return -ENOMEM; | 464 | return -ENOMEM; |
389 | } | 465 | } |
@@ -909,7 +985,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, | |||
909 | return ret ?: nbytes; | 985 | return ret ?: nbytes; |
910 | } | 986 | } |
911 | 987 | ||
912 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 988 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
913 | static int bfqg_print_stat(struct seq_file *sf, void *v) | 989 | static int bfqg_print_stat(struct seq_file *sf, void *v) |
914 | { | 990 | { |
915 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, | 991 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, |
@@ -927,17 +1003,34 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) | |||
927 | static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, | 1003 | static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, |
928 | struct blkg_policy_data *pd, int off) | 1004 | struct blkg_policy_data *pd, int off) |
929 | { | 1005 | { |
930 | u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), | 1006 | struct blkcg_gq *blkg = pd_to_blkg(pd); |
931 | &blkcg_policy_bfq, off); | 1007 | struct blkcg_gq *pos_blkg; |
1008 | struct cgroup_subsys_state *pos_css; | ||
1009 | u64 sum = 0; | ||
1010 | |||
1011 | lockdep_assert_held(&blkg->q->queue_lock); | ||
1012 | |||
1013 | rcu_read_lock(); | ||
1014 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { | ||
1015 | struct bfq_stat *stat; | ||
1016 | |||
1017 | if (!pos_blkg->online) | ||
1018 | continue; | ||
1019 | |||
1020 | stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; | ||
1021 | sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); | ||
1022 | } | ||
1023 | rcu_read_unlock(); | ||
1024 | |||
932 | return __blkg_prfill_u64(sf, pd, sum); | 1025 | return __blkg_prfill_u64(sf, pd, sum); |
933 | } | 1026 | } |
934 | 1027 | ||
935 | static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, | 1028 | static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, |
936 | struct blkg_policy_data *pd, int off) | 1029 | struct blkg_policy_data *pd, int off) |
937 | { | 1030 | { |
938 | struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), | 1031 | struct blkg_rwstat_sample sum; |
939 | &blkcg_policy_bfq, | 1032 | |
940 | off); | 1033 | blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); |
941 | return __blkg_prfill_rwstat(sf, pd, &sum); | 1034 | return __blkg_prfill_rwstat(sf, pd, &sum); |
942 | } | 1035 | } |
943 | 1036 | ||
@@ -975,12 +1068,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) | |||
975 | static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, | 1068 | static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, |
976 | struct blkg_policy_data *pd, int off) | 1069 | struct blkg_policy_data *pd, int off) |
977 | { | 1070 | { |
978 | struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, | 1071 | struct blkg_rwstat_sample tmp; |
979 | offsetof(struct blkcg_gq, stat_bytes)); | ||
980 | u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + | ||
981 | atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
982 | 1072 | ||
983 | return __blkg_prfill_u64(sf, pd, sum >> 9); | 1073 | blkg_rwstat_recursive_sum(pd->blkg, NULL, |
1074 | offsetof(struct blkcg_gq, stat_bytes), &tmp); | ||
1075 | |||
1076 | return __blkg_prfill_u64(sf, pd, | ||
1077 | (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); | ||
984 | } | 1078 | } |
985 | 1079 | ||
986 | static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) | 1080 | static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) |
@@ -995,11 +1089,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, | |||
995 | struct blkg_policy_data *pd, int off) | 1089 | struct blkg_policy_data *pd, int off) |
996 | { | 1090 | { |
997 | struct bfq_group *bfqg = pd_to_bfqg(pd); | 1091 | struct bfq_group *bfqg = pd_to_bfqg(pd); |
998 | u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); | 1092 | u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); |
999 | u64 v = 0; | 1093 | u64 v = 0; |
1000 | 1094 | ||
1001 | if (samples) { | 1095 | if (samples) { |
1002 | v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); | 1096 | v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); |
1003 | v = div64_u64(v, samples); | 1097 | v = div64_u64(v, samples); |
1004 | } | 1098 | } |
1005 | __blkg_prfill_u64(sf, pd, v); | 1099 | __blkg_prfill_u64(sf, pd, v); |
@@ -1014,7 +1108,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) | |||
1014 | 0, false); | 1108 | 0, false); |
1015 | return 0; | 1109 | return 0; |
1016 | } | 1110 | } |
1017 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | 1111 | #endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
1018 | 1112 | ||
1019 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) | 1113 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) |
1020 | { | 1114 | { |
@@ -1062,7 +1156,7 @@ struct cftype bfq_blkcg_legacy_files[] = { | |||
1062 | .private = (unsigned long)&blkcg_policy_bfq, | 1156 | .private = (unsigned long)&blkcg_policy_bfq, |
1063 | .seq_show = blkg_print_stat_ios, | 1157 | .seq_show = blkg_print_stat_ios, |
1064 | }, | 1158 | }, |
1065 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1159 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
1066 | { | 1160 | { |
1067 | .name = "bfq.time", | 1161 | .name = "bfq.time", |
1068 | .private = offsetof(struct bfq_group, stats.time), | 1162 | .private = offsetof(struct bfq_group, stats.time), |
@@ -1092,7 +1186,7 @@ struct cftype bfq_blkcg_legacy_files[] = { | |||
1092 | .private = offsetof(struct bfq_group, stats.queued), | 1186 | .private = offsetof(struct bfq_group, stats.queued), |
1093 | .seq_show = bfqg_print_rwstat, | 1187 | .seq_show = bfqg_print_rwstat, |
1094 | }, | 1188 | }, |
1095 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | 1189 | #endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
1096 | 1190 | ||
1097 | /* the same statistics which cover the bfqg and its descendants */ | 1191 | /* the same statistics which cover the bfqg and its descendants */ |
1098 | { | 1192 | { |
@@ -1105,7 +1199,7 @@ struct cftype bfq_blkcg_legacy_files[] = { | |||
1105 | .private = (unsigned long)&blkcg_policy_bfq, | 1199 | .private = (unsigned long)&blkcg_policy_bfq, |
1106 | .seq_show = blkg_print_stat_ios_recursive, | 1200 | .seq_show = blkg_print_stat_ios_recursive, |
1107 | }, | 1201 | }, |
1108 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1202 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
1109 | { | 1203 | { |
1110 | .name = "bfq.time_recursive", | 1204 | .name = "bfq.time_recursive", |
1111 | .private = offsetof(struct bfq_group, stats.time), | 1205 | .private = offsetof(struct bfq_group, stats.time), |
@@ -1159,7 +1253,7 @@ struct cftype bfq_blkcg_legacy_files[] = { | |||
1159 | .private = offsetof(struct bfq_group, stats.dequeue), | 1253 | .private = offsetof(struct bfq_group, stats.dequeue), |
1160 | .seq_show = bfqg_print_stat, | 1254 | .seq_show = bfqg_print_stat, |
1161 | }, | 1255 | }, |
1162 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | 1256 | #endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
1163 | { } /* terminate */ | 1257 | { } /* terminate */ |
1164 | }; | 1258 | }; |
1165 | 1259 | ||
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f9269ae6da9c..50c9d2598500 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst); | |||
157 | BFQ_BFQQ_FNS(coop); | 157 | BFQ_BFQQ_FNS(coop); |
158 | BFQ_BFQQ_FNS(split_coop); | 158 | BFQ_BFQQ_FNS(split_coop); |
159 | BFQ_BFQQ_FNS(softrt_update); | 159 | BFQ_BFQQ_FNS(softrt_update); |
160 | BFQ_BFQQ_FNS(has_waker); | ||
160 | #undef BFQ_BFQQ_FNS \ | 161 | #undef BFQ_BFQQ_FNS \ |
161 | 162 | ||
162 | /* Expiration time of sync (0) and async (1) requests, in ns. */ | 163 | /* Expiration time of sync (0) and async (1) requests, in ns. */ |
@@ -1427,17 +1428,19 @@ static int bfq_min_budget(struct bfq_data *bfqd) | |||
1427 | * mechanism may be re-designed in such a way to make it possible to | 1428 | * mechanism may be re-designed in such a way to make it possible to |
1428 | * know whether preemption is needed without needing to update service | 1429 | * know whether preemption is needed without needing to update service |
1429 | * trees). In addition, queue preemptions almost always cause random | 1430 | * trees). In addition, queue preemptions almost always cause random |
1430 | * I/O, and thus loss of throughput. Because of these facts, the next | 1431 | * I/O, which may in turn cause loss of throughput. Finally, there may |
1431 | * function adopts the following simple scheme to avoid both costly | 1432 | * even be no in-service queue when the next function is invoked (so, |
1432 | * operations and too frequent preemptions: it requests the expiration | 1433 | * no queue to compare timestamps with). Because of these facts, the |
1433 | * of the in-service queue (unconditionally) only for queues that need | 1434 | * next function adopts the following simple scheme to avoid costly |
1434 | * to recover a hole, or that either are weight-raised or deserve to | 1435 | * operations, too frequent preemptions and too many dependencies on |
1435 | * be weight-raised. | 1436 | * the state of the scheduler: it requests the expiration of the |
1437 | * in-service queue (unconditionally) only for queues that need to | ||
1438 | * recover a hole. Then it delegates to other parts of the code the | ||
1439 | * responsibility of handling the above case 2. | ||
1436 | */ | 1440 | */ |
1437 | static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, | 1441 | static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, |
1438 | struct bfq_queue *bfqq, | 1442 | struct bfq_queue *bfqq, |
1439 | bool arrived_in_time, | 1443 | bool arrived_in_time) |
1440 | bool wr_or_deserves_wr) | ||
1441 | { | 1444 | { |
1442 | struct bfq_entity *entity = &bfqq->entity; | 1445 | struct bfq_entity *entity = &bfqq->entity; |
1443 | 1446 | ||
@@ -1492,7 +1495,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, | |||
1492 | entity->budget = max_t(unsigned long, bfqq->max_budget, | 1495 | entity->budget = max_t(unsigned long, bfqq->max_budget, |
1493 | bfq_serv_to_charge(bfqq->next_rq, bfqq)); | 1496 | bfq_serv_to_charge(bfqq->next_rq, bfqq)); |
1494 | bfq_clear_bfqq_non_blocking_wait_rq(bfqq); | 1497 | bfq_clear_bfqq_non_blocking_wait_rq(bfqq); |
1495 | return wr_or_deserves_wr; | 1498 | return false; |
1496 | } | 1499 | } |
1497 | 1500 | ||
1498 | /* | 1501 | /* |
@@ -1610,6 +1613,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, | |||
1610 | bfqd->bfq_wr_min_idle_time); | 1613 | bfqd->bfq_wr_min_idle_time); |
1611 | } | 1614 | } |
1612 | 1615 | ||
1616 | |||
1617 | /* | ||
1618 | * Return true if bfqq is in a higher priority class, or has a higher | ||
1619 | * weight than the in-service queue. | ||
1620 | */ | ||
1621 | static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, | ||
1622 | struct bfq_queue *in_serv_bfqq) | ||
1623 | { | ||
1624 | int bfqq_weight, in_serv_weight; | ||
1625 | |||
1626 | if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class) | ||
1627 | return true; | ||
1628 | |||
1629 | if (in_serv_bfqq->entity.parent == bfqq->entity.parent) { | ||
1630 | bfqq_weight = bfqq->entity.weight; | ||
1631 | in_serv_weight = in_serv_bfqq->entity.weight; | ||
1632 | } else { | ||
1633 | if (bfqq->entity.parent) | ||
1634 | bfqq_weight = bfqq->entity.parent->weight; | ||
1635 | else | ||
1636 | bfqq_weight = bfqq->entity.weight; | ||
1637 | if (in_serv_bfqq->entity.parent) | ||
1638 | in_serv_weight = in_serv_bfqq->entity.parent->weight; | ||
1639 | else | ||
1640 | in_serv_weight = in_serv_bfqq->entity.weight; | ||
1641 | } | ||
1642 | |||
1643 | return bfqq_weight > in_serv_weight; | ||
1644 | } | ||
1645 | |||
1613 | static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, | 1646 | static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, |
1614 | struct bfq_queue *bfqq, | 1647 | struct bfq_queue *bfqq, |
1615 | int old_wr_coeff, | 1648 | int old_wr_coeff, |
@@ -1654,8 +1687,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, | |||
1654 | */ | 1687 | */ |
1655 | bfqq_wants_to_preempt = | 1688 | bfqq_wants_to_preempt = |
1656 | bfq_bfqq_update_budg_for_activation(bfqd, bfqq, | 1689 | bfq_bfqq_update_budg_for_activation(bfqd, bfqq, |
1657 | arrived_in_time, | 1690 | arrived_in_time); |
1658 | wr_or_deserves_wr); | ||
1659 | 1691 | ||
1660 | /* | 1692 | /* |
1661 | * If bfqq happened to be activated in a burst, but has been | 1693 | * If bfqq happened to be activated in a burst, but has been |
@@ -1720,21 +1752,111 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, | |||
1720 | 1752 | ||
1721 | /* | 1753 | /* |
1722 | * Expire in-service queue only if preemption may be needed | 1754 | * Expire in-service queue only if preemption may be needed |
1723 | * for guarantees. In this respect, the function | 1755 | * for guarantees. In particular, we care only about two |
1724 | * next_queue_may_preempt just checks a simple, necessary | 1756 | * cases. The first is that bfqq has to recover a service |
1725 | * condition, and not a sufficient condition based on | 1757 | * hole, as explained in the comments on |
1726 | * timestamps. In fact, for the latter condition to be | 1758 | * bfq_bfqq_update_budg_for_activation(), i.e., that |
1727 | * evaluated, timestamps would need first to be updated, and | 1759 | * bfqq_wants_to_preempt is true. However, if bfqq does not |
1728 | * this operation is quite costly (see the comments on the | 1760 | * carry time-critical I/O, then bfqq's bandwidth is less |
1729 | * function bfq_bfqq_update_budg_for_activation). | 1761 | * important than that of queues that carry time-critical I/O. |
1762 | * So, as a further constraint, we consider this case only if | ||
1763 | * bfqq is at least as weight-raised, i.e., at least as time | ||
1764 | * critical, as the in-service queue. | ||
1765 | * | ||
1766 | * The second case is that bfqq is in a higher priority class, | ||
1767 | * or has a higher weight than the in-service queue. If this | ||
1768 | * condition does not hold, we don't care because, even if | ||
1769 | * bfqq does not start to be served immediately, the resulting | ||
1770 | * delay for bfqq's I/O is however lower or much lower than | ||
1771 | * the ideal completion time to be guaranteed to bfqq's I/O. | ||
1772 | * | ||
1773 | * In both cases, preemption is needed only if, according to | ||
1774 | * the timestamps of both bfqq and of the in-service queue, | ||
1775 | * bfqq actually is the next queue to serve. So, to reduce | ||
1776 | * useless preemptions, the return value of | ||
1777 | * next_queue_may_preempt() is considered in the next compound | ||
1778 | * condition too. Yet next_queue_may_preempt() just checks a | ||
1779 | * simple, necessary condition for bfqq to be the next queue | ||
1780 | * to serve. In fact, to evaluate a sufficient condition, the | ||
1781 | * timestamps of the in-service queue would need to be | ||
1782 | * updated, and this operation is quite costly (see the | ||
1783 | * comments on bfq_bfqq_update_budg_for_activation()). | ||
1730 | */ | 1784 | */ |
1731 | if (bfqd->in_service_queue && bfqq_wants_to_preempt && | 1785 | if (bfqd->in_service_queue && |
1732 | bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && | 1786 | ((bfqq_wants_to_preempt && |
1787 | bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || | ||
1788 | bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && | ||
1733 | next_queue_may_preempt(bfqd)) | 1789 | next_queue_may_preempt(bfqd)) |
1734 | bfq_bfqq_expire(bfqd, bfqd->in_service_queue, | 1790 | bfq_bfqq_expire(bfqd, bfqd->in_service_queue, |
1735 | false, BFQQE_PREEMPTED); | 1791 | false, BFQQE_PREEMPTED); |
1736 | } | 1792 | } |
1737 | 1793 | ||
1794 | static void bfq_reset_inject_limit(struct bfq_data *bfqd, | ||
1795 | struct bfq_queue *bfqq) | ||
1796 | { | ||
1797 | /* invalidate baseline total service time */ | ||
1798 | bfqq->last_serv_time_ns = 0; | ||
1799 | |||
1800 | /* | ||
1801 | * Reset pointer in case we are waiting for | ||
1802 | * some request completion. | ||
1803 | */ | ||
1804 | bfqd->waited_rq = NULL; | ||
1805 | |||
1806 | /* | ||
1807 | * If bfqq has a short think time, then start by setting the | ||
1808 | * inject limit to 0 prudentially, because the service time of | ||
1809 | * an injected I/O request may be higher than the think time | ||
1810 | * of bfqq, and therefore, if one request was injected when | ||
1811 | * bfqq remains empty, this injected request might delay the | ||
1812 | * service of the next I/O request for bfqq significantly. In | ||
1813 | * case bfqq can actually tolerate some injection, then the | ||
1814 | * adaptive update will however raise the limit soon. This | ||
1815 | * lucky circumstance holds exactly because bfqq has a short | ||
1816 | * think time, and thus, after remaining empty, is likely to | ||
1817 | * get new I/O enqueued---and then completed---before being | ||
1818 | * expired. This is the very pattern that gives the | ||
1819 | * limit-update algorithm the chance to measure the effect of | ||
1820 | * injection on request service times, and then to update the | ||
1821 | * limit accordingly. | ||
1822 | * | ||
1823 | * However, in the following special case, the inject limit is | ||
1824 | * left to 1 even if the think time is short: bfqq's I/O is | ||
1825 | * synchronized with that of some other queue, i.e., bfqq may | ||
1826 | * receive new I/O only after the I/O of the other queue is | ||
1827 | * completed. Keeping the inject limit to 1 allows the | ||
1828 | * blocking I/O to be served while bfqq is in service. And | ||
1829 | * this is very convenient both for bfqq and for overall | ||
1830 | * throughput, as explained in detail in the comments in | ||
1831 | * bfq_update_has_short_ttime(). | ||
1832 | * | ||
1833 | * On the opposite end, if bfqq has a long think time, then | ||
1834 | * start directly by 1, because: | ||
1835 | * a) on the bright side, keeping at most one request in | ||
1836 | * service in the drive is unlikely to cause any harm to the | ||
1837 | * latency of bfqq's requests, as the service time of a single | ||
1838 | * request is likely to be lower than the think time of bfqq; | ||
1839 | * b) on the downside, after becoming empty, bfqq is likely to | ||
1840 | * expire before getting its next request. With this request | ||
1841 | * arrival pattern, it is very hard to sample total service | ||
1842 | * times and update the inject limit accordingly (see comments | ||
1843 | * on bfq_update_inject_limit()). So the limit is likely to be | ||
1844 | * never, or at least seldom, updated. As a consequence, by | ||
1845 | * setting the limit to 1, we avoid that no injection ever | ||
1846 | * occurs with bfqq. On the downside, this proactive step | ||
1847 | * further reduces chances to actually compute the baseline | ||
1848 | * total service time. Thus it reduces chances to execute the | ||
1849 | * limit-update algorithm and possibly raise the limit to more | ||
1850 | * than 1. | ||
1851 | */ | ||
1852 | if (bfq_bfqq_has_short_ttime(bfqq)) | ||
1853 | bfqq->inject_limit = 0; | ||
1854 | else | ||
1855 | bfqq->inject_limit = 1; | ||
1856 | |||
1857 | bfqq->decrease_time_jif = jiffies; | ||
1858 | } | ||
1859 | |||
1738 | static void bfq_add_request(struct request *rq) | 1860 | static void bfq_add_request(struct request *rq) |
1739 | { | 1861 | { |
1740 | struct bfq_queue *bfqq = RQ_BFQQ(rq); | 1862 | struct bfq_queue *bfqq = RQ_BFQQ(rq); |
@@ -1749,77 +1871,119 @@ static void bfq_add_request(struct request *rq) | |||
1749 | 1871 | ||
1750 | if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { | 1872 | if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { |
1751 | /* | 1873 | /* |
1874 | * Detect whether bfqq's I/O seems synchronized with | ||
1875 | * that of some other queue, i.e., whether bfqq, after | ||
1876 | * remaining empty, happens to receive new I/O only | ||
1877 | * right after some I/O request of the other queue has | ||
1878 | * been completed. We call waker queue the other | ||
1879 | * queue, and we assume, for simplicity, that bfqq may | ||
1880 | * have at most one waker queue. | ||
1881 | * | ||
1882 | * A remarkable throughput boost can be reached by | ||
1883 | * unconditionally injecting the I/O of the waker | ||
1884 | * queue, every time a new bfq_dispatch_request | ||
1885 | * happens to be invoked while I/O is being plugged | ||
1886 | * for bfqq. In addition to boosting throughput, this | ||
1887 | * unblocks bfqq's I/O, thereby improving bandwidth | ||
1888 | * and latency for bfqq. Note that these same results | ||
1889 | * may be achieved with the general injection | ||
1890 | * mechanism, but less effectively. For details on | ||
1891 | * this aspect, see the comments on the choice of the | ||
1892 | * queue for injection in bfq_select_queue(). | ||
1893 | * | ||
1894 | * Turning back to the detection of a waker queue, a | ||
1895 | * queue Q is deemed as a waker queue for bfqq if, for | ||
1896 | * two consecutive times, bfqq happens to become non | ||
1897 | * empty right after a request of Q has been | ||
1898 | * completed. In particular, on the first time, Q is | ||
1899 | * tentatively set as a candidate waker queue, while | ||
1900 | * on the second time, the flag | ||
1901 | * bfq_bfqq_has_waker(bfqq) is set to confirm that Q | ||
1902 | * is a waker queue for bfqq. These detection steps | ||
1903 | * are performed only if bfqq has a long think time, | ||
1904 | * so as to make it more likely that bfqq's I/O is | ||
1905 | * actually being blocked by a synchronization. This | ||
1906 | * last filter, plus the above two-times requirement, | ||
1907 | * make false positives less likely. | ||
1908 | * | ||
1909 | * NOTE | ||
1910 | * | ||
1911 | * The sooner a waker queue is detected, the sooner | ||
1912 | * throughput can be boosted by injecting I/O from the | ||
1913 | * waker queue. Fortunately, detection is likely to be | ||
1914 | * actually fast, for the following reasons. While | ||
1915 | * blocked by synchronization, bfqq has a long think | ||
1916 | * time. This implies that bfqq's inject limit is at | ||
1917 | * least equal to 1 (see the comments in | ||
1918 | * bfq_update_inject_limit()). So, thanks to | ||
1919 | * injection, the waker queue is likely to be served | ||
1920 | * during the very first I/O-plugging time interval | ||
1921 | * for bfqq. This triggers the first step of the | ||
1922 | * detection mechanism. Thanks again to injection, the | ||
1923 | * candidate waker queue is then likely to be | ||
1924 | * confirmed no later than during the next | ||
1925 | * I/O-plugging interval for bfqq. | ||
1926 | */ | ||
1927 | if (!bfq_bfqq_has_short_ttime(bfqq) && | ||
1928 | ktime_get_ns() - bfqd->last_completion < | ||
1929 | 200 * NSEC_PER_USEC) { | ||
1930 | if (bfqd->last_completed_rq_bfqq != bfqq && | ||
1931 | bfqd->last_completed_rq_bfqq != | ||
1932 | bfqq->waker_bfqq) { | ||
1933 | /* | ||
1934 | * First synchronization detected with | ||
1935 | * a candidate waker queue, or with a | ||
1936 | * different candidate waker queue | ||
1937 | * from the current one. | ||
1938 | */ | ||
1939 | bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; | ||
1940 | |||
1941 | /* | ||
1942 | * If the waker queue disappears, then | ||
1943 | * bfqq->waker_bfqq must be reset. To | ||
1944 | * this goal, we maintain in each | ||
1945 | * waker queue a list, woken_list, of | ||
1946 | * all the queues that reference the | ||
1947 | * waker queue through their | ||
1948 | * waker_bfqq pointer. When the waker | ||
1949 | * queue exits, the waker_bfqq pointer | ||
1950 | * of all the queues in the woken_list | ||
1951 | * is reset. | ||
1952 | * | ||
1953 | * In addition, if bfqq is already in | ||
1954 | * the woken_list of a waker queue, | ||
1955 | * then, before being inserted into | ||
1956 | * the woken_list of a new waker | ||
1957 | * queue, bfqq must be removed from | ||
1958 | * the woken_list of the old waker | ||
1959 | * queue. | ||
1960 | */ | ||
1961 | if (!hlist_unhashed(&bfqq->woken_list_node)) | ||
1962 | hlist_del_init(&bfqq->woken_list_node); | ||
1963 | hlist_add_head(&bfqq->woken_list_node, | ||
1964 | &bfqd->last_completed_rq_bfqq->woken_list); | ||
1965 | |||
1966 | bfq_clear_bfqq_has_waker(bfqq); | ||
1967 | } else if (bfqd->last_completed_rq_bfqq == | ||
1968 | bfqq->waker_bfqq && | ||
1969 | !bfq_bfqq_has_waker(bfqq)) { | ||
1970 | /* | ||
1971 | * synchronization with waker_bfqq | ||
1972 | * seen for the second time | ||
1973 | */ | ||
1974 | bfq_mark_bfqq_has_waker(bfqq); | ||
1975 | } | ||
1976 | } | ||
1977 | |||
1978 | /* | ||
1752 | * Periodically reset inject limit, to make sure that | 1979 | * Periodically reset inject limit, to make sure that |
1753 | * the latter eventually drops in case workload | 1980 | * the latter eventually drops in case workload |
1754 | * changes, see step (3) in the comments on | 1981 | * changes, see step (3) in the comments on |
1755 | * bfq_update_inject_limit(). | 1982 | * bfq_update_inject_limit(). |
1756 | */ | 1983 | */ |
1757 | if (time_is_before_eq_jiffies(bfqq->decrease_time_jif + | 1984 | if (time_is_before_eq_jiffies(bfqq->decrease_time_jif + |
1758 | msecs_to_jiffies(1000))) { | 1985 | msecs_to_jiffies(1000))) |
1759 | /* invalidate baseline total service time */ | 1986 | bfq_reset_inject_limit(bfqd, bfqq); |
1760 | bfqq->last_serv_time_ns = 0; | ||
1761 | |||
1762 | /* | ||
1763 | * Reset pointer in case we are waiting for | ||
1764 | * some request completion. | ||
1765 | */ | ||
1766 | bfqd->waited_rq = NULL; | ||
1767 | |||
1768 | /* | ||
1769 | * If bfqq has a short think time, then start | ||
1770 | * by setting the inject limit to 0 | ||
1771 | * prudentially, because the service time of | ||
1772 | * an injected I/O request may be higher than | ||
1773 | * the think time of bfqq, and therefore, if | ||
1774 | * one request was injected when bfqq remains | ||
1775 | * empty, this injected request might delay | ||
1776 | * the service of the next I/O request for | ||
1777 | * bfqq significantly. In case bfqq can | ||
1778 | * actually tolerate some injection, then the | ||
1779 | * adaptive update will however raise the | ||
1780 | * limit soon. This lucky circumstance holds | ||
1781 | * exactly because bfqq has a short think | ||
1782 | * time, and thus, after remaining empty, is | ||
1783 | * likely to get new I/O enqueued---and then | ||
1784 | * completed---before being expired. This is | ||
1785 | * the very pattern that gives the | ||
1786 | * limit-update algorithm the chance to | ||
1787 | * measure the effect of injection on request | ||
1788 | * service times, and then to update the limit | ||
1789 | * accordingly. | ||
1790 | * | ||
1791 | * On the opposite end, if bfqq has a long | ||
1792 | * think time, then start directly by 1, | ||
1793 | * because: | ||
1794 | * a) on the bright side, keeping at most one | ||
1795 | * request in service in the drive is unlikely | ||
1796 | * to cause any harm to the latency of bfqq's | ||
1797 | * requests, as the service time of a single | ||
1798 | * request is likely to be lower than the | ||
1799 | * think time of bfqq; | ||
1800 | * b) on the downside, after becoming empty, | ||
1801 | * bfqq is likely to expire before getting its | ||
1802 | * next request. With this request arrival | ||
1803 | * pattern, it is very hard to sample total | ||
1804 | * service times and update the inject limit | ||
1805 | * accordingly (see comments on | ||
1806 | * bfq_update_inject_limit()). So the limit is | ||
1807 | * likely to be never, or at least seldom, | ||
1808 | * updated. As a consequence, by setting the | ||
1809 | * limit to 1, we avoid that no injection ever | ||
1810 | * occurs with bfqq. On the downside, this | ||
1811 | * proactive step further reduces chances to | ||
1812 | * actually compute the baseline total service | ||
1813 | * time. Thus it reduces chances to execute the | ||
1814 | * limit-update algorithm and possibly raise the | ||
1815 | * limit to more than 1. | ||
1816 | */ | ||
1817 | if (bfq_bfqq_has_short_ttime(bfqq)) | ||
1818 | bfqq->inject_limit = 0; | ||
1819 | else | ||
1820 | bfqq->inject_limit = 1; | ||
1821 | bfqq->decrease_time_jif = jiffies; | ||
1822 | } | ||
1823 | 1987 | ||
1824 | /* | 1988 | /* |
1825 | * The following conditions must hold to setup a new | 1989 | * The following conditions must hold to setup a new |
@@ -2027,7 +2191,8 @@ static void bfq_remove_request(struct request_queue *q, | |||
2027 | 2191 | ||
2028 | } | 2192 | } |
2029 | 2193 | ||
2030 | static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | 2194 | static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, |
2195 | unsigned int nr_segs) | ||
2031 | { | 2196 | { |
2032 | struct request_queue *q = hctx->queue; | 2197 | struct request_queue *q = hctx->queue; |
2033 | struct bfq_data *bfqd = q->elevator->elevator_data; | 2198 | struct bfq_data *bfqd = q->elevator->elevator_data; |
@@ -2050,7 +2215,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | |||
2050 | bfqd->bio_bfqq = NULL; | 2215 | bfqd->bio_bfqq = NULL; |
2051 | bfqd->bio_bic = bic; | 2216 | bfqd->bio_bic = bic; |
2052 | 2217 | ||
2053 | ret = blk_mq_sched_try_merge(q, bio, &free); | 2218 | ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); |
2054 | 2219 | ||
2055 | if (free) | 2220 | if (free) |
2056 | blk_mq_free_request(free); | 2221 | blk_mq_free_request(free); |
@@ -2513,6 +2678,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) | |||
2513 | * to enjoy weight raising if split soon. | 2678 | * to enjoy weight raising if split soon. |
2514 | */ | 2679 | */ |
2515 | bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; | 2680 | bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; |
2681 | bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); | ||
2516 | bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); | 2682 | bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); |
2517 | bic->saved_last_wr_start_finish = jiffies; | 2683 | bic->saved_last_wr_start_finish = jiffies; |
2518 | } else { | 2684 | } else { |
@@ -3045,7 +3211,186 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) | |||
3045 | bfq_remove_request(q, rq); | 3211 | bfq_remove_request(q, rq); |
3046 | } | 3212 | } |
3047 | 3213 | ||
3048 | static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) | 3214 | /* |
3215 | * There is a case where idling does not have to be performed for | ||
3216 | * throughput concerns, but to preserve the throughput share of | ||
3217 | * the process associated with bfqq. | ||
3218 | * | ||
3219 | * To introduce this case, we can note that allowing the drive | ||
3220 | * to enqueue more than one request at a time, and hence | ||
3221 | * delegating de facto final scheduling decisions to the | ||
3222 | * drive's internal scheduler, entails loss of control on the | ||
3223 | * actual request service order. In particular, the critical | ||
3224 | * situation is when requests from different processes happen | ||
3225 | * to be present, at the same time, in the internal queue(s) | ||
3226 | * of the drive. In such a situation, the drive, by deciding | ||
3227 | * the service order of the internally-queued requests, does | ||
3228 | * determine also the actual throughput distribution among | ||
3229 | * these processes. But the drive typically has no notion or | ||
3230 | * concern about per-process throughput distribution, and | ||
3231 | * makes its decisions only on a per-request basis. Therefore, | ||
3232 | * the service distribution enforced by the drive's internal | ||
3233 | * scheduler is likely to coincide with the desired throughput | ||
3234 | * distribution only in a completely symmetric, or favorably | ||
3235 | * skewed scenario where: | ||
3236 | * (i-a) each of these processes must get the same throughput as | ||
3237 | * the others, | ||
3238 | * (i-b) in case (i-a) does not hold, it holds that the process | ||
3239 | * associated with bfqq must receive a lower or equal | ||
3240 | * throughput than any of the other processes; | ||
3241 | * (ii) the I/O of each process has the same properties, in | ||
3242 | * terms of locality (sequential or random), direction | ||
3243 | * (reads or writes), request sizes, greediness | ||
3244 | * (from I/O-bound to sporadic), and so on; | ||
3245 | |||
3246 | * In fact, in such a scenario, the drive tends to treat the requests | ||
3247 | * of each process in about the same way as the requests of the | ||
3248 | * others, and thus to provide each of these processes with about the | ||
3249 | * same throughput. This is exactly the desired throughput | ||
3250 | * distribution if (i-a) holds, or, if (i-b) holds instead, this is an | ||
3251 | * even more convenient distribution for (the process associated with) | ||
3252 | * bfqq. | ||
3253 | * | ||
3254 | * In contrast, in any asymmetric or unfavorable scenario, device | ||
3255 | * idling (I/O-dispatch plugging) is certainly needed to guarantee | ||
3256 | * that bfqq receives its assigned fraction of the device throughput | ||
3257 | * (see [1] for details). | ||
3258 | * | ||
3259 | * The problem is that idling may significantly reduce throughput with | ||
3260 | * certain combinations of types of I/O and devices. An important | ||
3261 | * example is sync random I/O on flash storage with command | ||
3262 | * queueing. So, unless bfqq falls in cases where idling also boosts | ||
3263 | * throughput, it is important to check conditions (i-a), i(-b) and | ||
3264 | * (ii) accurately, so as to avoid idling when not strictly needed for | ||
3265 | * service guarantees. | ||
3266 | * | ||
3267 | * Unfortunately, it is extremely difficult to thoroughly check | ||
3268 | * condition (ii). And, in case there are active groups, it becomes | ||
3269 | * very difficult to check conditions (i-a) and (i-b) too. In fact, | ||
3270 | * if there are active groups, then, for conditions (i-a) or (i-b) to | ||
3271 | * become false 'indirectly', it is enough that an active group | ||
3272 | * contains more active processes or sub-groups than some other active | ||
3273 | * group. More precisely, for conditions (i-a) or (i-b) to become | ||
3274 | * false because of such a group, it is not even necessary that the | ||
3275 | * group is (still) active: it is sufficient that, even if the group | ||
3276 | * has become inactive, some of its descendant processes still have | ||
3277 | * some request already dispatched but still waiting for | ||
3278 | * completion. In fact, requests have still to be guaranteed their | ||
3279 | * share of the throughput even after being dispatched. In this | ||
3280 | * respect, it is easy to show that, if a group frequently becomes | ||
3281 | * inactive while still having in-flight requests, and if, when this | ||
3282 | * happens, the group is not considered in the calculation of whether | ||
3283 | * the scenario is asymmetric, then the group may fail to be | ||
3284 | * guaranteed its fair share of the throughput (basically because | ||
3285 | * idling may not be performed for the descendant processes of the | ||
3286 | * group, but it had to be). We address this issue with the following | ||
3287 | * bi-modal behavior, implemented in the function | ||
3288 | * bfq_asymmetric_scenario(). | ||
3289 | * | ||
3290 | * If there are groups with requests waiting for completion | ||
3291 | * (as commented above, some of these groups may even be | ||
3292 | * already inactive), then the scenario is tagged as | ||
3293 | * asymmetric, conservatively, without checking any of the | ||
3294 | * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq. | ||
3295 | * This behavior matches also the fact that groups are created | ||
3296 | * exactly if controlling I/O is a primary concern (to | ||
3297 | * preserve bandwidth and latency guarantees). | ||
3298 | * | ||
3299 | * On the opposite end, if there are no groups with requests waiting | ||
3300 | * for completion, then only conditions (i-a) and (i-b) are actually | ||
3301 | * controlled, i.e., provided that conditions (i-a) or (i-b) holds, | ||
3302 | * idling is not performed, regardless of whether condition (ii) | ||
3303 | * holds. In other words, only if conditions (i-a) and (i-b) do not | ||
3304 | * hold, then idling is allowed, and the device tends to be prevented | ||
3305 | * from queueing many requests, possibly of several processes. Since | ||
3306 | * there are no groups with requests waiting for completion, then, to | ||
3307 | * control conditions (i-a) and (i-b) it is enough to check just | ||
3308 | * whether all the queues with requests waiting for completion also | ||
3309 | * have the same weight. | ||
3310 | * | ||
3311 | * Not checking condition (ii) evidently exposes bfqq to the | ||
3312 | * risk of getting less throughput than its fair share. | ||
3313 | * However, for queues with the same weight, a further | ||
3314 | * mechanism, preemption, mitigates or even eliminates this | ||
3315 | * problem. And it does so without consequences on overall | ||
3316 | * throughput. This mechanism and its benefits are explained | ||
3317 | * in the next three paragraphs. | ||
3318 | * | ||
3319 | * Even if a queue, say Q, is expired when it remains idle, Q | ||
3320 | * can still preempt the new in-service queue if the next | ||
3321 | * request of Q arrives soon (see the comments on | ||
3322 | * bfq_bfqq_update_budg_for_activation). If all queues and | ||
3323 | * groups have the same weight, this form of preemption, | ||
3324 | * combined with the hole-recovery heuristic described in the | ||
3325 | * comments on function bfq_bfqq_update_budg_for_activation, | ||
3326 | * are enough to preserve a correct bandwidth distribution in | ||
3327 | * the mid term, even without idling. In fact, even if not | ||
3328 | * idling allows the internal queues of the device to contain | ||
3329 | * many requests, and thus to reorder requests, we can rather | ||
3330 | * safely assume that the internal scheduler still preserves a | ||
3331 | * minimum of mid-term fairness. | ||
3332 | * | ||
3333 | * More precisely, this preemption-based, idleless approach | ||
3334 | * provides fairness in terms of IOPS, and not sectors per | ||
3335 | * second. This can be seen with a simple example. Suppose | ||
3336 | * that there are two queues with the same weight, but that | ||
3337 | * the first queue receives requests of 8 sectors, while the | ||
3338 | * second queue receives requests of 1024 sectors. In | ||
3339 | * addition, suppose that each of the two queues contains at | ||
3340 | * most one request at a time, which implies that each queue | ||
3341 | * always remains idle after it is served. Finally, after | ||
3342 | * remaining idle, each queue receives very quickly a new | ||
3343 | * request. It follows that the two queues are served | ||
3344 | * alternatively, preempting each other if needed. This | ||
3345 | * implies that, although both queues have the same weight, | ||
3346 | * the queue with large requests receives a service that is | ||
3347 | * 1024/8 times as high as the service received by the other | ||
3348 | * queue. | ||
3349 | * | ||
3350 | * The motivation for using preemption instead of idling (for | ||
3351 | * queues with the same weight) is that, by not idling, | ||
3352 | * service guarantees are preserved (completely or at least in | ||
3353 | * part) without minimally sacrificing throughput. And, if | ||
3354 | * there is no active group, then the primary expectation for | ||
3355 | * this device is probably a high throughput. | ||
3356 | * | ||
3357 | * We are now left only with explaining the additional | ||
3358 | * compound condition that is checked below for deciding | ||
3359 | * whether the scenario is asymmetric. To explain this | ||
3360 | * compound condition, we need to add that the function | ||
3361 | * bfq_asymmetric_scenario checks the weights of only | ||
3362 | * non-weight-raised queues, for efficiency reasons (see | ||
3363 | * comments on bfq_weights_tree_add()). Then the fact that | ||
3364 | * bfqq is weight-raised is checked explicitly here. More | ||
3365 | * precisely, the compound condition below takes into account | ||
3366 | * also the fact that, even if bfqq is being weight-raised, | ||
3367 | * the scenario is still symmetric if all queues with requests | ||
3368 | * waiting for completion happen to be | ||
3369 | * weight-raised. Actually, we should be even more precise | ||
3370 | * here, and differentiate between interactive weight raising | ||
3371 | * and soft real-time weight raising. | ||
3372 | * | ||
3373 | * As a side note, it is worth considering that the above | ||
3374 | * device-idling countermeasures may however fail in the | ||
3375 | * following unlucky scenario: if idling is (correctly) | ||
3376 | * disabled in a time period during which all symmetry | ||
3377 | * sub-conditions hold, and hence the device is allowed to | ||
3378 | * enqueue many requests, but at some later point in time some | ||
3379 | * sub-condition stops to hold, then it may become impossible | ||
3380 | * to let requests be served in the desired order until all | ||
3381 | * the requests already queued in the device have been served. | ||
3382 | */ | ||
3383 | static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, | ||
3384 | struct bfq_queue *bfqq) | ||
3385 | { | ||
3386 | return (bfqq->wr_coeff > 1 && | ||
3387 | bfqd->wr_busy_queues < | ||
3388 | bfq_tot_busy_queues(bfqd)) || | ||
3389 | bfq_asymmetric_scenario(bfqd, bfqq); | ||
3390 | } | ||
3391 | |||
3392 | static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
3393 | enum bfqq_expiration reason) | ||
3049 | { | 3394 | { |
3050 | /* | 3395 | /* |
3051 | * If this bfqq is shared between multiple processes, check | 3396 | * If this bfqq is shared between multiple processes, check |
@@ -3056,7 +3401,22 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) | |||
3056 | if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) | 3401 | if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) |
3057 | bfq_mark_bfqq_split_coop(bfqq); | 3402 | bfq_mark_bfqq_split_coop(bfqq); |
3058 | 3403 | ||
3059 | if (RB_EMPTY_ROOT(&bfqq->sort_list)) { | 3404 | /* |
3405 | * Consider queues with a higher finish virtual time than | ||
3406 | * bfqq. If idling_needed_for_service_guarantees(bfqq) returns | ||
3407 | * true, then bfqq's bandwidth would be violated if an | ||
3408 | * uncontrolled amount of I/O from these queues were | ||
3409 | * dispatched while bfqq is waiting for its new I/O to | ||
3410 | * arrive. This is exactly what may happen if this is a forced | ||
3411 | * expiration caused by a preemption attempt, and if bfqq is | ||
3412 | * not re-scheduled. To prevent this from happening, re-queue | ||
3413 | * bfqq if it needs I/O-dispatch plugging, even if it is | ||
3414 | * empty. By doing so, bfqq is granted to be served before the | ||
3415 | * above queues (provided that bfqq is of course eligible). | ||
3416 | */ | ||
3417 | if (RB_EMPTY_ROOT(&bfqq->sort_list) && | ||
3418 | !(reason == BFQQE_PREEMPTED && | ||
3419 | idling_needed_for_service_guarantees(bfqd, bfqq))) { | ||
3060 | if (bfqq->dispatched == 0) | 3420 | if (bfqq->dispatched == 0) |
3061 | /* | 3421 | /* |
3062 | * Overloading budget_timeout field to store | 3422 | * Overloading budget_timeout field to store |
@@ -3073,7 +3433,8 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) | |||
3073 | * Resort priority tree of potential close cooperators. | 3433 | * Resort priority tree of potential close cooperators. |
3074 | * See comments on bfq_pos_tree_add_move() for the unlikely(). | 3434 | * See comments on bfq_pos_tree_add_move() for the unlikely(). |
3075 | */ | 3435 | */ |
3076 | if (unlikely(!bfqd->nonrot_with_queueing)) | 3436 | if (unlikely(!bfqd->nonrot_with_queueing && |
3437 | !RB_EMPTY_ROOT(&bfqq->sort_list))) | ||
3077 | bfq_pos_tree_add_move(bfqd, bfqq); | 3438 | bfq_pos_tree_add_move(bfqd, bfqq); |
3078 | } | 3439 | } |
3079 | 3440 | ||
@@ -3574,7 +3935,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, | |||
3574 | * reason. | 3935 | * reason. |
3575 | */ | 3936 | */ |
3576 | __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); | 3937 | __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); |
3577 | if (__bfq_bfqq_expire(bfqd, bfqq)) | 3938 | if (__bfq_bfqq_expire(bfqd, bfqq, reason)) |
3578 | /* bfqq is gone, no more actions on it */ | 3939 | /* bfqq is gone, no more actions on it */ |
3579 | return; | 3940 | return; |
3580 | 3941 | ||
@@ -3721,184 +4082,6 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, | |||
3721 | } | 4082 | } |
3722 | 4083 | ||
3723 | /* | 4084 | /* |
3724 | * There is a case where idling does not have to be performed for | ||
3725 | * throughput concerns, but to preserve the throughput share of | ||
3726 | * the process associated with bfqq. | ||
3727 | * | ||
3728 | * To introduce this case, we can note that allowing the drive | ||
3729 | * to enqueue more than one request at a time, and hence | ||
3730 | * delegating de facto final scheduling decisions to the | ||
3731 | * drive's internal scheduler, entails loss of control on the | ||
3732 | * actual request service order. In particular, the critical | ||
3733 | * situation is when requests from different processes happen | ||
3734 | * to be present, at the same time, in the internal queue(s) | ||
3735 | * of the drive. In such a situation, the drive, by deciding | ||
3736 | * the service order of the internally-queued requests, does | ||
3737 | * determine also the actual throughput distribution among | ||
3738 | * these processes. But the drive typically has no notion or | ||
3739 | * concern about per-process throughput distribution, and | ||
3740 | * makes its decisions only on a per-request basis. Therefore, | ||
3741 | * the service distribution enforced by the drive's internal | ||
3742 | * scheduler is likely to coincide with the desired throughput | ||
3743 | * distribution only in a completely symmetric, or favorably | ||
3744 | * skewed scenario where: | ||
3745 | * (i-a) each of these processes must get the same throughput as | ||
3746 | * the others, | ||
3747 | * (i-b) in case (i-a) does not hold, it holds that the process | ||
3748 | * associated with bfqq must receive a lower or equal | ||
3749 | * throughput than any of the other processes; | ||
3750 | * (ii) the I/O of each process has the same properties, in | ||
3751 | * terms of locality (sequential or random), direction | ||
3752 | * (reads or writes), request sizes, greediness | ||
3753 | * (from I/O-bound to sporadic), and so on; | ||
3754 | |||
3755 | * In fact, in such a scenario, the drive tends to treat the requests | ||
3756 | * of each process in about the same way as the requests of the | ||
3757 | * others, and thus to provide each of these processes with about the | ||
3758 | * same throughput. This is exactly the desired throughput | ||
3759 | * distribution if (i-a) holds, or, if (i-b) holds instead, this is an | ||
3760 | * even more convenient distribution for (the process associated with) | ||
3761 | * bfqq. | ||
3762 | * | ||
3763 | * In contrast, in any asymmetric or unfavorable scenario, device | ||
3764 | * idling (I/O-dispatch plugging) is certainly needed to guarantee | ||
3765 | * that bfqq receives its assigned fraction of the device throughput | ||
3766 | * (see [1] for details). | ||
3767 | * | ||
3768 | * The problem is that idling may significantly reduce throughput with | ||
3769 | * certain combinations of types of I/O and devices. An important | ||
3770 | * example is sync random I/O on flash storage with command | ||
3771 | * queueing. So, unless bfqq falls in cases where idling also boosts | ||
3772 | * throughput, it is important to check conditions (i-a), i(-b) and | ||
3773 | * (ii) accurately, so as to avoid idling when not strictly needed for | ||
3774 | * service guarantees. | ||
3775 | * | ||
3776 | * Unfortunately, it is extremely difficult to thoroughly check | ||
3777 | * condition (ii). And, in case there are active groups, it becomes | ||
3778 | * very difficult to check conditions (i-a) and (i-b) too. In fact, | ||
3779 | * if there are active groups, then, for conditions (i-a) or (i-b) to | ||
3780 | * become false 'indirectly', it is enough that an active group | ||
3781 | * contains more active processes or sub-groups than some other active | ||
3782 | * group. More precisely, for conditions (i-a) or (i-b) to become | ||
3783 | * false because of such a group, it is not even necessary that the | ||
3784 | * group is (still) active: it is sufficient that, even if the group | ||
3785 | * has become inactive, some of its descendant processes still have | ||
3786 | * some request already dispatched but still waiting for | ||
3787 | * completion. In fact, requests have still to be guaranteed their | ||
3788 | * share of the throughput even after being dispatched. In this | ||
3789 | * respect, it is easy to show that, if a group frequently becomes | ||
3790 | * inactive while still having in-flight requests, and if, when this | ||
3791 | * happens, the group is not considered in the calculation of whether | ||
3792 | * the scenario is asymmetric, then the group may fail to be | ||
3793 | * guaranteed its fair share of the throughput (basically because | ||
3794 | * idling may not be performed for the descendant processes of the | ||
3795 | * group, but it had to be). We address this issue with the following | ||
3796 | * bi-modal behavior, implemented in the function | ||
3797 | * bfq_asymmetric_scenario(). | ||
3798 | * | ||
3799 | * If there are groups with requests waiting for completion | ||
3800 | * (as commented above, some of these groups may even be | ||
3801 | * already inactive), then the scenario is tagged as | ||
3802 | * asymmetric, conservatively, without checking any of the | ||
3803 | * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq. | ||
3804 | * This behavior matches also the fact that groups are created | ||
3805 | * exactly if controlling I/O is a primary concern (to | ||
3806 | * preserve bandwidth and latency guarantees). | ||
3807 | * | ||
3808 | * On the opposite end, if there are no groups with requests waiting | ||
3809 | * for completion, then only conditions (i-a) and (i-b) are actually | ||
3810 | * controlled, i.e., provided that conditions (i-a) or (i-b) holds, | ||
3811 | * idling is not performed, regardless of whether condition (ii) | ||
3812 | * holds. In other words, only if conditions (i-a) and (i-b) do not | ||
3813 | * hold, then idling is allowed, and the device tends to be prevented | ||
3814 | * from queueing many requests, possibly of several processes. Since | ||
3815 | * there are no groups with requests waiting for completion, then, to | ||
3816 | * control conditions (i-a) and (i-b) it is enough to check just | ||
3817 | * whether all the queues with requests waiting for completion also | ||
3818 | * have the same weight. | ||
3819 | * | ||
3820 | * Not checking condition (ii) evidently exposes bfqq to the | ||
3821 | * risk of getting less throughput than its fair share. | ||
3822 | * However, for queues with the same weight, a further | ||
3823 | * mechanism, preemption, mitigates or even eliminates this | ||
3824 | * problem. And it does so without consequences on overall | ||
3825 | * throughput. This mechanism and its benefits are explained | ||
3826 | * in the next three paragraphs. | ||
3827 | * | ||
3828 | * Even if a queue, say Q, is expired when it remains idle, Q | ||
3829 | * can still preempt the new in-service queue if the next | ||
3830 | * request of Q arrives soon (see the comments on | ||
3831 | * bfq_bfqq_update_budg_for_activation). If all queues and | ||
3832 | * groups have the same weight, this form of preemption, | ||
3833 | * combined with the hole-recovery heuristic described in the | ||
3834 | * comments on function bfq_bfqq_update_budg_for_activation, | ||
3835 | * are enough to preserve a correct bandwidth distribution in | ||
3836 | * the mid term, even without idling. In fact, even if not | ||
3837 | * idling allows the internal queues of the device to contain | ||
3838 | * many requests, and thus to reorder requests, we can rather | ||
3839 | * safely assume that the internal scheduler still preserves a | ||
3840 | * minimum of mid-term fairness. | ||
3841 | * | ||
3842 | * More precisely, this preemption-based, idleless approach | ||
3843 | * provides fairness in terms of IOPS, and not sectors per | ||
3844 | * second. This can be seen with a simple example. Suppose | ||
3845 | * that there are two queues with the same weight, but that | ||
3846 | * the first queue receives requests of 8 sectors, while the | ||
3847 | * second queue receives requests of 1024 sectors. In | ||
3848 | * addition, suppose that each of the two queues contains at | ||
3849 | * most one request at a time, which implies that each queue | ||
3850 | * always remains idle after it is served. Finally, after | ||
3851 | * remaining idle, each queue receives very quickly a new | ||
3852 | * request. It follows that the two queues are served | ||
3853 | * alternatively, preempting each other if needed. This | ||
3854 | * implies that, although both queues have the same weight, | ||
3855 | * the queue with large requests receives a service that is | ||
3856 | * 1024/8 times as high as the service received by the other | ||
3857 | * queue. | ||
3858 | * | ||
3859 | * The motivation for using preemption instead of idling (for | ||
3860 | * queues with the same weight) is that, by not idling, | ||
3861 | * service guarantees are preserved (completely or at least in | ||
3862 | * part) without minimally sacrificing throughput. And, if | ||
3863 | * there is no active group, then the primary expectation for | ||
3864 | * this device is probably a high throughput. | ||
3865 | * | ||
3866 | * We are now left only with explaining the additional | ||
3867 | * compound condition that is checked below for deciding | ||
3868 | * whether the scenario is asymmetric. To explain this | ||
3869 | * compound condition, we need to add that the function | ||
3870 | * bfq_asymmetric_scenario checks the weights of only | ||
3871 | * non-weight-raised queues, for efficiency reasons (see | ||
3872 | * comments on bfq_weights_tree_add()). Then the fact that | ||
3873 | * bfqq is weight-raised is checked explicitly here. More | ||
3874 | * precisely, the compound condition below takes into account | ||
3875 | * also the fact that, even if bfqq is being weight-raised, | ||
3876 | * the scenario is still symmetric if all queues with requests | ||
3877 | * waiting for completion happen to be | ||
3878 | * weight-raised. Actually, we should be even more precise | ||
3879 | * here, and differentiate between interactive weight raising | ||
3880 | * and soft real-time weight raising. | ||
3881 | * | ||
3882 | * As a side note, it is worth considering that the above | ||
3883 | * device-idling countermeasures may however fail in the | ||
3884 | * following unlucky scenario: if idling is (correctly) | ||
3885 | * disabled in a time period during which all symmetry | ||
3886 | * sub-conditions hold, and hence the device is allowed to | ||
3887 | * enqueue many requests, but at some later point in time some | ||
3888 | * sub-condition stops to hold, then it may become impossible | ||
3889 | * to let requests be served in the desired order until all | ||
3890 | * the requests already queued in the device have been served. | ||
3891 | */ | ||
3892 | static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, | ||
3893 | struct bfq_queue *bfqq) | ||
3894 | { | ||
3895 | return (bfqq->wr_coeff > 1 && | ||
3896 | bfqd->wr_busy_queues < | ||
3897 | bfq_tot_busy_queues(bfqd)) || | ||
3898 | bfq_asymmetric_scenario(bfqd, bfqq); | ||
3899 | } | ||
3900 | |||
3901 | /* | ||
3902 | * For a queue that becomes empty, device idling is allowed only if | 4085 | * For a queue that becomes empty, device idling is allowed only if |
3903 | * this function returns true for that queue. As a consequence, since | 4086 | * this function returns true for that queue. As a consequence, since |
3904 | * device idling plays a critical role for both throughput boosting | 4087 | * device idling plays a critical role for both throughput boosting |
@@ -4156,22 +4339,95 @@ check_queue: | |||
4156 | (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { | 4339 | (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { |
4157 | struct bfq_queue *async_bfqq = | 4340 | struct bfq_queue *async_bfqq = |
4158 | bfqq->bic && bfqq->bic->bfqq[0] && | 4341 | bfqq->bic && bfqq->bic->bfqq[0] && |
4159 | bfq_bfqq_busy(bfqq->bic->bfqq[0]) ? | 4342 | bfq_bfqq_busy(bfqq->bic->bfqq[0]) && |
4343 | bfqq->bic->bfqq[0]->next_rq ? | ||
4160 | bfqq->bic->bfqq[0] : NULL; | 4344 | bfqq->bic->bfqq[0] : NULL; |
4161 | 4345 | ||
4162 | /* | 4346 | /* |
4163 | * If the process associated with bfqq has also async | 4347 | * The next three mutually-exclusive ifs decide |
4164 | * I/O pending, then inject it | 4348 | * whether to try injection, and choose the queue to |
4165 | * unconditionally. Injecting I/O from the same | 4349 | * pick an I/O request from. |
4166 | * process can cause no harm to the process. On the | 4350 | * |
4167 | * contrary, it can only increase bandwidth and reduce | 4351 | * The first if checks whether the process associated |
4168 | * latency for the process. | 4352 | * with bfqq has also async I/O pending. If so, it |
4353 | * injects such I/O unconditionally. Injecting async | ||
4354 | * I/O from the same process can cause no harm to the | ||
4355 | * process. On the contrary, it can only increase | ||
4356 | * bandwidth and reduce latency for the process. | ||
4357 | * | ||
4358 | * The second if checks whether there happens to be a | ||
4359 | * non-empty waker queue for bfqq, i.e., a queue whose | ||
4360 | * I/O needs to be completed for bfqq to receive new | ||
4361 | * I/O. This happens, e.g., if bfqq is associated with | ||
4362 | * a process that does some sync. A sync generates | ||
4363 | * extra blocking I/O, which must be completed before | ||
4364 | * the process associated with bfqq can go on with its | ||
4365 | * I/O. If the I/O of the waker queue is not served, | ||
4366 | * then bfqq remains empty, and no I/O is dispatched, | ||
4367 | * until the idle timeout fires for bfqq. This is | ||
4368 | * likely to result in lower bandwidth and higher | ||
4369 | * latencies for bfqq, and in a severe loss of total | ||
4370 | * throughput. The best action to take is therefore to | ||
4371 | * serve the waker queue as soon as possible. So do it | ||
4372 | * (without relying on the third alternative below for | ||
4373 | * eventually serving waker_bfqq's I/O; see the last | ||
4374 | * paragraph for further details). This systematic | ||
4375 | * injection of I/O from the waker queue does not | ||
4376 | * cause any delay to bfqq's I/O. On the contrary, | ||
4377 | * next bfqq's I/O is brought forward dramatically, | ||
4378 | * for it is not blocked for milliseconds. | ||
4379 | * | ||
4380 | * The third if checks whether bfqq is a queue for | ||
4381 | * which it is better to avoid injection. It is so if | ||
4382 | * bfqq delivers more throughput when served without | ||
4383 | * any further I/O from other queues in the middle, or | ||
4384 | * if the service times of bfqq's I/O requests both | ||
4385 | * count more than overall throughput, and may be | ||
4386 | * easily increased by injection (this happens if bfqq | ||
4387 | * has a short think time). If none of these | ||
4388 | * conditions holds, then a candidate queue for | ||
4389 | * injection is looked for through | ||
4390 | * bfq_choose_bfqq_for_injection(). Note that the | ||
4391 | * latter may return NULL (for example if the inject | ||
4392 | * limit for bfqq is currently 0). | ||
4393 | * | ||
4394 | * NOTE: motivation for the second alternative | ||
4395 | * | ||
4396 | * Thanks to the way the inject limit is updated in | ||
4397 | * bfq_update_has_short_ttime(), it is rather likely | ||
4398 | * that, if I/O is being plugged for bfqq and the | ||
4399 | * waker queue has pending I/O requests that are | ||
4400 | * blocking bfqq's I/O, then the third alternative | ||
4401 | * above lets the waker queue get served before the | ||
4402 | * I/O-plugging timeout fires. So one may deem the | ||
4403 | * second alternative superfluous. It is not, because | ||
4404 | * the third alternative may be way less effective in | ||
4405 | * case of a synchronization. For two main | ||
4406 | * reasons. First, throughput may be low because the | ||
4407 | * inject limit may be too low to guarantee the same | ||
4408 | * amount of injected I/O, from the waker queue or | ||
4409 | * other queues, that the second alternative | ||
4410 | * guarantees (the second alternative unconditionally | ||
4411 | * injects a pending I/O request of the waker queue | ||
4412 | * for each bfq_dispatch_request()). Second, with the | ||
4413 | * third alternative, the duration of the plugging, | ||
4414 | * i.e., the time before bfqq finally receives new I/O, | ||
4415 | * may not be minimized, because the waker queue may | ||
4416 | * happen to be served only after other queues. | ||
4169 | */ | 4417 | */ |
4170 | if (async_bfqq && | 4418 | if (async_bfqq && |
4171 | icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && | 4419 | icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && |
4172 | bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= | 4420 | bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= |
4173 | bfq_bfqq_budget_left(async_bfqq)) | 4421 | bfq_bfqq_budget_left(async_bfqq)) |
4174 | bfqq = bfqq->bic->bfqq[0]; | 4422 | bfqq = bfqq->bic->bfqq[0]; |
4423 | else if (bfq_bfqq_has_waker(bfqq) && | ||
4424 | bfq_bfqq_busy(bfqq->waker_bfqq) && | ||
4425 | bfqq->next_rq && | ||
4426 | bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, | ||
4427 | bfqq->waker_bfqq) <= | ||
4428 | bfq_bfqq_budget_left(bfqq->waker_bfqq) | ||
4429 | ) | ||
4430 | bfqq = bfqq->waker_bfqq; | ||
4175 | else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && | 4431 | else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && |
4176 | (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || | 4432 | (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || |
4177 | !bfq_bfqq_has_short_ttime(bfqq))) | 4433 | !bfq_bfqq_has_short_ttime(bfqq))) |
@@ -4403,7 +4659,7 @@ exit: | |||
4403 | return rq; | 4659 | return rq; |
4404 | } | 4660 | } |
4405 | 4661 | ||
4406 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | 4662 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
4407 | static void bfq_update_dispatch_stats(struct request_queue *q, | 4663 | static void bfq_update_dispatch_stats(struct request_queue *q, |
4408 | struct request *rq, | 4664 | struct request *rq, |
4409 | struct bfq_queue *in_serv_queue, | 4665 | struct bfq_queue *in_serv_queue, |
@@ -4453,7 +4709,7 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q, | |||
4453 | struct request *rq, | 4709 | struct request *rq, |
4454 | struct bfq_queue *in_serv_queue, | 4710 | struct bfq_queue *in_serv_queue, |
4455 | bool idle_timer_disabled) {} | 4711 | bool idle_timer_disabled) {} |
4456 | #endif | 4712 | #endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
4457 | 4713 | ||
4458 | static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | 4714 | static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) |
4459 | { | 4715 | { |
@@ -4560,8 +4816,11 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq) | |||
4560 | 4816 | ||
4561 | static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) | 4817 | static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
4562 | { | 4818 | { |
4819 | struct bfq_queue *item; | ||
4820 | struct hlist_node *n; | ||
4821 | |||
4563 | if (bfqq == bfqd->in_service_queue) { | 4822 | if (bfqq == bfqd->in_service_queue) { |
4564 | __bfq_bfqq_expire(bfqd, bfqq); | 4823 | __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT); |
4565 | bfq_schedule_dispatch(bfqd); | 4824 | bfq_schedule_dispatch(bfqd); |
4566 | } | 4825 | } |
4567 | 4826 | ||
@@ -4569,6 +4828,18 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) | |||
4569 | 4828 | ||
4570 | bfq_put_cooperator(bfqq); | 4829 | bfq_put_cooperator(bfqq); |
4571 | 4830 | ||
4831 | /* remove bfqq from woken list */ | ||
4832 | if (!hlist_unhashed(&bfqq->woken_list_node)) | ||
4833 | hlist_del_init(&bfqq->woken_list_node); | ||
4834 | |||
4835 | /* reset waker for all queues in woken list */ | ||
4836 | hlist_for_each_entry_safe(item, n, &bfqq->woken_list, | ||
4837 | woken_list_node) { | ||
4838 | item->waker_bfqq = NULL; | ||
4839 | bfq_clear_bfqq_has_waker(item); | ||
4840 | hlist_del_init(&item->woken_list_node); | ||
4841 | } | ||
4842 | |||
4572 | bfq_put_queue(bfqq); /* release process reference */ | 4843 | bfq_put_queue(bfqq); /* release process reference */ |
4573 | } | 4844 | } |
4574 | 4845 | ||
@@ -4584,6 +4855,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) | |||
4584 | unsigned long flags; | 4855 | unsigned long flags; |
4585 | 4856 | ||
4586 | spin_lock_irqsave(&bfqd->lock, flags); | 4857 | spin_lock_irqsave(&bfqd->lock, flags); |
4858 | bfqq->bic = NULL; | ||
4587 | bfq_exit_bfqq(bfqd, bfqq); | 4859 | bfq_exit_bfqq(bfqd, bfqq); |
4588 | bic_set_bfqq(bic, NULL, is_sync); | 4860 | bic_set_bfqq(bic, NULL, is_sync); |
4589 | spin_unlock_irqrestore(&bfqd->lock, flags); | 4861 | spin_unlock_irqrestore(&bfqd->lock, flags); |
@@ -4687,6 +4959,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
4687 | RB_CLEAR_NODE(&bfqq->entity.rb_node); | 4959 | RB_CLEAR_NODE(&bfqq->entity.rb_node); |
4688 | INIT_LIST_HEAD(&bfqq->fifo); | 4960 | INIT_LIST_HEAD(&bfqq->fifo); |
4689 | INIT_HLIST_NODE(&bfqq->burst_list_node); | 4961 | INIT_HLIST_NODE(&bfqq->burst_list_node); |
4962 | INIT_HLIST_NODE(&bfqq->woken_list_node); | ||
4963 | INIT_HLIST_HEAD(&bfqq->woken_list); | ||
4690 | 4964 | ||
4691 | bfqq->ref = 0; | 4965 | bfqq->ref = 0; |
4692 | bfqq->bfqd = bfqd; | 4966 | bfqq->bfqd = bfqd; |
@@ -4854,7 +5128,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, | |||
4854 | struct bfq_queue *bfqq, | 5128 | struct bfq_queue *bfqq, |
4855 | struct bfq_io_cq *bic) | 5129 | struct bfq_io_cq *bic) |
4856 | { | 5130 | { |
4857 | bool has_short_ttime = true; | 5131 | bool has_short_ttime = true, state_changed; |
4858 | 5132 | ||
4859 | /* | 5133 | /* |
4860 | * No need to update has_short_ttime if bfqq is async or in | 5134 | * No need to update has_short_ttime if bfqq is async or in |
@@ -4879,13 +5153,102 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, | |||
4879 | bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) | 5153 | bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) |
4880 | has_short_ttime = false; | 5154 | has_short_ttime = false; |
4881 | 5155 | ||
4882 | bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", | 5156 | state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); |
4883 | has_short_ttime); | ||
4884 | 5157 | ||
4885 | if (has_short_ttime) | 5158 | if (has_short_ttime) |
4886 | bfq_mark_bfqq_has_short_ttime(bfqq); | 5159 | bfq_mark_bfqq_has_short_ttime(bfqq); |
4887 | else | 5160 | else |
4888 | bfq_clear_bfqq_has_short_ttime(bfqq); | 5161 | bfq_clear_bfqq_has_short_ttime(bfqq); |
5162 | |||
5163 | /* | ||
5164 | * Until the base value for the total service time gets | ||
5165 | * finally computed for bfqq, the inject limit does depend on | ||
5166 | * the think-time state (short|long). In particular, the limit | ||
5167 | * is 0 or 1 if the think time is deemed, respectively, as | ||
5168 | * short or long (details in the comments in | ||
5169 | * bfq_update_inject_limit()). Accordingly, the next | ||
5170 | * instructions reset the inject limit if the think-time state | ||
5171 | * has changed and the above base value is still to be | ||
5172 | * computed. | ||
5173 | * | ||
5174 | * However, the reset is performed only if more than 100 ms | ||
5175 | * have elapsed since the last update of the inject limit, or | ||
5176 | * (inclusive) if the change is from short to long think | ||
5177 | * time. The reason for this waiting is as follows. | ||
5178 | * | ||
5179 | * bfqq may have a long think time because of a | ||
5180 | * synchronization with some other queue, i.e., because the | ||
5181 | * I/O of some other queue may need to be completed for bfqq | ||
5182 | * to receive new I/O. Details in the comments on the choice | ||
5183 | * of the queue for injection in bfq_select_queue(). | ||
5184 | * | ||
5185 | * As stressed in those comments, if such a synchronization is | ||
5186 | * actually in place, then, without injection on bfqq, the | ||
5187 | * blocking I/O cannot happen to served while bfqq is in | ||
5188 | * service. As a consequence, if bfqq is granted | ||
5189 | * I/O-dispatch-plugging, then bfqq remains empty, and no I/O | ||
5190 | * is dispatched, until the idle timeout fires. This is likely | ||
5191 | * to result in lower bandwidth and higher latencies for bfqq, | ||
5192 | * and in a severe loss of total throughput. | ||
5193 | * | ||
5194 | * On the opposite end, a non-zero inject limit may allow the | ||
5195 | * I/O that blocks bfqq to be executed soon, and therefore | ||
5196 | * bfqq to receive new I/O soon. | ||
5197 | * | ||
5198 | * But, if the blocking gets actually eliminated, then the | ||
5199 | * next think-time sample for bfqq may be very low. This in | ||
5200 | * turn may cause bfqq's think time to be deemed | ||
5201 | * short. Without the 100 ms barrier, this new state change | ||
5202 | * would cause the body of the next if to be executed | ||
5203 | * immediately. But this would set to 0 the inject | ||
5204 | * limit. Without injection, the blocking I/O would cause the | ||
5205 | * think time of bfqq to become long again, and therefore the | ||
5206 | * inject limit to be raised again, and so on. The only effect | ||
5207 | * of such a steady oscillation between the two think-time | ||
5208 | * states would be to prevent effective injection on bfqq. | ||
5209 | * | ||
5210 | * In contrast, if the inject limit is not reset during such a | ||
5211 | * long time interval as 100 ms, then the number of short | ||
5212 | * think time samples can grow significantly before the reset | ||
5213 | * is performed. As a consequence, the think time state can | ||
5214 | * become stable before the reset. Therefore there will be no | ||
5215 | * state change when the 100 ms elapse, and no reset of the | ||
5216 | * inject limit. The inject limit remains steadily equal to 1 | ||
5217 | * both during and after the 100 ms. So injection can be | ||
5218 | * performed at all times, and throughput gets boosted. | ||
5219 | * | ||
5220 | * An inject limit equal to 1 is however in conflict, in | ||
5221 | * general, with the fact that the think time of bfqq is | ||
5222 | * short, because injection may be likely to delay bfqq's I/O | ||
5223 | * (as explained in the comments in | ||
5224 | * bfq_update_inject_limit()). But this does not happen in | ||
5225 | * this special case, because bfqq's low think time is due to | ||
5226 | * an effective handling of a synchronization, through | ||
5227 | * injection. In this special case, bfqq's I/O does not get | ||
5228 | * delayed by injection; on the contrary, bfqq's I/O is | ||
5229 | * brought forward, because it is not blocked for | ||
5230 | * milliseconds. | ||
5231 | * | ||
5232 | * In addition, serving the blocking I/O much sooner, and much | ||
5233 | * more frequently than once per I/O-plugging timeout, makes | ||
5234 | * it much quicker to detect a waker queue (the concept of | ||
5235 | * waker queue is defined in the comments in | ||
5236 | * bfq_add_request()). This makes it possible to start sooner | ||
5237 | * to boost throughput more effectively, by injecting the I/O | ||
5238 | * of the waker queue unconditionally on every | ||
5239 | * bfq_dispatch_request(). | ||
5240 | * | ||
5241 | * One last, important benefit of not resetting the inject | ||
5242 | * limit before 100 ms is that, during this time interval, the | ||
5243 | * base value for the total service time is likely to get | ||
5244 | * finally computed for bfqq, freeing the inject limit from | ||
5245 | * its relation with the think time. | ||
5246 | */ | ||
5247 | if (state_changed && bfqq->last_serv_time_ns == 0 && | ||
5248 | (time_is_before_eq_jiffies(bfqq->decrease_time_jif + | ||
5249 | msecs_to_jiffies(100)) || | ||
5250 | !has_short_ttime)) | ||
5251 | bfq_reset_inject_limit(bfqd, bfqq); | ||
4889 | } | 5252 | } |
4890 | 5253 | ||
4891 | /* | 5254 | /* |
@@ -4895,19 +5258,9 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, | |||
4895 | static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, | 5258 | static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
4896 | struct request *rq) | 5259 | struct request *rq) |
4897 | { | 5260 | { |
4898 | struct bfq_io_cq *bic = RQ_BIC(rq); | ||
4899 | |||
4900 | if (rq->cmd_flags & REQ_META) | 5261 | if (rq->cmd_flags & REQ_META) |
4901 | bfqq->meta_pending++; | 5262 | bfqq->meta_pending++; |
4902 | 5263 | ||
4903 | bfq_update_io_thinktime(bfqd, bfqq); | ||
4904 | bfq_update_has_short_ttime(bfqd, bfqq, bic); | ||
4905 | bfq_update_io_seektime(bfqd, bfqq, rq); | ||
4906 | |||
4907 | bfq_log_bfqq(bfqd, bfqq, | ||
4908 | "rq_enqueued: has_short_ttime=%d (seeky %d)", | ||
4909 | bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); | ||
4910 | |||
4911 | bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); | 5264 | bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); |
4912 | 5265 | ||
4913 | if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { | 5266 | if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { |
@@ -4995,6 +5348,10 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) | |||
4995 | bfqq = new_bfqq; | 5348 | bfqq = new_bfqq; |
4996 | } | 5349 | } |
4997 | 5350 | ||
5351 | bfq_update_io_thinktime(bfqd, bfqq); | ||
5352 | bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq)); | ||
5353 | bfq_update_io_seektime(bfqd, bfqq, rq); | ||
5354 | |||
4998 | waiting = bfqq && bfq_bfqq_wait_request(bfqq); | 5355 | waiting = bfqq && bfq_bfqq_wait_request(bfqq); |
4999 | bfq_add_request(rq); | 5356 | bfq_add_request(rq); |
5000 | idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); | 5357 | idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); |
@@ -5007,7 +5364,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) | |||
5007 | return idle_timer_disabled; | 5364 | return idle_timer_disabled; |
5008 | } | 5365 | } |
5009 | 5366 | ||
5010 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | 5367 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
5011 | static void bfq_update_insert_stats(struct request_queue *q, | 5368 | static void bfq_update_insert_stats(struct request_queue *q, |
5012 | struct bfq_queue *bfqq, | 5369 | struct bfq_queue *bfqq, |
5013 | bool idle_timer_disabled, | 5370 | bool idle_timer_disabled, |
@@ -5037,7 +5394,7 @@ static inline void bfq_update_insert_stats(struct request_queue *q, | |||
5037 | struct bfq_queue *bfqq, | 5394 | struct bfq_queue *bfqq, |
5038 | bool idle_timer_disabled, | 5395 | bool idle_timer_disabled, |
5039 | unsigned int cmd_flags) {} | 5396 | unsigned int cmd_flags) {} |
5040 | #endif | 5397 | #endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
5041 | 5398 | ||
5042 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | 5399 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, |
5043 | bool at_head) | 5400 | bool at_head) |
@@ -5200,6 +5557,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | |||
5200 | 1UL<<(BFQ_RATE_SHIFT - 10)) | 5557 | 1UL<<(BFQ_RATE_SHIFT - 10)) |
5201 | bfq_update_rate_reset(bfqd, NULL); | 5558 | bfq_update_rate_reset(bfqd, NULL); |
5202 | bfqd->last_completion = now_ns; | 5559 | bfqd->last_completion = now_ns; |
5560 | bfqd->last_completed_rq_bfqq = bfqq; | ||
5203 | 5561 | ||
5204 | /* | 5562 | /* |
5205 | * If we are waiting to discover whether the request pattern | 5563 | * If we are waiting to discover whether the request pattern |
@@ -5397,8 +5755,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, | |||
5397 | * total service time, and there seem to be the right | 5755 | * total service time, and there seem to be the right |
5398 | * conditions to do it, or we can lower the last base value | 5756 | * conditions to do it, or we can lower the last base value |
5399 | * computed. | 5757 | * computed. |
5758 | * | ||
5759 | * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O | ||
5760 | * request in flight, because this function is in the code | ||
5761 | * path that handles the completion of a request of bfqq, and, | ||
5762 | * in particular, this function is executed before | ||
5763 | * bfqd->rq_in_driver is decremented in such a code path. | ||
5400 | */ | 5764 | */ |
5401 | if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 0) || | 5765 | if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || |
5402 | tot_time_ns < bfqq->last_serv_time_ns) { | 5766 | tot_time_ns < bfqq->last_serv_time_ns) { |
5403 | bfqq->last_serv_time_ns = tot_time_ns; | 5767 | bfqq->last_serv_time_ns = tot_time_ns; |
5404 | /* | 5768 | /* |
@@ -5406,7 +5770,18 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, | |||
5406 | * start trying injection. | 5770 | * start trying injection. |
5407 | */ | 5771 | */ |
5408 | bfqq->inject_limit = max_t(unsigned int, 1, old_limit); | 5772 | bfqq->inject_limit = max_t(unsigned int, 1, old_limit); |
5409 | } | 5773 | } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) |
5774 | /* | ||
5775 | * No I/O injected and no request still in service in | ||
5776 | * the drive: these are the exact conditions for | ||
5777 | * computing the base value of the total service time | ||
5778 | * for bfqq. So let's update this value, because it is | ||
5779 | * rather variable. For example, it varies if the size | ||
5780 | * or the spatial locality of the I/O requests in bfqq | ||
5781 | * change. | ||
5782 | */ | ||
5783 | bfqq->last_serv_time_ns = tot_time_ns; | ||
5784 | |||
5410 | 5785 | ||
5411 | /* update complete, not waiting for any request completion any longer */ | 5786 | /* update complete, not waiting for any request completion any longer */ |
5412 | bfqd->waited_rq = NULL; | 5787 | bfqd->waited_rq = NULL; |
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index c2faa77824f8..e80adf822bbe 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h | |||
@@ -357,6 +357,24 @@ struct bfq_queue { | |||
357 | 357 | ||
358 | /* max service rate measured so far */ | 358 | /* max service rate measured so far */ |
359 | u32 max_service_rate; | 359 | u32 max_service_rate; |
360 | |||
361 | /* | ||
362 | * Pointer to the waker queue for this queue, i.e., to the | ||
363 | * queue Q such that this queue happens to get new I/O right | ||
364 | * after some I/O request of Q is completed. For details, see | ||
365 | * the comments on the choice of the queue for injection in | ||
366 | * bfq_select_queue(). | ||
367 | */ | ||
368 | struct bfq_queue *waker_bfqq; | ||
369 | /* node for woken_list, see below */ | ||
370 | struct hlist_node woken_list_node; | ||
371 | /* | ||
372 | * Head of the list of the woken queues for this queue, i.e., | ||
373 | * of the list of the queues for which this queue is a waker | ||
374 | * queue. This list is used to reset the waker_bfqq pointer in | ||
375 | * the woken queues when this queue exits. | ||
376 | */ | ||
377 | struct hlist_head woken_list; | ||
360 | }; | 378 | }; |
361 | 379 | ||
362 | /** | 380 | /** |
@@ -533,6 +551,9 @@ struct bfq_data { | |||
533 | /* time of last request completion (ns) */ | 551 | /* time of last request completion (ns) */ |
534 | u64 last_completion; | 552 | u64 last_completion; |
535 | 553 | ||
554 | /* bfqq owning the last completed rq */ | ||
555 | struct bfq_queue *last_completed_rq_bfqq; | ||
556 | |||
536 | /* time of last transition from empty to non-empty (ns) */ | 557 | /* time of last transition from empty to non-empty (ns) */ |
537 | u64 last_empty_occupied_ns; | 558 | u64 last_empty_occupied_ns; |
538 | 559 | ||
@@ -743,7 +764,8 @@ enum bfqq_state_flags { | |||
743 | * update | 764 | * update |
744 | */ | 765 | */ |
745 | BFQQF_coop, /* bfqq is shared */ | 766 | BFQQF_coop, /* bfqq is shared */ |
746 | BFQQF_split_coop /* shared bfqq will be split */ | 767 | BFQQF_split_coop, /* shared bfqq will be split */ |
768 | BFQQF_has_waker /* bfqq has a waker queue */ | ||
747 | }; | 769 | }; |
748 | 770 | ||
749 | #define BFQ_BFQQ_FNS(name) \ | 771 | #define BFQ_BFQQ_FNS(name) \ |
@@ -763,6 +785,7 @@ BFQ_BFQQ_FNS(in_large_burst); | |||
763 | BFQ_BFQQ_FNS(coop); | 785 | BFQ_BFQQ_FNS(coop); |
764 | BFQ_BFQQ_FNS(split_coop); | 786 | BFQ_BFQQ_FNS(split_coop); |
765 | BFQ_BFQQ_FNS(softrt_update); | 787 | BFQ_BFQQ_FNS(softrt_update); |
788 | BFQ_BFQQ_FNS(has_waker); | ||
766 | #undef BFQ_BFQQ_FNS | 789 | #undef BFQ_BFQQ_FNS |
767 | 790 | ||
768 | /* Expiration reasons. */ | 791 | /* Expiration reasons. */ |
@@ -777,8 +800,13 @@ enum bfqq_expiration { | |||
777 | BFQQE_PREEMPTED /* preemption in progress */ | 800 | BFQQE_PREEMPTED /* preemption in progress */ |
778 | }; | 801 | }; |
779 | 802 | ||
803 | struct bfq_stat { | ||
804 | struct percpu_counter cpu_cnt; | ||
805 | atomic64_t aux_cnt; | ||
806 | }; | ||
807 | |||
780 | struct bfqg_stats { | 808 | struct bfqg_stats { |
781 | #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | 809 | #ifdef CONFIG_BFQ_CGROUP_DEBUG |
782 | /* number of ios merged */ | 810 | /* number of ios merged */ |
783 | struct blkg_rwstat merged; | 811 | struct blkg_rwstat merged; |
784 | /* total time spent on device in ns, may not be accurate w/ queueing */ | 812 | /* total time spent on device in ns, may not be accurate w/ queueing */ |
@@ -788,25 +816,25 @@ struct bfqg_stats { | |||
788 | /* number of IOs queued up */ | 816 | /* number of IOs queued up */ |
789 | struct blkg_rwstat queued; | 817 | struct blkg_rwstat queued; |
790 | /* total disk time and nr sectors dispatched by this group */ | 818 | /* total disk time and nr sectors dispatched by this group */ |
791 | struct blkg_stat time; | 819 | struct bfq_stat time; |
792 | /* sum of number of ios queued across all samples */ | 820 | /* sum of number of ios queued across all samples */ |
793 | struct blkg_stat avg_queue_size_sum; | 821 | struct bfq_stat avg_queue_size_sum; |
794 | /* count of samples taken for average */ | 822 | /* count of samples taken for average */ |
795 | struct blkg_stat avg_queue_size_samples; | 823 | struct bfq_stat avg_queue_size_samples; |
796 | /* how many times this group has been removed from service tree */ | 824 | /* how many times this group has been removed from service tree */ |
797 | struct blkg_stat dequeue; | 825 | struct bfq_stat dequeue; |
798 | /* total time spent waiting for it to be assigned a timeslice. */ | 826 | /* total time spent waiting for it to be assigned a timeslice. */ |
799 | struct blkg_stat group_wait_time; | 827 | struct bfq_stat group_wait_time; |
800 | /* time spent idling for this blkcg_gq */ | 828 | /* time spent idling for this blkcg_gq */ |
801 | struct blkg_stat idle_time; | 829 | struct bfq_stat idle_time; |
802 | /* total time with empty current active q with other requests queued */ | 830 | /* total time with empty current active q with other requests queued */ |
803 | struct blkg_stat empty_time; | 831 | struct bfq_stat empty_time; |
804 | /* fields after this shouldn't be cleared on stat reset */ | 832 | /* fields after this shouldn't be cleared on stat reset */ |
805 | u64 start_group_wait_time; | 833 | u64 start_group_wait_time; |
806 | u64 start_idle_time; | 834 | u64 start_idle_time; |
807 | u64 start_empty_time; | 835 | u64 start_empty_time; |
808 | uint16_t flags; | 836 | uint16_t flags; |
809 | #endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | 837 | #endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
810 | }; | 838 | }; |
811 | 839 | ||
812 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 840 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
diff --git a/block/bio.c b/block/bio.c index ce797d73bb43..29cd6cf4da51 100644 --- a/block/bio.c +++ b/block/bio.c | |||
@@ -558,14 +558,6 @@ void bio_put(struct bio *bio) | |||
558 | } | 558 | } |
559 | EXPORT_SYMBOL(bio_put); | 559 | EXPORT_SYMBOL(bio_put); |
560 | 560 | ||
561 | int bio_phys_segments(struct request_queue *q, struct bio *bio) | ||
562 | { | ||
563 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) | ||
564 | blk_recount_segments(q, bio); | ||
565 | |||
566 | return bio->bi_phys_segments; | ||
567 | } | ||
568 | |||
569 | /** | 561 | /** |
570 | * __bio_clone_fast - clone a bio that shares the original bio's biovec | 562 | * __bio_clone_fast - clone a bio that shares the original bio's biovec |
571 | * @bio: destination bio | 563 | * @bio: destination bio |
@@ -731,10 +723,10 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, | |||
731 | } | 723 | } |
732 | } | 724 | } |
733 | 725 | ||
734 | if (bio_full(bio)) | 726 | if (bio_full(bio, len)) |
735 | return 0; | 727 | return 0; |
736 | 728 | ||
737 | if (bio->bi_phys_segments >= queue_max_segments(q)) | 729 | if (bio->bi_vcnt >= queue_max_segments(q)) |
738 | return 0; | 730 | return 0; |
739 | 731 | ||
740 | bvec = &bio->bi_io_vec[bio->bi_vcnt]; | 732 | bvec = &bio->bi_io_vec[bio->bi_vcnt]; |
@@ -744,8 +736,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, | |||
744 | bio->bi_vcnt++; | 736 | bio->bi_vcnt++; |
745 | done: | 737 | done: |
746 | bio->bi_iter.bi_size += len; | 738 | bio->bi_iter.bi_size += len; |
747 | bio->bi_phys_segments = bio->bi_vcnt; | ||
748 | bio_set_flag(bio, BIO_SEG_VALID); | ||
749 | return len; | 739 | return len; |
750 | } | 740 | } |
751 | 741 | ||
@@ -807,7 +797,7 @@ void __bio_add_page(struct bio *bio, struct page *page, | |||
807 | struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; | 797 | struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; |
808 | 798 | ||
809 | WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); | 799 | WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); |
810 | WARN_ON_ONCE(bio_full(bio)); | 800 | WARN_ON_ONCE(bio_full(bio, len)); |
811 | 801 | ||
812 | bv->bv_page = page; | 802 | bv->bv_page = page; |
813 | bv->bv_offset = off; | 803 | bv->bv_offset = off; |
@@ -834,7 +824,7 @@ int bio_add_page(struct bio *bio, struct page *page, | |||
834 | bool same_page = false; | 824 | bool same_page = false; |
835 | 825 | ||
836 | if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { | 826 | if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { |
837 | if (bio_full(bio)) | 827 | if (bio_full(bio, len)) |
838 | return 0; | 828 | return 0; |
839 | __bio_add_page(bio, page, len, offset); | 829 | __bio_add_page(bio, page, len, offset); |
840 | } | 830 | } |
@@ -842,22 +832,19 @@ int bio_add_page(struct bio *bio, struct page *page, | |||
842 | } | 832 | } |
843 | EXPORT_SYMBOL(bio_add_page); | 833 | EXPORT_SYMBOL(bio_add_page); |
844 | 834 | ||
845 | static void bio_get_pages(struct bio *bio) | 835 | void bio_release_pages(struct bio *bio, bool mark_dirty) |
846 | { | 836 | { |
847 | struct bvec_iter_all iter_all; | 837 | struct bvec_iter_all iter_all; |
848 | struct bio_vec *bvec; | 838 | struct bio_vec *bvec; |
849 | 839 | ||
850 | bio_for_each_segment_all(bvec, bio, iter_all) | 840 | if (bio_flagged(bio, BIO_NO_PAGE_REF)) |
851 | get_page(bvec->bv_page); | 841 | return; |
852 | } | ||
853 | |||
854 | static void bio_release_pages(struct bio *bio) | ||
855 | { | ||
856 | struct bvec_iter_all iter_all; | ||
857 | struct bio_vec *bvec; | ||
858 | 842 | ||
859 | bio_for_each_segment_all(bvec, bio, iter_all) | 843 | bio_for_each_segment_all(bvec, bio, iter_all) { |
844 | if (mark_dirty && !PageCompound(bvec->bv_page)) | ||
845 | set_page_dirty_lock(bvec->bv_page); | ||
860 | put_page(bvec->bv_page); | 846 | put_page(bvec->bv_page); |
847 | } | ||
861 | } | 848 | } |
862 | 849 | ||
863 | static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) | 850 | static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) |
@@ -922,7 +909,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) | |||
922 | if (same_page) | 909 | if (same_page) |
923 | put_page(page); | 910 | put_page(page); |
924 | } else { | 911 | } else { |
925 | if (WARN_ON_ONCE(bio_full(bio))) | 912 | if (WARN_ON_ONCE(bio_full(bio, len))) |
926 | return -EINVAL; | 913 | return -EINVAL; |
927 | __bio_add_page(bio, page, len, offset); | 914 | __bio_add_page(bio, page, len, offset); |
928 | } | 915 | } |
@@ -966,13 +953,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) | |||
966 | ret = __bio_iov_bvec_add_pages(bio, iter); | 953 | ret = __bio_iov_bvec_add_pages(bio, iter); |
967 | else | 954 | else |
968 | ret = __bio_iov_iter_get_pages(bio, iter); | 955 | ret = __bio_iov_iter_get_pages(bio, iter); |
969 | } while (!ret && iov_iter_count(iter) && !bio_full(bio)); | 956 | } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); |
970 | 957 | ||
971 | if (iov_iter_bvec_no_ref(iter)) | 958 | if (is_bvec) |
972 | bio_set_flag(bio, BIO_NO_PAGE_REF); | 959 | bio_set_flag(bio, BIO_NO_PAGE_REF); |
973 | else if (is_bvec) | ||
974 | bio_get_pages(bio); | ||
975 | |||
976 | return bio->bi_vcnt ? 0 : ret; | 960 | return bio->bi_vcnt ? 0 : ret; |
977 | } | 961 | } |
978 | 962 | ||
@@ -1124,8 +1108,7 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, | |||
1124 | if (data->nr_segs > UIO_MAXIOV) | 1108 | if (data->nr_segs > UIO_MAXIOV) |
1125 | return NULL; | 1109 | return NULL; |
1126 | 1110 | ||
1127 | bmd = kmalloc(sizeof(struct bio_map_data) + | 1111 | bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); |
1128 | sizeof(struct iovec) * data->nr_segs, gfp_mask); | ||
1129 | if (!bmd) | 1112 | if (!bmd) |
1130 | return NULL; | 1113 | return NULL; |
1131 | memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); | 1114 | memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); |
@@ -1371,8 +1354,6 @@ struct bio *bio_map_user_iov(struct request_queue *q, | |||
1371 | int j; | 1354 | int j; |
1372 | struct bio *bio; | 1355 | struct bio *bio; |
1373 | int ret; | 1356 | int ret; |
1374 | struct bio_vec *bvec; | ||
1375 | struct bvec_iter_all iter_all; | ||
1376 | 1357 | ||
1377 | if (!iov_iter_count(iter)) | 1358 | if (!iov_iter_count(iter)) |
1378 | return ERR_PTR(-EINVAL); | 1359 | return ERR_PTR(-EINVAL); |
@@ -1439,31 +1420,11 @@ struct bio *bio_map_user_iov(struct request_queue *q, | |||
1439 | return bio; | 1420 | return bio; |
1440 | 1421 | ||
1441 | out_unmap: | 1422 | out_unmap: |
1442 | bio_for_each_segment_all(bvec, bio, iter_all) { | 1423 | bio_release_pages(bio, false); |
1443 | put_page(bvec->bv_page); | ||
1444 | } | ||
1445 | bio_put(bio); | 1424 | bio_put(bio); |
1446 | return ERR_PTR(ret); | 1425 | return ERR_PTR(ret); |
1447 | } | 1426 | } |
1448 | 1427 | ||
1449 | static void __bio_unmap_user(struct bio *bio) | ||
1450 | { | ||
1451 | struct bio_vec *bvec; | ||
1452 | struct bvec_iter_all iter_all; | ||
1453 | |||
1454 | /* | ||
1455 | * make sure we dirty pages we wrote to | ||
1456 | */ | ||
1457 | bio_for_each_segment_all(bvec, bio, iter_all) { | ||
1458 | if (bio_data_dir(bio) == READ) | ||
1459 | set_page_dirty_lock(bvec->bv_page); | ||
1460 | |||
1461 | put_page(bvec->bv_page); | ||
1462 | } | ||
1463 | |||
1464 | bio_put(bio); | ||
1465 | } | ||
1466 | |||
1467 | /** | 1428 | /** |
1468 | * bio_unmap_user - unmap a bio | 1429 | * bio_unmap_user - unmap a bio |
1469 | * @bio: the bio being unmapped | 1430 | * @bio: the bio being unmapped |
@@ -1475,7 +1436,8 @@ static void __bio_unmap_user(struct bio *bio) | |||
1475 | */ | 1436 | */ |
1476 | void bio_unmap_user(struct bio *bio) | 1437 | void bio_unmap_user(struct bio *bio) |
1477 | { | 1438 | { |
1478 | __bio_unmap_user(bio); | 1439 | bio_release_pages(bio, bio_data_dir(bio) == READ); |
1440 | bio_put(bio); | ||
1479 | bio_put(bio); | 1441 | bio_put(bio); |
1480 | } | 1442 | } |
1481 | 1443 | ||
@@ -1695,9 +1657,7 @@ static void bio_dirty_fn(struct work_struct *work) | |||
1695 | while ((bio = next) != NULL) { | 1657 | while ((bio = next) != NULL) { |
1696 | next = bio->bi_private; | 1658 | next = bio->bi_private; |
1697 | 1659 | ||
1698 | bio_set_pages_dirty(bio); | 1660 | bio_release_pages(bio, true); |
1699 | if (!bio_flagged(bio, BIO_NO_PAGE_REF)) | ||
1700 | bio_release_pages(bio); | ||
1701 | bio_put(bio); | 1661 | bio_put(bio); |
1702 | } | 1662 | } |
1703 | } | 1663 | } |
@@ -1713,8 +1673,7 @@ void bio_check_pages_dirty(struct bio *bio) | |||
1713 | goto defer; | 1673 | goto defer; |
1714 | } | 1674 | } |
1715 | 1675 | ||
1716 | if (!bio_flagged(bio, BIO_NO_PAGE_REF)) | 1676 | bio_release_pages(bio, false); |
1717 | bio_release_pages(bio); | ||
1718 | bio_put(bio); | 1677 | bio_put(bio); |
1719 | return; | 1678 | return; |
1720 | defer: | 1679 | defer: |
@@ -1775,18 +1734,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op, | |||
1775 | } | 1734 | } |
1776 | EXPORT_SYMBOL(generic_end_io_acct); | 1735 | EXPORT_SYMBOL(generic_end_io_acct); |
1777 | 1736 | ||
1778 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE | ||
1779 | void bio_flush_dcache_pages(struct bio *bi) | ||
1780 | { | ||
1781 | struct bio_vec bvec; | ||
1782 | struct bvec_iter iter; | ||
1783 | |||
1784 | bio_for_each_segment(bvec, bi, iter) | ||
1785 | flush_dcache_page(bvec.bv_page); | ||
1786 | } | ||
1787 | EXPORT_SYMBOL(bio_flush_dcache_pages); | ||
1788 | #endif | ||
1789 | |||
1790 | static inline bool bio_remaining_done(struct bio *bio) | 1737 | static inline bool bio_remaining_done(struct bio *bio) |
1791 | { | 1738 | { |
1792 | /* | 1739 | /* |
@@ -1914,10 +1861,7 @@ void bio_trim(struct bio *bio, int offset, int size) | |||
1914 | if (offset == 0 && size == bio->bi_iter.bi_size) | 1861 | if (offset == 0 && size == bio->bi_iter.bi_size) |
1915 | return; | 1862 | return; |
1916 | 1863 | ||
1917 | bio_clear_flag(bio, BIO_SEG_VALID); | ||
1918 | |||
1919 | bio_advance(bio, offset << 9); | 1864 | bio_advance(bio, offset << 9); |
1920 | |||
1921 | bio->bi_iter.bi_size = size; | 1865 | bio->bi_iter.bi_size = size; |
1922 | 1866 | ||
1923 | if (bio_integrity(bio)) | 1867 | if (bio_integrity(bio)) |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1f7127b03490..53b7bd4c7000 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -79,6 +79,7 @@ static void blkg_free(struct blkcg_gq *blkg) | |||
79 | 79 | ||
80 | blkg_rwstat_exit(&blkg->stat_ios); | 80 | blkg_rwstat_exit(&blkg->stat_ios); |
81 | blkg_rwstat_exit(&blkg->stat_bytes); | 81 | blkg_rwstat_exit(&blkg->stat_bytes); |
82 | percpu_ref_exit(&blkg->refcnt); | ||
82 | kfree(blkg); | 83 | kfree(blkg); |
83 | } | 84 | } |
84 | 85 | ||
@@ -86,8 +87,6 @@ static void __blkg_release(struct rcu_head *rcu) | |||
86 | { | 87 | { |
87 | struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); | 88 | struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); |
88 | 89 | ||
89 | percpu_ref_exit(&blkg->refcnt); | ||
90 | |||
91 | /* release the blkcg and parent blkg refs this blkg has been holding */ | 90 | /* release the blkcg and parent blkg refs this blkg has been holding */ |
92 | css_put(&blkg->blkcg->css); | 91 | css_put(&blkg->blkcg->css); |
93 | if (blkg->parent) | 92 | if (blkg->parent) |
@@ -132,6 +131,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
132 | if (!blkg) | 131 | if (!blkg) |
133 | return NULL; | 132 | return NULL; |
134 | 133 | ||
134 | if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) | ||
135 | goto err_free; | ||
136 | |||
135 | if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || | 137 | if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || |
136 | blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) | 138 | blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) |
137 | goto err_free; | 139 | goto err_free; |
@@ -244,11 +246,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
244 | blkg_get(blkg->parent); | 246 | blkg_get(blkg->parent); |
245 | } | 247 | } |
246 | 248 | ||
247 | ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, | ||
248 | GFP_NOWAIT | __GFP_NOWARN); | ||
249 | if (ret) | ||
250 | goto err_cancel_ref; | ||
251 | |||
252 | /* invoke per-policy init */ | 249 | /* invoke per-policy init */ |
253 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 250 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
254 | struct blkcg_policy *pol = blkcg_policy[i]; | 251 | struct blkcg_policy *pol = blkcg_policy[i]; |
@@ -281,8 +278,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
281 | blkg_put(blkg); | 278 | blkg_put(blkg); |
282 | return ERR_PTR(ret); | 279 | return ERR_PTR(ret); |
283 | 280 | ||
284 | err_cancel_ref: | ||
285 | percpu_ref_exit(&blkg->refcnt); | ||
286 | err_put_congested: | 281 | err_put_congested: |
287 | wb_congested_put(wb_congested); | 282 | wb_congested_put(wb_congested); |
288 | err_put_css: | 283 | err_put_css: |
@@ -549,7 +544,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64); | |||
549 | * Print @rwstat to @sf for the device assocaited with @pd. | 544 | * Print @rwstat to @sf for the device assocaited with @pd. |
550 | */ | 545 | */ |
551 | u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | 546 | u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, |
552 | const struct blkg_rwstat *rwstat) | 547 | const struct blkg_rwstat_sample *rwstat) |
553 | { | 548 | { |
554 | static const char *rwstr[] = { | 549 | static const char *rwstr[] = { |
555 | [BLKG_RWSTAT_READ] = "Read", | 550 | [BLKG_RWSTAT_READ] = "Read", |
@@ -567,31 +562,17 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
567 | 562 | ||
568 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | 563 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
569 | seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], | 564 | seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], |
570 | (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); | 565 | rwstat->cnt[i]); |
571 | 566 | ||
572 | v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + | 567 | v = rwstat->cnt[BLKG_RWSTAT_READ] + |
573 | atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) + | 568 | rwstat->cnt[BLKG_RWSTAT_WRITE] + |
574 | atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]); | 569 | rwstat->cnt[BLKG_RWSTAT_DISCARD]; |
575 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); | 570 | seq_printf(sf, "%s Total %llu\n", dname, v); |
576 | return v; | 571 | return v; |
577 | } | 572 | } |
578 | EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); | 573 | EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); |
579 | 574 | ||
580 | /** | 575 | /** |
581 | * blkg_prfill_stat - prfill callback for blkg_stat | ||
582 | * @sf: seq_file to print to | ||
583 | * @pd: policy private data of interest | ||
584 | * @off: offset to the blkg_stat in @pd | ||
585 | * | ||
586 | * prfill callback for printing a blkg_stat. | ||
587 | */ | ||
588 | u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) | ||
589 | { | ||
590 | return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); | ||
591 | } | ||
592 | EXPORT_SYMBOL_GPL(blkg_prfill_stat); | ||
593 | |||
594 | /** | ||
595 | * blkg_prfill_rwstat - prfill callback for blkg_rwstat | 576 | * blkg_prfill_rwstat - prfill callback for blkg_rwstat |
596 | * @sf: seq_file to print to | 577 | * @sf: seq_file to print to |
597 | * @pd: policy private data of interest | 578 | * @pd: policy private data of interest |
@@ -602,8 +583,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_stat); | |||
602 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | 583 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, |
603 | int off) | 584 | int off) |
604 | { | 585 | { |
605 | struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); | 586 | struct blkg_rwstat_sample rwstat = { }; |
606 | 587 | ||
588 | blkg_rwstat_read((void *)pd + off, &rwstat); | ||
607 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | 589 | return __blkg_prfill_rwstat(sf, pd, &rwstat); |
608 | } | 590 | } |
609 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); | 591 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); |
@@ -611,8 +593,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); | |||
611 | static u64 blkg_prfill_rwstat_field(struct seq_file *sf, | 593 | static u64 blkg_prfill_rwstat_field(struct seq_file *sf, |
612 | struct blkg_policy_data *pd, int off) | 594 | struct blkg_policy_data *pd, int off) |
613 | { | 595 | { |
614 | struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); | 596 | struct blkg_rwstat_sample rwstat = { }; |
615 | 597 | ||
598 | blkg_rwstat_read((void *)pd->blkg + off, &rwstat); | ||
616 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | 599 | return __blkg_prfill_rwstat(sf, pd, &rwstat); |
617 | } | 600 | } |
618 | 601 | ||
@@ -654,8 +637,9 @@ static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, | |||
654 | struct blkg_policy_data *pd, | 637 | struct blkg_policy_data *pd, |
655 | int off) | 638 | int off) |
656 | { | 639 | { |
657 | struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, | 640 | struct blkg_rwstat_sample rwstat; |
658 | NULL, off); | 641 | |
642 | blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat); | ||
659 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | 643 | return __blkg_prfill_rwstat(sf, pd, &rwstat); |
660 | } | 644 | } |
661 | 645 | ||
@@ -690,52 +674,11 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) | |||
690 | EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); | 674 | EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); |
691 | 675 | ||
692 | /** | 676 | /** |
693 | * blkg_stat_recursive_sum - collect hierarchical blkg_stat | ||
694 | * @blkg: blkg of interest | ||
695 | * @pol: blkcg_policy which contains the blkg_stat | ||
696 | * @off: offset to the blkg_stat in blkg_policy_data or @blkg | ||
697 | * | ||
698 | * Collect the blkg_stat specified by @blkg, @pol and @off and all its | ||
699 | * online descendants and their aux counts. The caller must be holding the | ||
700 | * queue lock for online tests. | ||
701 | * | ||
702 | * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is | ||
703 | * at @off bytes into @blkg's blkg_policy_data of the policy. | ||
704 | */ | ||
705 | u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, | ||
706 | struct blkcg_policy *pol, int off) | ||
707 | { | ||
708 | struct blkcg_gq *pos_blkg; | ||
709 | struct cgroup_subsys_state *pos_css; | ||
710 | u64 sum = 0; | ||
711 | |||
712 | lockdep_assert_held(&blkg->q->queue_lock); | ||
713 | |||
714 | rcu_read_lock(); | ||
715 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { | ||
716 | struct blkg_stat *stat; | ||
717 | |||
718 | if (!pos_blkg->online) | ||
719 | continue; | ||
720 | |||
721 | if (pol) | ||
722 | stat = (void *)blkg_to_pd(pos_blkg, pol) + off; | ||
723 | else | ||
724 | stat = (void *)blkg + off; | ||
725 | |||
726 | sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); | ||
727 | } | ||
728 | rcu_read_unlock(); | ||
729 | |||
730 | return sum; | ||
731 | } | ||
732 | EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); | ||
733 | |||
734 | /** | ||
735 | * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat | 677 | * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat |
736 | * @blkg: blkg of interest | 678 | * @blkg: blkg of interest |
737 | * @pol: blkcg_policy which contains the blkg_rwstat | 679 | * @pol: blkcg_policy which contains the blkg_rwstat |
738 | * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg | 680 | * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg |
681 | * @sum: blkg_rwstat_sample structure containing the results | ||
739 | * | 682 | * |
740 | * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its | 683 | * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its |
741 | * online descendants and their aux counts. The caller must be holding the | 684 | * online descendants and their aux counts. The caller must be holding the |
@@ -744,13 +687,12 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); | |||
744 | * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it | 687 | * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it |
745 | * is at @off bytes into @blkg's blkg_policy_data of the policy. | 688 | * is at @off bytes into @blkg's blkg_policy_data of the policy. |
746 | */ | 689 | */ |
747 | struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, | 690 | void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, |
748 | struct blkcg_policy *pol, int off) | 691 | int off, struct blkg_rwstat_sample *sum) |
749 | { | 692 | { |
750 | struct blkcg_gq *pos_blkg; | 693 | struct blkcg_gq *pos_blkg; |
751 | struct cgroup_subsys_state *pos_css; | 694 | struct cgroup_subsys_state *pos_css; |
752 | struct blkg_rwstat sum = { }; | 695 | unsigned int i; |
753 | int i; | ||
754 | 696 | ||
755 | lockdep_assert_held(&blkg->q->queue_lock); | 697 | lockdep_assert_held(&blkg->q->queue_lock); |
756 | 698 | ||
@@ -767,13 +709,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, | |||
767 | rwstat = (void *)pos_blkg + off; | 709 | rwstat = (void *)pos_blkg + off; |
768 | 710 | ||
769 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | 711 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
770 | atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + | 712 | sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); |
771 | percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), | ||
772 | &sum.aux_cnt[i]); | ||
773 | } | 713 | } |
774 | rcu_read_unlock(); | 714 | rcu_read_unlock(); |
775 | |||
776 | return sum; | ||
777 | } | 715 | } |
778 | EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); | 716 | EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); |
779 | 717 | ||
@@ -939,7 +877,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) | |||
939 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { | 877 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { |
940 | const char *dname; | 878 | const char *dname; |
941 | char *buf; | 879 | char *buf; |
942 | struct blkg_rwstat rwstat; | 880 | struct blkg_rwstat_sample rwstat; |
943 | u64 rbytes, wbytes, rios, wios, dbytes, dios; | 881 | u64 rbytes, wbytes, rios, wios, dbytes, dios; |
944 | size_t size = seq_get_buf(sf, &buf), off = 0; | 882 | size_t size = seq_get_buf(sf, &buf), off = 0; |
945 | int i; | 883 | int i; |
@@ -959,17 +897,17 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) | |||
959 | 897 | ||
960 | spin_lock_irq(&blkg->q->queue_lock); | 898 | spin_lock_irq(&blkg->q->queue_lock); |
961 | 899 | ||
962 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, | 900 | blkg_rwstat_recursive_sum(blkg, NULL, |
963 | offsetof(struct blkcg_gq, stat_bytes)); | 901 | offsetof(struct blkcg_gq, stat_bytes), &rwstat); |
964 | rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); | 902 | rbytes = rwstat.cnt[BLKG_RWSTAT_READ]; |
965 | wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); | 903 | wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE]; |
966 | dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); | 904 | dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD]; |
967 | 905 | ||
968 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, | 906 | blkg_rwstat_recursive_sum(blkg, NULL, |
969 | offsetof(struct blkcg_gq, stat_ios)); | 907 | offsetof(struct blkcg_gq, stat_ios), &rwstat); |
970 | rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); | 908 | rios = rwstat.cnt[BLKG_RWSTAT_READ]; |
971 | wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); | 909 | wios = rwstat.cnt[BLKG_RWSTAT_WRITE]; |
972 | dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); | 910 | dios = rwstat.cnt[BLKG_RWSTAT_DISCARD]; |
973 | 911 | ||
974 | spin_unlock_irq(&blkg->q->queue_lock); | 912 | spin_unlock_irq(&blkg->q->queue_lock); |
975 | 913 | ||
@@ -1006,8 +944,12 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) | |||
1006 | } | 944 | } |
1007 | next: | 945 | next: |
1008 | if (has_stats) { | 946 | if (has_stats) { |
1009 | off += scnprintf(buf+off, size-off, "\n"); | 947 | if (off < size - 1) { |
1010 | seq_commit(sf, off); | 948 | off += scnprintf(buf+off, size-off, "\n"); |
949 | seq_commit(sf, off); | ||
950 | } else { | ||
951 | seq_commit(sf, -1); | ||
952 | } | ||
1011 | } | 953 | } |
1012 | } | 954 | } |
1013 | 955 | ||
@@ -1391,7 +1333,8 @@ pd_prealloc: | |||
1391 | 1333 | ||
1392 | spin_lock_irq(&q->queue_lock); | 1334 | spin_lock_irq(&q->queue_lock); |
1393 | 1335 | ||
1394 | list_for_each_entry(blkg, &q->blkg_list, q_node) { | 1336 | /* blkg_list is pushed at the head, reverse walk to init parents first */ |
1337 | list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { | ||
1395 | struct blkg_policy_data *pd; | 1338 | struct blkg_policy_data *pd; |
1396 | 1339 | ||
1397 | if (blkg->pd[pol->plid]) | 1340 | if (blkg->pd[pol->plid]) |
diff --git a/block/blk-core.c b/block/blk-core.c index 8340f69670d8..5d1fc8e17dd1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -120,6 +120,42 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |||
120 | } | 120 | } |
121 | EXPORT_SYMBOL(blk_rq_init); | 121 | EXPORT_SYMBOL(blk_rq_init); |
122 | 122 | ||
123 | #define REQ_OP_NAME(name) [REQ_OP_##name] = #name | ||
124 | static const char *const blk_op_name[] = { | ||
125 | REQ_OP_NAME(READ), | ||
126 | REQ_OP_NAME(WRITE), | ||
127 | REQ_OP_NAME(FLUSH), | ||
128 | REQ_OP_NAME(DISCARD), | ||
129 | REQ_OP_NAME(SECURE_ERASE), | ||
130 | REQ_OP_NAME(ZONE_RESET), | ||
131 | REQ_OP_NAME(WRITE_SAME), | ||
132 | REQ_OP_NAME(WRITE_ZEROES), | ||
133 | REQ_OP_NAME(SCSI_IN), | ||
134 | REQ_OP_NAME(SCSI_OUT), | ||
135 | REQ_OP_NAME(DRV_IN), | ||
136 | REQ_OP_NAME(DRV_OUT), | ||
137 | }; | ||
138 | #undef REQ_OP_NAME | ||
139 | |||
140 | /** | ||
141 | * blk_op_str - Return string XXX in the REQ_OP_XXX. | ||
142 | * @op: REQ_OP_XXX. | ||
143 | * | ||
144 | * Description: Centralize block layer function to convert REQ_OP_XXX into | ||
145 | * string format. Useful in the debugging and tracing bio or request. For | ||
146 | * invalid REQ_OP_XXX it returns string "UNKNOWN". | ||
147 | */ | ||
148 | inline const char *blk_op_str(unsigned int op) | ||
149 | { | ||
150 | const char *op_str = "UNKNOWN"; | ||
151 | |||
152 | if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) | ||
153 | op_str = blk_op_name[op]; | ||
154 | |||
155 | return op_str; | ||
156 | } | ||
157 | EXPORT_SYMBOL_GPL(blk_op_str); | ||
158 | |||
123 | static const struct { | 159 | static const struct { |
124 | int errno; | 160 | int errno; |
125 | const char *name; | 161 | const char *name; |
@@ -167,18 +203,23 @@ int blk_status_to_errno(blk_status_t status) | |||
167 | } | 203 | } |
168 | EXPORT_SYMBOL_GPL(blk_status_to_errno); | 204 | EXPORT_SYMBOL_GPL(blk_status_to_errno); |
169 | 205 | ||
170 | static void print_req_error(struct request *req, blk_status_t status) | 206 | static void print_req_error(struct request *req, blk_status_t status, |
207 | const char *caller) | ||
171 | { | 208 | { |
172 | int idx = (__force int)status; | 209 | int idx = (__force int)status; |
173 | 210 | ||
174 | if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) | 211 | if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) |
175 | return; | 212 | return; |
176 | 213 | ||
177 | printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n", | 214 | printk_ratelimited(KERN_ERR |
178 | __func__, blk_errors[idx].name, | 215 | "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " |
179 | req->rq_disk ? req->rq_disk->disk_name : "?", | 216 | "phys_seg %u prio class %u\n", |
180 | (unsigned long long)blk_rq_pos(req), | 217 | caller, blk_errors[idx].name, |
181 | req->cmd_flags); | 218 | req->rq_disk ? req->rq_disk->disk_name : "?", |
219 | blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), | ||
220 | req->cmd_flags & ~REQ_OP_MASK, | ||
221 | req->nr_phys_segments, | ||
222 | IOPRIO_PRIO_CLASS(req->ioprio)); | ||
182 | } | 223 | } |
183 | 224 | ||
184 | static void req_bio_endio(struct request *rq, struct bio *bio, | 225 | static void req_bio_endio(struct request *rq, struct bio *bio, |
@@ -550,15 +591,15 @@ void blk_put_request(struct request *req) | |||
550 | } | 591 | } |
551 | EXPORT_SYMBOL(blk_put_request); | 592 | EXPORT_SYMBOL(blk_put_request); |
552 | 593 | ||
553 | bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | 594 | bool bio_attempt_back_merge(struct request *req, struct bio *bio, |
554 | struct bio *bio) | 595 | unsigned int nr_segs) |
555 | { | 596 | { |
556 | const int ff = bio->bi_opf & REQ_FAILFAST_MASK; | 597 | const int ff = bio->bi_opf & REQ_FAILFAST_MASK; |
557 | 598 | ||
558 | if (!ll_back_merge_fn(q, req, bio)) | 599 | if (!ll_back_merge_fn(req, bio, nr_segs)) |
559 | return false; | 600 | return false; |
560 | 601 | ||
561 | trace_block_bio_backmerge(q, req, bio); | 602 | trace_block_bio_backmerge(req->q, req, bio); |
562 | 603 | ||
563 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) | 604 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) |
564 | blk_rq_set_mixed_merge(req); | 605 | blk_rq_set_mixed_merge(req); |
@@ -571,15 +612,15 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
571 | return true; | 612 | return true; |
572 | } | 613 | } |
573 | 614 | ||
574 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, | 615 | bool bio_attempt_front_merge(struct request *req, struct bio *bio, |
575 | struct bio *bio) | 616 | unsigned int nr_segs) |
576 | { | 617 | { |
577 | const int ff = bio->bi_opf & REQ_FAILFAST_MASK; | 618 | const int ff = bio->bi_opf & REQ_FAILFAST_MASK; |
578 | 619 | ||
579 | if (!ll_front_merge_fn(q, req, bio)) | 620 | if (!ll_front_merge_fn(req, bio, nr_segs)) |
580 | return false; | 621 | return false; |
581 | 622 | ||
582 | trace_block_bio_frontmerge(q, req, bio); | 623 | trace_block_bio_frontmerge(req->q, req, bio); |
583 | 624 | ||
584 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) | 625 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) |
585 | blk_rq_set_mixed_merge(req); | 626 | blk_rq_set_mixed_merge(req); |
@@ -621,6 +662,7 @@ no_merge: | |||
621 | * blk_attempt_plug_merge - try to merge with %current's plugged list | 662 | * blk_attempt_plug_merge - try to merge with %current's plugged list |
622 | * @q: request_queue new bio is being queued at | 663 | * @q: request_queue new bio is being queued at |
623 | * @bio: new bio being queued | 664 | * @bio: new bio being queued |
665 | * @nr_segs: number of segments in @bio | ||
624 | * @same_queue_rq: pointer to &struct request that gets filled in when | 666 | * @same_queue_rq: pointer to &struct request that gets filled in when |
625 | * another request associated with @q is found on the plug list | 667 | * another request associated with @q is found on the plug list |
626 | * (optional, may be %NULL) | 668 | * (optional, may be %NULL) |
@@ -639,7 +681,7 @@ no_merge: | |||
639 | * Caller must ensure !blk_queue_nomerges(q) beforehand. | 681 | * Caller must ensure !blk_queue_nomerges(q) beforehand. |
640 | */ | 682 | */ |
641 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | 683 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, |
642 | struct request **same_queue_rq) | 684 | unsigned int nr_segs, struct request **same_queue_rq) |
643 | { | 685 | { |
644 | struct blk_plug *plug; | 686 | struct blk_plug *plug; |
645 | struct request *rq; | 687 | struct request *rq; |
@@ -668,10 +710,10 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | |||
668 | 710 | ||
669 | switch (blk_try_merge(rq, bio)) { | 711 | switch (blk_try_merge(rq, bio)) { |
670 | case ELEVATOR_BACK_MERGE: | 712 | case ELEVATOR_BACK_MERGE: |
671 | merged = bio_attempt_back_merge(q, rq, bio); | 713 | merged = bio_attempt_back_merge(rq, bio, nr_segs); |
672 | break; | 714 | break; |
673 | case ELEVATOR_FRONT_MERGE: | 715 | case ELEVATOR_FRONT_MERGE: |
674 | merged = bio_attempt_front_merge(q, rq, bio); | 716 | merged = bio_attempt_front_merge(rq, bio, nr_segs); |
675 | break; | 717 | break; |
676 | case ELEVATOR_DISCARD_MERGE: | 718 | case ELEVATOR_DISCARD_MERGE: |
677 | merged = bio_attempt_discard_merge(q, rq, bio); | 719 | merged = bio_attempt_discard_merge(q, rq, bio); |
@@ -687,18 +729,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | |||
687 | return false; | 729 | return false; |
688 | } | 730 | } |
689 | 731 | ||
690 | void blk_init_request_from_bio(struct request *req, struct bio *bio) | ||
691 | { | ||
692 | if (bio->bi_opf & REQ_RAHEAD) | ||
693 | req->cmd_flags |= REQ_FAILFAST_MASK; | ||
694 | |||
695 | req->__sector = bio->bi_iter.bi_sector; | ||
696 | req->ioprio = bio_prio(bio); | ||
697 | req->write_hint = bio->bi_write_hint; | ||
698 | blk_rq_bio_prep(req->q, req, bio); | ||
699 | } | ||
700 | EXPORT_SYMBOL_GPL(blk_init_request_from_bio); | ||
701 | |||
702 | static void handle_bad_sector(struct bio *bio, sector_t maxsector) | 732 | static void handle_bad_sector(struct bio *bio, sector_t maxsector) |
703 | { | 733 | { |
704 | char b[BDEVNAME_SIZE]; | 734 | char b[BDEVNAME_SIZE]; |
@@ -1163,7 +1193,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, | |||
1163 | * Recalculate it to check the request correctly on this queue's | 1193 | * Recalculate it to check the request correctly on this queue's |
1164 | * limitation. | 1194 | * limitation. |
1165 | */ | 1195 | */ |
1166 | blk_recalc_rq_segments(rq); | 1196 | rq->nr_phys_segments = blk_recalc_rq_segments(rq); |
1167 | if (rq->nr_phys_segments > queue_max_segments(q)) { | 1197 | if (rq->nr_phys_segments > queue_max_segments(q)) { |
1168 | printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", | 1198 | printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", |
1169 | __func__, rq->nr_phys_segments, queue_max_segments(q)); | 1199 | __func__, rq->nr_phys_segments, queue_max_segments(q)); |
@@ -1348,7 +1378,7 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); | |||
1348 | * | 1378 | * |
1349 | * This special helper function is only for request stacking drivers | 1379 | * This special helper function is only for request stacking drivers |
1350 | * (e.g. request-based dm) so that they can handle partial completion. | 1380 | * (e.g. request-based dm) so that they can handle partial completion. |
1351 | * Actual device drivers should use blk_end_request instead. | 1381 | * Actual device drivers should use blk_mq_end_request instead. |
1352 | * | 1382 | * |
1353 | * Passing the result of blk_rq_bytes() as @nr_bytes guarantees | 1383 | * Passing the result of blk_rq_bytes() as @nr_bytes guarantees |
1354 | * %false return from this function. | 1384 | * %false return from this function. |
@@ -1373,7 +1403,7 @@ bool blk_update_request(struct request *req, blk_status_t error, | |||
1373 | 1403 | ||
1374 | if (unlikely(error && !blk_rq_is_passthrough(req) && | 1404 | if (unlikely(error && !blk_rq_is_passthrough(req) && |
1375 | !(req->rq_flags & RQF_QUIET))) | 1405 | !(req->rq_flags & RQF_QUIET))) |
1376 | print_req_error(req, error); | 1406 | print_req_error(req, error, __func__); |
1377 | 1407 | ||
1378 | blk_account_io_completion(req, nr_bytes); | 1408 | blk_account_io_completion(req, nr_bytes); |
1379 | 1409 | ||
@@ -1432,28 +1462,13 @@ bool blk_update_request(struct request *req, blk_status_t error, | |||
1432 | } | 1462 | } |
1433 | 1463 | ||
1434 | /* recalculate the number of segments */ | 1464 | /* recalculate the number of segments */ |
1435 | blk_recalc_rq_segments(req); | 1465 | req->nr_phys_segments = blk_recalc_rq_segments(req); |
1436 | } | 1466 | } |
1437 | 1467 | ||
1438 | return true; | 1468 | return true; |
1439 | } | 1469 | } |
1440 | EXPORT_SYMBOL_GPL(blk_update_request); | 1470 | EXPORT_SYMBOL_GPL(blk_update_request); |
1441 | 1471 | ||
1442 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | ||
1443 | struct bio *bio) | ||
1444 | { | ||
1445 | if (bio_has_data(bio)) | ||
1446 | rq->nr_phys_segments = bio_phys_segments(q, bio); | ||
1447 | else if (bio_op(bio) == REQ_OP_DISCARD) | ||
1448 | rq->nr_phys_segments = 1; | ||
1449 | |||
1450 | rq->__data_len = bio->bi_iter.bi_size; | ||
1451 | rq->bio = rq->biotail = bio; | ||
1452 | |||
1453 | if (bio->bi_disk) | ||
1454 | rq->rq_disk = bio->bi_disk; | ||
1455 | } | ||
1456 | |||
1457 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE | 1472 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE |
1458 | /** | 1473 | /** |
1459 | * rq_flush_dcache_pages - Helper function to flush all pages in a request | 1474 | * rq_flush_dcache_pages - Helper function to flush all pages in a request |
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index d22e61bced86..d973c38ee4fd 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c | |||
@@ -618,44 +618,26 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) | |||
618 | 618 | ||
619 | inflight = atomic_dec_return(&rqw->inflight); | 619 | inflight = atomic_dec_return(&rqw->inflight); |
620 | WARN_ON_ONCE(inflight < 0); | 620 | WARN_ON_ONCE(inflight < 0); |
621 | if (iolat->min_lat_nsec == 0) | 621 | /* |
622 | goto next; | 622 | * If bi_status is BLK_STS_AGAIN, the bio wasn't actually |
623 | iolatency_record_time(iolat, &bio->bi_issue, now, | 623 | * submitted, so do not account for it. |
624 | issue_as_root); | 624 | */ |
625 | window_start = atomic64_read(&iolat->window_start); | 625 | if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { |
626 | if (now > window_start && | 626 | iolatency_record_time(iolat, &bio->bi_issue, now, |
627 | (now - window_start) >= iolat->cur_win_nsec) { | 627 | issue_as_root); |
628 | if (atomic64_cmpxchg(&iolat->window_start, | 628 | window_start = atomic64_read(&iolat->window_start); |
629 | window_start, now) == window_start) | 629 | if (now > window_start && |
630 | iolatency_check_latencies(iolat, now); | 630 | (now - window_start) >= iolat->cur_win_nsec) { |
631 | if (atomic64_cmpxchg(&iolat->window_start, | ||
632 | window_start, now) == window_start) | ||
633 | iolatency_check_latencies(iolat, now); | ||
634 | } | ||
631 | } | 635 | } |
632 | next: | ||
633 | wake_up(&rqw->wait); | 636 | wake_up(&rqw->wait); |
634 | blkg = blkg->parent; | 637 | blkg = blkg->parent; |
635 | } | 638 | } |
636 | } | 639 | } |
637 | 640 | ||
638 | static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio) | ||
639 | { | ||
640 | struct blkcg_gq *blkg; | ||
641 | |||
642 | blkg = bio->bi_blkg; | ||
643 | while (blkg && blkg->parent) { | ||
644 | struct rq_wait *rqw; | ||
645 | struct iolatency_grp *iolat; | ||
646 | |||
647 | iolat = blkg_to_lat(blkg); | ||
648 | if (!iolat) | ||
649 | goto next; | ||
650 | |||
651 | rqw = &iolat->rq_wait; | ||
652 | atomic_dec(&rqw->inflight); | ||
653 | wake_up(&rqw->wait); | ||
654 | next: | ||
655 | blkg = blkg->parent; | ||
656 | } | ||
657 | } | ||
658 | |||
659 | static void blkcg_iolatency_exit(struct rq_qos *rqos) | 641 | static void blkcg_iolatency_exit(struct rq_qos *rqos) |
660 | { | 642 | { |
661 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); | 643 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
@@ -667,7 +649,6 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos) | |||
667 | 649 | ||
668 | static struct rq_qos_ops blkcg_iolatency_ops = { | 650 | static struct rq_qos_ops blkcg_iolatency_ops = { |
669 | .throttle = blkcg_iolatency_throttle, | 651 | .throttle = blkcg_iolatency_throttle, |
670 | .cleanup = blkcg_iolatency_cleanup, | ||
671 | .done_bio = blkcg_iolatency_done_bio, | 652 | .done_bio = blkcg_iolatency_done_bio, |
672 | .exit = blkcg_iolatency_exit, | 653 | .exit = blkcg_iolatency_exit, |
673 | }; | 654 | }; |
@@ -778,8 +759,10 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) | |||
778 | 759 | ||
779 | if (!oldval && val) | 760 | if (!oldval && val) |
780 | return 1; | 761 | return 1; |
781 | if (oldval && !val) | 762 | if (oldval && !val) { |
763 | blkcg_clear_delay(blkg); | ||
782 | return -1; | 764 | return -1; |
765 | } | ||
783 | return 0; | 766 | return 0; |
784 | } | 767 | } |
785 | 768 | ||
diff --git a/block/blk-map.c b/block/blk-map.c index db9373bd31ac..3a62e471d81b 100644 --- a/block/blk-map.c +++ b/block/blk-map.c | |||
@@ -18,13 +18,19 @@ | |||
18 | int blk_rq_append_bio(struct request *rq, struct bio **bio) | 18 | int blk_rq_append_bio(struct request *rq, struct bio **bio) |
19 | { | 19 | { |
20 | struct bio *orig_bio = *bio; | 20 | struct bio *orig_bio = *bio; |
21 | struct bvec_iter iter; | ||
22 | struct bio_vec bv; | ||
23 | unsigned int nr_segs = 0; | ||
21 | 24 | ||
22 | blk_queue_bounce(rq->q, bio); | 25 | blk_queue_bounce(rq->q, bio); |
23 | 26 | ||
27 | bio_for_each_bvec(bv, *bio, iter) | ||
28 | nr_segs++; | ||
29 | |||
24 | if (!rq->bio) { | 30 | if (!rq->bio) { |
25 | blk_rq_bio_prep(rq->q, rq, *bio); | 31 | blk_rq_bio_prep(rq, *bio, nr_segs); |
26 | } else { | 32 | } else { |
27 | if (!ll_back_merge_fn(rq->q, rq, *bio)) { | 33 | if (!ll_back_merge_fn(rq, *bio, nr_segs)) { |
28 | if (orig_bio != *bio) { | 34 | if (orig_bio != *bio) { |
29 | bio_put(*bio); | 35 | bio_put(*bio); |
30 | *bio = orig_bio; | 36 | *bio = orig_bio; |
diff --git a/block/blk-merge.c b/block/blk-merge.c index 17713d7d98d5..57f7990b342d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -105,7 +105,7 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, | |||
105 | static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, | 105 | static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, |
106 | struct bio *bio, struct bio_set *bs, unsigned *nsegs) | 106 | struct bio *bio, struct bio_set *bs, unsigned *nsegs) |
107 | { | 107 | { |
108 | *nsegs = 1; | 108 | *nsegs = 0; |
109 | 109 | ||
110 | if (!q->limits.max_write_zeroes_sectors) | 110 | if (!q->limits.max_write_zeroes_sectors) |
111 | return NULL; | 111 | return NULL; |
@@ -202,8 +202,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, | |||
202 | struct bio_vec bv, bvprv, *bvprvp = NULL; | 202 | struct bio_vec bv, bvprv, *bvprvp = NULL; |
203 | struct bvec_iter iter; | 203 | struct bvec_iter iter; |
204 | unsigned nsegs = 0, sectors = 0; | 204 | unsigned nsegs = 0, sectors = 0; |
205 | bool do_split = true; | ||
206 | struct bio *new = NULL; | ||
207 | const unsigned max_sectors = get_max_io_size(q, bio); | 205 | const unsigned max_sectors = get_max_io_size(q, bio); |
208 | const unsigned max_segs = queue_max_segments(q); | 206 | const unsigned max_segs = queue_max_segments(q); |
209 | 207 | ||
@@ -245,45 +243,36 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, | |||
245 | } | 243 | } |
246 | } | 244 | } |
247 | 245 | ||
248 | do_split = false; | 246 | *segs = nsegs; |
247 | return NULL; | ||
249 | split: | 248 | split: |
250 | *segs = nsegs; | 249 | *segs = nsegs; |
251 | 250 | return bio_split(bio, sectors, GFP_NOIO, bs); | |
252 | if (do_split) { | ||
253 | new = bio_split(bio, sectors, GFP_NOIO, bs); | ||
254 | if (new) | ||
255 | bio = new; | ||
256 | } | ||
257 | |||
258 | return do_split ? new : NULL; | ||
259 | } | 251 | } |
260 | 252 | ||
261 | void blk_queue_split(struct request_queue *q, struct bio **bio) | 253 | void __blk_queue_split(struct request_queue *q, struct bio **bio, |
254 | unsigned int *nr_segs) | ||
262 | { | 255 | { |
263 | struct bio *split, *res; | 256 | struct bio *split; |
264 | unsigned nsegs; | ||
265 | 257 | ||
266 | switch (bio_op(*bio)) { | 258 | switch (bio_op(*bio)) { |
267 | case REQ_OP_DISCARD: | 259 | case REQ_OP_DISCARD: |
268 | case REQ_OP_SECURE_ERASE: | 260 | case REQ_OP_SECURE_ERASE: |
269 | split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs); | 261 | split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs); |
270 | break; | 262 | break; |
271 | case REQ_OP_WRITE_ZEROES: | 263 | case REQ_OP_WRITE_ZEROES: |
272 | split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs); | 264 | split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, |
265 | nr_segs); | ||
273 | break; | 266 | break; |
274 | case REQ_OP_WRITE_SAME: | 267 | case REQ_OP_WRITE_SAME: |
275 | split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs); | 268 | split = blk_bio_write_same_split(q, *bio, &q->bio_split, |
269 | nr_segs); | ||
276 | break; | 270 | break; |
277 | default: | 271 | default: |
278 | split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs); | 272 | split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); |
279 | break; | 273 | break; |
280 | } | 274 | } |
281 | 275 | ||
282 | /* physical segments can be figured out during splitting */ | ||
283 | res = split ? split : *bio; | ||
284 | res->bi_phys_segments = nsegs; | ||
285 | bio_set_flag(res, BIO_SEG_VALID); | ||
286 | |||
287 | if (split) { | 276 | if (split) { |
288 | /* there isn't chance to merge the splitted bio */ | 277 | /* there isn't chance to merge the splitted bio */ |
289 | split->bi_opf |= REQ_NOMERGE; | 278 | split->bi_opf |= REQ_NOMERGE; |
@@ -304,19 +293,25 @@ void blk_queue_split(struct request_queue *q, struct bio **bio) | |||
304 | *bio = split; | 293 | *bio = split; |
305 | } | 294 | } |
306 | } | 295 | } |
296 | |||
297 | void blk_queue_split(struct request_queue *q, struct bio **bio) | ||
298 | { | ||
299 | unsigned int nr_segs; | ||
300 | |||
301 | __blk_queue_split(q, bio, &nr_segs); | ||
302 | } | ||
307 | EXPORT_SYMBOL(blk_queue_split); | 303 | EXPORT_SYMBOL(blk_queue_split); |
308 | 304 | ||
309 | static unsigned int __blk_recalc_rq_segments(struct request_queue *q, | 305 | unsigned int blk_recalc_rq_segments(struct request *rq) |
310 | struct bio *bio) | ||
311 | { | 306 | { |
312 | unsigned int nr_phys_segs = 0; | 307 | unsigned int nr_phys_segs = 0; |
313 | struct bvec_iter iter; | 308 | struct req_iterator iter; |
314 | struct bio_vec bv; | 309 | struct bio_vec bv; |
315 | 310 | ||
316 | if (!bio) | 311 | if (!rq->bio) |
317 | return 0; | 312 | return 0; |
318 | 313 | ||
319 | switch (bio_op(bio)) { | 314 | switch (bio_op(rq->bio)) { |
320 | case REQ_OP_DISCARD: | 315 | case REQ_OP_DISCARD: |
321 | case REQ_OP_SECURE_ERASE: | 316 | case REQ_OP_SECURE_ERASE: |
322 | case REQ_OP_WRITE_ZEROES: | 317 | case REQ_OP_WRITE_ZEROES: |
@@ -325,30 +320,11 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, | |||
325 | return 1; | 320 | return 1; |
326 | } | 321 | } |
327 | 322 | ||
328 | for_each_bio(bio) { | 323 | rq_for_each_bvec(bv, rq, iter) |
329 | bio_for_each_bvec(bv, bio, iter) | 324 | bvec_split_segs(rq->q, &bv, &nr_phys_segs, NULL, UINT_MAX); |
330 | bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX); | ||
331 | } | ||
332 | |||
333 | return nr_phys_segs; | 325 | return nr_phys_segs; |
334 | } | 326 | } |
335 | 327 | ||
336 | void blk_recalc_rq_segments(struct request *rq) | ||
337 | { | ||
338 | rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); | ||
339 | } | ||
340 | |||
341 | void blk_recount_segments(struct request_queue *q, struct bio *bio) | ||
342 | { | ||
343 | struct bio *nxt = bio->bi_next; | ||
344 | |||
345 | bio->bi_next = NULL; | ||
346 | bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); | ||
347 | bio->bi_next = nxt; | ||
348 | |||
349 | bio_set_flag(bio, BIO_SEG_VALID); | ||
350 | } | ||
351 | |||
352 | static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, | 328 | static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, |
353 | struct scatterlist *sglist) | 329 | struct scatterlist *sglist) |
354 | { | 330 | { |
@@ -519,16 +495,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, | |||
519 | } | 495 | } |
520 | EXPORT_SYMBOL(blk_rq_map_sg); | 496 | EXPORT_SYMBOL(blk_rq_map_sg); |
521 | 497 | ||
522 | static inline int ll_new_hw_segment(struct request_queue *q, | 498 | static inline int ll_new_hw_segment(struct request *req, struct bio *bio, |
523 | struct request *req, | 499 | unsigned int nr_phys_segs) |
524 | struct bio *bio) | ||
525 | { | 500 | { |
526 | int nr_phys_segs = bio_phys_segments(q, bio); | 501 | if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q)) |
527 | |||
528 | if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) | ||
529 | goto no_merge; | 502 | goto no_merge; |
530 | 503 | ||
531 | if (blk_integrity_merge_bio(q, req, bio) == false) | 504 | if (blk_integrity_merge_bio(req->q, req, bio) == false) |
532 | goto no_merge; | 505 | goto no_merge; |
533 | 506 | ||
534 | /* | 507 | /* |
@@ -539,12 +512,11 @@ static inline int ll_new_hw_segment(struct request_queue *q, | |||
539 | return 1; | 512 | return 1; |
540 | 513 | ||
541 | no_merge: | 514 | no_merge: |
542 | req_set_nomerge(q, req); | 515 | req_set_nomerge(req->q, req); |
543 | return 0; | 516 | return 0; |
544 | } | 517 | } |
545 | 518 | ||
546 | int ll_back_merge_fn(struct request_queue *q, struct request *req, | 519 | int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) |
547 | struct bio *bio) | ||
548 | { | 520 | { |
549 | if (req_gap_back_merge(req, bio)) | 521 | if (req_gap_back_merge(req, bio)) |
550 | return 0; | 522 | return 0; |
@@ -553,21 +525,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, | |||
553 | return 0; | 525 | return 0; |
554 | if (blk_rq_sectors(req) + bio_sectors(bio) > | 526 | if (blk_rq_sectors(req) + bio_sectors(bio) > |
555 | blk_rq_get_max_sectors(req, blk_rq_pos(req))) { | 527 | blk_rq_get_max_sectors(req, blk_rq_pos(req))) { |
556 | req_set_nomerge(q, req); | 528 | req_set_nomerge(req->q, req); |
557 | return 0; | 529 | return 0; |
558 | } | 530 | } |
559 | if (!bio_flagged(req->biotail, BIO_SEG_VALID)) | ||
560 | blk_recount_segments(q, req->biotail); | ||
561 | if (!bio_flagged(bio, BIO_SEG_VALID)) | ||
562 | blk_recount_segments(q, bio); | ||
563 | 531 | ||
564 | return ll_new_hw_segment(q, req, bio); | 532 | return ll_new_hw_segment(req, bio, nr_segs); |
565 | } | 533 | } |
566 | 534 | ||
567 | int ll_front_merge_fn(struct request_queue *q, struct request *req, | 535 | int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) |
568 | struct bio *bio) | ||
569 | { | 536 | { |
570 | |||
571 | if (req_gap_front_merge(req, bio)) | 537 | if (req_gap_front_merge(req, bio)) |
572 | return 0; | 538 | return 0; |
573 | if (blk_integrity_rq(req) && | 539 | if (blk_integrity_rq(req) && |
@@ -575,15 +541,11 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, | |||
575 | return 0; | 541 | return 0; |
576 | if (blk_rq_sectors(req) + bio_sectors(bio) > | 542 | if (blk_rq_sectors(req) + bio_sectors(bio) > |
577 | blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { | 543 | blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { |
578 | req_set_nomerge(q, req); | 544 | req_set_nomerge(req->q, req); |
579 | return 0; | 545 | return 0; |
580 | } | 546 | } |
581 | if (!bio_flagged(bio, BIO_SEG_VALID)) | ||
582 | blk_recount_segments(q, bio); | ||
583 | if (!bio_flagged(req->bio, BIO_SEG_VALID)) | ||
584 | blk_recount_segments(q, req->bio); | ||
585 | 547 | ||
586 | return ll_new_hw_segment(q, req, bio); | 548 | return ll_new_hw_segment(req, bio, nr_segs); |
587 | } | 549 | } |
588 | 550 | ||
589 | static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, | 551 | static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, |
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3afe327f816f..b3f2ba483992 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c | |||
@@ -17,7 +17,7 @@ | |||
17 | static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) | 17 | static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) |
18 | { | 18 | { |
19 | if (stat->nr_samples) { | 19 | if (stat->nr_samples) { |
20 | seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", | 20 | seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu", |
21 | stat->nr_samples, stat->mean, stat->min, stat->max); | 21 | stat->nr_samples, stat->mean, stat->min, stat->max); |
22 | } else { | 22 | } else { |
23 | seq_puts(m, "samples=0"); | 23 | seq_puts(m, "samples=0"); |
@@ -29,13 +29,13 @@ static int queue_poll_stat_show(void *data, struct seq_file *m) | |||
29 | struct request_queue *q = data; | 29 | struct request_queue *q = data; |
30 | int bucket; | 30 | int bucket; |
31 | 31 | ||
32 | for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) { | 32 | for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { |
33 | seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket)); | 33 | seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); |
34 | print_stat(m, &q->poll_stat[2*bucket]); | 34 | print_stat(m, &q->poll_stat[2 * bucket]); |
35 | seq_puts(m, "\n"); | 35 | seq_puts(m, "\n"); |
36 | 36 | ||
37 | seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket)); | 37 | seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket)); |
38 | print_stat(m, &q->poll_stat[2*bucket+1]); | 38 | print_stat(m, &q->poll_stat[2 * bucket + 1]); |
39 | seq_puts(m, "\n"); | 39 | seq_puts(m, "\n"); |
40 | } | 40 | } |
41 | return 0; | 41 | return 0; |
@@ -261,23 +261,6 @@ static int hctx_flags_show(void *data, struct seq_file *m) | |||
261 | return 0; | 261 | return 0; |
262 | } | 262 | } |
263 | 263 | ||
264 | #define REQ_OP_NAME(name) [REQ_OP_##name] = #name | ||
265 | static const char *const op_name[] = { | ||
266 | REQ_OP_NAME(READ), | ||
267 | REQ_OP_NAME(WRITE), | ||
268 | REQ_OP_NAME(FLUSH), | ||
269 | REQ_OP_NAME(DISCARD), | ||
270 | REQ_OP_NAME(SECURE_ERASE), | ||
271 | REQ_OP_NAME(ZONE_RESET), | ||
272 | REQ_OP_NAME(WRITE_SAME), | ||
273 | REQ_OP_NAME(WRITE_ZEROES), | ||
274 | REQ_OP_NAME(SCSI_IN), | ||
275 | REQ_OP_NAME(SCSI_OUT), | ||
276 | REQ_OP_NAME(DRV_IN), | ||
277 | REQ_OP_NAME(DRV_OUT), | ||
278 | }; | ||
279 | #undef REQ_OP_NAME | ||
280 | |||
281 | #define CMD_FLAG_NAME(name) [__REQ_##name] = #name | 264 | #define CMD_FLAG_NAME(name) [__REQ_##name] = #name |
282 | static const char *const cmd_flag_name[] = { | 265 | static const char *const cmd_flag_name[] = { |
283 | CMD_FLAG_NAME(FAILFAST_DEV), | 266 | CMD_FLAG_NAME(FAILFAST_DEV), |
@@ -341,13 +324,14 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) | |||
341 | int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) | 324 | int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) |
342 | { | 325 | { |
343 | const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; | 326 | const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; |
344 | const unsigned int op = rq->cmd_flags & REQ_OP_MASK; | 327 | const unsigned int op = req_op(rq); |
328 | const char *op_str = blk_op_str(op); | ||
345 | 329 | ||
346 | seq_printf(m, "%p {.op=", rq); | 330 | seq_printf(m, "%p {.op=", rq); |
347 | if (op < ARRAY_SIZE(op_name) && op_name[op]) | 331 | if (strcmp(op_str, "UNKNOWN") == 0) |
348 | seq_printf(m, "%s", op_name[op]); | 332 | seq_printf(m, "%u", op); |
349 | else | 333 | else |
350 | seq_printf(m, "%d", op); | 334 | seq_printf(m, "%s", op_str); |
351 | seq_puts(m, ", .cmd_flags="); | 335 | seq_puts(m, ", .cmd_flags="); |
352 | blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, | 336 | blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, |
353 | ARRAY_SIZE(cmd_flag_name)); | 337 | ARRAY_SIZE(cmd_flag_name)); |
@@ -779,8 +763,8 @@ static int blk_mq_debugfs_release(struct inode *inode, struct file *file) | |||
779 | 763 | ||
780 | if (attr->show) | 764 | if (attr->show) |
781 | return single_release(inode, file); | 765 | return single_release(inode, file); |
782 | else | 766 | |
783 | return seq_release(inode, file); | 767 | return seq_release(inode, file); |
784 | } | 768 | } |
785 | 769 | ||
786 | static const struct file_operations blk_mq_debugfs_fops = { | 770 | static const struct file_operations blk_mq_debugfs_fops = { |
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2766066a15db..c9d183d6c499 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c | |||
@@ -224,7 +224,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |||
224 | } | 224 | } |
225 | 225 | ||
226 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | 226 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
227 | struct request **merged_request) | 227 | unsigned int nr_segs, struct request **merged_request) |
228 | { | 228 | { |
229 | struct request *rq; | 229 | struct request *rq; |
230 | 230 | ||
@@ -232,7 +232,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | |||
232 | case ELEVATOR_BACK_MERGE: | 232 | case ELEVATOR_BACK_MERGE: |
233 | if (!blk_mq_sched_allow_merge(q, rq, bio)) | 233 | if (!blk_mq_sched_allow_merge(q, rq, bio)) |
234 | return false; | 234 | return false; |
235 | if (!bio_attempt_back_merge(q, rq, bio)) | 235 | if (!bio_attempt_back_merge(rq, bio, nr_segs)) |
236 | return false; | 236 | return false; |
237 | *merged_request = attempt_back_merge(q, rq); | 237 | *merged_request = attempt_back_merge(q, rq); |
238 | if (!*merged_request) | 238 | if (!*merged_request) |
@@ -241,7 +241,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | |||
241 | case ELEVATOR_FRONT_MERGE: | 241 | case ELEVATOR_FRONT_MERGE: |
242 | if (!blk_mq_sched_allow_merge(q, rq, bio)) | 242 | if (!blk_mq_sched_allow_merge(q, rq, bio)) |
243 | return false; | 243 | return false; |
244 | if (!bio_attempt_front_merge(q, rq, bio)) | 244 | if (!bio_attempt_front_merge(rq, bio, nr_segs)) |
245 | return false; | 245 | return false; |
246 | *merged_request = attempt_front_merge(q, rq); | 246 | *merged_request = attempt_front_merge(q, rq); |
247 | if (!*merged_request) | 247 | if (!*merged_request) |
@@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); | |||
260 | * of them. | 260 | * of them. |
261 | */ | 261 | */ |
262 | bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, | 262 | bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, |
263 | struct bio *bio) | 263 | struct bio *bio, unsigned int nr_segs) |
264 | { | 264 | { |
265 | struct request *rq; | 265 | struct request *rq; |
266 | int checked = 8; | 266 | int checked = 8; |
@@ -277,11 +277,13 @@ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, | |||
277 | switch (blk_try_merge(rq, bio)) { | 277 | switch (blk_try_merge(rq, bio)) { |
278 | case ELEVATOR_BACK_MERGE: | 278 | case ELEVATOR_BACK_MERGE: |
279 | if (blk_mq_sched_allow_merge(q, rq, bio)) | 279 | if (blk_mq_sched_allow_merge(q, rq, bio)) |
280 | merged = bio_attempt_back_merge(q, rq, bio); | 280 | merged = bio_attempt_back_merge(rq, bio, |
281 | nr_segs); | ||
281 | break; | 282 | break; |
282 | case ELEVATOR_FRONT_MERGE: | 283 | case ELEVATOR_FRONT_MERGE: |
283 | if (blk_mq_sched_allow_merge(q, rq, bio)) | 284 | if (blk_mq_sched_allow_merge(q, rq, bio)) |
284 | merged = bio_attempt_front_merge(q, rq, bio); | 285 | merged = bio_attempt_front_merge(rq, bio, |
286 | nr_segs); | ||
285 | break; | 287 | break; |
286 | case ELEVATOR_DISCARD_MERGE: | 288 | case ELEVATOR_DISCARD_MERGE: |
287 | merged = bio_attempt_discard_merge(q, rq, bio); | 289 | merged = bio_attempt_discard_merge(q, rq, bio); |
@@ -304,13 +306,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); | |||
304 | */ | 306 | */ |
305 | static bool blk_mq_attempt_merge(struct request_queue *q, | 307 | static bool blk_mq_attempt_merge(struct request_queue *q, |
306 | struct blk_mq_hw_ctx *hctx, | 308 | struct blk_mq_hw_ctx *hctx, |
307 | struct blk_mq_ctx *ctx, struct bio *bio) | 309 | struct blk_mq_ctx *ctx, struct bio *bio, |
310 | unsigned int nr_segs) | ||
308 | { | 311 | { |
309 | enum hctx_type type = hctx->type; | 312 | enum hctx_type type = hctx->type; |
310 | 313 | ||
311 | lockdep_assert_held(&ctx->lock); | 314 | lockdep_assert_held(&ctx->lock); |
312 | 315 | ||
313 | if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { | 316 | if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { |
314 | ctx->rq_merged++; | 317 | ctx->rq_merged++; |
315 | return true; | 318 | return true; |
316 | } | 319 | } |
@@ -318,7 +321,8 @@ static bool blk_mq_attempt_merge(struct request_queue *q, | |||
318 | return false; | 321 | return false; |
319 | } | 322 | } |
320 | 323 | ||
321 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | 324 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, |
325 | unsigned int nr_segs) | ||
322 | { | 326 | { |
323 | struct elevator_queue *e = q->elevator; | 327 | struct elevator_queue *e = q->elevator; |
324 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); | 328 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
@@ -326,21 +330,18 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | |||
326 | bool ret = false; | 330 | bool ret = false; |
327 | enum hctx_type type; | 331 | enum hctx_type type; |
328 | 332 | ||
329 | if (e && e->type->ops.bio_merge) { | 333 | if (e && e->type->ops.bio_merge) |
330 | blk_mq_put_ctx(ctx); | 334 | return e->type->ops.bio_merge(hctx, bio, nr_segs); |
331 | return e->type->ops.bio_merge(hctx, bio); | ||
332 | } | ||
333 | 335 | ||
334 | type = hctx->type; | 336 | type = hctx->type; |
335 | if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && | 337 | if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && |
336 | !list_empty_careful(&ctx->rq_lists[type])) { | 338 | !list_empty_careful(&ctx->rq_lists[type])) { |
337 | /* default per sw-queue merge */ | 339 | /* default per sw-queue merge */ |
338 | spin_lock(&ctx->lock); | 340 | spin_lock(&ctx->lock); |
339 | ret = blk_mq_attempt_merge(q, hctx, ctx, bio); | 341 | ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); |
340 | spin_unlock(&ctx->lock); | 342 | spin_unlock(&ctx->lock); |
341 | } | 343 | } |
342 | 344 | ||
343 | blk_mq_put_ctx(ctx); | ||
344 | return ret; | 345 | return ret; |
345 | } | 346 | } |
346 | 347 | ||
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 3cf92cbbd8ac..cf22ab00fefb 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h | |||
@@ -12,8 +12,9 @@ void blk_mq_sched_assign_ioc(struct request *rq); | |||
12 | 12 | ||
13 | void blk_mq_sched_request_inserted(struct request *rq); | 13 | void blk_mq_sched_request_inserted(struct request *rq); |
14 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | 14 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
15 | struct request **merged_request); | 15 | unsigned int nr_segs, struct request **merged_request); |
16 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); | 16 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, |
17 | unsigned int nr_segs); | ||
17 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); | 18 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); |
18 | void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); | 19 | void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); |
19 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); | 20 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); |
@@ -31,12 +32,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); | |||
31 | void blk_mq_sched_free_requests(struct request_queue *q); | 32 | void blk_mq_sched_free_requests(struct request_queue *q); |
32 | 33 | ||
33 | static inline bool | 34 | static inline bool |
34 | blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | 35 | blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, |
36 | unsigned int nr_segs) | ||
35 | { | 37 | { |
36 | if (blk_queue_nomerges(q) || !bio_mergeable(bio)) | 38 | if (blk_queue_nomerges(q) || !bio_mergeable(bio)) |
37 | return false; | 39 | return false; |
38 | 40 | ||
39 | return __blk_mq_sched_bio_merge(q, bio); | 41 | return __blk_mq_sched_bio_merge(q, bio, nr_segs); |
40 | } | 42 | } |
41 | 43 | ||
42 | static inline bool | 44 | static inline bool |
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 7513c8eaabee..da19f0bc8876 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c | |||
@@ -113,7 +113,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
113 | struct sbq_wait_state *ws; | 113 | struct sbq_wait_state *ws; |
114 | DEFINE_SBQ_WAIT(wait); | 114 | DEFINE_SBQ_WAIT(wait); |
115 | unsigned int tag_offset; | 115 | unsigned int tag_offset; |
116 | bool drop_ctx; | ||
117 | int tag; | 116 | int tag; |
118 | 117 | ||
119 | if (data->flags & BLK_MQ_REQ_RESERVED) { | 118 | if (data->flags & BLK_MQ_REQ_RESERVED) { |
@@ -136,7 +135,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
136 | return BLK_MQ_TAG_FAIL; | 135 | return BLK_MQ_TAG_FAIL; |
137 | 136 | ||
138 | ws = bt_wait_ptr(bt, data->hctx); | 137 | ws = bt_wait_ptr(bt, data->hctx); |
139 | drop_ctx = data->ctx == NULL; | ||
140 | do { | 138 | do { |
141 | struct sbitmap_queue *bt_prev; | 139 | struct sbitmap_queue *bt_prev; |
142 | 140 | ||
@@ -161,9 +159,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
161 | if (tag != -1) | 159 | if (tag != -1) |
162 | break; | 160 | break; |
163 | 161 | ||
164 | if (data->ctx) | ||
165 | blk_mq_put_ctx(data->ctx); | ||
166 | |||
167 | bt_prev = bt; | 162 | bt_prev = bt; |
168 | io_schedule(); | 163 | io_schedule(); |
169 | 164 | ||
@@ -189,9 +184,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
189 | ws = bt_wait_ptr(bt, data->hctx); | 184 | ws = bt_wait_ptr(bt, data->hctx); |
190 | } while (1); | 185 | } while (1); |
191 | 186 | ||
192 | if (drop_ctx && data->ctx) | ||
193 | blk_mq_put_ctx(data->ctx); | ||
194 | |||
195 | sbitmap_finish_wait(bt, ws, &wait); | 187 | sbitmap_finish_wait(bt, ws, &wait); |
196 | 188 | ||
197 | found_tag: | 189 | found_tag: |
diff --git a/block/blk-mq.c b/block/blk-mq.c index ce0f5f4ede70..e5ef40c603ca 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -355,13 +355,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
355 | struct elevator_queue *e = q->elevator; | 355 | struct elevator_queue *e = q->elevator; |
356 | struct request *rq; | 356 | struct request *rq; |
357 | unsigned int tag; | 357 | unsigned int tag; |
358 | bool put_ctx_on_error = false; | 358 | bool clear_ctx_on_error = false; |
359 | 359 | ||
360 | blk_queue_enter_live(q); | 360 | blk_queue_enter_live(q); |
361 | data->q = q; | 361 | data->q = q; |
362 | if (likely(!data->ctx)) { | 362 | if (likely(!data->ctx)) { |
363 | data->ctx = blk_mq_get_ctx(q); | 363 | data->ctx = blk_mq_get_ctx(q); |
364 | put_ctx_on_error = true; | 364 | clear_ctx_on_error = true; |
365 | } | 365 | } |
366 | if (likely(!data->hctx)) | 366 | if (likely(!data->hctx)) |
367 | data->hctx = blk_mq_map_queue(q, data->cmd_flags, | 367 | data->hctx = blk_mq_map_queue(q, data->cmd_flags, |
@@ -387,10 +387,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
387 | 387 | ||
388 | tag = blk_mq_get_tag(data); | 388 | tag = blk_mq_get_tag(data); |
389 | if (tag == BLK_MQ_TAG_FAIL) { | 389 | if (tag == BLK_MQ_TAG_FAIL) { |
390 | if (put_ctx_on_error) { | 390 | if (clear_ctx_on_error) |
391 | blk_mq_put_ctx(data->ctx); | ||
392 | data->ctx = NULL; | 391 | data->ctx = NULL; |
393 | } | ||
394 | blk_queue_exit(q); | 392 | blk_queue_exit(q); |
395 | return NULL; | 393 | return NULL; |
396 | } | 394 | } |
@@ -427,8 +425,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, | |||
427 | if (!rq) | 425 | if (!rq) |
428 | return ERR_PTR(-EWOULDBLOCK); | 426 | return ERR_PTR(-EWOULDBLOCK); |
429 | 427 | ||
430 | blk_mq_put_ctx(alloc_data.ctx); | ||
431 | |||
432 | rq->__data_len = 0; | 428 | rq->__data_len = 0; |
433 | rq->__sector = (sector_t) -1; | 429 | rq->__sector = (sector_t) -1; |
434 | rq->bio = rq->biotail = NULL; | 430 | rq->bio = rq->biotail = NULL; |
@@ -1764,9 +1760,15 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
1764 | } | 1760 | } |
1765 | } | 1761 | } |
1766 | 1762 | ||
1767 | static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | 1763 | static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, |
1764 | unsigned int nr_segs) | ||
1768 | { | 1765 | { |
1769 | blk_init_request_from_bio(rq, bio); | 1766 | if (bio->bi_opf & REQ_RAHEAD) |
1767 | rq->cmd_flags |= REQ_FAILFAST_MASK; | ||
1768 | |||
1769 | rq->__sector = bio->bi_iter.bi_sector; | ||
1770 | rq->write_hint = bio->bi_write_hint; | ||
1771 | blk_rq_bio_prep(rq, bio, nr_segs); | ||
1770 | 1772 | ||
1771 | blk_account_io_start(rq, true); | 1773 | blk_account_io_start(rq, true); |
1772 | } | 1774 | } |
@@ -1936,20 +1938,20 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1936 | struct request *rq; | 1938 | struct request *rq; |
1937 | struct blk_plug *plug; | 1939 | struct blk_plug *plug; |
1938 | struct request *same_queue_rq = NULL; | 1940 | struct request *same_queue_rq = NULL; |
1941 | unsigned int nr_segs; | ||
1939 | blk_qc_t cookie; | 1942 | blk_qc_t cookie; |
1940 | 1943 | ||
1941 | blk_queue_bounce(q, &bio); | 1944 | blk_queue_bounce(q, &bio); |
1942 | 1945 | __blk_queue_split(q, &bio, &nr_segs); | |
1943 | blk_queue_split(q, &bio); | ||
1944 | 1946 | ||
1945 | if (!bio_integrity_prep(bio)) | 1947 | if (!bio_integrity_prep(bio)) |
1946 | return BLK_QC_T_NONE; | 1948 | return BLK_QC_T_NONE; |
1947 | 1949 | ||
1948 | if (!is_flush_fua && !blk_queue_nomerges(q) && | 1950 | if (!is_flush_fua && !blk_queue_nomerges(q) && |
1949 | blk_attempt_plug_merge(q, bio, &same_queue_rq)) | 1951 | blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) |
1950 | return BLK_QC_T_NONE; | 1952 | return BLK_QC_T_NONE; |
1951 | 1953 | ||
1952 | if (blk_mq_sched_bio_merge(q, bio)) | 1954 | if (blk_mq_sched_bio_merge(q, bio, nr_segs)) |
1953 | return BLK_QC_T_NONE; | 1955 | return BLK_QC_T_NONE; |
1954 | 1956 | ||
1955 | rq_qos_throttle(q, bio); | 1957 | rq_qos_throttle(q, bio); |
@@ -1969,11 +1971,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1969 | 1971 | ||
1970 | cookie = request_to_qc_t(data.hctx, rq); | 1972 | cookie = request_to_qc_t(data.hctx, rq); |
1971 | 1973 | ||
1974 | blk_mq_bio_to_request(rq, bio, nr_segs); | ||
1975 | |||
1972 | plug = current->plug; | 1976 | plug = current->plug; |
1973 | if (unlikely(is_flush_fua)) { | 1977 | if (unlikely(is_flush_fua)) { |
1974 | blk_mq_put_ctx(data.ctx); | ||
1975 | blk_mq_bio_to_request(rq, bio); | ||
1976 | |||
1977 | /* bypass scheduler for flush rq */ | 1978 | /* bypass scheduler for flush rq */ |
1978 | blk_insert_flush(rq); | 1979 | blk_insert_flush(rq); |
1979 | blk_mq_run_hw_queue(data.hctx, true); | 1980 | blk_mq_run_hw_queue(data.hctx, true); |
@@ -1985,9 +1986,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1985 | unsigned int request_count = plug->rq_count; | 1986 | unsigned int request_count = plug->rq_count; |
1986 | struct request *last = NULL; | 1987 | struct request *last = NULL; |
1987 | 1988 | ||
1988 | blk_mq_put_ctx(data.ctx); | ||
1989 | blk_mq_bio_to_request(rq, bio); | ||
1990 | |||
1991 | if (!request_count) | 1989 | if (!request_count) |
1992 | trace_block_plug(q); | 1990 | trace_block_plug(q); |
1993 | else | 1991 | else |
@@ -2001,8 +1999,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
2001 | 1999 | ||
2002 | blk_add_rq_to_plug(plug, rq); | 2000 | blk_add_rq_to_plug(plug, rq); |
2003 | } else if (plug && !blk_queue_nomerges(q)) { | 2001 | } else if (plug && !blk_queue_nomerges(q)) { |
2004 | blk_mq_bio_to_request(rq, bio); | ||
2005 | |||
2006 | /* | 2002 | /* |
2007 | * We do limited plugging. If the bio can be merged, do that. | 2003 | * We do limited plugging. If the bio can be merged, do that. |
2008 | * Otherwise the existing request in the plug list will be | 2004 | * Otherwise the existing request in the plug list will be |
@@ -2019,8 +2015,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
2019 | blk_add_rq_to_plug(plug, rq); | 2015 | blk_add_rq_to_plug(plug, rq); |
2020 | trace_block_plug(q); | 2016 | trace_block_plug(q); |
2021 | 2017 | ||
2022 | blk_mq_put_ctx(data.ctx); | ||
2023 | |||
2024 | if (same_queue_rq) { | 2018 | if (same_queue_rq) { |
2025 | data.hctx = same_queue_rq->mq_hctx; | 2019 | data.hctx = same_queue_rq->mq_hctx; |
2026 | trace_block_unplug(q, 1, true); | 2020 | trace_block_unplug(q, 1, true); |
@@ -2029,12 +2023,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
2029 | } | 2023 | } |
2030 | } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && | 2024 | } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && |
2031 | !data.hctx->dispatch_busy)) { | 2025 | !data.hctx->dispatch_busy)) { |
2032 | blk_mq_put_ctx(data.ctx); | ||
2033 | blk_mq_bio_to_request(rq, bio); | ||
2034 | blk_mq_try_issue_directly(data.hctx, rq, &cookie); | 2026 | blk_mq_try_issue_directly(data.hctx, rq, &cookie); |
2035 | } else { | 2027 | } else { |
2036 | blk_mq_put_ctx(data.ctx); | ||
2037 | blk_mq_bio_to_request(rq, bio); | ||
2038 | blk_mq_sched_insert_request(rq, false, true, true); | 2028 | blk_mq_sched_insert_request(rq, false, true, true); |
2039 | } | 2029 | } |
2040 | 2030 | ||
diff --git a/block/blk-mq.h b/block/blk-mq.h index 633a5a77ee8b..f4bf5161333e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h | |||
@@ -151,12 +151,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, | |||
151 | */ | 151 | */ |
152 | static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) | 152 | static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) |
153 | { | 153 | { |
154 | return __blk_mq_get_ctx(q, get_cpu()); | 154 | return __blk_mq_get_ctx(q, raw_smp_processor_id()); |
155 | } | ||
156 | |||
157 | static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) | ||
158 | { | ||
159 | put_cpu(); | ||
160 | } | 155 | } |
161 | 156 | ||
162 | struct blk_mq_alloc_data { | 157 | struct blk_mq_alloc_data { |
diff --git a/block/blk.h b/block/blk.h index 7814aa207153..de6b2e146d6e 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -51,8 +51,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, | |||
51 | int node, int cmd_size, gfp_t flags); | 51 | int node, int cmd_size, gfp_t flags); |
52 | void blk_free_flush_queue(struct blk_flush_queue *q); | 52 | void blk_free_flush_queue(struct blk_flush_queue *q); |
53 | 53 | ||
54 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | ||
55 | struct bio *bio); | ||
56 | void blk_freeze_queue(struct request_queue *q); | 54 | void blk_freeze_queue(struct request_queue *q); |
57 | 55 | ||
58 | static inline void blk_queue_enter_live(struct request_queue *q) | 56 | static inline void blk_queue_enter_live(struct request_queue *q) |
@@ -101,6 +99,18 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, | |||
101 | return __bvec_gap_to_prev(q, bprv, offset); | 99 | return __bvec_gap_to_prev(q, bprv, offset); |
102 | } | 100 | } |
103 | 101 | ||
102 | static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, | ||
103 | unsigned int nr_segs) | ||
104 | { | ||
105 | rq->nr_phys_segments = nr_segs; | ||
106 | rq->__data_len = bio->bi_iter.bi_size; | ||
107 | rq->bio = rq->biotail = bio; | ||
108 | rq->ioprio = bio_prio(bio); | ||
109 | |||
110 | if (bio->bi_disk) | ||
111 | rq->rq_disk = bio->bi_disk; | ||
112 | } | ||
113 | |||
104 | #ifdef CONFIG_BLK_DEV_INTEGRITY | 114 | #ifdef CONFIG_BLK_DEV_INTEGRITY |
105 | void blk_flush_integrity(void); | 115 | void blk_flush_integrity(void); |
106 | bool __bio_integrity_endio(struct bio *); | 116 | bool __bio_integrity_endio(struct bio *); |
@@ -154,14 +164,14 @@ static inline bool bio_integrity_endio(struct bio *bio) | |||
154 | unsigned long blk_rq_timeout(unsigned long timeout); | 164 | unsigned long blk_rq_timeout(unsigned long timeout); |
155 | void blk_add_timer(struct request *req); | 165 | void blk_add_timer(struct request *req); |
156 | 166 | ||
157 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, | 167 | bool bio_attempt_front_merge(struct request *req, struct bio *bio, |
158 | struct bio *bio); | 168 | unsigned int nr_segs); |
159 | bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | 169 | bool bio_attempt_back_merge(struct request *req, struct bio *bio, |
160 | struct bio *bio); | 170 | unsigned int nr_segs); |
161 | bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, | 171 | bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, |
162 | struct bio *bio); | 172 | struct bio *bio); |
163 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | 173 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, |
164 | struct request **same_queue_rq); | 174 | unsigned int nr_segs, struct request **same_queue_rq); |
165 | 175 | ||
166 | void blk_account_io_start(struct request *req, bool new_io); | 176 | void blk_account_io_start(struct request *req, bool new_io); |
167 | void blk_account_io_completion(struct request *req, unsigned int bytes); | 177 | void blk_account_io_completion(struct request *req, unsigned int bytes); |
@@ -202,15 +212,17 @@ static inline int blk_should_fake_timeout(struct request_queue *q) | |||
202 | } | 212 | } |
203 | #endif | 213 | #endif |
204 | 214 | ||
205 | int ll_back_merge_fn(struct request_queue *q, struct request *req, | 215 | void __blk_queue_split(struct request_queue *q, struct bio **bio, |
206 | struct bio *bio); | 216 | unsigned int *nr_segs); |
207 | int ll_front_merge_fn(struct request_queue *q, struct request *req, | 217 | int ll_back_merge_fn(struct request *req, struct bio *bio, |
208 | struct bio *bio); | 218 | unsigned int nr_segs); |
219 | int ll_front_merge_fn(struct request *req, struct bio *bio, | ||
220 | unsigned int nr_segs); | ||
209 | struct request *attempt_back_merge(struct request_queue *q, struct request *rq); | 221 | struct request *attempt_back_merge(struct request_queue *q, struct request *rq); |
210 | struct request *attempt_front_merge(struct request_queue *q, struct request *rq); | 222 | struct request *attempt_front_merge(struct request_queue *q, struct request *rq); |
211 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, | 223 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, |
212 | struct request *next); | 224 | struct request *next); |
213 | void blk_recalc_rq_segments(struct request *rq); | 225 | unsigned int blk_recalc_rq_segments(struct request *rq); |
214 | void blk_rq_set_mixed_merge(struct request *rq); | 226 | void blk_rq_set_mixed_merge(struct request *rq); |
215 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio); | 227 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio); |
216 | enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); | 228 | enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); |
diff --git a/block/genhd.c b/block/genhd.c index 24654e1d83e6..97887e59f3b2 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -1281,7 +1281,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) | |||
1281 | struct disk_part_tbl *new_ptbl; | 1281 | struct disk_part_tbl *new_ptbl; |
1282 | int len = old_ptbl ? old_ptbl->len : 0; | 1282 | int len = old_ptbl ? old_ptbl->len : 0; |
1283 | int i, target; | 1283 | int i, target; |
1284 | size_t size; | ||
1285 | 1284 | ||
1286 | /* | 1285 | /* |
1287 | * check for int overflow, since we can get here from blkpg_ioctl() | 1286 | * check for int overflow, since we can get here from blkpg_ioctl() |
@@ -1298,8 +1297,8 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) | |||
1298 | if (target <= len) | 1297 | if (target <= len) |
1299 | return 0; | 1298 | return 0; |
1300 | 1299 | ||
1301 | size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); | 1300 | new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, |
1302 | new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); | 1301 | disk->node_id); |
1303 | if (!new_ptbl) | 1302 | if (!new_ptbl) |
1304 | return -ENOMEM; | 1303 | return -ENOMEM; |
1305 | 1304 | ||
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index c3b05119cebd..34dcea0ef637 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c | |||
@@ -562,7 +562,8 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) | |||
562 | } | 562 | } |
563 | } | 563 | } |
564 | 564 | ||
565 | static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | 565 | static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, |
566 | unsigned int nr_segs) | ||
566 | { | 567 | { |
567 | struct kyber_hctx_data *khd = hctx->sched_data; | 568 | struct kyber_hctx_data *khd = hctx->sched_data; |
568 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); | 569 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); |
@@ -572,9 +573,8 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | |||
572 | bool merged; | 573 | bool merged; |
573 | 574 | ||
574 | spin_lock(&kcq->lock); | 575 | spin_lock(&kcq->lock); |
575 | merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); | 576 | merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); |
576 | spin_unlock(&kcq->lock); | 577 | spin_unlock(&kcq->lock); |
577 | blk_mq_put_ctx(ctx); | ||
578 | 578 | ||
579 | return merged; | 579 | return merged; |
580 | } | 580 | } |
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 1876f5712bfd..b8a682b5a1bb 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c | |||
@@ -469,7 +469,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, | |||
469 | return ELEVATOR_NO_MERGE; | 469 | return ELEVATOR_NO_MERGE; |
470 | } | 470 | } |
471 | 471 | ||
472 | static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | 472 | static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, |
473 | unsigned int nr_segs) | ||
473 | { | 474 | { |
474 | struct request_queue *q = hctx->queue; | 475 | struct request_queue *q = hctx->queue; |
475 | struct deadline_data *dd = q->elevator->elevator_data; | 476 | struct deadline_data *dd = q->elevator->elevator_data; |
@@ -477,7 +478,7 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | |||
477 | bool ret; | 478 | bool ret; |
478 | 479 | ||
479 | spin_lock(&dd->lock); | 480 | spin_lock(&dd->lock); |
480 | ret = blk_mq_sched_try_merge(q, bio, &free); | 481 | ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); |
481 | spin_unlock(&dd->lock); | 482 | spin_unlock(&dd->lock); |
482 | 483 | ||
483 | if (free) | 484 | if (free) |
diff --git a/block/opal_proto.h b/block/opal_proto.h index d9a05ad02eb5..466ec7be16ef 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h | |||
@@ -98,6 +98,7 @@ enum opal_uid { | |||
98 | OPAL_ENTERPRISE_BANDMASTER0_UID, | 98 | OPAL_ENTERPRISE_BANDMASTER0_UID, |
99 | OPAL_ENTERPRISE_ERASEMASTER_UID, | 99 | OPAL_ENTERPRISE_ERASEMASTER_UID, |
100 | /* tables */ | 100 | /* tables */ |
101 | OPAL_TABLE_TABLE, | ||
101 | OPAL_LOCKINGRANGE_GLOBAL, | 102 | OPAL_LOCKINGRANGE_GLOBAL, |
102 | OPAL_LOCKINGRANGE_ACE_RDLOCKED, | 103 | OPAL_LOCKINGRANGE_ACE_RDLOCKED, |
103 | OPAL_LOCKINGRANGE_ACE_WRLOCKED, | 104 | OPAL_LOCKINGRANGE_ACE_WRLOCKED, |
@@ -152,6 +153,21 @@ enum opal_token { | |||
152 | OPAL_STARTCOLUMN = 0x03, | 153 | OPAL_STARTCOLUMN = 0x03, |
153 | OPAL_ENDCOLUMN = 0x04, | 154 | OPAL_ENDCOLUMN = 0x04, |
154 | OPAL_VALUES = 0x01, | 155 | OPAL_VALUES = 0x01, |
156 | /* table table */ | ||
157 | OPAL_TABLE_UID = 0x00, | ||
158 | OPAL_TABLE_NAME = 0x01, | ||
159 | OPAL_TABLE_COMMON = 0x02, | ||
160 | OPAL_TABLE_TEMPLATE = 0x03, | ||
161 | OPAL_TABLE_KIND = 0x04, | ||
162 | OPAL_TABLE_COLUMN = 0x05, | ||
163 | OPAL_TABLE_COLUMNS = 0x06, | ||
164 | OPAL_TABLE_ROWS = 0x07, | ||
165 | OPAL_TABLE_ROWS_FREE = 0x08, | ||
166 | OPAL_TABLE_ROW_BYTES = 0x09, | ||
167 | OPAL_TABLE_LASTID = 0x0A, | ||
168 | OPAL_TABLE_MIN = 0x0B, | ||
169 | OPAL_TABLE_MAX = 0x0C, | ||
170 | |||
155 | /* authority table */ | 171 | /* authority table */ |
156 | OPAL_PIN = 0x03, | 172 | OPAL_PIN = 0x03, |
157 | /* locking tokens */ | 173 | /* locking tokens */ |
diff --git a/block/sed-opal.c b/block/sed-opal.c index a46e8d13e16d..7e1a444a25b2 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c | |||
@@ -26,6 +26,9 @@ | |||
26 | #define IO_BUFFER_LENGTH 2048 | 26 | #define IO_BUFFER_LENGTH 2048 |
27 | #define MAX_TOKS 64 | 27 | #define MAX_TOKS 64 |
28 | 28 | ||
29 | /* Number of bytes needed by cmd_finalize. */ | ||
30 | #define CMD_FINALIZE_BYTES_NEEDED 7 | ||
31 | |||
29 | struct opal_step { | 32 | struct opal_step { |
30 | int (*fn)(struct opal_dev *dev, void *data); | 33 | int (*fn)(struct opal_dev *dev, void *data); |
31 | void *data; | 34 | void *data; |
@@ -127,6 +130,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { | |||
127 | 130 | ||
128 | /* tables */ | 131 | /* tables */ |
129 | 132 | ||
133 | [OPAL_TABLE_TABLE] | ||
134 | { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, | ||
130 | [OPAL_LOCKINGRANGE_GLOBAL] = | 135 | [OPAL_LOCKINGRANGE_GLOBAL] = |
131 | { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, | 136 | { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, |
132 | [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = | 137 | [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = |
@@ -523,12 +528,17 @@ static int opal_discovery0_step(struct opal_dev *dev) | |||
523 | return execute_step(dev, &discovery0_step, 0); | 528 | return execute_step(dev, &discovery0_step, 0); |
524 | } | 529 | } |
525 | 530 | ||
531 | static size_t remaining_size(struct opal_dev *cmd) | ||
532 | { | ||
533 | return IO_BUFFER_LENGTH - cmd->pos; | ||
534 | } | ||
535 | |||
526 | static bool can_add(int *err, struct opal_dev *cmd, size_t len) | 536 | static bool can_add(int *err, struct opal_dev *cmd, size_t len) |
527 | { | 537 | { |
528 | if (*err) | 538 | if (*err) |
529 | return false; | 539 | return false; |
530 | 540 | ||
531 | if (len > IO_BUFFER_LENGTH || cmd->pos > IO_BUFFER_LENGTH - len) { | 541 | if (remaining_size(cmd) < len) { |
532 | pr_debug("Error adding %zu bytes: end of buffer.\n", len); | 542 | pr_debug("Error adding %zu bytes: end of buffer.\n", len); |
533 | *err = -ERANGE; | 543 | *err = -ERANGE; |
534 | return false; | 544 | return false; |
@@ -674,7 +684,11 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) | |||
674 | struct opal_header *hdr; | 684 | struct opal_header *hdr; |
675 | int err = 0; | 685 | int err = 0; |
676 | 686 | ||
677 | /* close the parameter list opened from cmd_start */ | 687 | /* |
688 | * Close the parameter list opened from cmd_start. | ||
689 | * The number of bytes added must be equal to | ||
690 | * CMD_FINALIZE_BYTES_NEEDED. | ||
691 | */ | ||
678 | add_token_u8(&err, cmd, OPAL_ENDLIST); | 692 | add_token_u8(&err, cmd, OPAL_ENDLIST); |
679 | 693 | ||
680 | add_token_u8(&err, cmd, OPAL_ENDOFDATA); | 694 | add_token_u8(&err, cmd, OPAL_ENDOFDATA); |
@@ -1119,6 +1133,29 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, | |||
1119 | return finalize_and_send(dev, parse_and_check_status); | 1133 | return finalize_and_send(dev, parse_and_check_status); |
1120 | } | 1134 | } |
1121 | 1135 | ||
1136 | /* | ||
1137 | * see TCG SAS 5.3.2.3 for a description of the available columns | ||
1138 | * | ||
1139 | * the result is provided in dev->resp->tok[4] | ||
1140 | */ | ||
1141 | static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, | ||
1142 | u64 column) | ||
1143 | { | ||
1144 | u8 uid[OPAL_UID_LENGTH]; | ||
1145 | const unsigned int half = OPAL_UID_LENGTH/2; | ||
1146 | |||
1147 | /* sed-opal UIDs can be split in two halves: | ||
1148 | * first: actual table index | ||
1149 | * second: relative index in the table | ||
1150 | * so we have to get the first half of the OPAL_TABLE_TABLE and use the | ||
1151 | * first part of the target table as relative index into that table | ||
1152 | */ | ||
1153 | memcpy(uid, opaluid[OPAL_TABLE_TABLE], half); | ||
1154 | memcpy(uid+half, opaluid[table], half); | ||
1155 | |||
1156 | return generic_get_column(dev, uid, column); | ||
1157 | } | ||
1158 | |||
1122 | static int gen_key(struct opal_dev *dev, void *data) | 1159 | static int gen_key(struct opal_dev *dev, void *data) |
1123 | { | 1160 | { |
1124 | u8 uid[OPAL_UID_LENGTH]; | 1161 | u8 uid[OPAL_UID_LENGTH]; |
@@ -1307,6 +1344,7 @@ static int start_generic_opal_session(struct opal_dev *dev, | |||
1307 | break; | 1344 | break; |
1308 | case OPAL_ADMIN1_UID: | 1345 | case OPAL_ADMIN1_UID: |
1309 | case OPAL_SID_UID: | 1346 | case OPAL_SID_UID: |
1347 | case OPAL_PSID_UID: | ||
1310 | add_token_u8(&err, dev, OPAL_STARTNAME); | 1348 | add_token_u8(&err, dev, OPAL_STARTNAME); |
1311 | add_token_u8(&err, dev, 0); /* HostChallenge */ | 1349 | add_token_u8(&err, dev, 0); /* HostChallenge */ |
1312 | add_token_bytestring(&err, dev, key, key_len); | 1350 | add_token_bytestring(&err, dev, key, key_len); |
@@ -1367,6 +1405,16 @@ static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data) | |||
1367 | key->key, key->key_len); | 1405 | key->key, key->key_len); |
1368 | } | 1406 | } |
1369 | 1407 | ||
1408 | static int start_PSID_opal_session(struct opal_dev *dev, void *data) | ||
1409 | { | ||
1410 | const struct opal_key *okey = data; | ||
1411 | |||
1412 | return start_generic_opal_session(dev, OPAL_PSID_UID, | ||
1413 | OPAL_ADMINSP_UID, | ||
1414 | okey->key, | ||
1415 | okey->key_len); | ||
1416 | } | ||
1417 | |||
1370 | static int start_auth_opal_session(struct opal_dev *dev, void *data) | 1418 | static int start_auth_opal_session(struct opal_dev *dev, void *data) |
1371 | { | 1419 | { |
1372 | struct opal_session_info *session = data; | 1420 | struct opal_session_info *session = data; |
@@ -1525,6 +1573,72 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) | |||
1525 | return finalize_and_send(dev, parse_and_check_status); | 1573 | return finalize_and_send(dev, parse_and_check_status); |
1526 | } | 1574 | } |
1527 | 1575 | ||
1576 | static int write_shadow_mbr(struct opal_dev *dev, void *data) | ||
1577 | { | ||
1578 | struct opal_shadow_mbr *shadow = data; | ||
1579 | const u8 __user *src; | ||
1580 | u8 *dst; | ||
1581 | size_t off = 0; | ||
1582 | u64 len; | ||
1583 | int err = 0; | ||
1584 | |||
1585 | /* do we fit in the available shadow mbr space? */ | ||
1586 | err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS); | ||
1587 | if (err) { | ||
1588 | pr_debug("MBR: could not get shadow size\n"); | ||
1589 | return err; | ||
1590 | } | ||
1591 | |||
1592 | len = response_get_u64(&dev->parsed, 4); | ||
1593 | if (shadow->size > len || shadow->offset > len - shadow->size) { | ||
1594 | pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n", | ||
1595 | shadow->offset + shadow->size, len); | ||
1596 | return -ENOSPC; | ||
1597 | } | ||
1598 | |||
1599 | /* do the actual transmission(s) */ | ||
1600 | src = (u8 __user *)(uintptr_t)shadow->data; | ||
1601 | while (off < shadow->size) { | ||
1602 | err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]); | ||
1603 | add_token_u8(&err, dev, OPAL_STARTNAME); | ||
1604 | add_token_u8(&err, dev, OPAL_WHERE); | ||
1605 | add_token_u64(&err, dev, shadow->offset + off); | ||
1606 | add_token_u8(&err, dev, OPAL_ENDNAME); | ||
1607 | |||
1608 | add_token_u8(&err, dev, OPAL_STARTNAME); | ||
1609 | add_token_u8(&err, dev, OPAL_VALUES); | ||
1610 | |||
1611 | /* | ||
1612 | * The bytestring header is either 1 or 2 bytes, so assume 2. | ||
1613 | * There also needs to be enough space to accommodate the | ||
1614 | * trailing OPAL_ENDNAME (1 byte) and tokens added by | ||
1615 | * cmd_finalize. | ||
1616 | */ | ||
1617 | len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), | ||
1618 | (size_t)(shadow->size - off)); | ||
1619 | pr_debug("MBR: write bytes %zu+%llu/%llu\n", | ||
1620 | off, len, shadow->size); | ||
1621 | |||
1622 | dst = add_bytestring_header(&err, dev, len); | ||
1623 | if (!dst) | ||
1624 | break; | ||
1625 | if (copy_from_user(dst, src + off, len)) | ||
1626 | err = -EFAULT; | ||
1627 | dev->pos += len; | ||
1628 | |||
1629 | add_token_u8(&err, dev, OPAL_ENDNAME); | ||
1630 | if (err) | ||
1631 | break; | ||
1632 | |||
1633 | err = finalize_and_send(dev, parse_and_check_status); | ||
1634 | if (err) | ||
1635 | break; | ||
1636 | |||
1637 | off += len; | ||
1638 | } | ||
1639 | return err; | ||
1640 | } | ||
1641 | |||
1528 | static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, | 1642 | static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, |
1529 | struct opal_dev *dev) | 1643 | struct opal_dev *dev) |
1530 | { | 1644 | { |
@@ -1978,6 +2092,50 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, | |||
1978 | return ret; | 2092 | return ret; |
1979 | } | 2093 | } |
1980 | 2094 | ||
2095 | static int opal_set_mbr_done(struct opal_dev *dev, | ||
2096 | struct opal_mbr_done *mbr_done) | ||
2097 | { | ||
2098 | u8 mbr_done_tf = mbr_done->done_flag == OPAL_MBR_DONE ? | ||
2099 | OPAL_TRUE : OPAL_FALSE; | ||
2100 | |||
2101 | const struct opal_step mbr_steps[] = { | ||
2102 | { start_admin1LSP_opal_session, &mbr_done->key }, | ||
2103 | { set_mbr_done, &mbr_done_tf }, | ||
2104 | { end_opal_session, } | ||
2105 | }; | ||
2106 | int ret; | ||
2107 | |||
2108 | if (mbr_done->done_flag != OPAL_MBR_DONE && | ||
2109 | mbr_done->done_flag != OPAL_MBR_NOT_DONE) | ||
2110 | return -EINVAL; | ||
2111 | |||
2112 | mutex_lock(&dev->dev_lock); | ||
2113 | setup_opal_dev(dev); | ||
2114 | ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); | ||
2115 | mutex_unlock(&dev->dev_lock); | ||
2116 | return ret; | ||
2117 | } | ||
2118 | |||
2119 | static int opal_write_shadow_mbr(struct opal_dev *dev, | ||
2120 | struct opal_shadow_mbr *info) | ||
2121 | { | ||
2122 | const struct opal_step mbr_steps[] = { | ||
2123 | { start_admin1LSP_opal_session, &info->key }, | ||
2124 | { write_shadow_mbr, info }, | ||
2125 | { end_opal_session, } | ||
2126 | }; | ||
2127 | int ret; | ||
2128 | |||
2129 | if (info->size == 0) | ||
2130 | return 0; | ||
2131 | |||
2132 | mutex_lock(&dev->dev_lock); | ||
2133 | setup_opal_dev(dev); | ||
2134 | ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); | ||
2135 | mutex_unlock(&dev->dev_lock); | ||
2136 | return ret; | ||
2137 | } | ||
2138 | |||
1981 | static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) | 2139 | static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) |
1982 | { | 2140 | { |
1983 | struct opal_suspend_data *suspend; | 2141 | struct opal_suspend_data *suspend; |
@@ -2030,17 +2188,28 @@ static int opal_add_user_to_lr(struct opal_dev *dev, | |||
2030 | return ret; | 2188 | return ret; |
2031 | } | 2189 | } |
2032 | 2190 | ||
2033 | static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) | 2191 | static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psid) |
2034 | { | 2192 | { |
2193 | /* controller will terminate session */ | ||
2035 | const struct opal_step revert_steps[] = { | 2194 | const struct opal_step revert_steps[] = { |
2036 | { start_SIDASP_opal_session, opal }, | 2195 | { start_SIDASP_opal_session, opal }, |
2037 | { revert_tper, } /* controller will terminate session */ | 2196 | { revert_tper, } |
2197 | }; | ||
2198 | const struct opal_step psid_revert_steps[] = { | ||
2199 | { start_PSID_opal_session, opal }, | ||
2200 | { revert_tper, } | ||
2038 | }; | 2201 | }; |
2202 | |||
2039 | int ret; | 2203 | int ret; |
2040 | 2204 | ||
2041 | mutex_lock(&dev->dev_lock); | 2205 | mutex_lock(&dev->dev_lock); |
2042 | setup_opal_dev(dev); | 2206 | setup_opal_dev(dev); |
2043 | ret = execute_steps(dev, revert_steps, ARRAY_SIZE(revert_steps)); | 2207 | if (psid) |
2208 | ret = execute_steps(dev, psid_revert_steps, | ||
2209 | ARRAY_SIZE(psid_revert_steps)); | ||
2210 | else | ||
2211 | ret = execute_steps(dev, revert_steps, | ||
2212 | ARRAY_SIZE(revert_steps)); | ||
2044 | mutex_unlock(&dev->dev_lock); | 2213 | mutex_unlock(&dev->dev_lock); |
2045 | 2214 | ||
2046 | /* | 2215 | /* |
@@ -2092,8 +2261,7 @@ static int opal_lock_unlock(struct opal_dev *dev, | |||
2092 | { | 2261 | { |
2093 | int ret; | 2262 | int ret; |
2094 | 2263 | ||
2095 | if (lk_unlk->session.who < OPAL_ADMIN1 || | 2264 | if (lk_unlk->session.who > OPAL_USER9) |
2096 | lk_unlk->session.who > OPAL_USER9) | ||
2097 | return -EINVAL; | 2265 | return -EINVAL; |
2098 | 2266 | ||
2099 | mutex_lock(&dev->dev_lock); | 2267 | mutex_lock(&dev->dev_lock); |
@@ -2171,9 +2339,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) | |||
2171 | }; | 2339 | }; |
2172 | int ret; | 2340 | int ret; |
2173 | 2341 | ||
2174 | if (opal_pw->session.who < OPAL_ADMIN1 || | 2342 | if (opal_pw->session.who > OPAL_USER9 || |
2175 | opal_pw->session.who > OPAL_USER9 || | ||
2176 | opal_pw->new_user_pw.who < OPAL_ADMIN1 || | ||
2177 | opal_pw->new_user_pw.who > OPAL_USER9) | 2343 | opal_pw->new_user_pw.who > OPAL_USER9) |
2178 | return -EINVAL; | 2344 | return -EINVAL; |
2179 | 2345 | ||
@@ -2280,7 +2446,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) | |||
2280 | ret = opal_activate_user(dev, p); | 2446 | ret = opal_activate_user(dev, p); |
2281 | break; | 2447 | break; |
2282 | case IOC_OPAL_REVERT_TPR: | 2448 | case IOC_OPAL_REVERT_TPR: |
2283 | ret = opal_reverttper(dev, p); | 2449 | ret = opal_reverttper(dev, p, false); |
2284 | break; | 2450 | break; |
2285 | case IOC_OPAL_LR_SETUP: | 2451 | case IOC_OPAL_LR_SETUP: |
2286 | ret = opal_setup_locking_range(dev, p); | 2452 | ret = opal_setup_locking_range(dev, p); |
@@ -2291,12 +2457,21 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) | |||
2291 | case IOC_OPAL_ENABLE_DISABLE_MBR: | 2457 | case IOC_OPAL_ENABLE_DISABLE_MBR: |
2292 | ret = opal_enable_disable_shadow_mbr(dev, p); | 2458 | ret = opal_enable_disable_shadow_mbr(dev, p); |
2293 | break; | 2459 | break; |
2460 | case IOC_OPAL_MBR_DONE: | ||
2461 | ret = opal_set_mbr_done(dev, p); | ||
2462 | break; | ||
2463 | case IOC_OPAL_WRITE_SHADOW_MBR: | ||
2464 | ret = opal_write_shadow_mbr(dev, p); | ||
2465 | break; | ||
2294 | case IOC_OPAL_ERASE_LR: | 2466 | case IOC_OPAL_ERASE_LR: |
2295 | ret = opal_erase_locking_range(dev, p); | 2467 | ret = opal_erase_locking_range(dev, p); |
2296 | break; | 2468 | break; |
2297 | case IOC_OPAL_SECURE_ERASE_LR: | 2469 | case IOC_OPAL_SECURE_ERASE_LR: |
2298 | ret = opal_secure_erase_locking_range(dev, p); | 2470 | ret = opal_secure_erase_locking_range(dev, p); |
2299 | break; | 2471 | break; |
2472 | case IOC_OPAL_PSID_REVERT_TPR: | ||
2473 | ret = opal_reverttper(dev, p, true); | ||
2474 | break; | ||
2300 | default: | 2475 | default: |
2301 | break; | 2476 | break; |
2302 | } | 2477 | } |