summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-09 13:45:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-09 13:45:06 -0400
commit3b99107f0e0298e6fe0787f75b8f3d8306dfb230 (patch)
tree30536dbc9ca176470a2ae2938f952381e33f5deb /block
parent0415052db4f92b7e272fc15802ad8b8be672deea (diff)
parentc9b3007feca018d3f7061f5d5a14cb00766ffe9b (diff)
Merge tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main block updates for 5.3. Nothing earth shattering or major in here, just fixes, additions, and improvements all over the map. This contains: - Series of documentation fixes (Bart) - Optimization of the blk-mq ctx get/put (Bart) - null_blk removal race condition fix (Bob) - req/bio_op() cleanups (Chaitanya) - Series cleaning up the segment accounting, and request/bio mapping (Christoph) - Series cleaning up the page getting/putting for bios (Christoph) - block cgroup cleanups and moving it to where it is used (Christoph) - block cgroup fixes (Tejun) - Series of fixes and improvements to bcache, most notably a write deadlock fix (Coly) - blk-iolatency STS_AGAIN and accounting fixes (Dennis) - Series of improvements and fixes to BFQ (Douglas, Paolo) - debugfs_create() return value check removal for drbd (Greg) - Use struct_size(), where appropriate (Gustavo) - Two lighnvm fixes (Heiner, Geert) - MD fixes, including a read balance and corruption fix (Guoqing, Marcos, Xiao, Yufen) - block opal shadow mbr additions (Jonas, Revanth) - sbitmap compare-and-exhange improvemnts (Pavel) - Fix for potential bio->bi_size overflow (Ming) - NVMe pull requests: - improved PCIe suspent support (Keith Busch) - error injection support for the admin queue (Akinobu Mita) - Fibre Channel discovery improvements (James Smart) - tracing improvements including nvmetc tracing support (Minwoo Im) - misc fixes and cleanups (Anton Eidelman, Minwoo Im, Chaitanya Kulkarni)" - Various little fixes and improvements to drivers and core" * tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block: (153 commits) blk-iolatency: fix STS_AGAIN handling block: nr_phys_segments needs to be zero for REQ_OP_WRITE_ZEROES blk-mq: simplify blk_mq_make_request() blk-mq: remove blk_mq_put_ctx() sbitmap: Replace cmpxchg with xchg block: fix .bi_size overflow block: sed-opal: check size of shadow mbr block: sed-opal: ioctl for writing to shadow mbr block: sed-opal: add ioctl for done-mark of shadow mbr block: never take page references for ITER_BVEC direct-io: use bio_release_pages in dio_bio_complete block_dev: use bio_release_pages in bio_unmap_user block_dev: use bio_release_pages in blkdev_bio_end_io iomap: use bio_release_pages in iomap_dio_bio_end_io block: use bio_release_pages in bio_map_user_iov block: use bio_release_pages in bio_unmap_user block: optionally mark pages dirty in bio_release_pages block: move the BIO_NO_PAGE_REF check into bio_release_pages block: skd_main.c: Remove call to memset after dma_alloc_coherent block: mtip32xx: Remove call to memset after dma_alloc_coherent ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig.iosched7
-rw-r--r--block/bfq-cgroup.c212
-rw-r--r--block/bfq-iosched.c967
-rw-r--r--block/bfq-iosched.h48
-rw-r--r--block/bio.c96
-rw-r--r--block/blk-cgroup.c139
-rw-r--r--block/blk-core.c111
-rw-r--r--block/blk-iolatency.c51
-rw-r--r--block/blk-map.c10
-rw-r--r--block/blk-merge.c112
-rw-r--r--block/blk-mq-debugfs.c42
-rw-r--r--block/blk-mq-sched.c31
-rw-r--r--block/blk-mq-sched.h10
-rw-r--r--block/blk-mq-tag.c8
-rw-r--r--block/blk-mq.c44
-rw-r--r--block/blk-mq.h7
-rw-r--r--block/blk.h36
-rw-r--r--block/genhd.c5
-rw-r--r--block/kyber-iosched.c6
-rw-r--r--block/mq-deadline.c5
-rw-r--r--block/opal_proto.h16
-rw-r--r--block/sed-opal.c197
22 files changed, 1342 insertions, 818 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 4626b88b2d5a..7a6b2f29a582 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -36,6 +36,13 @@ config BFQ_GROUP_IOSCHED
36 Enable hierarchical scheduling in BFQ, using the blkio 36 Enable hierarchical scheduling in BFQ, using the blkio
37 (cgroups-v1) or io (cgroups-v2) controller. 37 (cgroups-v1) or io (cgroups-v2) controller.
38 38
39config BFQ_CGROUP_DEBUG
40 bool "BFQ IO controller debugging"
41 depends on BFQ_GROUP_IOSCHED
42 ---help---
43 Enable some debugging help. Currently it exports additional stat
44 files in a cgroup which can be useful for debugging.
45
39endmenu 46endmenu
40 47
41endif 48endif
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b3796a40a61a..0f6cd688924f 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -15,7 +15,83 @@
15 15
16#include "bfq-iosched.h" 16#include "bfq-iosched.h"
17 17
18#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 18#ifdef CONFIG_BFQ_CGROUP_DEBUG
19static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp)
20{
21 int ret;
22
23 ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
24 if (ret)
25 return ret;
26
27 atomic64_set(&stat->aux_cnt, 0);
28 return 0;
29}
30
31static void bfq_stat_exit(struct bfq_stat *stat)
32{
33 percpu_counter_destroy(&stat->cpu_cnt);
34}
35
36/**
37 * bfq_stat_add - add a value to a bfq_stat
38 * @stat: target bfq_stat
39 * @val: value to add
40 *
41 * Add @val to @stat. The caller must ensure that IRQ on the same CPU
42 * don't re-enter this function for the same counter.
43 */
44static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val)
45{
46 percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
47}
48
49/**
50 * bfq_stat_read - read the current value of a bfq_stat
51 * @stat: bfq_stat to read
52 */
53static inline uint64_t bfq_stat_read(struct bfq_stat *stat)
54{
55 return percpu_counter_sum_positive(&stat->cpu_cnt);
56}
57
58/**
59 * bfq_stat_reset - reset a bfq_stat
60 * @stat: bfq_stat to reset
61 */
62static inline void bfq_stat_reset(struct bfq_stat *stat)
63{
64 percpu_counter_set(&stat->cpu_cnt, 0);
65 atomic64_set(&stat->aux_cnt, 0);
66}
67
68/**
69 * bfq_stat_add_aux - add a bfq_stat into another's aux count
70 * @to: the destination bfq_stat
71 * @from: the source
72 *
73 * Add @from's count including the aux one to @to's aux count.
74 */
75static inline void bfq_stat_add_aux(struct bfq_stat *to,
76 struct bfq_stat *from)
77{
78 atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt),
79 &to->aux_cnt);
80}
81
82/**
83 * blkg_prfill_stat - prfill callback for bfq_stat
84 * @sf: seq_file to print to
85 * @pd: policy private data of interest
86 * @off: offset to the bfq_stat in @pd
87 *
88 * prfill callback for printing a bfq_stat.
89 */
90static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
91 int off)
92{
93 return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off));
94}
19 95
20/* bfqg stats flags */ 96/* bfqg stats flags */
21enum bfqg_stats_flags { 97enum bfqg_stats_flags {
@@ -53,7 +129,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
53 129
54 now = ktime_get_ns(); 130 now = ktime_get_ns();
55 if (now > stats->start_group_wait_time) 131 if (now > stats->start_group_wait_time)
56 blkg_stat_add(&stats->group_wait_time, 132 bfq_stat_add(&stats->group_wait_time,
57 now - stats->start_group_wait_time); 133 now - stats->start_group_wait_time);
58 bfqg_stats_clear_waiting(stats); 134 bfqg_stats_clear_waiting(stats);
59} 135}
@@ -82,14 +158,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
82 158
83 now = ktime_get_ns(); 159 now = ktime_get_ns();
84 if (now > stats->start_empty_time) 160 if (now > stats->start_empty_time)
85 blkg_stat_add(&stats->empty_time, 161 bfq_stat_add(&stats->empty_time,
86 now - stats->start_empty_time); 162 now - stats->start_empty_time);
87 bfqg_stats_clear_empty(stats); 163 bfqg_stats_clear_empty(stats);
88} 164}
89 165
90void bfqg_stats_update_dequeue(struct bfq_group *bfqg) 166void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
91{ 167{
92 blkg_stat_add(&bfqg->stats.dequeue, 1); 168 bfq_stat_add(&bfqg->stats.dequeue, 1);
93} 169}
94 170
95void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) 171void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
@@ -119,7 +195,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
119 u64 now = ktime_get_ns(); 195 u64 now = ktime_get_ns();
120 196
121 if (now > stats->start_idle_time) 197 if (now > stats->start_idle_time)
122 blkg_stat_add(&stats->idle_time, 198 bfq_stat_add(&stats->idle_time,
123 now - stats->start_idle_time); 199 now - stats->start_idle_time);
124 bfqg_stats_clear_idling(stats); 200 bfqg_stats_clear_idling(stats);
125 } 201 }
@@ -137,9 +213,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
137{ 213{
138 struct bfqg_stats *stats = &bfqg->stats; 214 struct bfqg_stats *stats = &bfqg->stats;
139 215
140 blkg_stat_add(&stats->avg_queue_size_sum, 216 bfq_stat_add(&stats->avg_queue_size_sum,
141 blkg_rwstat_total(&stats->queued)); 217 blkg_rwstat_total(&stats->queued));
142 blkg_stat_add(&stats->avg_queue_size_samples, 1); 218 bfq_stat_add(&stats->avg_queue_size_samples, 1);
143 bfqg_stats_update_group_wait_time(stats); 219 bfqg_stats_update_group_wait_time(stats);
144} 220}
145 221
@@ -176,7 +252,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
176 io_start_time_ns - start_time_ns); 252 io_start_time_ns - start_time_ns);
177} 253}
178 254
179#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ 255#else /* CONFIG_BFQ_CGROUP_DEBUG */
180 256
181void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, 257void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
182 unsigned int op) { } 258 unsigned int op) { }
@@ -190,7 +266,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
190void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } 266void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
191void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } 267void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
192 268
193#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ 269#endif /* CONFIG_BFQ_CGROUP_DEBUG */
194 270
195#ifdef CONFIG_BFQ_GROUP_IOSCHED 271#ifdef CONFIG_BFQ_GROUP_IOSCHED
196 272
@@ -274,18 +350,18 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg)
274/* @stats = 0 */ 350/* @stats = 0 */
275static void bfqg_stats_reset(struct bfqg_stats *stats) 351static void bfqg_stats_reset(struct bfqg_stats *stats)
276{ 352{
277#ifdef CONFIG_DEBUG_BLK_CGROUP 353#ifdef CONFIG_BFQ_CGROUP_DEBUG
278 /* queued stats shouldn't be cleared */ 354 /* queued stats shouldn't be cleared */
279 blkg_rwstat_reset(&stats->merged); 355 blkg_rwstat_reset(&stats->merged);
280 blkg_rwstat_reset(&stats->service_time); 356 blkg_rwstat_reset(&stats->service_time);
281 blkg_rwstat_reset(&stats->wait_time); 357 blkg_rwstat_reset(&stats->wait_time);
282 blkg_stat_reset(&stats->time); 358 bfq_stat_reset(&stats->time);
283 blkg_stat_reset(&stats->avg_queue_size_sum); 359 bfq_stat_reset(&stats->avg_queue_size_sum);
284 blkg_stat_reset(&stats->avg_queue_size_samples); 360 bfq_stat_reset(&stats->avg_queue_size_samples);
285 blkg_stat_reset(&stats->dequeue); 361 bfq_stat_reset(&stats->dequeue);
286 blkg_stat_reset(&stats->group_wait_time); 362 bfq_stat_reset(&stats->group_wait_time);
287 blkg_stat_reset(&stats->idle_time); 363 bfq_stat_reset(&stats->idle_time);
288 blkg_stat_reset(&stats->empty_time); 364 bfq_stat_reset(&stats->empty_time);
289#endif 365#endif
290} 366}
291 367
@@ -295,19 +371,19 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
295 if (!to || !from) 371 if (!to || !from)
296 return; 372 return;
297 373
298#ifdef CONFIG_DEBUG_BLK_CGROUP 374#ifdef CONFIG_BFQ_CGROUP_DEBUG
299 /* queued stats shouldn't be cleared */ 375 /* queued stats shouldn't be cleared */
300 blkg_rwstat_add_aux(&to->merged, &from->merged); 376 blkg_rwstat_add_aux(&to->merged, &from->merged);
301 blkg_rwstat_add_aux(&to->service_time, &from->service_time); 377 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
302 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); 378 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
303 blkg_stat_add_aux(&from->time, &from->time); 379 bfq_stat_add_aux(&from->time, &from->time);
304 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); 380 bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
305 blkg_stat_add_aux(&to->avg_queue_size_samples, 381 bfq_stat_add_aux(&to->avg_queue_size_samples,
306 &from->avg_queue_size_samples); 382 &from->avg_queue_size_samples);
307 blkg_stat_add_aux(&to->dequeue, &from->dequeue); 383 bfq_stat_add_aux(&to->dequeue, &from->dequeue);
308 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); 384 bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
309 blkg_stat_add_aux(&to->idle_time, &from->idle_time); 385 bfq_stat_add_aux(&to->idle_time, &from->idle_time);
310 blkg_stat_add_aux(&to->empty_time, &from->empty_time); 386 bfq_stat_add_aux(&to->empty_time, &from->empty_time);
311#endif 387#endif
312} 388}
313 389
@@ -355,35 +431,35 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
355 431
356static void bfqg_stats_exit(struct bfqg_stats *stats) 432static void bfqg_stats_exit(struct bfqg_stats *stats)
357{ 433{
358#ifdef CONFIG_DEBUG_BLK_CGROUP 434#ifdef CONFIG_BFQ_CGROUP_DEBUG
359 blkg_rwstat_exit(&stats->merged); 435 blkg_rwstat_exit(&stats->merged);
360 blkg_rwstat_exit(&stats->service_time); 436 blkg_rwstat_exit(&stats->service_time);
361 blkg_rwstat_exit(&stats->wait_time); 437 blkg_rwstat_exit(&stats->wait_time);
362 blkg_rwstat_exit(&stats->queued); 438 blkg_rwstat_exit(&stats->queued);
363 blkg_stat_exit(&stats->time); 439 bfq_stat_exit(&stats->time);
364 blkg_stat_exit(&stats->avg_queue_size_sum); 440 bfq_stat_exit(&stats->avg_queue_size_sum);
365 blkg_stat_exit(&stats->avg_queue_size_samples); 441 bfq_stat_exit(&stats->avg_queue_size_samples);
366 blkg_stat_exit(&stats->dequeue); 442 bfq_stat_exit(&stats->dequeue);
367 blkg_stat_exit(&stats->group_wait_time); 443 bfq_stat_exit(&stats->group_wait_time);
368 blkg_stat_exit(&stats->idle_time); 444 bfq_stat_exit(&stats->idle_time);
369 blkg_stat_exit(&stats->empty_time); 445 bfq_stat_exit(&stats->empty_time);
370#endif 446#endif
371} 447}
372 448
373static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) 449static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
374{ 450{
375#ifdef CONFIG_DEBUG_BLK_CGROUP 451#ifdef CONFIG_BFQ_CGROUP_DEBUG
376 if (blkg_rwstat_init(&stats->merged, gfp) || 452 if (blkg_rwstat_init(&stats->merged, gfp) ||
377 blkg_rwstat_init(&stats->service_time, gfp) || 453 blkg_rwstat_init(&stats->service_time, gfp) ||
378 blkg_rwstat_init(&stats->wait_time, gfp) || 454 blkg_rwstat_init(&stats->wait_time, gfp) ||
379 blkg_rwstat_init(&stats->queued, gfp) || 455 blkg_rwstat_init(&stats->queued, gfp) ||
380 blkg_stat_init(&stats->time, gfp) || 456 bfq_stat_init(&stats->time, gfp) ||
381 blkg_stat_init(&stats->avg_queue_size_sum, gfp) || 457 bfq_stat_init(&stats->avg_queue_size_sum, gfp) ||
382 blkg_stat_init(&stats->avg_queue_size_samples, gfp) || 458 bfq_stat_init(&stats->avg_queue_size_samples, gfp) ||
383 blkg_stat_init(&stats->dequeue, gfp) || 459 bfq_stat_init(&stats->dequeue, gfp) ||
384 blkg_stat_init(&stats->group_wait_time, gfp) || 460 bfq_stat_init(&stats->group_wait_time, gfp) ||
385 blkg_stat_init(&stats->idle_time, gfp) || 461 bfq_stat_init(&stats->idle_time, gfp) ||
386 blkg_stat_init(&stats->empty_time, gfp)) { 462 bfq_stat_init(&stats->empty_time, gfp)) {
387 bfqg_stats_exit(stats); 463 bfqg_stats_exit(stats);
388 return -ENOMEM; 464 return -ENOMEM;
389 } 465 }
@@ -909,7 +985,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
909 return ret ?: nbytes; 985 return ret ?: nbytes;
910} 986}
911 987
912#ifdef CONFIG_DEBUG_BLK_CGROUP 988#ifdef CONFIG_BFQ_CGROUP_DEBUG
913static int bfqg_print_stat(struct seq_file *sf, void *v) 989static int bfqg_print_stat(struct seq_file *sf, void *v)
914{ 990{
915 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, 991 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
@@ -927,17 +1003,34 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v)
927static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, 1003static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
928 struct blkg_policy_data *pd, int off) 1004 struct blkg_policy_data *pd, int off)
929{ 1005{
930 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), 1006 struct blkcg_gq *blkg = pd_to_blkg(pd);
931 &blkcg_policy_bfq, off); 1007 struct blkcg_gq *pos_blkg;
1008 struct cgroup_subsys_state *pos_css;
1009 u64 sum = 0;
1010
1011 lockdep_assert_held(&blkg->q->queue_lock);
1012
1013 rcu_read_lock();
1014 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
1015 struct bfq_stat *stat;
1016
1017 if (!pos_blkg->online)
1018 continue;
1019
1020 stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off;
1021 sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt);
1022 }
1023 rcu_read_unlock();
1024
932 return __blkg_prfill_u64(sf, pd, sum); 1025 return __blkg_prfill_u64(sf, pd, sum);
933} 1026}
934 1027
935static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, 1028static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
936 struct blkg_policy_data *pd, int off) 1029 struct blkg_policy_data *pd, int off)
937{ 1030{
938 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), 1031 struct blkg_rwstat_sample sum;
939 &blkcg_policy_bfq, 1032
940 off); 1033 blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
941 return __blkg_prfill_rwstat(sf, pd, &sum); 1034 return __blkg_prfill_rwstat(sf, pd, &sum);
942} 1035}
943 1036
@@ -975,12 +1068,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
975static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, 1068static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
976 struct blkg_policy_data *pd, int off) 1069 struct blkg_policy_data *pd, int off)
977{ 1070{
978 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, 1071 struct blkg_rwstat_sample tmp;
979 offsetof(struct blkcg_gq, stat_bytes));
980 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
981 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
982 1072
983 return __blkg_prfill_u64(sf, pd, sum >> 9); 1073 blkg_rwstat_recursive_sum(pd->blkg, NULL,
1074 offsetof(struct blkcg_gq, stat_bytes), &tmp);
1075
1076 return __blkg_prfill_u64(sf, pd,
1077 (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9);
984} 1078}
985 1079
986static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) 1080static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
@@ -995,11 +1089,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
995 struct blkg_policy_data *pd, int off) 1089 struct blkg_policy_data *pd, int off)
996{ 1090{
997 struct bfq_group *bfqg = pd_to_bfqg(pd); 1091 struct bfq_group *bfqg = pd_to_bfqg(pd);
998 u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); 1092 u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples);
999 u64 v = 0; 1093 u64 v = 0;
1000 1094
1001 if (samples) { 1095 if (samples) {
1002 v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); 1096 v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum);
1003 v = div64_u64(v, samples); 1097 v = div64_u64(v, samples);
1004 } 1098 }
1005 __blkg_prfill_u64(sf, pd, v); 1099 __blkg_prfill_u64(sf, pd, v);
@@ -1014,7 +1108,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1014 0, false); 1108 0, false);
1015 return 0; 1109 return 0;
1016} 1110}
1017#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1111#endif /* CONFIG_BFQ_CGROUP_DEBUG */
1018 1112
1019struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) 1113struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
1020{ 1114{
@@ -1062,7 +1156,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
1062 .private = (unsigned long)&blkcg_policy_bfq, 1156 .private = (unsigned long)&blkcg_policy_bfq,
1063 .seq_show = blkg_print_stat_ios, 1157 .seq_show = blkg_print_stat_ios,
1064 }, 1158 },
1065#ifdef CONFIG_DEBUG_BLK_CGROUP 1159#ifdef CONFIG_BFQ_CGROUP_DEBUG
1066 { 1160 {
1067 .name = "bfq.time", 1161 .name = "bfq.time",
1068 .private = offsetof(struct bfq_group, stats.time), 1162 .private = offsetof(struct bfq_group, stats.time),
@@ -1092,7 +1186,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
1092 .private = offsetof(struct bfq_group, stats.queued), 1186 .private = offsetof(struct bfq_group, stats.queued),
1093 .seq_show = bfqg_print_rwstat, 1187 .seq_show = bfqg_print_rwstat,
1094 }, 1188 },
1095#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1189#endif /* CONFIG_BFQ_CGROUP_DEBUG */
1096 1190
1097 /* the same statistics which cover the bfqg and its descendants */ 1191 /* the same statistics which cover the bfqg and its descendants */
1098 { 1192 {
@@ -1105,7 +1199,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
1105 .private = (unsigned long)&blkcg_policy_bfq, 1199 .private = (unsigned long)&blkcg_policy_bfq,
1106 .seq_show = blkg_print_stat_ios_recursive, 1200 .seq_show = blkg_print_stat_ios_recursive,
1107 }, 1201 },
1108#ifdef CONFIG_DEBUG_BLK_CGROUP 1202#ifdef CONFIG_BFQ_CGROUP_DEBUG
1109 { 1203 {
1110 .name = "bfq.time_recursive", 1204 .name = "bfq.time_recursive",
1111 .private = offsetof(struct bfq_group, stats.time), 1205 .private = offsetof(struct bfq_group, stats.time),
@@ -1159,7 +1253,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
1159 .private = offsetof(struct bfq_group, stats.dequeue), 1253 .private = offsetof(struct bfq_group, stats.dequeue),
1160 .seq_show = bfqg_print_stat, 1254 .seq_show = bfqg_print_stat,
1161 }, 1255 },
1162#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1256#endif /* CONFIG_BFQ_CGROUP_DEBUG */
1163 { } /* terminate */ 1257 { } /* terminate */
1164}; 1258};
1165 1259
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f9269ae6da9c..50c9d2598500 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst);
157BFQ_BFQQ_FNS(coop); 157BFQ_BFQQ_FNS(coop);
158BFQ_BFQQ_FNS(split_coop); 158BFQ_BFQQ_FNS(split_coop);
159BFQ_BFQQ_FNS(softrt_update); 159BFQ_BFQQ_FNS(softrt_update);
160BFQ_BFQQ_FNS(has_waker);
160#undef BFQ_BFQQ_FNS \ 161#undef BFQ_BFQQ_FNS \
161 162
162/* Expiration time of sync (0) and async (1) requests, in ns. */ 163/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1427,17 +1428,19 @@ static int bfq_min_budget(struct bfq_data *bfqd)
1427 * mechanism may be re-designed in such a way to make it possible to 1428 * mechanism may be re-designed in such a way to make it possible to
1428 * know whether preemption is needed without needing to update service 1429 * know whether preemption is needed without needing to update service
1429 * trees). In addition, queue preemptions almost always cause random 1430 * trees). In addition, queue preemptions almost always cause random
1430 * I/O, and thus loss of throughput. Because of these facts, the next 1431 * I/O, which may in turn cause loss of throughput. Finally, there may
1431 * function adopts the following simple scheme to avoid both costly 1432 * even be no in-service queue when the next function is invoked (so,
1432 * operations and too frequent preemptions: it requests the expiration 1433 * no queue to compare timestamps with). Because of these facts, the
1433 * of the in-service queue (unconditionally) only for queues that need 1434 * next function adopts the following simple scheme to avoid costly
1434 * to recover a hole, or that either are weight-raised or deserve to 1435 * operations, too frequent preemptions and too many dependencies on
1435 * be weight-raised. 1436 * the state of the scheduler: it requests the expiration of the
1437 * in-service queue (unconditionally) only for queues that need to
1438 * recover a hole. Then it delegates to other parts of the code the
1439 * responsibility of handling the above case 2.
1436 */ 1440 */
1437static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, 1441static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1438 struct bfq_queue *bfqq, 1442 struct bfq_queue *bfqq,
1439 bool arrived_in_time, 1443 bool arrived_in_time)
1440 bool wr_or_deserves_wr)
1441{ 1444{
1442 struct bfq_entity *entity = &bfqq->entity; 1445 struct bfq_entity *entity = &bfqq->entity;
1443 1446
@@ -1492,7 +1495,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1492 entity->budget = max_t(unsigned long, bfqq->max_budget, 1495 entity->budget = max_t(unsigned long, bfqq->max_budget,
1493 bfq_serv_to_charge(bfqq->next_rq, bfqq)); 1496 bfq_serv_to_charge(bfqq->next_rq, bfqq));
1494 bfq_clear_bfqq_non_blocking_wait_rq(bfqq); 1497 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1495 return wr_or_deserves_wr; 1498 return false;
1496} 1499}
1497 1500
1498/* 1501/*
@@ -1610,6 +1613,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
1610 bfqd->bfq_wr_min_idle_time); 1613 bfqd->bfq_wr_min_idle_time);
1611} 1614}
1612 1615
1616
1617/*
1618 * Return true if bfqq is in a higher priority class, or has a higher
1619 * weight than the in-service queue.
1620 */
1621static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
1622 struct bfq_queue *in_serv_bfqq)
1623{
1624 int bfqq_weight, in_serv_weight;
1625
1626 if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
1627 return true;
1628
1629 if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
1630 bfqq_weight = bfqq->entity.weight;
1631 in_serv_weight = in_serv_bfqq->entity.weight;
1632 } else {
1633 if (bfqq->entity.parent)
1634 bfqq_weight = bfqq->entity.parent->weight;
1635 else
1636 bfqq_weight = bfqq->entity.weight;
1637 if (in_serv_bfqq->entity.parent)
1638 in_serv_weight = in_serv_bfqq->entity.parent->weight;
1639 else
1640 in_serv_weight = in_serv_bfqq->entity.weight;
1641 }
1642
1643 return bfqq_weight > in_serv_weight;
1644}
1645
1613static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, 1646static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1614 struct bfq_queue *bfqq, 1647 struct bfq_queue *bfqq,
1615 int old_wr_coeff, 1648 int old_wr_coeff,
@@ -1654,8 +1687,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1654 */ 1687 */
1655 bfqq_wants_to_preempt = 1688 bfqq_wants_to_preempt =
1656 bfq_bfqq_update_budg_for_activation(bfqd, bfqq, 1689 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
1657 arrived_in_time, 1690 arrived_in_time);
1658 wr_or_deserves_wr);
1659 1691
1660 /* 1692 /*
1661 * If bfqq happened to be activated in a burst, but has been 1693 * If bfqq happened to be activated in a burst, but has been
@@ -1720,21 +1752,111 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1720 1752
1721 /* 1753 /*
1722 * Expire in-service queue only if preemption may be needed 1754 * Expire in-service queue only if preemption may be needed
1723 * for guarantees. In this respect, the function 1755 * for guarantees. In particular, we care only about two
1724 * next_queue_may_preempt just checks a simple, necessary 1756 * cases. The first is that bfqq has to recover a service
1725 * condition, and not a sufficient condition based on 1757 * hole, as explained in the comments on
1726 * timestamps. In fact, for the latter condition to be 1758 * bfq_bfqq_update_budg_for_activation(), i.e., that
1727 * evaluated, timestamps would need first to be updated, and 1759 * bfqq_wants_to_preempt is true. However, if bfqq does not
1728 * this operation is quite costly (see the comments on the 1760 * carry time-critical I/O, then bfqq's bandwidth is less
1729 * function bfq_bfqq_update_budg_for_activation). 1761 * important than that of queues that carry time-critical I/O.
1762 * So, as a further constraint, we consider this case only if
1763 * bfqq is at least as weight-raised, i.e., at least as time
1764 * critical, as the in-service queue.
1765 *
1766 * The second case is that bfqq is in a higher priority class,
1767 * or has a higher weight than the in-service queue. If this
1768 * condition does not hold, we don't care because, even if
1769 * bfqq does not start to be served immediately, the resulting
1770 * delay for bfqq's I/O is however lower or much lower than
1771 * the ideal completion time to be guaranteed to bfqq's I/O.
1772 *
1773 * In both cases, preemption is needed only if, according to
1774 * the timestamps of both bfqq and of the in-service queue,
1775 * bfqq actually is the next queue to serve. So, to reduce
1776 * useless preemptions, the return value of
1777 * next_queue_may_preempt() is considered in the next compound
1778 * condition too. Yet next_queue_may_preempt() just checks a
1779 * simple, necessary condition for bfqq to be the next queue
1780 * to serve. In fact, to evaluate a sufficient condition, the
1781 * timestamps of the in-service queue would need to be
1782 * updated, and this operation is quite costly (see the
1783 * comments on bfq_bfqq_update_budg_for_activation()).
1730 */ 1784 */
1731 if (bfqd->in_service_queue && bfqq_wants_to_preempt && 1785 if (bfqd->in_service_queue &&
1732 bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && 1786 ((bfqq_wants_to_preempt &&
1787 bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
1788 bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
1733 next_queue_may_preempt(bfqd)) 1789 next_queue_may_preempt(bfqd))
1734 bfq_bfqq_expire(bfqd, bfqd->in_service_queue, 1790 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
1735 false, BFQQE_PREEMPTED); 1791 false, BFQQE_PREEMPTED);
1736} 1792}
1737 1793
1794static void bfq_reset_inject_limit(struct bfq_data *bfqd,
1795 struct bfq_queue *bfqq)
1796{
1797 /* invalidate baseline total service time */
1798 bfqq->last_serv_time_ns = 0;
1799
1800 /*
1801 * Reset pointer in case we are waiting for
1802 * some request completion.
1803 */
1804 bfqd->waited_rq = NULL;
1805
1806 /*
1807 * If bfqq has a short think time, then start by setting the
1808 * inject limit to 0 prudentially, because the service time of
1809 * an injected I/O request may be higher than the think time
1810 * of bfqq, and therefore, if one request was injected when
1811 * bfqq remains empty, this injected request might delay the
1812 * service of the next I/O request for bfqq significantly. In
1813 * case bfqq can actually tolerate some injection, then the
1814 * adaptive update will however raise the limit soon. This
1815 * lucky circumstance holds exactly because bfqq has a short
1816 * think time, and thus, after remaining empty, is likely to
1817 * get new I/O enqueued---and then completed---before being
1818 * expired. This is the very pattern that gives the
1819 * limit-update algorithm the chance to measure the effect of
1820 * injection on request service times, and then to update the
1821 * limit accordingly.
1822 *
1823 * However, in the following special case, the inject limit is
1824 * left to 1 even if the think time is short: bfqq's I/O is
1825 * synchronized with that of some other queue, i.e., bfqq may
1826 * receive new I/O only after the I/O of the other queue is
1827 * completed. Keeping the inject limit to 1 allows the
1828 * blocking I/O to be served while bfqq is in service. And
1829 * this is very convenient both for bfqq and for overall
1830 * throughput, as explained in detail in the comments in
1831 * bfq_update_has_short_ttime().
1832 *
1833 * On the opposite end, if bfqq has a long think time, then
1834 * start directly by 1, because:
1835 * a) on the bright side, keeping at most one request in
1836 * service in the drive is unlikely to cause any harm to the
1837 * latency of bfqq's requests, as the service time of a single
1838 * request is likely to be lower than the think time of bfqq;
1839 * b) on the downside, after becoming empty, bfqq is likely to
1840 * expire before getting its next request. With this request
1841 * arrival pattern, it is very hard to sample total service
1842 * times and update the inject limit accordingly (see comments
1843 * on bfq_update_inject_limit()). So the limit is likely to be
1844 * never, or at least seldom, updated. As a consequence, by
1845 * setting the limit to 1, we avoid that no injection ever
1846 * occurs with bfqq. On the downside, this proactive step
1847 * further reduces chances to actually compute the baseline
1848 * total service time. Thus it reduces chances to execute the
1849 * limit-update algorithm and possibly raise the limit to more
1850 * than 1.
1851 */
1852 if (bfq_bfqq_has_short_ttime(bfqq))
1853 bfqq->inject_limit = 0;
1854 else
1855 bfqq->inject_limit = 1;
1856
1857 bfqq->decrease_time_jif = jiffies;
1858}
1859
1738static void bfq_add_request(struct request *rq) 1860static void bfq_add_request(struct request *rq)
1739{ 1861{
1740 struct bfq_queue *bfqq = RQ_BFQQ(rq); 1862 struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1749,77 +1871,119 @@ static void bfq_add_request(struct request *rq)
1749 1871
1750 if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { 1872 if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
1751 /* 1873 /*
1874 * Detect whether bfqq's I/O seems synchronized with
1875 * that of some other queue, i.e., whether bfqq, after
1876 * remaining empty, happens to receive new I/O only
1877 * right after some I/O request of the other queue has
1878 * been completed. We call waker queue the other
1879 * queue, and we assume, for simplicity, that bfqq may
1880 * have at most one waker queue.
1881 *
1882 * A remarkable throughput boost can be reached by
1883 * unconditionally injecting the I/O of the waker
1884 * queue, every time a new bfq_dispatch_request
1885 * happens to be invoked while I/O is being plugged
1886 * for bfqq. In addition to boosting throughput, this
1887 * unblocks bfqq's I/O, thereby improving bandwidth
1888 * and latency for bfqq. Note that these same results
1889 * may be achieved with the general injection
1890 * mechanism, but less effectively. For details on
1891 * this aspect, see the comments on the choice of the
1892 * queue for injection in bfq_select_queue().
1893 *
1894 * Turning back to the detection of a waker queue, a
1895 * queue Q is deemed as a waker queue for bfqq if, for
1896 * two consecutive times, bfqq happens to become non
1897 * empty right after a request of Q has been
1898 * completed. In particular, on the first time, Q is
1899 * tentatively set as a candidate waker queue, while
1900 * on the second time, the flag
1901 * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
1902 * is a waker queue for bfqq. These detection steps
1903 * are performed only if bfqq has a long think time,
1904 * so as to make it more likely that bfqq's I/O is
1905 * actually being blocked by a synchronization. This
1906 * last filter, plus the above two-times requirement,
1907 * make false positives less likely.
1908 *
1909 * NOTE
1910 *
1911 * The sooner a waker queue is detected, the sooner
1912 * throughput can be boosted by injecting I/O from the
1913 * waker queue. Fortunately, detection is likely to be
1914 * actually fast, for the following reasons. While
1915 * blocked by synchronization, bfqq has a long think
1916 * time. This implies that bfqq's inject limit is at
1917 * least equal to 1 (see the comments in
1918 * bfq_update_inject_limit()). So, thanks to
1919 * injection, the waker queue is likely to be served
1920 * during the very first I/O-plugging time interval
1921 * for bfqq. This triggers the first step of the
1922 * detection mechanism. Thanks again to injection, the
1923 * candidate waker queue is then likely to be
1924 * confirmed no later than during the next
1925 * I/O-plugging interval for bfqq.
1926 */
1927 if (!bfq_bfqq_has_short_ttime(bfqq) &&
1928 ktime_get_ns() - bfqd->last_completion <
1929 200 * NSEC_PER_USEC) {
1930 if (bfqd->last_completed_rq_bfqq != bfqq &&
1931 bfqd->last_completed_rq_bfqq !=
1932 bfqq->waker_bfqq) {
1933 /*
1934 * First synchronization detected with
1935 * a candidate waker queue, or with a
1936 * different candidate waker queue
1937 * from the current one.
1938 */
1939 bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
1940
1941 /*
1942 * If the waker queue disappears, then
1943 * bfqq->waker_bfqq must be reset. To
1944 * this goal, we maintain in each
1945 * waker queue a list, woken_list, of
1946 * all the queues that reference the
1947 * waker queue through their
1948 * waker_bfqq pointer. When the waker
1949 * queue exits, the waker_bfqq pointer
1950 * of all the queues in the woken_list
1951 * is reset.
1952 *
1953 * In addition, if bfqq is already in
1954 * the woken_list of a waker queue,
1955 * then, before being inserted into
1956 * the woken_list of a new waker
1957 * queue, bfqq must be removed from
1958 * the woken_list of the old waker
1959 * queue.
1960 */
1961 if (!hlist_unhashed(&bfqq->woken_list_node))
1962 hlist_del_init(&bfqq->woken_list_node);
1963 hlist_add_head(&bfqq->woken_list_node,
1964 &bfqd->last_completed_rq_bfqq->woken_list);
1965
1966 bfq_clear_bfqq_has_waker(bfqq);
1967 } else if (bfqd->last_completed_rq_bfqq ==
1968 bfqq->waker_bfqq &&
1969 !bfq_bfqq_has_waker(bfqq)) {
1970 /*
1971 * synchronization with waker_bfqq
1972 * seen for the second time
1973 */
1974 bfq_mark_bfqq_has_waker(bfqq);
1975 }
1976 }
1977
1978 /*
1752 * Periodically reset inject limit, to make sure that 1979 * Periodically reset inject limit, to make sure that
1753 * the latter eventually drops in case workload 1980 * the latter eventually drops in case workload
1754 * changes, see step (3) in the comments on 1981 * changes, see step (3) in the comments on
1755 * bfq_update_inject_limit(). 1982 * bfq_update_inject_limit().
1756 */ 1983 */
1757 if (time_is_before_eq_jiffies(bfqq->decrease_time_jif + 1984 if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
1758 msecs_to_jiffies(1000))) { 1985 msecs_to_jiffies(1000)))
1759 /* invalidate baseline total service time */ 1986 bfq_reset_inject_limit(bfqd, bfqq);
1760 bfqq->last_serv_time_ns = 0;
1761
1762 /*
1763 * Reset pointer in case we are waiting for
1764 * some request completion.
1765 */
1766 bfqd->waited_rq = NULL;
1767
1768 /*
1769 * If bfqq has a short think time, then start
1770 * by setting the inject limit to 0
1771 * prudentially, because the service time of
1772 * an injected I/O request may be higher than
1773 * the think time of bfqq, and therefore, if
1774 * one request was injected when bfqq remains
1775 * empty, this injected request might delay
1776 * the service of the next I/O request for
1777 * bfqq significantly. In case bfqq can
1778 * actually tolerate some injection, then the
1779 * adaptive update will however raise the
1780 * limit soon. This lucky circumstance holds
1781 * exactly because bfqq has a short think
1782 * time, and thus, after remaining empty, is
1783 * likely to get new I/O enqueued---and then
1784 * completed---before being expired. This is
1785 * the very pattern that gives the
1786 * limit-update algorithm the chance to
1787 * measure the effect of injection on request
1788 * service times, and then to update the limit
1789 * accordingly.
1790 *
1791 * On the opposite end, if bfqq has a long
1792 * think time, then start directly by 1,
1793 * because:
1794 * a) on the bright side, keeping at most one
1795 * request in service in the drive is unlikely
1796 * to cause any harm to the latency of bfqq's
1797 * requests, as the service time of a single
1798 * request is likely to be lower than the
1799 * think time of bfqq;
1800 * b) on the downside, after becoming empty,
1801 * bfqq is likely to expire before getting its
1802 * next request. With this request arrival
1803 * pattern, it is very hard to sample total
1804 * service times and update the inject limit
1805 * accordingly (see comments on
1806 * bfq_update_inject_limit()). So the limit is
1807 * likely to be never, or at least seldom,
1808 * updated. As a consequence, by setting the
1809 * limit to 1, we avoid that no injection ever
1810 * occurs with bfqq. On the downside, this
1811 * proactive step further reduces chances to
1812 * actually compute the baseline total service
1813 * time. Thus it reduces chances to execute the
1814 * limit-update algorithm and possibly raise the
1815 * limit to more than 1.
1816 */
1817 if (bfq_bfqq_has_short_ttime(bfqq))
1818 bfqq->inject_limit = 0;
1819 else
1820 bfqq->inject_limit = 1;
1821 bfqq->decrease_time_jif = jiffies;
1822 }
1823 1987
1824 /* 1988 /*
1825 * The following conditions must hold to setup a new 1989 * The following conditions must hold to setup a new
@@ -2027,7 +2191,8 @@ static void bfq_remove_request(struct request_queue *q,
2027 2191
2028} 2192}
2029 2193
2030static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) 2194static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
2195 unsigned int nr_segs)
2031{ 2196{
2032 struct request_queue *q = hctx->queue; 2197 struct request_queue *q = hctx->queue;
2033 struct bfq_data *bfqd = q->elevator->elevator_data; 2198 struct bfq_data *bfqd = q->elevator->elevator_data;
@@ -2050,7 +2215,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
2050 bfqd->bio_bfqq = NULL; 2215 bfqd->bio_bfqq = NULL;
2051 bfqd->bio_bic = bic; 2216 bfqd->bio_bic = bic;
2052 2217
2053 ret = blk_mq_sched_try_merge(q, bio, &free); 2218 ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
2054 2219
2055 if (free) 2220 if (free)
2056 blk_mq_free_request(free); 2221 blk_mq_free_request(free);
@@ -2513,6 +2678,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
2513 * to enjoy weight raising if split soon. 2678 * to enjoy weight raising if split soon.
2514 */ 2679 */
2515 bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; 2680 bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
2681 bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
2516 bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); 2682 bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
2517 bic->saved_last_wr_start_finish = jiffies; 2683 bic->saved_last_wr_start_finish = jiffies;
2518 } else { 2684 } else {
@@ -3045,7 +3211,186 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
3045 bfq_remove_request(q, rq); 3211 bfq_remove_request(q, rq);
3046} 3212}
3047 3213
3048static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) 3214/*
3215 * There is a case where idling does not have to be performed for
3216 * throughput concerns, but to preserve the throughput share of
3217 * the process associated with bfqq.
3218 *
3219 * To introduce this case, we can note that allowing the drive
3220 * to enqueue more than one request at a time, and hence
3221 * delegating de facto final scheduling decisions to the
3222 * drive's internal scheduler, entails loss of control on the
3223 * actual request service order. In particular, the critical
3224 * situation is when requests from different processes happen
3225 * to be present, at the same time, in the internal queue(s)
3226 * of the drive. In such a situation, the drive, by deciding
3227 * the service order of the internally-queued requests, does
3228 * determine also the actual throughput distribution among
3229 * these processes. But the drive typically has no notion or
3230 * concern about per-process throughput distribution, and
3231 * makes its decisions only on a per-request basis. Therefore,
3232 * the service distribution enforced by the drive's internal
3233 * scheduler is likely to coincide with the desired throughput
3234 * distribution only in a completely symmetric, or favorably
3235 * skewed scenario where:
3236 * (i-a) each of these processes must get the same throughput as
3237 * the others,
3238 * (i-b) in case (i-a) does not hold, it holds that the process
3239 * associated with bfqq must receive a lower or equal
3240 * throughput than any of the other processes;
3241 * (ii) the I/O of each process has the same properties, in
3242 * terms of locality (sequential or random), direction
3243 * (reads or writes), request sizes, greediness
3244 * (from I/O-bound to sporadic), and so on;
3245
3246 * In fact, in such a scenario, the drive tends to treat the requests
3247 * of each process in about the same way as the requests of the
3248 * others, and thus to provide each of these processes with about the
3249 * same throughput. This is exactly the desired throughput
3250 * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3251 * even more convenient distribution for (the process associated with)
3252 * bfqq.
3253 *
3254 * In contrast, in any asymmetric or unfavorable scenario, device
3255 * idling (I/O-dispatch plugging) is certainly needed to guarantee
3256 * that bfqq receives its assigned fraction of the device throughput
3257 * (see [1] for details).
3258 *
3259 * The problem is that idling may significantly reduce throughput with
3260 * certain combinations of types of I/O and devices. An important
3261 * example is sync random I/O on flash storage with command
3262 * queueing. So, unless bfqq falls in cases where idling also boosts
3263 * throughput, it is important to check conditions (i-a), i(-b) and
3264 * (ii) accurately, so as to avoid idling when not strictly needed for
3265 * service guarantees.
3266 *
3267 * Unfortunately, it is extremely difficult to thoroughly check
3268 * condition (ii). And, in case there are active groups, it becomes
3269 * very difficult to check conditions (i-a) and (i-b) too. In fact,
3270 * if there are active groups, then, for conditions (i-a) or (i-b) to
3271 * become false 'indirectly', it is enough that an active group
3272 * contains more active processes or sub-groups than some other active
3273 * group. More precisely, for conditions (i-a) or (i-b) to become
3274 * false because of such a group, it is not even necessary that the
3275 * group is (still) active: it is sufficient that, even if the group
3276 * has become inactive, some of its descendant processes still have
3277 * some request already dispatched but still waiting for
3278 * completion. In fact, requests have still to be guaranteed their
3279 * share of the throughput even after being dispatched. In this
3280 * respect, it is easy to show that, if a group frequently becomes
3281 * inactive while still having in-flight requests, and if, when this
3282 * happens, the group is not considered in the calculation of whether
3283 * the scenario is asymmetric, then the group may fail to be
3284 * guaranteed its fair share of the throughput (basically because
3285 * idling may not be performed for the descendant processes of the
3286 * group, but it had to be). We address this issue with the following
3287 * bi-modal behavior, implemented in the function
3288 * bfq_asymmetric_scenario().
3289 *
3290 * If there are groups with requests waiting for completion
3291 * (as commented above, some of these groups may even be
3292 * already inactive), then the scenario is tagged as
3293 * asymmetric, conservatively, without checking any of the
3294 * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3295 * This behavior matches also the fact that groups are created
3296 * exactly if controlling I/O is a primary concern (to
3297 * preserve bandwidth and latency guarantees).
3298 *
3299 * On the opposite end, if there are no groups with requests waiting
3300 * for completion, then only conditions (i-a) and (i-b) are actually
3301 * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3302 * idling is not performed, regardless of whether condition (ii)
3303 * holds. In other words, only if conditions (i-a) and (i-b) do not
3304 * hold, then idling is allowed, and the device tends to be prevented
3305 * from queueing many requests, possibly of several processes. Since
3306 * there are no groups with requests waiting for completion, then, to
3307 * control conditions (i-a) and (i-b) it is enough to check just
3308 * whether all the queues with requests waiting for completion also
3309 * have the same weight.
3310 *
3311 * Not checking condition (ii) evidently exposes bfqq to the
3312 * risk of getting less throughput than its fair share.
3313 * However, for queues with the same weight, a further
3314 * mechanism, preemption, mitigates or even eliminates this
3315 * problem. And it does so without consequences on overall
3316 * throughput. This mechanism and its benefits are explained
3317 * in the next three paragraphs.
3318 *
3319 * Even if a queue, say Q, is expired when it remains idle, Q
3320 * can still preempt the new in-service queue if the next
3321 * request of Q arrives soon (see the comments on
3322 * bfq_bfqq_update_budg_for_activation). If all queues and
3323 * groups have the same weight, this form of preemption,
3324 * combined with the hole-recovery heuristic described in the
3325 * comments on function bfq_bfqq_update_budg_for_activation,
3326 * are enough to preserve a correct bandwidth distribution in
3327 * the mid term, even without idling. In fact, even if not
3328 * idling allows the internal queues of the device to contain
3329 * many requests, and thus to reorder requests, we can rather
3330 * safely assume that the internal scheduler still preserves a
3331 * minimum of mid-term fairness.
3332 *
3333 * More precisely, this preemption-based, idleless approach
3334 * provides fairness in terms of IOPS, and not sectors per
3335 * second. This can be seen with a simple example. Suppose
3336 * that there are two queues with the same weight, but that
3337 * the first queue receives requests of 8 sectors, while the
3338 * second queue receives requests of 1024 sectors. In
3339 * addition, suppose that each of the two queues contains at
3340 * most one request at a time, which implies that each queue
3341 * always remains idle after it is served. Finally, after
3342 * remaining idle, each queue receives very quickly a new
3343 * request. It follows that the two queues are served
3344 * alternatively, preempting each other if needed. This
3345 * implies that, although both queues have the same weight,
3346 * the queue with large requests receives a service that is
3347 * 1024/8 times as high as the service received by the other
3348 * queue.
3349 *
3350 * The motivation for using preemption instead of idling (for
3351 * queues with the same weight) is that, by not idling,
3352 * service guarantees are preserved (completely or at least in
3353 * part) without minimally sacrificing throughput. And, if
3354 * there is no active group, then the primary expectation for
3355 * this device is probably a high throughput.
3356 *
3357 * We are now left only with explaining the additional
3358 * compound condition that is checked below for deciding
3359 * whether the scenario is asymmetric. To explain this
3360 * compound condition, we need to add that the function
3361 * bfq_asymmetric_scenario checks the weights of only
3362 * non-weight-raised queues, for efficiency reasons (see
3363 * comments on bfq_weights_tree_add()). Then the fact that
3364 * bfqq is weight-raised is checked explicitly here. More
3365 * precisely, the compound condition below takes into account
3366 * also the fact that, even if bfqq is being weight-raised,
3367 * the scenario is still symmetric if all queues with requests
3368 * waiting for completion happen to be
3369 * weight-raised. Actually, we should be even more precise
3370 * here, and differentiate between interactive weight raising
3371 * and soft real-time weight raising.
3372 *
3373 * As a side note, it is worth considering that the above
3374 * device-idling countermeasures may however fail in the
3375 * following unlucky scenario: if idling is (correctly)
3376 * disabled in a time period during which all symmetry
3377 * sub-conditions hold, and hence the device is allowed to
3378 * enqueue many requests, but at some later point in time some
3379 * sub-condition stops to hold, then it may become impossible
3380 * to let requests be served in the desired order until all
3381 * the requests already queued in the device have been served.
3382 */
3383static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
3384 struct bfq_queue *bfqq)
3385{
3386 return (bfqq->wr_coeff > 1 &&
3387 bfqd->wr_busy_queues <
3388 bfq_tot_busy_queues(bfqd)) ||
3389 bfq_asymmetric_scenario(bfqd, bfqq);
3390}
3391
3392static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3393 enum bfqq_expiration reason)
3049{ 3394{
3050 /* 3395 /*
3051 * If this bfqq is shared between multiple processes, check 3396 * If this bfqq is shared between multiple processes, check
@@ -3056,7 +3401,22 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3056 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) 3401 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
3057 bfq_mark_bfqq_split_coop(bfqq); 3402 bfq_mark_bfqq_split_coop(bfqq);
3058 3403
3059 if (RB_EMPTY_ROOT(&bfqq->sort_list)) { 3404 /*
3405 * Consider queues with a higher finish virtual time than
3406 * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
3407 * true, then bfqq's bandwidth would be violated if an
3408 * uncontrolled amount of I/O from these queues were
3409 * dispatched while bfqq is waiting for its new I/O to
3410 * arrive. This is exactly what may happen if this is a forced
3411 * expiration caused by a preemption attempt, and if bfqq is
3412 * not re-scheduled. To prevent this from happening, re-queue
3413 * bfqq if it needs I/O-dispatch plugging, even if it is
3414 * empty. By doing so, bfqq is granted to be served before the
3415 * above queues (provided that bfqq is of course eligible).
3416 */
3417 if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3418 !(reason == BFQQE_PREEMPTED &&
3419 idling_needed_for_service_guarantees(bfqd, bfqq))) {
3060 if (bfqq->dispatched == 0) 3420 if (bfqq->dispatched == 0)
3061 /* 3421 /*
3062 * Overloading budget_timeout field to store 3422 * Overloading budget_timeout field to store
@@ -3073,7 +3433,8 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3073 * Resort priority tree of potential close cooperators. 3433 * Resort priority tree of potential close cooperators.
3074 * See comments on bfq_pos_tree_add_move() for the unlikely(). 3434 * See comments on bfq_pos_tree_add_move() for the unlikely().
3075 */ 3435 */
3076 if (unlikely(!bfqd->nonrot_with_queueing)) 3436 if (unlikely(!bfqd->nonrot_with_queueing &&
3437 !RB_EMPTY_ROOT(&bfqq->sort_list)))
3077 bfq_pos_tree_add_move(bfqd, bfqq); 3438 bfq_pos_tree_add_move(bfqd, bfqq);
3078 } 3439 }
3079 3440
@@ -3574,7 +3935,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
3574 * reason. 3935 * reason.
3575 */ 3936 */
3576 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); 3937 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3577 if (__bfq_bfqq_expire(bfqd, bfqq)) 3938 if (__bfq_bfqq_expire(bfqd, bfqq, reason))
3578 /* bfqq is gone, no more actions on it */ 3939 /* bfqq is gone, no more actions on it */
3579 return; 3940 return;
3580 3941
@@ -3721,184 +4082,6 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3721} 4082}
3722 4083
3723/* 4084/*
3724 * There is a case where idling does not have to be performed for
3725 * throughput concerns, but to preserve the throughput share of
3726 * the process associated with bfqq.
3727 *
3728 * To introduce this case, we can note that allowing the drive
3729 * to enqueue more than one request at a time, and hence
3730 * delegating de facto final scheduling decisions to the
3731 * drive's internal scheduler, entails loss of control on the
3732 * actual request service order. In particular, the critical
3733 * situation is when requests from different processes happen
3734 * to be present, at the same time, in the internal queue(s)
3735 * of the drive. In such a situation, the drive, by deciding
3736 * the service order of the internally-queued requests, does
3737 * determine also the actual throughput distribution among
3738 * these processes. But the drive typically has no notion or
3739 * concern about per-process throughput distribution, and
3740 * makes its decisions only on a per-request basis. Therefore,
3741 * the service distribution enforced by the drive's internal
3742 * scheduler is likely to coincide with the desired throughput
3743 * distribution only in a completely symmetric, or favorably
3744 * skewed scenario where:
3745 * (i-a) each of these processes must get the same throughput as
3746 * the others,
3747 * (i-b) in case (i-a) does not hold, it holds that the process
3748 * associated with bfqq must receive a lower or equal
3749 * throughput than any of the other processes;
3750 * (ii) the I/O of each process has the same properties, in
3751 * terms of locality (sequential or random), direction
3752 * (reads or writes), request sizes, greediness
3753 * (from I/O-bound to sporadic), and so on;
3754
3755 * In fact, in such a scenario, the drive tends to treat the requests
3756 * of each process in about the same way as the requests of the
3757 * others, and thus to provide each of these processes with about the
3758 * same throughput. This is exactly the desired throughput
3759 * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3760 * even more convenient distribution for (the process associated with)
3761 * bfqq.
3762 *
3763 * In contrast, in any asymmetric or unfavorable scenario, device
3764 * idling (I/O-dispatch plugging) is certainly needed to guarantee
3765 * that bfqq receives its assigned fraction of the device throughput
3766 * (see [1] for details).
3767 *
3768 * The problem is that idling may significantly reduce throughput with
3769 * certain combinations of types of I/O and devices. An important
3770 * example is sync random I/O on flash storage with command
3771 * queueing. So, unless bfqq falls in cases where idling also boosts
3772 * throughput, it is important to check conditions (i-a), i(-b) and
3773 * (ii) accurately, so as to avoid idling when not strictly needed for
3774 * service guarantees.
3775 *
3776 * Unfortunately, it is extremely difficult to thoroughly check
3777 * condition (ii). And, in case there are active groups, it becomes
3778 * very difficult to check conditions (i-a) and (i-b) too. In fact,
3779 * if there are active groups, then, for conditions (i-a) or (i-b) to
3780 * become false 'indirectly', it is enough that an active group
3781 * contains more active processes or sub-groups than some other active
3782 * group. More precisely, for conditions (i-a) or (i-b) to become
3783 * false because of such a group, it is not even necessary that the
3784 * group is (still) active: it is sufficient that, even if the group
3785 * has become inactive, some of its descendant processes still have
3786 * some request already dispatched but still waiting for
3787 * completion. In fact, requests have still to be guaranteed their
3788 * share of the throughput even after being dispatched. In this
3789 * respect, it is easy to show that, if a group frequently becomes
3790 * inactive while still having in-flight requests, and if, when this
3791 * happens, the group is not considered in the calculation of whether
3792 * the scenario is asymmetric, then the group may fail to be
3793 * guaranteed its fair share of the throughput (basically because
3794 * idling may not be performed for the descendant processes of the
3795 * group, but it had to be). We address this issue with the following
3796 * bi-modal behavior, implemented in the function
3797 * bfq_asymmetric_scenario().
3798 *
3799 * If there are groups with requests waiting for completion
3800 * (as commented above, some of these groups may even be
3801 * already inactive), then the scenario is tagged as
3802 * asymmetric, conservatively, without checking any of the
3803 * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3804 * This behavior matches also the fact that groups are created
3805 * exactly if controlling I/O is a primary concern (to
3806 * preserve bandwidth and latency guarantees).
3807 *
3808 * On the opposite end, if there are no groups with requests waiting
3809 * for completion, then only conditions (i-a) and (i-b) are actually
3810 * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3811 * idling is not performed, regardless of whether condition (ii)
3812 * holds. In other words, only if conditions (i-a) and (i-b) do not
3813 * hold, then idling is allowed, and the device tends to be prevented
3814 * from queueing many requests, possibly of several processes. Since
3815 * there are no groups with requests waiting for completion, then, to
3816 * control conditions (i-a) and (i-b) it is enough to check just
3817 * whether all the queues with requests waiting for completion also
3818 * have the same weight.
3819 *
3820 * Not checking condition (ii) evidently exposes bfqq to the
3821 * risk of getting less throughput than its fair share.
3822 * However, for queues with the same weight, a further
3823 * mechanism, preemption, mitigates or even eliminates this
3824 * problem. And it does so without consequences on overall
3825 * throughput. This mechanism and its benefits are explained
3826 * in the next three paragraphs.
3827 *
3828 * Even if a queue, say Q, is expired when it remains idle, Q
3829 * can still preempt the new in-service queue if the next
3830 * request of Q arrives soon (see the comments on
3831 * bfq_bfqq_update_budg_for_activation). If all queues and
3832 * groups have the same weight, this form of preemption,
3833 * combined with the hole-recovery heuristic described in the
3834 * comments on function bfq_bfqq_update_budg_for_activation,
3835 * are enough to preserve a correct bandwidth distribution in
3836 * the mid term, even without idling. In fact, even if not
3837 * idling allows the internal queues of the device to contain
3838 * many requests, and thus to reorder requests, we can rather
3839 * safely assume that the internal scheduler still preserves a
3840 * minimum of mid-term fairness.
3841 *
3842 * More precisely, this preemption-based, idleless approach
3843 * provides fairness in terms of IOPS, and not sectors per
3844 * second. This can be seen with a simple example. Suppose
3845 * that there are two queues with the same weight, but that
3846 * the first queue receives requests of 8 sectors, while the
3847 * second queue receives requests of 1024 sectors. In
3848 * addition, suppose that each of the two queues contains at
3849 * most one request at a time, which implies that each queue
3850 * always remains idle after it is served. Finally, after
3851 * remaining idle, each queue receives very quickly a new
3852 * request. It follows that the two queues are served
3853 * alternatively, preempting each other if needed. This
3854 * implies that, although both queues have the same weight,
3855 * the queue with large requests receives a service that is
3856 * 1024/8 times as high as the service received by the other
3857 * queue.
3858 *
3859 * The motivation for using preemption instead of idling (for
3860 * queues with the same weight) is that, by not idling,
3861 * service guarantees are preserved (completely or at least in
3862 * part) without minimally sacrificing throughput. And, if
3863 * there is no active group, then the primary expectation for
3864 * this device is probably a high throughput.
3865 *
3866 * We are now left only with explaining the additional
3867 * compound condition that is checked below for deciding
3868 * whether the scenario is asymmetric. To explain this
3869 * compound condition, we need to add that the function
3870 * bfq_asymmetric_scenario checks the weights of only
3871 * non-weight-raised queues, for efficiency reasons (see
3872 * comments on bfq_weights_tree_add()). Then the fact that
3873 * bfqq is weight-raised is checked explicitly here. More
3874 * precisely, the compound condition below takes into account
3875 * also the fact that, even if bfqq is being weight-raised,
3876 * the scenario is still symmetric if all queues with requests
3877 * waiting for completion happen to be
3878 * weight-raised. Actually, we should be even more precise
3879 * here, and differentiate between interactive weight raising
3880 * and soft real-time weight raising.
3881 *
3882 * As a side note, it is worth considering that the above
3883 * device-idling countermeasures may however fail in the
3884 * following unlucky scenario: if idling is (correctly)
3885 * disabled in a time period during which all symmetry
3886 * sub-conditions hold, and hence the device is allowed to
3887 * enqueue many requests, but at some later point in time some
3888 * sub-condition stops to hold, then it may become impossible
3889 * to let requests be served in the desired order until all
3890 * the requests already queued in the device have been served.
3891 */
3892static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
3893 struct bfq_queue *bfqq)
3894{
3895 return (bfqq->wr_coeff > 1 &&
3896 bfqd->wr_busy_queues <
3897 bfq_tot_busy_queues(bfqd)) ||
3898 bfq_asymmetric_scenario(bfqd, bfqq);
3899}
3900
3901/*
3902 * For a queue that becomes empty, device idling is allowed only if 4085 * For a queue that becomes empty, device idling is allowed only if
3903 * this function returns true for that queue. As a consequence, since 4086 * this function returns true for that queue. As a consequence, since
3904 * device idling plays a critical role for both throughput boosting 4087 * device idling plays a critical role for both throughput boosting
@@ -4156,22 +4339,95 @@ check_queue:
4156 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { 4339 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
4157 struct bfq_queue *async_bfqq = 4340 struct bfq_queue *async_bfqq =
4158 bfqq->bic && bfqq->bic->bfqq[0] && 4341 bfqq->bic && bfqq->bic->bfqq[0] &&
4159 bfq_bfqq_busy(bfqq->bic->bfqq[0]) ? 4342 bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
4343 bfqq->bic->bfqq[0]->next_rq ?
4160 bfqq->bic->bfqq[0] : NULL; 4344 bfqq->bic->bfqq[0] : NULL;
4161 4345
4162 /* 4346 /*
4163 * If the process associated with bfqq has also async 4347 * The next three mutually-exclusive ifs decide
4164 * I/O pending, then inject it 4348 * whether to try injection, and choose the queue to
4165 * unconditionally. Injecting I/O from the same 4349 * pick an I/O request from.
4166 * process can cause no harm to the process. On the 4350 *
4167 * contrary, it can only increase bandwidth and reduce 4351 * The first if checks whether the process associated
4168 * latency for the process. 4352 * with bfqq has also async I/O pending. If so, it
4353 * injects such I/O unconditionally. Injecting async
4354 * I/O from the same process can cause no harm to the
4355 * process. On the contrary, it can only increase
4356 * bandwidth and reduce latency for the process.
4357 *
4358 * The second if checks whether there happens to be a
4359 * non-empty waker queue for bfqq, i.e., a queue whose
4360 * I/O needs to be completed for bfqq to receive new
4361 * I/O. This happens, e.g., if bfqq is associated with
4362 * a process that does some sync. A sync generates
4363 * extra blocking I/O, which must be completed before
4364 * the process associated with bfqq can go on with its
4365 * I/O. If the I/O of the waker queue is not served,
4366 * then bfqq remains empty, and no I/O is dispatched,
4367 * until the idle timeout fires for bfqq. This is
4368 * likely to result in lower bandwidth and higher
4369 * latencies for bfqq, and in a severe loss of total
4370 * throughput. The best action to take is therefore to
4371 * serve the waker queue as soon as possible. So do it
4372 * (without relying on the third alternative below for
4373 * eventually serving waker_bfqq's I/O; see the last
4374 * paragraph for further details). This systematic
4375 * injection of I/O from the waker queue does not
4376 * cause any delay to bfqq's I/O. On the contrary,
4377 * next bfqq's I/O is brought forward dramatically,
4378 * for it is not blocked for milliseconds.
4379 *
4380 * The third if checks whether bfqq is a queue for
4381 * which it is better to avoid injection. It is so if
4382 * bfqq delivers more throughput when served without
4383 * any further I/O from other queues in the middle, or
4384 * if the service times of bfqq's I/O requests both
4385 * count more than overall throughput, and may be
4386 * easily increased by injection (this happens if bfqq
4387 * has a short think time). If none of these
4388 * conditions holds, then a candidate queue for
4389 * injection is looked for through
4390 * bfq_choose_bfqq_for_injection(). Note that the
4391 * latter may return NULL (for example if the inject
4392 * limit for bfqq is currently 0).
4393 *
4394 * NOTE: motivation for the second alternative
4395 *
4396 * Thanks to the way the inject limit is updated in
4397 * bfq_update_has_short_ttime(), it is rather likely
4398 * that, if I/O is being plugged for bfqq and the
4399 * waker queue has pending I/O requests that are
4400 * blocking bfqq's I/O, then the third alternative
4401 * above lets the waker queue get served before the
4402 * I/O-plugging timeout fires. So one may deem the
4403 * second alternative superfluous. It is not, because
4404 * the third alternative may be way less effective in
4405 * case of a synchronization. For two main
4406 * reasons. First, throughput may be low because the
4407 * inject limit may be too low to guarantee the same
4408 * amount of injected I/O, from the waker queue or
4409 * other queues, that the second alternative
4410 * guarantees (the second alternative unconditionally
4411 * injects a pending I/O request of the waker queue
4412 * for each bfq_dispatch_request()). Second, with the
4413 * third alternative, the duration of the plugging,
4414 * i.e., the time before bfqq finally receives new I/O,
4415 * may not be minimized, because the waker queue may
4416 * happen to be served only after other queues.
4169 */ 4417 */
4170 if (async_bfqq && 4418 if (async_bfqq &&
4171 icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && 4419 icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
4172 bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= 4420 bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
4173 bfq_bfqq_budget_left(async_bfqq)) 4421 bfq_bfqq_budget_left(async_bfqq))
4174 bfqq = bfqq->bic->bfqq[0]; 4422 bfqq = bfqq->bic->bfqq[0];
4423 else if (bfq_bfqq_has_waker(bfqq) &&
4424 bfq_bfqq_busy(bfqq->waker_bfqq) &&
4425 bfqq->next_rq &&
4426 bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
4427 bfqq->waker_bfqq) <=
4428 bfq_bfqq_budget_left(bfqq->waker_bfqq)
4429 )
4430 bfqq = bfqq->waker_bfqq;
4175 else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && 4431 else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
4176 (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || 4432 (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
4177 !bfq_bfqq_has_short_ttime(bfqq))) 4433 !bfq_bfqq_has_short_ttime(bfqq)))
@@ -4403,7 +4659,7 @@ exit:
4403 return rq; 4659 return rq;
4404} 4660}
4405 4661
4406#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 4662#ifdef CONFIG_BFQ_CGROUP_DEBUG
4407static void bfq_update_dispatch_stats(struct request_queue *q, 4663static void bfq_update_dispatch_stats(struct request_queue *q,
4408 struct request *rq, 4664 struct request *rq,
4409 struct bfq_queue *in_serv_queue, 4665 struct bfq_queue *in_serv_queue,
@@ -4453,7 +4709,7 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
4453 struct request *rq, 4709 struct request *rq,
4454 struct bfq_queue *in_serv_queue, 4710 struct bfq_queue *in_serv_queue,
4455 bool idle_timer_disabled) {} 4711 bool idle_timer_disabled) {}
4456#endif 4712#endif /* CONFIG_BFQ_CGROUP_DEBUG */
4457 4713
4458static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) 4714static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
4459{ 4715{
@@ -4560,8 +4816,11 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
4560 4816
4561static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) 4817static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4562{ 4818{
4819 struct bfq_queue *item;
4820 struct hlist_node *n;
4821
4563 if (bfqq == bfqd->in_service_queue) { 4822 if (bfqq == bfqd->in_service_queue) {
4564 __bfq_bfqq_expire(bfqd, bfqq); 4823 __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
4565 bfq_schedule_dispatch(bfqd); 4824 bfq_schedule_dispatch(bfqd);
4566 } 4825 }
4567 4826
@@ -4569,6 +4828,18 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4569 4828
4570 bfq_put_cooperator(bfqq); 4829 bfq_put_cooperator(bfqq);
4571 4830
4831 /* remove bfqq from woken list */
4832 if (!hlist_unhashed(&bfqq->woken_list_node))
4833 hlist_del_init(&bfqq->woken_list_node);
4834
4835 /* reset waker for all queues in woken list */
4836 hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
4837 woken_list_node) {
4838 item->waker_bfqq = NULL;
4839 bfq_clear_bfqq_has_waker(item);
4840 hlist_del_init(&item->woken_list_node);
4841 }
4842
4572 bfq_put_queue(bfqq); /* release process reference */ 4843 bfq_put_queue(bfqq); /* release process reference */
4573} 4844}
4574 4845
@@ -4584,6 +4855,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
4584 unsigned long flags; 4855 unsigned long flags;
4585 4856
4586 spin_lock_irqsave(&bfqd->lock, flags); 4857 spin_lock_irqsave(&bfqd->lock, flags);
4858 bfqq->bic = NULL;
4587 bfq_exit_bfqq(bfqd, bfqq); 4859 bfq_exit_bfqq(bfqd, bfqq);
4588 bic_set_bfqq(bic, NULL, is_sync); 4860 bic_set_bfqq(bic, NULL, is_sync);
4589 spin_unlock_irqrestore(&bfqd->lock, flags); 4861 spin_unlock_irqrestore(&bfqd->lock, flags);
@@ -4687,6 +4959,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4687 RB_CLEAR_NODE(&bfqq->entity.rb_node); 4959 RB_CLEAR_NODE(&bfqq->entity.rb_node);
4688 INIT_LIST_HEAD(&bfqq->fifo); 4960 INIT_LIST_HEAD(&bfqq->fifo);
4689 INIT_HLIST_NODE(&bfqq->burst_list_node); 4961 INIT_HLIST_NODE(&bfqq->burst_list_node);
4962 INIT_HLIST_NODE(&bfqq->woken_list_node);
4963 INIT_HLIST_HEAD(&bfqq->woken_list);
4690 4964
4691 bfqq->ref = 0; 4965 bfqq->ref = 0;
4692 bfqq->bfqd = bfqd; 4966 bfqq->bfqd = bfqd;
@@ -4854,7 +5128,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4854 struct bfq_queue *bfqq, 5128 struct bfq_queue *bfqq,
4855 struct bfq_io_cq *bic) 5129 struct bfq_io_cq *bic)
4856{ 5130{
4857 bool has_short_ttime = true; 5131 bool has_short_ttime = true, state_changed;
4858 5132
4859 /* 5133 /*
4860 * No need to update has_short_ttime if bfqq is async or in 5134 * No need to update has_short_ttime if bfqq is async or in
@@ -4879,13 +5153,102 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4879 bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) 5153 bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
4880 has_short_ttime = false; 5154 has_short_ttime = false;
4881 5155
4882 bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", 5156 state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
4883 has_short_ttime);
4884 5157
4885 if (has_short_ttime) 5158 if (has_short_ttime)
4886 bfq_mark_bfqq_has_short_ttime(bfqq); 5159 bfq_mark_bfqq_has_short_ttime(bfqq);
4887 else 5160 else
4888 bfq_clear_bfqq_has_short_ttime(bfqq); 5161 bfq_clear_bfqq_has_short_ttime(bfqq);
5162
5163 /*
5164 * Until the base value for the total service time gets
5165 * finally computed for bfqq, the inject limit does depend on
5166 * the think-time state (short|long). In particular, the limit
5167 * is 0 or 1 if the think time is deemed, respectively, as
5168 * short or long (details in the comments in
5169 * bfq_update_inject_limit()). Accordingly, the next
5170 * instructions reset the inject limit if the think-time state
5171 * has changed and the above base value is still to be
5172 * computed.
5173 *
5174 * However, the reset is performed only if more than 100 ms
5175 * have elapsed since the last update of the inject limit, or
5176 * (inclusive) if the change is from short to long think
5177 * time. The reason for this waiting is as follows.
5178 *
5179 * bfqq may have a long think time because of a
5180 * synchronization with some other queue, i.e., because the
5181 * I/O of some other queue may need to be completed for bfqq
5182 * to receive new I/O. Details in the comments on the choice
5183 * of the queue for injection in bfq_select_queue().
5184 *
5185 * As stressed in those comments, if such a synchronization is
5186 * actually in place, then, without injection on bfqq, the
5187 * blocking I/O cannot happen to served while bfqq is in
5188 * service. As a consequence, if bfqq is granted
5189 * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
5190 * is dispatched, until the idle timeout fires. This is likely
5191 * to result in lower bandwidth and higher latencies for bfqq,
5192 * and in a severe loss of total throughput.
5193 *
5194 * On the opposite end, a non-zero inject limit may allow the
5195 * I/O that blocks bfqq to be executed soon, and therefore
5196 * bfqq to receive new I/O soon.
5197 *
5198 * But, if the blocking gets actually eliminated, then the
5199 * next think-time sample for bfqq may be very low. This in
5200 * turn may cause bfqq's think time to be deemed
5201 * short. Without the 100 ms barrier, this new state change
5202 * would cause the body of the next if to be executed
5203 * immediately. But this would set to 0 the inject
5204 * limit. Without injection, the blocking I/O would cause the
5205 * think time of bfqq to become long again, and therefore the
5206 * inject limit to be raised again, and so on. The only effect
5207 * of such a steady oscillation between the two think-time
5208 * states would be to prevent effective injection on bfqq.
5209 *
5210 * In contrast, if the inject limit is not reset during such a
5211 * long time interval as 100 ms, then the number of short
5212 * think time samples can grow significantly before the reset
5213 * is performed. As a consequence, the think time state can
5214 * become stable before the reset. Therefore there will be no
5215 * state change when the 100 ms elapse, and no reset of the
5216 * inject limit. The inject limit remains steadily equal to 1
5217 * both during and after the 100 ms. So injection can be
5218 * performed at all times, and throughput gets boosted.
5219 *
5220 * An inject limit equal to 1 is however in conflict, in
5221 * general, with the fact that the think time of bfqq is
5222 * short, because injection may be likely to delay bfqq's I/O
5223 * (as explained in the comments in
5224 * bfq_update_inject_limit()). But this does not happen in
5225 * this special case, because bfqq's low think time is due to
5226 * an effective handling of a synchronization, through
5227 * injection. In this special case, bfqq's I/O does not get
5228 * delayed by injection; on the contrary, bfqq's I/O is
5229 * brought forward, because it is not blocked for
5230 * milliseconds.
5231 *
5232 * In addition, serving the blocking I/O much sooner, and much
5233 * more frequently than once per I/O-plugging timeout, makes
5234 * it much quicker to detect a waker queue (the concept of
5235 * waker queue is defined in the comments in
5236 * bfq_add_request()). This makes it possible to start sooner
5237 * to boost throughput more effectively, by injecting the I/O
5238 * of the waker queue unconditionally on every
5239 * bfq_dispatch_request().
5240 *
5241 * One last, important benefit of not resetting the inject
5242 * limit before 100 ms is that, during this time interval, the
5243 * base value for the total service time is likely to get
5244 * finally computed for bfqq, freeing the inject limit from
5245 * its relation with the think time.
5246 */
5247 if (state_changed && bfqq->last_serv_time_ns == 0 &&
5248 (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
5249 msecs_to_jiffies(100)) ||
5250 !has_short_ttime))
5251 bfq_reset_inject_limit(bfqd, bfqq);
4889} 5252}
4890 5253
4891/* 5254/*
@@ -4895,19 +5258,9 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4895static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, 5258static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4896 struct request *rq) 5259 struct request *rq)
4897{ 5260{
4898 struct bfq_io_cq *bic = RQ_BIC(rq);
4899
4900 if (rq->cmd_flags & REQ_META) 5261 if (rq->cmd_flags & REQ_META)
4901 bfqq->meta_pending++; 5262 bfqq->meta_pending++;
4902 5263
4903 bfq_update_io_thinktime(bfqd, bfqq);
4904 bfq_update_has_short_ttime(bfqd, bfqq, bic);
4905 bfq_update_io_seektime(bfqd, bfqq, rq);
4906
4907 bfq_log_bfqq(bfqd, bfqq,
4908 "rq_enqueued: has_short_ttime=%d (seeky %d)",
4909 bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
4910
4911 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 5264 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4912 5265
4913 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { 5266 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
@@ -4995,6 +5348,10 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
4995 bfqq = new_bfqq; 5348 bfqq = new_bfqq;
4996 } 5349 }
4997 5350
5351 bfq_update_io_thinktime(bfqd, bfqq);
5352 bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
5353 bfq_update_io_seektime(bfqd, bfqq, rq);
5354
4998 waiting = bfqq && bfq_bfqq_wait_request(bfqq); 5355 waiting = bfqq && bfq_bfqq_wait_request(bfqq);
4999 bfq_add_request(rq); 5356 bfq_add_request(rq);
5000 idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); 5357 idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
@@ -5007,7 +5364,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
5007 return idle_timer_disabled; 5364 return idle_timer_disabled;
5008} 5365}
5009 5366
5010#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 5367#ifdef CONFIG_BFQ_CGROUP_DEBUG
5011static void bfq_update_insert_stats(struct request_queue *q, 5368static void bfq_update_insert_stats(struct request_queue *q,
5012 struct bfq_queue *bfqq, 5369 struct bfq_queue *bfqq,
5013 bool idle_timer_disabled, 5370 bool idle_timer_disabled,
@@ -5037,7 +5394,7 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
5037 struct bfq_queue *bfqq, 5394 struct bfq_queue *bfqq,
5038 bool idle_timer_disabled, 5395 bool idle_timer_disabled,
5039 unsigned int cmd_flags) {} 5396 unsigned int cmd_flags) {}
5040#endif 5397#endif /* CONFIG_BFQ_CGROUP_DEBUG */
5041 5398
5042static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 5399static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
5043 bool at_head) 5400 bool at_head)
@@ -5200,6 +5557,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5200 1UL<<(BFQ_RATE_SHIFT - 10)) 5557 1UL<<(BFQ_RATE_SHIFT - 10))
5201 bfq_update_rate_reset(bfqd, NULL); 5558 bfq_update_rate_reset(bfqd, NULL);
5202 bfqd->last_completion = now_ns; 5559 bfqd->last_completion = now_ns;
5560 bfqd->last_completed_rq_bfqq = bfqq;
5203 5561
5204 /* 5562 /*
5205 * If we are waiting to discover whether the request pattern 5563 * If we are waiting to discover whether the request pattern
@@ -5397,8 +5755,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
5397 * total service time, and there seem to be the right 5755 * total service time, and there seem to be the right
5398 * conditions to do it, or we can lower the last base value 5756 * conditions to do it, or we can lower the last base value
5399 * computed. 5757 * computed.
5758 *
5759 * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
5760 * request in flight, because this function is in the code
5761 * path that handles the completion of a request of bfqq, and,
5762 * in particular, this function is executed before
5763 * bfqd->rq_in_driver is decremented in such a code path.
5400 */ 5764 */
5401 if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 0) || 5765 if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
5402 tot_time_ns < bfqq->last_serv_time_ns) { 5766 tot_time_ns < bfqq->last_serv_time_ns) {
5403 bfqq->last_serv_time_ns = tot_time_ns; 5767 bfqq->last_serv_time_ns = tot_time_ns;
5404 /* 5768 /*
@@ -5406,7 +5770,18 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
5406 * start trying injection. 5770 * start trying injection.
5407 */ 5771 */
5408 bfqq->inject_limit = max_t(unsigned int, 1, old_limit); 5772 bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
5409 } 5773 } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
5774 /*
5775 * No I/O injected and no request still in service in
5776 * the drive: these are the exact conditions for
5777 * computing the base value of the total service time
5778 * for bfqq. So let's update this value, because it is
5779 * rather variable. For example, it varies if the size
5780 * or the spatial locality of the I/O requests in bfqq
5781 * change.
5782 */
5783 bfqq->last_serv_time_ns = tot_time_ns;
5784
5410 5785
5411 /* update complete, not waiting for any request completion any longer */ 5786 /* update complete, not waiting for any request completion any longer */
5412 bfqd->waited_rq = NULL; 5787 bfqd->waited_rq = NULL;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index c2faa77824f8..e80adf822bbe 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -357,6 +357,24 @@ struct bfq_queue {
357 357
358 /* max service rate measured so far */ 358 /* max service rate measured so far */
359 u32 max_service_rate; 359 u32 max_service_rate;
360
361 /*
362 * Pointer to the waker queue for this queue, i.e., to the
363 * queue Q such that this queue happens to get new I/O right
364 * after some I/O request of Q is completed. For details, see
365 * the comments on the choice of the queue for injection in
366 * bfq_select_queue().
367 */
368 struct bfq_queue *waker_bfqq;
369 /* node for woken_list, see below */
370 struct hlist_node woken_list_node;
371 /*
372 * Head of the list of the woken queues for this queue, i.e.,
373 * of the list of the queues for which this queue is a waker
374 * queue. This list is used to reset the waker_bfqq pointer in
375 * the woken queues when this queue exits.
376 */
377 struct hlist_head woken_list;
360}; 378};
361 379
362/** 380/**
@@ -533,6 +551,9 @@ struct bfq_data {
533 /* time of last request completion (ns) */ 551 /* time of last request completion (ns) */
534 u64 last_completion; 552 u64 last_completion;
535 553
554 /* bfqq owning the last completed rq */
555 struct bfq_queue *last_completed_rq_bfqq;
556
536 /* time of last transition from empty to non-empty (ns) */ 557 /* time of last transition from empty to non-empty (ns) */
537 u64 last_empty_occupied_ns; 558 u64 last_empty_occupied_ns;
538 559
@@ -743,7 +764,8 @@ enum bfqq_state_flags {
743 * update 764 * update
744 */ 765 */
745 BFQQF_coop, /* bfqq is shared */ 766 BFQQF_coop, /* bfqq is shared */
746 BFQQF_split_coop /* shared bfqq will be split */ 767 BFQQF_split_coop, /* shared bfqq will be split */
768 BFQQF_has_waker /* bfqq has a waker queue */
747}; 769};
748 770
749#define BFQ_BFQQ_FNS(name) \ 771#define BFQ_BFQQ_FNS(name) \
@@ -763,6 +785,7 @@ BFQ_BFQQ_FNS(in_large_burst);
763BFQ_BFQQ_FNS(coop); 785BFQ_BFQQ_FNS(coop);
764BFQ_BFQQ_FNS(split_coop); 786BFQ_BFQQ_FNS(split_coop);
765BFQ_BFQQ_FNS(softrt_update); 787BFQ_BFQQ_FNS(softrt_update);
788BFQ_BFQQ_FNS(has_waker);
766#undef BFQ_BFQQ_FNS 789#undef BFQ_BFQQ_FNS
767 790
768/* Expiration reasons. */ 791/* Expiration reasons. */
@@ -777,8 +800,13 @@ enum bfqq_expiration {
777 BFQQE_PREEMPTED /* preemption in progress */ 800 BFQQE_PREEMPTED /* preemption in progress */
778}; 801};
779 802
803struct bfq_stat {
804 struct percpu_counter cpu_cnt;
805 atomic64_t aux_cnt;
806};
807
780struct bfqg_stats { 808struct bfqg_stats {
781#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 809#ifdef CONFIG_BFQ_CGROUP_DEBUG
782 /* number of ios merged */ 810 /* number of ios merged */
783 struct blkg_rwstat merged; 811 struct blkg_rwstat merged;
784 /* total time spent on device in ns, may not be accurate w/ queueing */ 812 /* total time spent on device in ns, may not be accurate w/ queueing */
@@ -788,25 +816,25 @@ struct bfqg_stats {
788 /* number of IOs queued up */ 816 /* number of IOs queued up */
789 struct blkg_rwstat queued; 817 struct blkg_rwstat queued;
790 /* total disk time and nr sectors dispatched by this group */ 818 /* total disk time and nr sectors dispatched by this group */
791 struct blkg_stat time; 819 struct bfq_stat time;
792 /* sum of number of ios queued across all samples */ 820 /* sum of number of ios queued across all samples */
793 struct blkg_stat avg_queue_size_sum; 821 struct bfq_stat avg_queue_size_sum;
794 /* count of samples taken for average */ 822 /* count of samples taken for average */
795 struct blkg_stat avg_queue_size_samples; 823 struct bfq_stat avg_queue_size_samples;
796 /* how many times this group has been removed from service tree */ 824 /* how many times this group has been removed from service tree */
797 struct blkg_stat dequeue; 825 struct bfq_stat dequeue;
798 /* total time spent waiting for it to be assigned a timeslice. */ 826 /* total time spent waiting for it to be assigned a timeslice. */
799 struct blkg_stat group_wait_time; 827 struct bfq_stat group_wait_time;
800 /* time spent idling for this blkcg_gq */ 828 /* time spent idling for this blkcg_gq */
801 struct blkg_stat idle_time; 829 struct bfq_stat idle_time;
802 /* total time with empty current active q with other requests queued */ 830 /* total time with empty current active q with other requests queued */
803 struct blkg_stat empty_time; 831 struct bfq_stat empty_time;
804 /* fields after this shouldn't be cleared on stat reset */ 832 /* fields after this shouldn't be cleared on stat reset */
805 u64 start_group_wait_time; 833 u64 start_group_wait_time;
806 u64 start_idle_time; 834 u64 start_idle_time;
807 u64 start_empty_time; 835 u64 start_empty_time;
808 uint16_t flags; 836 uint16_t flags;
809#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ 837#endif /* CONFIG_BFQ_CGROUP_DEBUG */
810}; 838};
811 839
812#ifdef CONFIG_BFQ_GROUP_IOSCHED 840#ifdef CONFIG_BFQ_GROUP_IOSCHED
diff --git a/block/bio.c b/block/bio.c
index ce797d73bb43..29cd6cf4da51 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -558,14 +558,6 @@ void bio_put(struct bio *bio)
558} 558}
559EXPORT_SYMBOL(bio_put); 559EXPORT_SYMBOL(bio_put);
560 560
561int bio_phys_segments(struct request_queue *q, struct bio *bio)
562{
563 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
564 blk_recount_segments(q, bio);
565
566 return bio->bi_phys_segments;
567}
568
569/** 561/**
570 * __bio_clone_fast - clone a bio that shares the original bio's biovec 562 * __bio_clone_fast - clone a bio that shares the original bio's biovec
571 * @bio: destination bio 563 * @bio: destination bio
@@ -731,10 +723,10 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
731 } 723 }
732 } 724 }
733 725
734 if (bio_full(bio)) 726 if (bio_full(bio, len))
735 return 0; 727 return 0;
736 728
737 if (bio->bi_phys_segments >= queue_max_segments(q)) 729 if (bio->bi_vcnt >= queue_max_segments(q))
738 return 0; 730 return 0;
739 731
740 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 732 bvec = &bio->bi_io_vec[bio->bi_vcnt];
@@ -744,8 +736,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
744 bio->bi_vcnt++; 736 bio->bi_vcnt++;
745 done: 737 done:
746 bio->bi_iter.bi_size += len; 738 bio->bi_iter.bi_size += len;
747 bio->bi_phys_segments = bio->bi_vcnt;
748 bio_set_flag(bio, BIO_SEG_VALID);
749 return len; 739 return len;
750} 740}
751 741
@@ -807,7 +797,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
807 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; 797 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
808 798
809 WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); 799 WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
810 WARN_ON_ONCE(bio_full(bio)); 800 WARN_ON_ONCE(bio_full(bio, len));
811 801
812 bv->bv_page = page; 802 bv->bv_page = page;
813 bv->bv_offset = off; 803 bv->bv_offset = off;
@@ -834,7 +824,7 @@ int bio_add_page(struct bio *bio, struct page *page,
834 bool same_page = false; 824 bool same_page = false;
835 825
836 if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { 826 if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
837 if (bio_full(bio)) 827 if (bio_full(bio, len))
838 return 0; 828 return 0;
839 __bio_add_page(bio, page, len, offset); 829 __bio_add_page(bio, page, len, offset);
840 } 830 }
@@ -842,22 +832,19 @@ int bio_add_page(struct bio *bio, struct page *page,
842} 832}
843EXPORT_SYMBOL(bio_add_page); 833EXPORT_SYMBOL(bio_add_page);
844 834
845static void bio_get_pages(struct bio *bio) 835void bio_release_pages(struct bio *bio, bool mark_dirty)
846{ 836{
847 struct bvec_iter_all iter_all; 837 struct bvec_iter_all iter_all;
848 struct bio_vec *bvec; 838 struct bio_vec *bvec;
849 839
850 bio_for_each_segment_all(bvec, bio, iter_all) 840 if (bio_flagged(bio, BIO_NO_PAGE_REF))
851 get_page(bvec->bv_page); 841 return;
852}
853
854static void bio_release_pages(struct bio *bio)
855{
856 struct bvec_iter_all iter_all;
857 struct bio_vec *bvec;
858 842
859 bio_for_each_segment_all(bvec, bio, iter_all) 843 bio_for_each_segment_all(bvec, bio, iter_all) {
844 if (mark_dirty && !PageCompound(bvec->bv_page))
845 set_page_dirty_lock(bvec->bv_page);
860 put_page(bvec->bv_page); 846 put_page(bvec->bv_page);
847 }
861} 848}
862 849
863static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) 850static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
@@ -922,7 +909,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
922 if (same_page) 909 if (same_page)
923 put_page(page); 910 put_page(page);
924 } else { 911 } else {
925 if (WARN_ON_ONCE(bio_full(bio))) 912 if (WARN_ON_ONCE(bio_full(bio, len)))
926 return -EINVAL; 913 return -EINVAL;
927 __bio_add_page(bio, page, len, offset); 914 __bio_add_page(bio, page, len, offset);
928 } 915 }
@@ -966,13 +953,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
966 ret = __bio_iov_bvec_add_pages(bio, iter); 953 ret = __bio_iov_bvec_add_pages(bio, iter);
967 else 954 else
968 ret = __bio_iov_iter_get_pages(bio, iter); 955 ret = __bio_iov_iter_get_pages(bio, iter);
969 } while (!ret && iov_iter_count(iter) && !bio_full(bio)); 956 } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
970 957
971 if (iov_iter_bvec_no_ref(iter)) 958 if (is_bvec)
972 bio_set_flag(bio, BIO_NO_PAGE_REF); 959 bio_set_flag(bio, BIO_NO_PAGE_REF);
973 else if (is_bvec)
974 bio_get_pages(bio);
975
976 return bio->bi_vcnt ? 0 : ret; 960 return bio->bi_vcnt ? 0 : ret;
977} 961}
978 962
@@ -1124,8 +1108,7 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
1124 if (data->nr_segs > UIO_MAXIOV) 1108 if (data->nr_segs > UIO_MAXIOV)
1125 return NULL; 1109 return NULL;
1126 1110
1127 bmd = kmalloc(sizeof(struct bio_map_data) + 1111 bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask);
1128 sizeof(struct iovec) * data->nr_segs, gfp_mask);
1129 if (!bmd) 1112 if (!bmd)
1130 return NULL; 1113 return NULL;
1131 memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); 1114 memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
@@ -1371,8 +1354,6 @@ struct bio *bio_map_user_iov(struct request_queue *q,
1371 int j; 1354 int j;
1372 struct bio *bio; 1355 struct bio *bio;
1373 int ret; 1356 int ret;
1374 struct bio_vec *bvec;
1375 struct bvec_iter_all iter_all;
1376 1357
1377 if (!iov_iter_count(iter)) 1358 if (!iov_iter_count(iter))
1378 return ERR_PTR(-EINVAL); 1359 return ERR_PTR(-EINVAL);
@@ -1439,31 +1420,11 @@ struct bio *bio_map_user_iov(struct request_queue *q,
1439 return bio; 1420 return bio;
1440 1421
1441 out_unmap: 1422 out_unmap:
1442 bio_for_each_segment_all(bvec, bio, iter_all) { 1423 bio_release_pages(bio, false);
1443 put_page(bvec->bv_page);
1444 }
1445 bio_put(bio); 1424 bio_put(bio);
1446 return ERR_PTR(ret); 1425 return ERR_PTR(ret);
1447} 1426}
1448 1427
1449static void __bio_unmap_user(struct bio *bio)
1450{
1451 struct bio_vec *bvec;
1452 struct bvec_iter_all iter_all;
1453
1454 /*
1455 * make sure we dirty pages we wrote to
1456 */
1457 bio_for_each_segment_all(bvec, bio, iter_all) {
1458 if (bio_data_dir(bio) == READ)
1459 set_page_dirty_lock(bvec->bv_page);
1460
1461 put_page(bvec->bv_page);
1462 }
1463
1464 bio_put(bio);
1465}
1466
1467/** 1428/**
1468 * bio_unmap_user - unmap a bio 1429 * bio_unmap_user - unmap a bio
1469 * @bio: the bio being unmapped 1430 * @bio: the bio being unmapped
@@ -1475,7 +1436,8 @@ static void __bio_unmap_user(struct bio *bio)
1475 */ 1436 */
1476void bio_unmap_user(struct bio *bio) 1437void bio_unmap_user(struct bio *bio)
1477{ 1438{
1478 __bio_unmap_user(bio); 1439 bio_release_pages(bio, bio_data_dir(bio) == READ);
1440 bio_put(bio);
1479 bio_put(bio); 1441 bio_put(bio);
1480} 1442}
1481 1443
@@ -1695,9 +1657,7 @@ static void bio_dirty_fn(struct work_struct *work)
1695 while ((bio = next) != NULL) { 1657 while ((bio = next) != NULL) {
1696 next = bio->bi_private; 1658 next = bio->bi_private;
1697 1659
1698 bio_set_pages_dirty(bio); 1660 bio_release_pages(bio, true);
1699 if (!bio_flagged(bio, BIO_NO_PAGE_REF))
1700 bio_release_pages(bio);
1701 bio_put(bio); 1661 bio_put(bio);
1702 } 1662 }
1703} 1663}
@@ -1713,8 +1673,7 @@ void bio_check_pages_dirty(struct bio *bio)
1713 goto defer; 1673 goto defer;
1714 } 1674 }
1715 1675
1716 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) 1676 bio_release_pages(bio, false);
1717 bio_release_pages(bio);
1718 bio_put(bio); 1677 bio_put(bio);
1719 return; 1678 return;
1720defer: 1679defer:
@@ -1775,18 +1734,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
1775} 1734}
1776EXPORT_SYMBOL(generic_end_io_acct); 1735EXPORT_SYMBOL(generic_end_io_acct);
1777 1736
1778#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1779void bio_flush_dcache_pages(struct bio *bi)
1780{
1781 struct bio_vec bvec;
1782 struct bvec_iter iter;
1783
1784 bio_for_each_segment(bvec, bi, iter)
1785 flush_dcache_page(bvec.bv_page);
1786}
1787EXPORT_SYMBOL(bio_flush_dcache_pages);
1788#endif
1789
1790static inline bool bio_remaining_done(struct bio *bio) 1737static inline bool bio_remaining_done(struct bio *bio)
1791{ 1738{
1792 /* 1739 /*
@@ -1914,10 +1861,7 @@ void bio_trim(struct bio *bio, int offset, int size)
1914 if (offset == 0 && size == bio->bi_iter.bi_size) 1861 if (offset == 0 && size == bio->bi_iter.bi_size)
1915 return; 1862 return;
1916 1863
1917 bio_clear_flag(bio, BIO_SEG_VALID);
1918
1919 bio_advance(bio, offset << 9); 1864 bio_advance(bio, offset << 9);
1920
1921 bio->bi_iter.bi_size = size; 1865 bio->bi_iter.bi_size = size;
1922 1866
1923 if (bio_integrity(bio)) 1867 if (bio_integrity(bio))
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1f7127b03490..53b7bd4c7000 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -79,6 +79,7 @@ static void blkg_free(struct blkcg_gq *blkg)
79 79
80 blkg_rwstat_exit(&blkg->stat_ios); 80 blkg_rwstat_exit(&blkg->stat_ios);
81 blkg_rwstat_exit(&blkg->stat_bytes); 81 blkg_rwstat_exit(&blkg->stat_bytes);
82 percpu_ref_exit(&blkg->refcnt);
82 kfree(blkg); 83 kfree(blkg);
83} 84}
84 85
@@ -86,8 +87,6 @@ static void __blkg_release(struct rcu_head *rcu)
86{ 87{
87 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 88 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
88 89
89 percpu_ref_exit(&blkg->refcnt);
90
91 /* release the blkcg and parent blkg refs this blkg has been holding */ 90 /* release the blkcg and parent blkg refs this blkg has been holding */
92 css_put(&blkg->blkcg->css); 91 css_put(&blkg->blkcg->css);
93 if (blkg->parent) 92 if (blkg->parent)
@@ -132,6 +131,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
132 if (!blkg) 131 if (!blkg)
133 return NULL; 132 return NULL;
134 133
134 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
135 goto err_free;
136
135 if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || 137 if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
136 blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) 138 blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
137 goto err_free; 139 goto err_free;
@@ -244,11 +246,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
244 blkg_get(blkg->parent); 246 blkg_get(blkg->parent);
245 } 247 }
246 248
247 ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
248 GFP_NOWAIT | __GFP_NOWARN);
249 if (ret)
250 goto err_cancel_ref;
251
252 /* invoke per-policy init */ 249 /* invoke per-policy init */
253 for (i = 0; i < BLKCG_MAX_POLS; i++) { 250 for (i = 0; i < BLKCG_MAX_POLS; i++) {
254 struct blkcg_policy *pol = blkcg_policy[i]; 251 struct blkcg_policy *pol = blkcg_policy[i];
@@ -281,8 +278,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
281 blkg_put(blkg); 278 blkg_put(blkg);
282 return ERR_PTR(ret); 279 return ERR_PTR(ret);
283 280
284err_cancel_ref:
285 percpu_ref_exit(&blkg->refcnt);
286err_put_congested: 281err_put_congested:
287 wb_congested_put(wb_congested); 282 wb_congested_put(wb_congested);
288err_put_css: 283err_put_css:
@@ -549,7 +544,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
549 * Print @rwstat to @sf for the device assocaited with @pd. 544 * Print @rwstat to @sf for the device assocaited with @pd.
550 */ 545 */
551u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 546u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
552 const struct blkg_rwstat *rwstat) 547 const struct blkg_rwstat_sample *rwstat)
553{ 548{
554 static const char *rwstr[] = { 549 static const char *rwstr[] = {
555 [BLKG_RWSTAT_READ] = "Read", 550 [BLKG_RWSTAT_READ] = "Read",
@@ -567,31 +562,17 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
567 562
568 for (i = 0; i < BLKG_RWSTAT_NR; i++) 563 for (i = 0; i < BLKG_RWSTAT_NR; i++)
569 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 564 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
570 (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); 565 rwstat->cnt[i]);
571 566
572 v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + 567 v = rwstat->cnt[BLKG_RWSTAT_READ] +
573 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) + 568 rwstat->cnt[BLKG_RWSTAT_WRITE] +
574 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]); 569 rwstat->cnt[BLKG_RWSTAT_DISCARD];
575 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 570 seq_printf(sf, "%s Total %llu\n", dname, v);
576 return v; 571 return v;
577} 572}
578EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); 573EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
579 574
580/** 575/**
581 * blkg_prfill_stat - prfill callback for blkg_stat
582 * @sf: seq_file to print to
583 * @pd: policy private data of interest
584 * @off: offset to the blkg_stat in @pd
585 *
586 * prfill callback for printing a blkg_stat.
587 */
588u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
589{
590 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
591}
592EXPORT_SYMBOL_GPL(blkg_prfill_stat);
593
594/**
595 * blkg_prfill_rwstat - prfill callback for blkg_rwstat 576 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
596 * @sf: seq_file to print to 577 * @sf: seq_file to print to
597 * @pd: policy private data of interest 578 * @pd: policy private data of interest
@@ -602,8 +583,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_stat);
602u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 583u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
603 int off) 584 int off)
604{ 585{
605 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); 586 struct blkg_rwstat_sample rwstat = { };
606 587
588 blkg_rwstat_read((void *)pd + off, &rwstat);
607 return __blkg_prfill_rwstat(sf, pd, &rwstat); 589 return __blkg_prfill_rwstat(sf, pd, &rwstat);
608} 590}
609EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 591EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
@@ -611,8 +593,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
611static u64 blkg_prfill_rwstat_field(struct seq_file *sf, 593static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
612 struct blkg_policy_data *pd, int off) 594 struct blkg_policy_data *pd, int off)
613{ 595{
614 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); 596 struct blkg_rwstat_sample rwstat = { };
615 597
598 blkg_rwstat_read((void *)pd->blkg + off, &rwstat);
616 return __blkg_prfill_rwstat(sf, pd, &rwstat); 599 return __blkg_prfill_rwstat(sf, pd, &rwstat);
617} 600}
618 601
@@ -654,8 +637,9 @@ static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
654 struct blkg_policy_data *pd, 637 struct blkg_policy_data *pd,
655 int off) 638 int off)
656{ 639{
657 struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, 640 struct blkg_rwstat_sample rwstat;
658 NULL, off); 641
642 blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat);
659 return __blkg_prfill_rwstat(sf, pd, &rwstat); 643 return __blkg_prfill_rwstat(sf, pd, &rwstat);
660} 644}
661 645
@@ -690,52 +674,11 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
690EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); 674EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
691 675
692/** 676/**
693 * blkg_stat_recursive_sum - collect hierarchical blkg_stat
694 * @blkg: blkg of interest
695 * @pol: blkcg_policy which contains the blkg_stat
696 * @off: offset to the blkg_stat in blkg_policy_data or @blkg
697 *
698 * Collect the blkg_stat specified by @blkg, @pol and @off and all its
699 * online descendants and their aux counts. The caller must be holding the
700 * queue lock for online tests.
701 *
702 * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
703 * at @off bytes into @blkg's blkg_policy_data of the policy.
704 */
705u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
706 struct blkcg_policy *pol, int off)
707{
708 struct blkcg_gq *pos_blkg;
709 struct cgroup_subsys_state *pos_css;
710 u64 sum = 0;
711
712 lockdep_assert_held(&blkg->q->queue_lock);
713
714 rcu_read_lock();
715 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
716 struct blkg_stat *stat;
717
718 if (!pos_blkg->online)
719 continue;
720
721 if (pol)
722 stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
723 else
724 stat = (void *)blkg + off;
725
726 sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
727 }
728 rcu_read_unlock();
729
730 return sum;
731}
732EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
733
734/**
735 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat 677 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
736 * @blkg: blkg of interest 678 * @blkg: blkg of interest
737 * @pol: blkcg_policy which contains the blkg_rwstat 679 * @pol: blkcg_policy which contains the blkg_rwstat
738 * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg 680 * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
681 * @sum: blkg_rwstat_sample structure containing the results
739 * 682 *
740 * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its 683 * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
741 * online descendants and their aux counts. The caller must be holding the 684 * online descendants and their aux counts. The caller must be holding the
@@ -744,13 +687,12 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
744 * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it 687 * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
745 * is at @off bytes into @blkg's blkg_policy_data of the policy. 688 * is at @off bytes into @blkg's blkg_policy_data of the policy.
746 */ 689 */
747struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, 690void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
748 struct blkcg_policy *pol, int off) 691 int off, struct blkg_rwstat_sample *sum)
749{ 692{
750 struct blkcg_gq *pos_blkg; 693 struct blkcg_gq *pos_blkg;
751 struct cgroup_subsys_state *pos_css; 694 struct cgroup_subsys_state *pos_css;
752 struct blkg_rwstat sum = { }; 695 unsigned int i;
753 int i;
754 696
755 lockdep_assert_held(&blkg->q->queue_lock); 697 lockdep_assert_held(&blkg->q->queue_lock);
756 698
@@ -767,13 +709,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
767 rwstat = (void *)pos_blkg + off; 709 rwstat = (void *)pos_blkg + off;
768 710
769 for (i = 0; i < BLKG_RWSTAT_NR; i++) 711 for (i = 0; i < BLKG_RWSTAT_NR; i++)
770 atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + 712 sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
771 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
772 &sum.aux_cnt[i]);
773 } 713 }
774 rcu_read_unlock(); 714 rcu_read_unlock();
775
776 return sum;
777} 715}
778EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); 716EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
779 717
@@ -939,7 +877,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
939 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 877 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
940 const char *dname; 878 const char *dname;
941 char *buf; 879 char *buf;
942 struct blkg_rwstat rwstat; 880 struct blkg_rwstat_sample rwstat;
943 u64 rbytes, wbytes, rios, wios, dbytes, dios; 881 u64 rbytes, wbytes, rios, wios, dbytes, dios;
944 size_t size = seq_get_buf(sf, &buf), off = 0; 882 size_t size = seq_get_buf(sf, &buf), off = 0;
945 int i; 883 int i;
@@ -959,17 +897,17 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
959 897
960 spin_lock_irq(&blkg->q->queue_lock); 898 spin_lock_irq(&blkg->q->queue_lock);
961 899
962 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 900 blkg_rwstat_recursive_sum(blkg, NULL,
963 offsetof(struct blkcg_gq, stat_bytes)); 901 offsetof(struct blkcg_gq, stat_bytes), &rwstat);
964 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); 902 rbytes = rwstat.cnt[BLKG_RWSTAT_READ];
965 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 903 wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE];
966 dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); 904 dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD];
967 905
968 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 906 blkg_rwstat_recursive_sum(blkg, NULL,
969 offsetof(struct blkcg_gq, stat_ios)); 907 offsetof(struct blkcg_gq, stat_ios), &rwstat);
970 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); 908 rios = rwstat.cnt[BLKG_RWSTAT_READ];
971 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 909 wios = rwstat.cnt[BLKG_RWSTAT_WRITE];
972 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); 910 dios = rwstat.cnt[BLKG_RWSTAT_DISCARD];
973 911
974 spin_unlock_irq(&blkg->q->queue_lock); 912 spin_unlock_irq(&blkg->q->queue_lock);
975 913
@@ -1006,8 +944,12 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
1006 } 944 }
1007next: 945next:
1008 if (has_stats) { 946 if (has_stats) {
1009 off += scnprintf(buf+off, size-off, "\n"); 947 if (off < size - 1) {
1010 seq_commit(sf, off); 948 off += scnprintf(buf+off, size-off, "\n");
949 seq_commit(sf, off);
950 } else {
951 seq_commit(sf, -1);
952 }
1011 } 953 }
1012 } 954 }
1013 955
@@ -1391,7 +1333,8 @@ pd_prealloc:
1391 1333
1392 spin_lock_irq(&q->queue_lock); 1334 spin_lock_irq(&q->queue_lock);
1393 1335
1394 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1336 /* blkg_list is pushed at the head, reverse walk to init parents first */
1337 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1395 struct blkg_policy_data *pd; 1338 struct blkg_policy_data *pd;
1396 1339
1397 if (blkg->pd[pol->plid]) 1340 if (blkg->pd[pol->plid])
diff --git a/block/blk-core.c b/block/blk-core.c
index 8340f69670d8..5d1fc8e17dd1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -120,6 +120,42 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
120} 120}
121EXPORT_SYMBOL(blk_rq_init); 121EXPORT_SYMBOL(blk_rq_init);
122 122
123#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
124static const char *const blk_op_name[] = {
125 REQ_OP_NAME(READ),
126 REQ_OP_NAME(WRITE),
127 REQ_OP_NAME(FLUSH),
128 REQ_OP_NAME(DISCARD),
129 REQ_OP_NAME(SECURE_ERASE),
130 REQ_OP_NAME(ZONE_RESET),
131 REQ_OP_NAME(WRITE_SAME),
132 REQ_OP_NAME(WRITE_ZEROES),
133 REQ_OP_NAME(SCSI_IN),
134 REQ_OP_NAME(SCSI_OUT),
135 REQ_OP_NAME(DRV_IN),
136 REQ_OP_NAME(DRV_OUT),
137};
138#undef REQ_OP_NAME
139
140/**
141 * blk_op_str - Return string XXX in the REQ_OP_XXX.
142 * @op: REQ_OP_XXX.
143 *
144 * Description: Centralize block layer function to convert REQ_OP_XXX into
145 * string format. Useful in the debugging and tracing bio or request. For
146 * invalid REQ_OP_XXX it returns string "UNKNOWN".
147 */
148inline const char *blk_op_str(unsigned int op)
149{
150 const char *op_str = "UNKNOWN";
151
152 if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
153 op_str = blk_op_name[op];
154
155 return op_str;
156}
157EXPORT_SYMBOL_GPL(blk_op_str);
158
123static const struct { 159static const struct {
124 int errno; 160 int errno;
125 const char *name; 161 const char *name;
@@ -167,18 +203,23 @@ int blk_status_to_errno(blk_status_t status)
167} 203}
168EXPORT_SYMBOL_GPL(blk_status_to_errno); 204EXPORT_SYMBOL_GPL(blk_status_to_errno);
169 205
170static void print_req_error(struct request *req, blk_status_t status) 206static void print_req_error(struct request *req, blk_status_t status,
207 const char *caller)
171{ 208{
172 int idx = (__force int)status; 209 int idx = (__force int)status;
173 210
174 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) 211 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
175 return; 212 return;
176 213
177 printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n", 214 printk_ratelimited(KERN_ERR
178 __func__, blk_errors[idx].name, 215 "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
179 req->rq_disk ? req->rq_disk->disk_name : "?", 216 "phys_seg %u prio class %u\n",
180 (unsigned long long)blk_rq_pos(req), 217 caller, blk_errors[idx].name,
181 req->cmd_flags); 218 req->rq_disk ? req->rq_disk->disk_name : "?",
219 blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
220 req->cmd_flags & ~REQ_OP_MASK,
221 req->nr_phys_segments,
222 IOPRIO_PRIO_CLASS(req->ioprio));
182} 223}
183 224
184static void req_bio_endio(struct request *rq, struct bio *bio, 225static void req_bio_endio(struct request *rq, struct bio *bio,
@@ -550,15 +591,15 @@ void blk_put_request(struct request *req)
550} 591}
551EXPORT_SYMBOL(blk_put_request); 592EXPORT_SYMBOL(blk_put_request);
552 593
553bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 594bool bio_attempt_back_merge(struct request *req, struct bio *bio,
554 struct bio *bio) 595 unsigned int nr_segs)
555{ 596{
556 const int ff = bio->bi_opf & REQ_FAILFAST_MASK; 597 const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
557 598
558 if (!ll_back_merge_fn(q, req, bio)) 599 if (!ll_back_merge_fn(req, bio, nr_segs))
559 return false; 600 return false;
560 601
561 trace_block_bio_backmerge(q, req, bio); 602 trace_block_bio_backmerge(req->q, req, bio);
562 603
563 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 604 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
564 blk_rq_set_mixed_merge(req); 605 blk_rq_set_mixed_merge(req);
@@ -571,15 +612,15 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
571 return true; 612 return true;
572} 613}
573 614
574bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 615bool bio_attempt_front_merge(struct request *req, struct bio *bio,
575 struct bio *bio) 616 unsigned int nr_segs)
576{ 617{
577 const int ff = bio->bi_opf & REQ_FAILFAST_MASK; 618 const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
578 619
579 if (!ll_front_merge_fn(q, req, bio)) 620 if (!ll_front_merge_fn(req, bio, nr_segs))
580 return false; 621 return false;
581 622
582 trace_block_bio_frontmerge(q, req, bio); 623 trace_block_bio_frontmerge(req->q, req, bio);
583 624
584 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 625 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
585 blk_rq_set_mixed_merge(req); 626 blk_rq_set_mixed_merge(req);
@@ -621,6 +662,7 @@ no_merge:
621 * blk_attempt_plug_merge - try to merge with %current's plugged list 662 * blk_attempt_plug_merge - try to merge with %current's plugged list
622 * @q: request_queue new bio is being queued at 663 * @q: request_queue new bio is being queued at
623 * @bio: new bio being queued 664 * @bio: new bio being queued
665 * @nr_segs: number of segments in @bio
624 * @same_queue_rq: pointer to &struct request that gets filled in when 666 * @same_queue_rq: pointer to &struct request that gets filled in when
625 * another request associated with @q is found on the plug list 667 * another request associated with @q is found on the plug list
626 * (optional, may be %NULL) 668 * (optional, may be %NULL)
@@ -639,7 +681,7 @@ no_merge:
639 * Caller must ensure !blk_queue_nomerges(q) beforehand. 681 * Caller must ensure !blk_queue_nomerges(q) beforehand.
640 */ 682 */
641bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 683bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
642 struct request **same_queue_rq) 684 unsigned int nr_segs, struct request **same_queue_rq)
643{ 685{
644 struct blk_plug *plug; 686 struct blk_plug *plug;
645 struct request *rq; 687 struct request *rq;
@@ -668,10 +710,10 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
668 710
669 switch (blk_try_merge(rq, bio)) { 711 switch (blk_try_merge(rq, bio)) {
670 case ELEVATOR_BACK_MERGE: 712 case ELEVATOR_BACK_MERGE:
671 merged = bio_attempt_back_merge(q, rq, bio); 713 merged = bio_attempt_back_merge(rq, bio, nr_segs);
672 break; 714 break;
673 case ELEVATOR_FRONT_MERGE: 715 case ELEVATOR_FRONT_MERGE:
674 merged = bio_attempt_front_merge(q, rq, bio); 716 merged = bio_attempt_front_merge(rq, bio, nr_segs);
675 break; 717 break;
676 case ELEVATOR_DISCARD_MERGE: 718 case ELEVATOR_DISCARD_MERGE:
677 merged = bio_attempt_discard_merge(q, rq, bio); 719 merged = bio_attempt_discard_merge(q, rq, bio);
@@ -687,18 +729,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
687 return false; 729 return false;
688} 730}
689 731
690void blk_init_request_from_bio(struct request *req, struct bio *bio)
691{
692 if (bio->bi_opf & REQ_RAHEAD)
693 req->cmd_flags |= REQ_FAILFAST_MASK;
694
695 req->__sector = bio->bi_iter.bi_sector;
696 req->ioprio = bio_prio(bio);
697 req->write_hint = bio->bi_write_hint;
698 blk_rq_bio_prep(req->q, req, bio);
699}
700EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
701
702static void handle_bad_sector(struct bio *bio, sector_t maxsector) 732static void handle_bad_sector(struct bio *bio, sector_t maxsector)
703{ 733{
704 char b[BDEVNAME_SIZE]; 734 char b[BDEVNAME_SIZE];
@@ -1163,7 +1193,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
1163 * Recalculate it to check the request correctly on this queue's 1193 * Recalculate it to check the request correctly on this queue's
1164 * limitation. 1194 * limitation.
1165 */ 1195 */
1166 blk_recalc_rq_segments(rq); 1196 rq->nr_phys_segments = blk_recalc_rq_segments(rq);
1167 if (rq->nr_phys_segments > queue_max_segments(q)) { 1197 if (rq->nr_phys_segments > queue_max_segments(q)) {
1168 printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", 1198 printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
1169 __func__, rq->nr_phys_segments, queue_max_segments(q)); 1199 __func__, rq->nr_phys_segments, queue_max_segments(q));
@@ -1348,7 +1378,7 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
1348 * 1378 *
1349 * This special helper function is only for request stacking drivers 1379 * This special helper function is only for request stacking drivers
1350 * (e.g. request-based dm) so that they can handle partial completion. 1380 * (e.g. request-based dm) so that they can handle partial completion.
1351 * Actual device drivers should use blk_end_request instead. 1381 * Actual device drivers should use blk_mq_end_request instead.
1352 * 1382 *
1353 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees 1383 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
1354 * %false return from this function. 1384 * %false return from this function.
@@ -1373,7 +1403,7 @@ bool blk_update_request(struct request *req, blk_status_t error,
1373 1403
1374 if (unlikely(error && !blk_rq_is_passthrough(req) && 1404 if (unlikely(error && !blk_rq_is_passthrough(req) &&
1375 !(req->rq_flags & RQF_QUIET))) 1405 !(req->rq_flags & RQF_QUIET)))
1376 print_req_error(req, error); 1406 print_req_error(req, error, __func__);
1377 1407
1378 blk_account_io_completion(req, nr_bytes); 1408 blk_account_io_completion(req, nr_bytes);
1379 1409
@@ -1432,28 +1462,13 @@ bool blk_update_request(struct request *req, blk_status_t error,
1432 } 1462 }
1433 1463
1434 /* recalculate the number of segments */ 1464 /* recalculate the number of segments */
1435 blk_recalc_rq_segments(req); 1465 req->nr_phys_segments = blk_recalc_rq_segments(req);
1436 } 1466 }
1437 1467
1438 return true; 1468 return true;
1439} 1469}
1440EXPORT_SYMBOL_GPL(blk_update_request); 1470EXPORT_SYMBOL_GPL(blk_update_request);
1441 1471
1442void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
1443 struct bio *bio)
1444{
1445 if (bio_has_data(bio))
1446 rq->nr_phys_segments = bio_phys_segments(q, bio);
1447 else if (bio_op(bio) == REQ_OP_DISCARD)
1448 rq->nr_phys_segments = 1;
1449
1450 rq->__data_len = bio->bi_iter.bi_size;
1451 rq->bio = rq->biotail = bio;
1452
1453 if (bio->bi_disk)
1454 rq->rq_disk = bio->bi_disk;
1455}
1456
1457#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1472#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1458/** 1473/**
1459 * rq_flush_dcache_pages - Helper function to flush all pages in a request 1474 * rq_flush_dcache_pages - Helper function to flush all pages in a request
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index d22e61bced86..d973c38ee4fd 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -618,44 +618,26 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
618 618
619 inflight = atomic_dec_return(&rqw->inflight); 619 inflight = atomic_dec_return(&rqw->inflight);
620 WARN_ON_ONCE(inflight < 0); 620 WARN_ON_ONCE(inflight < 0);
621 if (iolat->min_lat_nsec == 0) 621 /*
622 goto next; 622 * If bi_status is BLK_STS_AGAIN, the bio wasn't actually
623 iolatency_record_time(iolat, &bio->bi_issue, now, 623 * submitted, so do not account for it.
624 issue_as_root); 624 */
625 window_start = atomic64_read(&iolat->window_start); 625 if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
626 if (now > window_start && 626 iolatency_record_time(iolat, &bio->bi_issue, now,
627 (now - window_start) >= iolat->cur_win_nsec) { 627 issue_as_root);
628 if (atomic64_cmpxchg(&iolat->window_start, 628 window_start = atomic64_read(&iolat->window_start);
629 window_start, now) == window_start) 629 if (now > window_start &&
630 iolatency_check_latencies(iolat, now); 630 (now - window_start) >= iolat->cur_win_nsec) {
631 if (atomic64_cmpxchg(&iolat->window_start,
632 window_start, now) == window_start)
633 iolatency_check_latencies(iolat, now);
634 }
631 } 635 }
632next:
633 wake_up(&rqw->wait); 636 wake_up(&rqw->wait);
634 blkg = blkg->parent; 637 blkg = blkg->parent;
635 } 638 }
636} 639}
637 640
638static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
639{
640 struct blkcg_gq *blkg;
641
642 blkg = bio->bi_blkg;
643 while (blkg && blkg->parent) {
644 struct rq_wait *rqw;
645 struct iolatency_grp *iolat;
646
647 iolat = blkg_to_lat(blkg);
648 if (!iolat)
649 goto next;
650
651 rqw = &iolat->rq_wait;
652 atomic_dec(&rqw->inflight);
653 wake_up(&rqw->wait);
654next:
655 blkg = blkg->parent;
656 }
657}
658
659static void blkcg_iolatency_exit(struct rq_qos *rqos) 641static void blkcg_iolatency_exit(struct rq_qos *rqos)
660{ 642{
661 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 643 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
@@ -667,7 +649,6 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos)
667 649
668static struct rq_qos_ops blkcg_iolatency_ops = { 650static struct rq_qos_ops blkcg_iolatency_ops = {
669 .throttle = blkcg_iolatency_throttle, 651 .throttle = blkcg_iolatency_throttle,
670 .cleanup = blkcg_iolatency_cleanup,
671 .done_bio = blkcg_iolatency_done_bio, 652 .done_bio = blkcg_iolatency_done_bio,
672 .exit = blkcg_iolatency_exit, 653 .exit = blkcg_iolatency_exit,
673}; 654};
@@ -778,8 +759,10 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
778 759
779 if (!oldval && val) 760 if (!oldval && val)
780 return 1; 761 return 1;
781 if (oldval && !val) 762 if (oldval && !val) {
763 blkcg_clear_delay(blkg);
782 return -1; 764 return -1;
765 }
783 return 0; 766 return 0;
784} 767}
785 768
diff --git a/block/blk-map.c b/block/blk-map.c
index db9373bd31ac..3a62e471d81b 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -18,13 +18,19 @@
18int blk_rq_append_bio(struct request *rq, struct bio **bio) 18int blk_rq_append_bio(struct request *rq, struct bio **bio)
19{ 19{
20 struct bio *orig_bio = *bio; 20 struct bio *orig_bio = *bio;
21 struct bvec_iter iter;
22 struct bio_vec bv;
23 unsigned int nr_segs = 0;
21 24
22 blk_queue_bounce(rq->q, bio); 25 blk_queue_bounce(rq->q, bio);
23 26
27 bio_for_each_bvec(bv, *bio, iter)
28 nr_segs++;
29
24 if (!rq->bio) { 30 if (!rq->bio) {
25 blk_rq_bio_prep(rq->q, rq, *bio); 31 blk_rq_bio_prep(rq, *bio, nr_segs);
26 } else { 32 } else {
27 if (!ll_back_merge_fn(rq->q, rq, *bio)) { 33 if (!ll_back_merge_fn(rq, *bio, nr_segs)) {
28 if (orig_bio != *bio) { 34 if (orig_bio != *bio) {
29 bio_put(*bio); 35 bio_put(*bio);
30 *bio = orig_bio; 36 *bio = orig_bio;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 17713d7d98d5..57f7990b342d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -105,7 +105,7 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
105static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, 105static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
106 struct bio *bio, struct bio_set *bs, unsigned *nsegs) 106 struct bio *bio, struct bio_set *bs, unsigned *nsegs)
107{ 107{
108 *nsegs = 1; 108 *nsegs = 0;
109 109
110 if (!q->limits.max_write_zeroes_sectors) 110 if (!q->limits.max_write_zeroes_sectors)
111 return NULL; 111 return NULL;
@@ -202,8 +202,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
202 struct bio_vec bv, bvprv, *bvprvp = NULL; 202 struct bio_vec bv, bvprv, *bvprvp = NULL;
203 struct bvec_iter iter; 203 struct bvec_iter iter;
204 unsigned nsegs = 0, sectors = 0; 204 unsigned nsegs = 0, sectors = 0;
205 bool do_split = true;
206 struct bio *new = NULL;
207 const unsigned max_sectors = get_max_io_size(q, bio); 205 const unsigned max_sectors = get_max_io_size(q, bio);
208 const unsigned max_segs = queue_max_segments(q); 206 const unsigned max_segs = queue_max_segments(q);
209 207
@@ -245,45 +243,36 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
245 } 243 }
246 } 244 }
247 245
248 do_split = false; 246 *segs = nsegs;
247 return NULL;
249split: 248split:
250 *segs = nsegs; 249 *segs = nsegs;
251 250 return bio_split(bio, sectors, GFP_NOIO, bs);
252 if (do_split) {
253 new = bio_split(bio, sectors, GFP_NOIO, bs);
254 if (new)
255 bio = new;
256 }
257
258 return do_split ? new : NULL;
259} 251}
260 252
261void blk_queue_split(struct request_queue *q, struct bio **bio) 253void __blk_queue_split(struct request_queue *q, struct bio **bio,
254 unsigned int *nr_segs)
262{ 255{
263 struct bio *split, *res; 256 struct bio *split;
264 unsigned nsegs;
265 257
266 switch (bio_op(*bio)) { 258 switch (bio_op(*bio)) {
267 case REQ_OP_DISCARD: 259 case REQ_OP_DISCARD:
268 case REQ_OP_SECURE_ERASE: 260 case REQ_OP_SECURE_ERASE:
269 split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs); 261 split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
270 break; 262 break;
271 case REQ_OP_WRITE_ZEROES: 263 case REQ_OP_WRITE_ZEROES:
272 split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs); 264 split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
265 nr_segs);
273 break; 266 break;
274 case REQ_OP_WRITE_SAME: 267 case REQ_OP_WRITE_SAME:
275 split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs); 268 split = blk_bio_write_same_split(q, *bio, &q->bio_split,
269 nr_segs);
276 break; 270 break;
277 default: 271 default:
278 split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs); 272 split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
279 break; 273 break;
280 } 274 }
281 275
282 /* physical segments can be figured out during splitting */
283 res = split ? split : *bio;
284 res->bi_phys_segments = nsegs;
285 bio_set_flag(res, BIO_SEG_VALID);
286
287 if (split) { 276 if (split) {
288 /* there isn't chance to merge the splitted bio */ 277 /* there isn't chance to merge the splitted bio */
289 split->bi_opf |= REQ_NOMERGE; 278 split->bi_opf |= REQ_NOMERGE;
@@ -304,19 +293,25 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
304 *bio = split; 293 *bio = split;
305 } 294 }
306} 295}
296
297void blk_queue_split(struct request_queue *q, struct bio **bio)
298{
299 unsigned int nr_segs;
300
301 __blk_queue_split(q, bio, &nr_segs);
302}
307EXPORT_SYMBOL(blk_queue_split); 303EXPORT_SYMBOL(blk_queue_split);
308 304
309static unsigned int __blk_recalc_rq_segments(struct request_queue *q, 305unsigned int blk_recalc_rq_segments(struct request *rq)
310 struct bio *bio)
311{ 306{
312 unsigned int nr_phys_segs = 0; 307 unsigned int nr_phys_segs = 0;
313 struct bvec_iter iter; 308 struct req_iterator iter;
314 struct bio_vec bv; 309 struct bio_vec bv;
315 310
316 if (!bio) 311 if (!rq->bio)
317 return 0; 312 return 0;
318 313
319 switch (bio_op(bio)) { 314 switch (bio_op(rq->bio)) {
320 case REQ_OP_DISCARD: 315 case REQ_OP_DISCARD:
321 case REQ_OP_SECURE_ERASE: 316 case REQ_OP_SECURE_ERASE:
322 case REQ_OP_WRITE_ZEROES: 317 case REQ_OP_WRITE_ZEROES:
@@ -325,30 +320,11 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
325 return 1; 320 return 1;
326 } 321 }
327 322
328 for_each_bio(bio) { 323 rq_for_each_bvec(bv, rq, iter)
329 bio_for_each_bvec(bv, bio, iter) 324 bvec_split_segs(rq->q, &bv, &nr_phys_segs, NULL, UINT_MAX);
330 bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX);
331 }
332
333 return nr_phys_segs; 325 return nr_phys_segs;
334} 326}
335 327
336void blk_recalc_rq_segments(struct request *rq)
337{
338 rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
339}
340
341void blk_recount_segments(struct request_queue *q, struct bio *bio)
342{
343 struct bio *nxt = bio->bi_next;
344
345 bio->bi_next = NULL;
346 bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
347 bio->bi_next = nxt;
348
349 bio_set_flag(bio, BIO_SEG_VALID);
350}
351
352static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, 328static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
353 struct scatterlist *sglist) 329 struct scatterlist *sglist)
354{ 330{
@@ -519,16 +495,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
519} 495}
520EXPORT_SYMBOL(blk_rq_map_sg); 496EXPORT_SYMBOL(blk_rq_map_sg);
521 497
522static inline int ll_new_hw_segment(struct request_queue *q, 498static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
523 struct request *req, 499 unsigned int nr_phys_segs)
524 struct bio *bio)
525{ 500{
526 int nr_phys_segs = bio_phys_segments(q, bio); 501 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q))
527
528 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
529 goto no_merge; 502 goto no_merge;
530 503
531 if (blk_integrity_merge_bio(q, req, bio) == false) 504 if (blk_integrity_merge_bio(req->q, req, bio) == false)
532 goto no_merge; 505 goto no_merge;
533 506
534 /* 507 /*
@@ -539,12 +512,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
539 return 1; 512 return 1;
540 513
541no_merge: 514no_merge:
542 req_set_nomerge(q, req); 515 req_set_nomerge(req->q, req);
543 return 0; 516 return 0;
544} 517}
545 518
546int ll_back_merge_fn(struct request_queue *q, struct request *req, 519int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
547 struct bio *bio)
548{ 520{
549 if (req_gap_back_merge(req, bio)) 521 if (req_gap_back_merge(req, bio))
550 return 0; 522 return 0;
@@ -553,21 +525,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
553 return 0; 525 return 0;
554 if (blk_rq_sectors(req) + bio_sectors(bio) > 526 if (blk_rq_sectors(req) + bio_sectors(bio) >
555 blk_rq_get_max_sectors(req, blk_rq_pos(req))) { 527 blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
556 req_set_nomerge(q, req); 528 req_set_nomerge(req->q, req);
557 return 0; 529 return 0;
558 } 530 }
559 if (!bio_flagged(req->biotail, BIO_SEG_VALID))
560 blk_recount_segments(q, req->biotail);
561 if (!bio_flagged(bio, BIO_SEG_VALID))
562 blk_recount_segments(q, bio);
563 531
564 return ll_new_hw_segment(q, req, bio); 532 return ll_new_hw_segment(req, bio, nr_segs);
565} 533}
566 534
567int ll_front_merge_fn(struct request_queue *q, struct request *req, 535int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
568 struct bio *bio)
569{ 536{
570
571 if (req_gap_front_merge(req, bio)) 537 if (req_gap_front_merge(req, bio))
572 return 0; 538 return 0;
573 if (blk_integrity_rq(req) && 539 if (blk_integrity_rq(req) &&
@@ -575,15 +541,11 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
575 return 0; 541 return 0;
576 if (blk_rq_sectors(req) + bio_sectors(bio) > 542 if (blk_rq_sectors(req) + bio_sectors(bio) >
577 blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { 543 blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
578 req_set_nomerge(q, req); 544 req_set_nomerge(req->q, req);
579 return 0; 545 return 0;
580 } 546 }
581 if (!bio_flagged(bio, BIO_SEG_VALID))
582 blk_recount_segments(q, bio);
583 if (!bio_flagged(req->bio, BIO_SEG_VALID))
584 blk_recount_segments(q, req->bio);
585 547
586 return ll_new_hw_segment(q, req, bio); 548 return ll_new_hw_segment(req, bio, nr_segs);
587} 549}
588 550
589static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, 551static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3afe327f816f..b3f2ba483992 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -17,7 +17,7 @@
17static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) 17static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
18{ 18{
19 if (stat->nr_samples) { 19 if (stat->nr_samples) {
20 seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", 20 seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
21 stat->nr_samples, stat->mean, stat->min, stat->max); 21 stat->nr_samples, stat->mean, stat->min, stat->max);
22 } else { 22 } else {
23 seq_puts(m, "samples=0"); 23 seq_puts(m, "samples=0");
@@ -29,13 +29,13 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
29 struct request_queue *q = data; 29 struct request_queue *q = data;
30 int bucket; 30 int bucket;
31 31
32 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) { 32 for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
33 seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket)); 33 seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
34 print_stat(m, &q->poll_stat[2*bucket]); 34 print_stat(m, &q->poll_stat[2 * bucket]);
35 seq_puts(m, "\n"); 35 seq_puts(m, "\n");
36 36
37 seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket)); 37 seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket));
38 print_stat(m, &q->poll_stat[2*bucket+1]); 38 print_stat(m, &q->poll_stat[2 * bucket + 1]);
39 seq_puts(m, "\n"); 39 seq_puts(m, "\n");
40 } 40 }
41 return 0; 41 return 0;
@@ -261,23 +261,6 @@ static int hctx_flags_show(void *data, struct seq_file *m)
261 return 0; 261 return 0;
262} 262}
263 263
264#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
265static const char *const op_name[] = {
266 REQ_OP_NAME(READ),
267 REQ_OP_NAME(WRITE),
268 REQ_OP_NAME(FLUSH),
269 REQ_OP_NAME(DISCARD),
270 REQ_OP_NAME(SECURE_ERASE),
271 REQ_OP_NAME(ZONE_RESET),
272 REQ_OP_NAME(WRITE_SAME),
273 REQ_OP_NAME(WRITE_ZEROES),
274 REQ_OP_NAME(SCSI_IN),
275 REQ_OP_NAME(SCSI_OUT),
276 REQ_OP_NAME(DRV_IN),
277 REQ_OP_NAME(DRV_OUT),
278};
279#undef REQ_OP_NAME
280
281#define CMD_FLAG_NAME(name) [__REQ_##name] = #name 264#define CMD_FLAG_NAME(name) [__REQ_##name] = #name
282static const char *const cmd_flag_name[] = { 265static const char *const cmd_flag_name[] = {
283 CMD_FLAG_NAME(FAILFAST_DEV), 266 CMD_FLAG_NAME(FAILFAST_DEV),
@@ -341,13 +324,14 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
341int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) 324int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
342{ 325{
343 const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; 326 const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
344 const unsigned int op = rq->cmd_flags & REQ_OP_MASK; 327 const unsigned int op = req_op(rq);
328 const char *op_str = blk_op_str(op);
345 329
346 seq_printf(m, "%p {.op=", rq); 330 seq_printf(m, "%p {.op=", rq);
347 if (op < ARRAY_SIZE(op_name) && op_name[op]) 331 if (strcmp(op_str, "UNKNOWN") == 0)
348 seq_printf(m, "%s", op_name[op]); 332 seq_printf(m, "%u", op);
349 else 333 else
350 seq_printf(m, "%d", op); 334 seq_printf(m, "%s", op_str);
351 seq_puts(m, ", .cmd_flags="); 335 seq_puts(m, ", .cmd_flags=");
352 blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, 336 blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
353 ARRAY_SIZE(cmd_flag_name)); 337 ARRAY_SIZE(cmd_flag_name));
@@ -779,8 +763,8 @@ static int blk_mq_debugfs_release(struct inode *inode, struct file *file)
779 763
780 if (attr->show) 764 if (attr->show)
781 return single_release(inode, file); 765 return single_release(inode, file);
782 else 766
783 return seq_release(inode, file); 767 return seq_release(inode, file);
784} 768}
785 769
786static const struct file_operations blk_mq_debugfs_fops = { 770static const struct file_operations blk_mq_debugfs_fops = {
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2766066a15db..c9d183d6c499 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -224,7 +224,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
224} 224}
225 225
226bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 226bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
227 struct request **merged_request) 227 unsigned int nr_segs, struct request **merged_request)
228{ 228{
229 struct request *rq; 229 struct request *rq;
230 230
@@ -232,7 +232,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
232 case ELEVATOR_BACK_MERGE: 232 case ELEVATOR_BACK_MERGE:
233 if (!blk_mq_sched_allow_merge(q, rq, bio)) 233 if (!blk_mq_sched_allow_merge(q, rq, bio))
234 return false; 234 return false;
235 if (!bio_attempt_back_merge(q, rq, bio)) 235 if (!bio_attempt_back_merge(rq, bio, nr_segs))
236 return false; 236 return false;
237 *merged_request = attempt_back_merge(q, rq); 237 *merged_request = attempt_back_merge(q, rq);
238 if (!*merged_request) 238 if (!*merged_request)
@@ -241,7 +241,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
241 case ELEVATOR_FRONT_MERGE: 241 case ELEVATOR_FRONT_MERGE:
242 if (!blk_mq_sched_allow_merge(q, rq, bio)) 242 if (!blk_mq_sched_allow_merge(q, rq, bio))
243 return false; 243 return false;
244 if (!bio_attempt_front_merge(q, rq, bio)) 244 if (!bio_attempt_front_merge(rq, bio, nr_segs))
245 return false; 245 return false;
246 *merged_request = attempt_front_merge(q, rq); 246 *merged_request = attempt_front_merge(q, rq);
247 if (!*merged_request) 247 if (!*merged_request)
@@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
260 * of them. 260 * of them.
261 */ 261 */
262bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 262bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
263 struct bio *bio) 263 struct bio *bio, unsigned int nr_segs)
264{ 264{
265 struct request *rq; 265 struct request *rq;
266 int checked = 8; 266 int checked = 8;
@@ -277,11 +277,13 @@ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
277 switch (blk_try_merge(rq, bio)) { 277 switch (blk_try_merge(rq, bio)) {
278 case ELEVATOR_BACK_MERGE: 278 case ELEVATOR_BACK_MERGE:
279 if (blk_mq_sched_allow_merge(q, rq, bio)) 279 if (blk_mq_sched_allow_merge(q, rq, bio))
280 merged = bio_attempt_back_merge(q, rq, bio); 280 merged = bio_attempt_back_merge(rq, bio,
281 nr_segs);
281 break; 282 break;
282 case ELEVATOR_FRONT_MERGE: 283 case ELEVATOR_FRONT_MERGE:
283 if (blk_mq_sched_allow_merge(q, rq, bio)) 284 if (blk_mq_sched_allow_merge(q, rq, bio))
284 merged = bio_attempt_front_merge(q, rq, bio); 285 merged = bio_attempt_front_merge(rq, bio,
286 nr_segs);
285 break; 287 break;
286 case ELEVATOR_DISCARD_MERGE: 288 case ELEVATOR_DISCARD_MERGE:
287 merged = bio_attempt_discard_merge(q, rq, bio); 289 merged = bio_attempt_discard_merge(q, rq, bio);
@@ -304,13 +306,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
304 */ 306 */
305static bool blk_mq_attempt_merge(struct request_queue *q, 307static bool blk_mq_attempt_merge(struct request_queue *q,
306 struct blk_mq_hw_ctx *hctx, 308 struct blk_mq_hw_ctx *hctx,
307 struct blk_mq_ctx *ctx, struct bio *bio) 309 struct blk_mq_ctx *ctx, struct bio *bio,
310 unsigned int nr_segs)
308{ 311{
309 enum hctx_type type = hctx->type; 312 enum hctx_type type = hctx->type;
310 313
311 lockdep_assert_held(&ctx->lock); 314 lockdep_assert_held(&ctx->lock);
312 315
313 if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { 316 if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
314 ctx->rq_merged++; 317 ctx->rq_merged++;
315 return true; 318 return true;
316 } 319 }
@@ -318,7 +321,8 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
318 return false; 321 return false;
319} 322}
320 323
321bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) 324bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
325 unsigned int nr_segs)
322{ 326{
323 struct elevator_queue *e = q->elevator; 327 struct elevator_queue *e = q->elevator;
324 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 328 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
@@ -326,21 +330,18 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
326 bool ret = false; 330 bool ret = false;
327 enum hctx_type type; 331 enum hctx_type type;
328 332
329 if (e && e->type->ops.bio_merge) { 333 if (e && e->type->ops.bio_merge)
330 blk_mq_put_ctx(ctx); 334 return e->type->ops.bio_merge(hctx, bio, nr_segs);
331 return e->type->ops.bio_merge(hctx, bio);
332 }
333 335
334 type = hctx->type; 336 type = hctx->type;
335 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 337 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
336 !list_empty_careful(&ctx->rq_lists[type])) { 338 !list_empty_careful(&ctx->rq_lists[type])) {
337 /* default per sw-queue merge */ 339 /* default per sw-queue merge */
338 spin_lock(&ctx->lock); 340 spin_lock(&ctx->lock);
339 ret = blk_mq_attempt_merge(q, hctx, ctx, bio); 341 ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs);
340 spin_unlock(&ctx->lock); 342 spin_unlock(&ctx->lock);
341 } 343 }
342 344
343 blk_mq_put_ctx(ctx);
344 return ret; 345 return ret;
345} 346}
346 347
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3cf92cbbd8ac..cf22ab00fefb 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -12,8 +12,9 @@ void blk_mq_sched_assign_ioc(struct request *rq);
12 12
13void blk_mq_sched_request_inserted(struct request *rq); 13void blk_mq_sched_request_inserted(struct request *rq);
14bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 14bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
15 struct request **merged_request); 15 unsigned int nr_segs, struct request **merged_request);
16bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); 16bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
17 unsigned int nr_segs);
17bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); 18bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
18void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); 19void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
19void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); 20void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
@@ -31,12 +32,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
31void blk_mq_sched_free_requests(struct request_queue *q); 32void blk_mq_sched_free_requests(struct request_queue *q);
32 33
33static inline bool 34static inline bool
34blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) 35blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
36 unsigned int nr_segs)
35{ 37{
36 if (blk_queue_nomerges(q) || !bio_mergeable(bio)) 38 if (blk_queue_nomerges(q) || !bio_mergeable(bio))
37 return false; 39 return false;
38 40
39 return __blk_mq_sched_bio_merge(q, bio); 41 return __blk_mq_sched_bio_merge(q, bio, nr_segs);
40} 42}
41 43
42static inline bool 44static inline bool
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7513c8eaabee..da19f0bc8876 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -113,7 +113,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
113 struct sbq_wait_state *ws; 113 struct sbq_wait_state *ws;
114 DEFINE_SBQ_WAIT(wait); 114 DEFINE_SBQ_WAIT(wait);
115 unsigned int tag_offset; 115 unsigned int tag_offset;
116 bool drop_ctx;
117 int tag; 116 int tag;
118 117
119 if (data->flags & BLK_MQ_REQ_RESERVED) { 118 if (data->flags & BLK_MQ_REQ_RESERVED) {
@@ -136,7 +135,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
136 return BLK_MQ_TAG_FAIL; 135 return BLK_MQ_TAG_FAIL;
137 136
138 ws = bt_wait_ptr(bt, data->hctx); 137 ws = bt_wait_ptr(bt, data->hctx);
139 drop_ctx = data->ctx == NULL;
140 do { 138 do {
141 struct sbitmap_queue *bt_prev; 139 struct sbitmap_queue *bt_prev;
142 140
@@ -161,9 +159,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
161 if (tag != -1) 159 if (tag != -1)
162 break; 160 break;
163 161
164 if (data->ctx)
165 blk_mq_put_ctx(data->ctx);
166
167 bt_prev = bt; 162 bt_prev = bt;
168 io_schedule(); 163 io_schedule();
169 164
@@ -189,9 +184,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
189 ws = bt_wait_ptr(bt, data->hctx); 184 ws = bt_wait_ptr(bt, data->hctx);
190 } while (1); 185 } while (1);
191 186
192 if (drop_ctx && data->ctx)
193 blk_mq_put_ctx(data->ctx);
194
195 sbitmap_finish_wait(bt, ws, &wait); 187 sbitmap_finish_wait(bt, ws, &wait);
196 188
197found_tag: 189found_tag:
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4ede70..e5ef40c603ca 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -355,13 +355,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
355 struct elevator_queue *e = q->elevator; 355 struct elevator_queue *e = q->elevator;
356 struct request *rq; 356 struct request *rq;
357 unsigned int tag; 357 unsigned int tag;
358 bool put_ctx_on_error = false; 358 bool clear_ctx_on_error = false;
359 359
360 blk_queue_enter_live(q); 360 blk_queue_enter_live(q);
361 data->q = q; 361 data->q = q;
362 if (likely(!data->ctx)) { 362 if (likely(!data->ctx)) {
363 data->ctx = blk_mq_get_ctx(q); 363 data->ctx = blk_mq_get_ctx(q);
364 put_ctx_on_error = true; 364 clear_ctx_on_error = true;
365 } 365 }
366 if (likely(!data->hctx)) 366 if (likely(!data->hctx))
367 data->hctx = blk_mq_map_queue(q, data->cmd_flags, 367 data->hctx = blk_mq_map_queue(q, data->cmd_flags,
@@ -387,10 +387,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
387 387
388 tag = blk_mq_get_tag(data); 388 tag = blk_mq_get_tag(data);
389 if (tag == BLK_MQ_TAG_FAIL) { 389 if (tag == BLK_MQ_TAG_FAIL) {
390 if (put_ctx_on_error) { 390 if (clear_ctx_on_error)
391 blk_mq_put_ctx(data->ctx);
392 data->ctx = NULL; 391 data->ctx = NULL;
393 }
394 blk_queue_exit(q); 392 blk_queue_exit(q);
395 return NULL; 393 return NULL;
396 } 394 }
@@ -427,8 +425,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
427 if (!rq) 425 if (!rq)
428 return ERR_PTR(-EWOULDBLOCK); 426 return ERR_PTR(-EWOULDBLOCK);
429 427
430 blk_mq_put_ctx(alloc_data.ctx);
431
432 rq->__data_len = 0; 428 rq->__data_len = 0;
433 rq->__sector = (sector_t) -1; 429 rq->__sector = (sector_t) -1;
434 rq->bio = rq->biotail = NULL; 430 rq->bio = rq->biotail = NULL;
@@ -1764,9 +1760,15 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1764 } 1760 }
1765} 1761}
1766 1762
1767static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1763static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1764 unsigned int nr_segs)
1768{ 1765{
1769 blk_init_request_from_bio(rq, bio); 1766 if (bio->bi_opf & REQ_RAHEAD)
1767 rq->cmd_flags |= REQ_FAILFAST_MASK;
1768
1769 rq->__sector = bio->bi_iter.bi_sector;
1770 rq->write_hint = bio->bi_write_hint;
1771 blk_rq_bio_prep(rq, bio, nr_segs);
1770 1772
1771 blk_account_io_start(rq, true); 1773 blk_account_io_start(rq, true);
1772} 1774}
@@ -1936,20 +1938,20 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1936 struct request *rq; 1938 struct request *rq;
1937 struct blk_plug *plug; 1939 struct blk_plug *plug;
1938 struct request *same_queue_rq = NULL; 1940 struct request *same_queue_rq = NULL;
1941 unsigned int nr_segs;
1939 blk_qc_t cookie; 1942 blk_qc_t cookie;
1940 1943
1941 blk_queue_bounce(q, &bio); 1944 blk_queue_bounce(q, &bio);
1942 1945 __blk_queue_split(q, &bio, &nr_segs);
1943 blk_queue_split(q, &bio);
1944 1946
1945 if (!bio_integrity_prep(bio)) 1947 if (!bio_integrity_prep(bio))
1946 return BLK_QC_T_NONE; 1948 return BLK_QC_T_NONE;
1947 1949
1948 if (!is_flush_fua && !blk_queue_nomerges(q) && 1950 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1949 blk_attempt_plug_merge(q, bio, &same_queue_rq)) 1951 blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
1950 return BLK_QC_T_NONE; 1952 return BLK_QC_T_NONE;
1951 1953
1952 if (blk_mq_sched_bio_merge(q, bio)) 1954 if (blk_mq_sched_bio_merge(q, bio, nr_segs))
1953 return BLK_QC_T_NONE; 1955 return BLK_QC_T_NONE;
1954 1956
1955 rq_qos_throttle(q, bio); 1957 rq_qos_throttle(q, bio);
@@ -1969,11 +1971,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1969 1971
1970 cookie = request_to_qc_t(data.hctx, rq); 1972 cookie = request_to_qc_t(data.hctx, rq);
1971 1973
1974 blk_mq_bio_to_request(rq, bio, nr_segs);
1975
1972 plug = current->plug; 1976 plug = current->plug;
1973 if (unlikely(is_flush_fua)) { 1977 if (unlikely(is_flush_fua)) {
1974 blk_mq_put_ctx(data.ctx);
1975 blk_mq_bio_to_request(rq, bio);
1976
1977 /* bypass scheduler for flush rq */ 1978 /* bypass scheduler for flush rq */
1978 blk_insert_flush(rq); 1979 blk_insert_flush(rq);
1979 blk_mq_run_hw_queue(data.hctx, true); 1980 blk_mq_run_hw_queue(data.hctx, true);
@@ -1985,9 +1986,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1985 unsigned int request_count = plug->rq_count; 1986 unsigned int request_count = plug->rq_count;
1986 struct request *last = NULL; 1987 struct request *last = NULL;
1987 1988
1988 blk_mq_put_ctx(data.ctx);
1989 blk_mq_bio_to_request(rq, bio);
1990
1991 if (!request_count) 1989 if (!request_count)
1992 trace_block_plug(q); 1990 trace_block_plug(q);
1993 else 1991 else
@@ -2001,8 +1999,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2001 1999
2002 blk_add_rq_to_plug(plug, rq); 2000 blk_add_rq_to_plug(plug, rq);
2003 } else if (plug && !blk_queue_nomerges(q)) { 2001 } else if (plug && !blk_queue_nomerges(q)) {
2004 blk_mq_bio_to_request(rq, bio);
2005
2006 /* 2002 /*
2007 * We do limited plugging. If the bio can be merged, do that. 2003 * We do limited plugging. If the bio can be merged, do that.
2008 * Otherwise the existing request in the plug list will be 2004 * Otherwise the existing request in the plug list will be
@@ -2019,8 +2015,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2019 blk_add_rq_to_plug(plug, rq); 2015 blk_add_rq_to_plug(plug, rq);
2020 trace_block_plug(q); 2016 trace_block_plug(q);
2021 2017
2022 blk_mq_put_ctx(data.ctx);
2023
2024 if (same_queue_rq) { 2018 if (same_queue_rq) {
2025 data.hctx = same_queue_rq->mq_hctx; 2019 data.hctx = same_queue_rq->mq_hctx;
2026 trace_block_unplug(q, 1, true); 2020 trace_block_unplug(q, 1, true);
@@ -2029,12 +2023,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2029 } 2023 }
2030 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && 2024 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
2031 !data.hctx->dispatch_busy)) { 2025 !data.hctx->dispatch_busy)) {
2032 blk_mq_put_ctx(data.ctx);
2033 blk_mq_bio_to_request(rq, bio);
2034 blk_mq_try_issue_directly(data.hctx, rq, &cookie); 2026 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
2035 } else { 2027 } else {
2036 blk_mq_put_ctx(data.ctx);
2037 blk_mq_bio_to_request(rq, bio);
2038 blk_mq_sched_insert_request(rq, false, true, true); 2028 blk_mq_sched_insert_request(rq, false, true, true);
2039 } 2029 }
2040 2030
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 633a5a77ee8b..f4bf5161333e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -151,12 +151,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
151 */ 151 */
152static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 152static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
153{ 153{
154 return __blk_mq_get_ctx(q, get_cpu()); 154 return __blk_mq_get_ctx(q, raw_smp_processor_id());
155}
156
157static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
158{
159 put_cpu();
160} 155}
161 156
162struct blk_mq_alloc_data { 157struct blk_mq_alloc_data {
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..de6b2e146d6e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,8 +51,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
51 int node, int cmd_size, gfp_t flags); 51 int node, int cmd_size, gfp_t flags);
52void blk_free_flush_queue(struct blk_flush_queue *q); 52void blk_free_flush_queue(struct blk_flush_queue *q);
53 53
54void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
55 struct bio *bio);
56void blk_freeze_queue(struct request_queue *q); 54void blk_freeze_queue(struct request_queue *q);
57 55
58static inline void blk_queue_enter_live(struct request_queue *q) 56static inline void blk_queue_enter_live(struct request_queue *q)
@@ -101,6 +99,18 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
101 return __bvec_gap_to_prev(q, bprv, offset); 99 return __bvec_gap_to_prev(q, bprv, offset);
102} 100}
103 101
102static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
103 unsigned int nr_segs)
104{
105 rq->nr_phys_segments = nr_segs;
106 rq->__data_len = bio->bi_iter.bi_size;
107 rq->bio = rq->biotail = bio;
108 rq->ioprio = bio_prio(bio);
109
110 if (bio->bi_disk)
111 rq->rq_disk = bio->bi_disk;
112}
113
104#ifdef CONFIG_BLK_DEV_INTEGRITY 114#ifdef CONFIG_BLK_DEV_INTEGRITY
105void blk_flush_integrity(void); 115void blk_flush_integrity(void);
106bool __bio_integrity_endio(struct bio *); 116bool __bio_integrity_endio(struct bio *);
@@ -154,14 +164,14 @@ static inline bool bio_integrity_endio(struct bio *bio)
154unsigned long blk_rq_timeout(unsigned long timeout); 164unsigned long blk_rq_timeout(unsigned long timeout);
155void blk_add_timer(struct request *req); 165void blk_add_timer(struct request *req);
156 166
157bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 167bool bio_attempt_front_merge(struct request *req, struct bio *bio,
158 struct bio *bio); 168 unsigned int nr_segs);
159bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 169bool bio_attempt_back_merge(struct request *req, struct bio *bio,
160 struct bio *bio); 170 unsigned int nr_segs);
161bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, 171bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
162 struct bio *bio); 172 struct bio *bio);
163bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 173bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
164 struct request **same_queue_rq); 174 unsigned int nr_segs, struct request **same_queue_rq);
165 175
166void blk_account_io_start(struct request *req, bool new_io); 176void blk_account_io_start(struct request *req, bool new_io);
167void blk_account_io_completion(struct request *req, unsigned int bytes); 177void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -202,15 +212,17 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
202} 212}
203#endif 213#endif
204 214
205int ll_back_merge_fn(struct request_queue *q, struct request *req, 215void __blk_queue_split(struct request_queue *q, struct bio **bio,
206 struct bio *bio); 216 unsigned int *nr_segs);
207int ll_front_merge_fn(struct request_queue *q, struct request *req, 217int ll_back_merge_fn(struct request *req, struct bio *bio,
208 struct bio *bio); 218 unsigned int nr_segs);
219int ll_front_merge_fn(struct request *req, struct bio *bio,
220 unsigned int nr_segs);
209struct request *attempt_back_merge(struct request_queue *q, struct request *rq); 221struct request *attempt_back_merge(struct request_queue *q, struct request *rq);
210struct request *attempt_front_merge(struct request_queue *q, struct request *rq); 222struct request *attempt_front_merge(struct request_queue *q, struct request *rq);
211int blk_attempt_req_merge(struct request_queue *q, struct request *rq, 223int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
212 struct request *next); 224 struct request *next);
213void blk_recalc_rq_segments(struct request *rq); 225unsigned int blk_recalc_rq_segments(struct request *rq);
214void blk_rq_set_mixed_merge(struct request *rq); 226void blk_rq_set_mixed_merge(struct request *rq);
215bool blk_rq_merge_ok(struct request *rq, struct bio *bio); 227bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
216enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); 228enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
diff --git a/block/genhd.c b/block/genhd.c
index 24654e1d83e6..97887e59f3b2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1281,7 +1281,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
1281 struct disk_part_tbl *new_ptbl; 1281 struct disk_part_tbl *new_ptbl;
1282 int len = old_ptbl ? old_ptbl->len : 0; 1282 int len = old_ptbl ? old_ptbl->len : 0;
1283 int i, target; 1283 int i, target;
1284 size_t size;
1285 1284
1286 /* 1285 /*
1287 * check for int overflow, since we can get here from blkpg_ioctl() 1286 * check for int overflow, since we can get here from blkpg_ioctl()
@@ -1298,8 +1297,8 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
1298 if (target <= len) 1297 if (target <= len)
1299 return 0; 1298 return 0;
1300 1299
1301 size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); 1300 new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
1302 new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); 1301 disk->node_id);
1303 if (!new_ptbl) 1302 if (!new_ptbl)
1304 return -ENOMEM; 1303 return -ENOMEM;
1305 1304
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index c3b05119cebd..34dcea0ef637 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -562,7 +562,8 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
562 } 562 }
563} 563}
564 564
565static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) 565static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
566 unsigned int nr_segs)
566{ 567{
567 struct kyber_hctx_data *khd = hctx->sched_data; 568 struct kyber_hctx_data *khd = hctx->sched_data;
568 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); 569 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
@@ -572,9 +573,8 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
572 bool merged; 573 bool merged;
573 574
574 spin_lock(&kcq->lock); 575 spin_lock(&kcq->lock);
575 merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); 576 merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
576 spin_unlock(&kcq->lock); 577 spin_unlock(&kcq->lock);
577 blk_mq_put_ctx(ctx);
578 578
579 return merged; 579 return merged;
580} 580}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 1876f5712bfd..b8a682b5a1bb 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -469,7 +469,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
469 return ELEVATOR_NO_MERGE; 469 return ELEVATOR_NO_MERGE;
470} 470}
471 471
472static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) 472static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
473 unsigned int nr_segs)
473{ 474{
474 struct request_queue *q = hctx->queue; 475 struct request_queue *q = hctx->queue;
475 struct deadline_data *dd = q->elevator->elevator_data; 476 struct deadline_data *dd = q->elevator->elevator_data;
@@ -477,7 +478,7 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
477 bool ret; 478 bool ret;
478 479
479 spin_lock(&dd->lock); 480 spin_lock(&dd->lock);
480 ret = blk_mq_sched_try_merge(q, bio, &free); 481 ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
481 spin_unlock(&dd->lock); 482 spin_unlock(&dd->lock);
482 483
483 if (free) 484 if (free)
diff --git a/block/opal_proto.h b/block/opal_proto.h
index d9a05ad02eb5..466ec7be16ef 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -98,6 +98,7 @@ enum opal_uid {
98 OPAL_ENTERPRISE_BANDMASTER0_UID, 98 OPAL_ENTERPRISE_BANDMASTER0_UID,
99 OPAL_ENTERPRISE_ERASEMASTER_UID, 99 OPAL_ENTERPRISE_ERASEMASTER_UID,
100 /* tables */ 100 /* tables */
101 OPAL_TABLE_TABLE,
101 OPAL_LOCKINGRANGE_GLOBAL, 102 OPAL_LOCKINGRANGE_GLOBAL,
102 OPAL_LOCKINGRANGE_ACE_RDLOCKED, 103 OPAL_LOCKINGRANGE_ACE_RDLOCKED,
103 OPAL_LOCKINGRANGE_ACE_WRLOCKED, 104 OPAL_LOCKINGRANGE_ACE_WRLOCKED,
@@ -152,6 +153,21 @@ enum opal_token {
152 OPAL_STARTCOLUMN = 0x03, 153 OPAL_STARTCOLUMN = 0x03,
153 OPAL_ENDCOLUMN = 0x04, 154 OPAL_ENDCOLUMN = 0x04,
154 OPAL_VALUES = 0x01, 155 OPAL_VALUES = 0x01,
156 /* table table */
157 OPAL_TABLE_UID = 0x00,
158 OPAL_TABLE_NAME = 0x01,
159 OPAL_TABLE_COMMON = 0x02,
160 OPAL_TABLE_TEMPLATE = 0x03,
161 OPAL_TABLE_KIND = 0x04,
162 OPAL_TABLE_COLUMN = 0x05,
163 OPAL_TABLE_COLUMNS = 0x06,
164 OPAL_TABLE_ROWS = 0x07,
165 OPAL_TABLE_ROWS_FREE = 0x08,
166 OPAL_TABLE_ROW_BYTES = 0x09,
167 OPAL_TABLE_LASTID = 0x0A,
168 OPAL_TABLE_MIN = 0x0B,
169 OPAL_TABLE_MAX = 0x0C,
170
155 /* authority table */ 171 /* authority table */
156 OPAL_PIN = 0x03, 172 OPAL_PIN = 0x03,
157 /* locking tokens */ 173 /* locking tokens */
diff --git a/block/sed-opal.c b/block/sed-opal.c
index a46e8d13e16d..7e1a444a25b2 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -26,6 +26,9 @@
26#define IO_BUFFER_LENGTH 2048 26#define IO_BUFFER_LENGTH 2048
27#define MAX_TOKS 64 27#define MAX_TOKS 64
28 28
29/* Number of bytes needed by cmd_finalize. */
30#define CMD_FINALIZE_BYTES_NEEDED 7
31
29struct opal_step { 32struct opal_step {
30 int (*fn)(struct opal_dev *dev, void *data); 33 int (*fn)(struct opal_dev *dev, void *data);
31 void *data; 34 void *data;
@@ -127,6 +130,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
127 130
128 /* tables */ 131 /* tables */
129 132
133 [OPAL_TABLE_TABLE]
134 { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 },
130 [OPAL_LOCKINGRANGE_GLOBAL] = 135 [OPAL_LOCKINGRANGE_GLOBAL] =
131 { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, 136 { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
132 [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = 137 [OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
@@ -523,12 +528,17 @@ static int opal_discovery0_step(struct opal_dev *dev)
523 return execute_step(dev, &discovery0_step, 0); 528 return execute_step(dev, &discovery0_step, 0);
524} 529}
525 530
531static size_t remaining_size(struct opal_dev *cmd)
532{
533 return IO_BUFFER_LENGTH - cmd->pos;
534}
535
526static bool can_add(int *err, struct opal_dev *cmd, size_t len) 536static bool can_add(int *err, struct opal_dev *cmd, size_t len)
527{ 537{
528 if (*err) 538 if (*err)
529 return false; 539 return false;
530 540
531 if (len > IO_BUFFER_LENGTH || cmd->pos > IO_BUFFER_LENGTH - len) { 541 if (remaining_size(cmd) < len) {
532 pr_debug("Error adding %zu bytes: end of buffer.\n", len); 542 pr_debug("Error adding %zu bytes: end of buffer.\n", len);
533 *err = -ERANGE; 543 *err = -ERANGE;
534 return false; 544 return false;
@@ -674,7 +684,11 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
674 struct opal_header *hdr; 684 struct opal_header *hdr;
675 int err = 0; 685 int err = 0;
676 686
677 /* close the parameter list opened from cmd_start */ 687 /*
688 * Close the parameter list opened from cmd_start.
689 * The number of bytes added must be equal to
690 * CMD_FINALIZE_BYTES_NEEDED.
691 */
678 add_token_u8(&err, cmd, OPAL_ENDLIST); 692 add_token_u8(&err, cmd, OPAL_ENDLIST);
679 693
680 add_token_u8(&err, cmd, OPAL_ENDOFDATA); 694 add_token_u8(&err, cmd, OPAL_ENDOFDATA);
@@ -1119,6 +1133,29 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table,
1119 return finalize_and_send(dev, parse_and_check_status); 1133 return finalize_and_send(dev, parse_and_check_status);
1120} 1134}
1121 1135
1136/*
1137 * see TCG SAS 5.3.2.3 for a description of the available columns
1138 *
1139 * the result is provided in dev->resp->tok[4]
1140 */
1141static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table,
1142 u64 column)
1143{
1144 u8 uid[OPAL_UID_LENGTH];
1145 const unsigned int half = OPAL_UID_LENGTH/2;
1146
1147 /* sed-opal UIDs can be split in two halves:
1148 * first: actual table index
1149 * second: relative index in the table
1150 * so we have to get the first half of the OPAL_TABLE_TABLE and use the
1151 * first part of the target table as relative index into that table
1152 */
1153 memcpy(uid, opaluid[OPAL_TABLE_TABLE], half);
1154 memcpy(uid+half, opaluid[table], half);
1155
1156 return generic_get_column(dev, uid, column);
1157}
1158
1122static int gen_key(struct opal_dev *dev, void *data) 1159static int gen_key(struct opal_dev *dev, void *data)
1123{ 1160{
1124 u8 uid[OPAL_UID_LENGTH]; 1161 u8 uid[OPAL_UID_LENGTH];
@@ -1307,6 +1344,7 @@ static int start_generic_opal_session(struct opal_dev *dev,
1307 break; 1344 break;
1308 case OPAL_ADMIN1_UID: 1345 case OPAL_ADMIN1_UID:
1309 case OPAL_SID_UID: 1346 case OPAL_SID_UID:
1347 case OPAL_PSID_UID:
1310 add_token_u8(&err, dev, OPAL_STARTNAME); 1348 add_token_u8(&err, dev, OPAL_STARTNAME);
1311 add_token_u8(&err, dev, 0); /* HostChallenge */ 1349 add_token_u8(&err, dev, 0); /* HostChallenge */
1312 add_token_bytestring(&err, dev, key, key_len); 1350 add_token_bytestring(&err, dev, key, key_len);
@@ -1367,6 +1405,16 @@ static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data)
1367 key->key, key->key_len); 1405 key->key, key->key_len);
1368} 1406}
1369 1407
1408static int start_PSID_opal_session(struct opal_dev *dev, void *data)
1409{
1410 const struct opal_key *okey = data;
1411
1412 return start_generic_opal_session(dev, OPAL_PSID_UID,
1413 OPAL_ADMINSP_UID,
1414 okey->key,
1415 okey->key_len);
1416}
1417
1370static int start_auth_opal_session(struct opal_dev *dev, void *data) 1418static int start_auth_opal_session(struct opal_dev *dev, void *data)
1371{ 1419{
1372 struct opal_session_info *session = data; 1420 struct opal_session_info *session = data;
@@ -1525,6 +1573,72 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
1525 return finalize_and_send(dev, parse_and_check_status); 1573 return finalize_and_send(dev, parse_and_check_status);
1526} 1574}
1527 1575
1576static int write_shadow_mbr(struct opal_dev *dev, void *data)
1577{
1578 struct opal_shadow_mbr *shadow = data;
1579 const u8 __user *src;
1580 u8 *dst;
1581 size_t off = 0;
1582 u64 len;
1583 int err = 0;
1584
1585 /* do we fit in the available shadow mbr space? */
1586 err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS);
1587 if (err) {
1588 pr_debug("MBR: could not get shadow size\n");
1589 return err;
1590 }
1591
1592 len = response_get_u64(&dev->parsed, 4);
1593 if (shadow->size > len || shadow->offset > len - shadow->size) {
1594 pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n",
1595 shadow->offset + shadow->size, len);
1596 return -ENOSPC;
1597 }
1598
1599 /* do the actual transmission(s) */
1600 src = (u8 __user *)(uintptr_t)shadow->data;
1601 while (off < shadow->size) {
1602 err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]);
1603 add_token_u8(&err, dev, OPAL_STARTNAME);
1604 add_token_u8(&err, dev, OPAL_WHERE);
1605 add_token_u64(&err, dev, shadow->offset + off);
1606 add_token_u8(&err, dev, OPAL_ENDNAME);
1607
1608 add_token_u8(&err, dev, OPAL_STARTNAME);
1609 add_token_u8(&err, dev, OPAL_VALUES);
1610
1611 /*
1612 * The bytestring header is either 1 or 2 bytes, so assume 2.
1613 * There also needs to be enough space to accommodate the
1614 * trailing OPAL_ENDNAME (1 byte) and tokens added by
1615 * cmd_finalize.
1616 */
1617 len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED),
1618 (size_t)(shadow->size - off));
1619 pr_debug("MBR: write bytes %zu+%llu/%llu\n",
1620 off, len, shadow->size);
1621
1622 dst = add_bytestring_header(&err, dev, len);
1623 if (!dst)
1624 break;
1625 if (copy_from_user(dst, src + off, len))
1626 err = -EFAULT;
1627 dev->pos += len;
1628
1629 add_token_u8(&err, dev, OPAL_ENDNAME);
1630 if (err)
1631 break;
1632
1633 err = finalize_and_send(dev, parse_and_check_status);
1634 if (err)
1635 break;
1636
1637 off += len;
1638 }
1639 return err;
1640}
1641
1528static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, 1642static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
1529 struct opal_dev *dev) 1643 struct opal_dev *dev)
1530{ 1644{
@@ -1978,6 +2092,50 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
1978 return ret; 2092 return ret;
1979} 2093}
1980 2094
2095static int opal_set_mbr_done(struct opal_dev *dev,
2096 struct opal_mbr_done *mbr_done)
2097{
2098 u8 mbr_done_tf = mbr_done->done_flag == OPAL_MBR_DONE ?
2099 OPAL_TRUE : OPAL_FALSE;
2100
2101 const struct opal_step mbr_steps[] = {
2102 { start_admin1LSP_opal_session, &mbr_done->key },
2103 { set_mbr_done, &mbr_done_tf },
2104 { end_opal_session, }
2105 };
2106 int ret;
2107
2108 if (mbr_done->done_flag != OPAL_MBR_DONE &&
2109 mbr_done->done_flag != OPAL_MBR_NOT_DONE)
2110 return -EINVAL;
2111
2112 mutex_lock(&dev->dev_lock);
2113 setup_opal_dev(dev);
2114 ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
2115 mutex_unlock(&dev->dev_lock);
2116 return ret;
2117}
2118
2119static int opal_write_shadow_mbr(struct opal_dev *dev,
2120 struct opal_shadow_mbr *info)
2121{
2122 const struct opal_step mbr_steps[] = {
2123 { start_admin1LSP_opal_session, &info->key },
2124 { write_shadow_mbr, info },
2125 { end_opal_session, }
2126 };
2127 int ret;
2128
2129 if (info->size == 0)
2130 return 0;
2131
2132 mutex_lock(&dev->dev_lock);
2133 setup_opal_dev(dev);
2134 ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
2135 mutex_unlock(&dev->dev_lock);
2136 return ret;
2137}
2138
1981static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) 2139static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
1982{ 2140{
1983 struct opal_suspend_data *suspend; 2141 struct opal_suspend_data *suspend;
@@ -2030,17 +2188,28 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
2030 return ret; 2188 return ret;
2031} 2189}
2032 2190
2033static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) 2191static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psid)
2034{ 2192{
2193 /* controller will terminate session */
2035 const struct opal_step revert_steps[] = { 2194 const struct opal_step revert_steps[] = {
2036 { start_SIDASP_opal_session, opal }, 2195 { start_SIDASP_opal_session, opal },
2037 { revert_tper, } /* controller will terminate session */ 2196 { revert_tper, }
2197 };
2198 const struct opal_step psid_revert_steps[] = {
2199 { start_PSID_opal_session, opal },
2200 { revert_tper, }
2038 }; 2201 };
2202
2039 int ret; 2203 int ret;
2040 2204
2041 mutex_lock(&dev->dev_lock); 2205 mutex_lock(&dev->dev_lock);
2042 setup_opal_dev(dev); 2206 setup_opal_dev(dev);
2043 ret = execute_steps(dev, revert_steps, ARRAY_SIZE(revert_steps)); 2207 if (psid)
2208 ret = execute_steps(dev, psid_revert_steps,
2209 ARRAY_SIZE(psid_revert_steps));
2210 else
2211 ret = execute_steps(dev, revert_steps,
2212 ARRAY_SIZE(revert_steps));
2044 mutex_unlock(&dev->dev_lock); 2213 mutex_unlock(&dev->dev_lock);
2045 2214
2046 /* 2215 /*
@@ -2092,8 +2261,7 @@ static int opal_lock_unlock(struct opal_dev *dev,
2092{ 2261{
2093 int ret; 2262 int ret;
2094 2263
2095 if (lk_unlk->session.who < OPAL_ADMIN1 || 2264 if (lk_unlk->session.who > OPAL_USER9)
2096 lk_unlk->session.who > OPAL_USER9)
2097 return -EINVAL; 2265 return -EINVAL;
2098 2266
2099 mutex_lock(&dev->dev_lock); 2267 mutex_lock(&dev->dev_lock);
@@ -2171,9 +2339,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
2171 }; 2339 };
2172 int ret; 2340 int ret;
2173 2341
2174 if (opal_pw->session.who < OPAL_ADMIN1 || 2342 if (opal_pw->session.who > OPAL_USER9 ||
2175 opal_pw->session.who > OPAL_USER9 ||
2176 opal_pw->new_user_pw.who < OPAL_ADMIN1 ||
2177 opal_pw->new_user_pw.who > OPAL_USER9) 2343 opal_pw->new_user_pw.who > OPAL_USER9)
2178 return -EINVAL; 2344 return -EINVAL;
2179 2345
@@ -2280,7 +2446,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
2280 ret = opal_activate_user(dev, p); 2446 ret = opal_activate_user(dev, p);
2281 break; 2447 break;
2282 case IOC_OPAL_REVERT_TPR: 2448 case IOC_OPAL_REVERT_TPR:
2283 ret = opal_reverttper(dev, p); 2449 ret = opal_reverttper(dev, p, false);
2284 break; 2450 break;
2285 case IOC_OPAL_LR_SETUP: 2451 case IOC_OPAL_LR_SETUP:
2286 ret = opal_setup_locking_range(dev, p); 2452 ret = opal_setup_locking_range(dev, p);
@@ -2291,12 +2457,21 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
2291 case IOC_OPAL_ENABLE_DISABLE_MBR: 2457 case IOC_OPAL_ENABLE_DISABLE_MBR:
2292 ret = opal_enable_disable_shadow_mbr(dev, p); 2458 ret = opal_enable_disable_shadow_mbr(dev, p);
2293 break; 2459 break;
2460 case IOC_OPAL_MBR_DONE:
2461 ret = opal_set_mbr_done(dev, p);
2462 break;
2463 case IOC_OPAL_WRITE_SHADOW_MBR:
2464 ret = opal_write_shadow_mbr(dev, p);
2465 break;
2294 case IOC_OPAL_ERASE_LR: 2466 case IOC_OPAL_ERASE_LR:
2295 ret = opal_erase_locking_range(dev, p); 2467 ret = opal_erase_locking_range(dev, p);
2296 break; 2468 break;
2297 case IOC_OPAL_SECURE_ERASE_LR: 2469 case IOC_OPAL_SECURE_ERASE_LR:
2298 ret = opal_secure_erase_locking_range(dev, p); 2470 ret = opal_secure_erase_locking_range(dev, p);
2299 break; 2471 break;
2472 case IOC_OPAL_PSID_REVERT_TPR:
2473 ret = opal_reverttper(dev, p, true);
2474 break;
2300 default: 2475 default:
2301 break; 2476 break;
2302 } 2477 }