diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 18:32:19 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 18:32:19 -0500 |
commit | e2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch) | |
tree | b97a90170c45211bcc437761653aa8016c34afcd /drivers/md/bcache/writeback.c | |
parent | abc36be236358162202e86ad88616ff95a755101 (diff) | |
parent | a04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff) |
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe:
"This is the main pull request for block storage for 4.15-rc1.
Nothing out of the ordinary in here, and no API changes or anything
like that. Just various new features for drivers, core changes, etc.
In particular, this pull request contains:
- A patch series from Bart, closing the whole on blk/scsi-mq queue
quescing.
- A series from Christoph, building towards hidden gendisks (for
multipath) and ability to move bio chains around.
- NVMe
- Support for native multipath for NVMe (Christoph).
- Userspace notifications for AENs (Keith).
- Command side-effects support (Keith).
- SGL support (Chaitanya Kulkarni)
- FC fixes and improvements (James Smart)
- Lots of fixes and tweaks (Various)
- bcache
- New maintainer (Michael Lyle)
- Writeback control improvements (Michael)
- Various fixes (Coly, Elena, Eric, Liang, et al)
- lightnvm updates, mostly centered around the pblk interface
(Javier, Hans, and Rakesh).
- Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)
- Writeback series that fix the much discussed hundreds of millions
of sync-all units. This goes all the way, as discussed previously
(me).
- Fix for missing wakeup on writeback timer adjustments (Yafang
Shao).
- Fix laptop mode on blk-mq (me).
- {mq,name} tupple lookup for IO schedulers, allowing us to have
alias names. This means you can use 'deadline' on both !mq and on
mq (where it's called mq-deadline). (me).
- blktrace race fix, oopsing on sg load (me).
- blk-mq optimizations (me).
- Obscure waitqueue race fix for kyber (Omar).
- NBD fixes (Josef).
- Disable writeback throttling by default on bfq, like we do on cfq
(Luca Miccio).
- Series from Ming that enable us to treat flush requests on blk-mq
like any other request. This is a really nice cleanup.
- Series from Ming that improves merging on blk-mq with schedulers,
getting us closer to flipping the switch on scsi-mq again.
- BFQ updates (Paolo).
- blk-mq atomic flags memory ordering fixes (Peter Z).
- Loop cgroup support (Shaohua).
- Lots of minor fixes from lots of different folks, both for core and
driver code"
* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
nvme: fix visibility of "uuid" ns attribute
blk-mq: fixup some comment typos and lengths
ide: ide-atapi: fix compile error with defining macro DEBUG
blk-mq: improve tag waiting setup for non-shared tags
brd: remove unused brd_mutex
blk-mq: only run the hardware queue if IO is pending
block: avoid null pointer dereference on null disk
fs: guard_bio_eod() needs to consider partitions
xtensa/simdisk: fix compile error
nvme: expose subsys attribute to sysfs
nvme: create 'slaves' and 'holders' entries for hidden controllers
block: create 'slaves' and 'holders' entries for hidden gendisks
nvme: also expose the namespace identification sysfs files for mpath nodes
nvme: implement multipath access to nvme subsystems
nvme: track shared namespaces
nvme: introduce a nvme_ns_ids structure
nvme: track subsystems
block, nvme: Introduce blk_mq_req_flags_t
block, scsi: Make SCSI quiesce and resume work reliably
block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
...
Diffstat (limited to 'drivers/md/bcache/writeback.c')
-rw-r--r-- | drivers/md/bcache/writeback.c | 117 |
1 files changed, 71 insertions, 46 deletions
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 70454f2ad2fa..56a37884ca8b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -26,48 +26,63 @@ static void __update_writeback_rate(struct cached_dev *dc) | |||
26 | bcache_flash_devs_sectors_dirty(c); | 26 | bcache_flash_devs_sectors_dirty(c); |
27 | uint64_t cache_dirty_target = | 27 | uint64_t cache_dirty_target = |
28 | div_u64(cache_sectors * dc->writeback_percent, 100); | 28 | div_u64(cache_sectors * dc->writeback_percent, 100); |
29 | |||
30 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | 29 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), |
31 | c->cached_dev_sectors); | 30 | c->cached_dev_sectors); |
32 | 31 | ||
33 | /* PD controller */ | 32 | /* |
34 | 33 | * PI controller: | |
34 | * Figures out the amount that should be written per second. | ||
35 | * | ||
36 | * First, the error (number of sectors that are dirty beyond our | ||
37 | * target) is calculated. The error is accumulated (numerically | ||
38 | * integrated). | ||
39 | * | ||
40 | * Then, the proportional value and integral value are scaled | ||
41 | * based on configured values. These are stored as inverses to | ||
42 | * avoid fixed point math and to make configuration easy-- e.g. | ||
43 | * the default value of 40 for writeback_rate_p_term_inverse | ||
44 | * attempts to write at a rate that would retire all the dirty | ||
45 | * blocks in 40 seconds. | ||
46 | * | ||
47 | * The writeback_rate_i_inverse value of 10000 means that 1/10000th | ||
48 | * of the error is accumulated in the integral term per second. | ||
49 | * This acts as a slow, long-term average that is not subject to | ||
50 | * variations in usage like the p term. | ||
51 | */ | ||
35 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); | 52 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
36 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | 53 | int64_t error = dirty - target; |
37 | int64_t proportional = dirty - target; | 54 | int64_t proportional_scaled = |
38 | int64_t change; | 55 | div_s64(error, dc->writeback_rate_p_term_inverse); |
39 | 56 | int64_t integral_scaled; | |
40 | dc->disk.sectors_dirty_last = dirty; | 57 | uint32_t new_rate; |
41 | 58 | ||
42 | /* Scale to sectors per second */ | 59 | if ((error < 0 && dc->writeback_rate_integral > 0) || |
43 | 60 | (error > 0 && time_before64(local_clock(), | |
44 | proportional *= dc->writeback_rate_update_seconds; | 61 | dc->writeback_rate.next + NSEC_PER_MSEC))) { |
45 | proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); | 62 | /* |
46 | 63 | * Only decrease the integral term if it's more than | |
47 | derivative = div_s64(derivative, dc->writeback_rate_update_seconds); | 64 | * zero. Only increase the integral term if the device |
48 | 65 | * is keeping up. (Don't wind up the integral | |
49 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | 66 | * ineffectively in either case). |
50 | (dc->writeback_rate_d_term / | 67 | * |
51 | dc->writeback_rate_update_seconds) ?: 1, 0); | 68 | * It's necessary to scale this by |
52 | 69 | * writeback_rate_update_seconds to keep the integral | |
53 | derivative *= dc->writeback_rate_d_term; | 70 | * term dimensioned properly. |
54 | derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); | 71 | */ |
55 | 72 | dc->writeback_rate_integral += error * | |
56 | change = proportional + derivative; | 73 | dc->writeback_rate_update_seconds; |
74 | } | ||
57 | 75 | ||
58 | /* Don't increase writeback rate if the device isn't keeping up */ | 76 | integral_scaled = div_s64(dc->writeback_rate_integral, |
59 | if (change > 0 && | 77 | dc->writeback_rate_i_term_inverse); |
60 | time_after64(local_clock(), | ||
61 | dc->writeback_rate.next + NSEC_PER_MSEC)) | ||
62 | change = 0; | ||
63 | 78 | ||
64 | dc->writeback_rate.rate = | 79 | new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), |
65 | clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, | 80 | dc->writeback_rate_minimum, NSEC_PER_SEC); |
66 | 1, NSEC_PER_MSEC); | ||
67 | 81 | ||
68 | dc->writeback_rate_proportional = proportional; | 82 | dc->writeback_rate_proportional = proportional_scaled; |
69 | dc->writeback_rate_derivative = derivative; | 83 | dc->writeback_rate_integral_scaled = integral_scaled; |
70 | dc->writeback_rate_change = change; | 84 | dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; |
85 | dc->writeback_rate.rate = new_rate; | ||
71 | dc->writeback_rate_target = target; | 86 | dc->writeback_rate_target = target; |
72 | } | 87 | } |
73 | 88 | ||
@@ -180,13 +195,21 @@ static void write_dirty(struct closure *cl) | |||
180 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 195 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
181 | struct keybuf_key *w = io->bio.bi_private; | 196 | struct keybuf_key *w = io->bio.bi_private; |
182 | 197 | ||
183 | dirty_init(w); | 198 | /* |
184 | bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); | 199 | * IO errors are signalled using the dirty bit on the key. |
185 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); | 200 | * If we failed to read, we should not attempt to write to the |
186 | bio_set_dev(&io->bio, io->dc->bdev); | 201 | * backing device. Instead, immediately go to write_dirty_finish |
187 | io->bio.bi_end_io = dirty_endio; | 202 | * to clean up. |
203 | */ | ||
204 | if (KEY_DIRTY(&w->key)) { | ||
205 | dirty_init(w); | ||
206 | bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); | ||
207 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); | ||
208 | bio_set_dev(&io->bio, io->dc->bdev); | ||
209 | io->bio.bi_end_io = dirty_endio; | ||
188 | 210 | ||
189 | closure_bio_submit(&io->bio, cl); | 211 | closure_bio_submit(&io->bio, cl); |
212 | } | ||
190 | 213 | ||
191 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); | 214 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); |
192 | } | 215 | } |
@@ -418,6 +441,8 @@ static int bch_writeback_thread(void *arg) | |||
418 | struct cached_dev *dc = arg; | 441 | struct cached_dev *dc = arg; |
419 | bool searched_full_index; | 442 | bool searched_full_index; |
420 | 443 | ||
444 | bch_ratelimit_reset(&dc->writeback_rate); | ||
445 | |||
421 | while (!kthread_should_stop()) { | 446 | while (!kthread_should_stop()) { |
422 | down_write(&dc->writeback_lock); | 447 | down_write(&dc->writeback_lock); |
423 | if (!atomic_read(&dc->has_dirty) || | 448 | if (!atomic_read(&dc->has_dirty) || |
@@ -445,7 +470,6 @@ static int bch_writeback_thread(void *arg) | |||
445 | 470 | ||
446 | up_write(&dc->writeback_lock); | 471 | up_write(&dc->writeback_lock); |
447 | 472 | ||
448 | bch_ratelimit_reset(&dc->writeback_rate); | ||
449 | read_dirty(dc); | 473 | read_dirty(dc); |
450 | 474 | ||
451 | if (searched_full_index) { | 475 | if (searched_full_index) { |
@@ -455,6 +479,8 @@ static int bch_writeback_thread(void *arg) | |||
455 | !kthread_should_stop() && | 479 | !kthread_should_stop() && |
456 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) | 480 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) |
457 | delay = schedule_timeout_interruptible(delay); | 481 | delay = schedule_timeout_interruptible(delay); |
482 | |||
483 | bch_ratelimit_reset(&dc->writeback_rate); | ||
458 | } | 484 | } |
459 | } | 485 | } |
460 | 486 | ||
@@ -492,8 +518,6 @@ void bch_sectors_dirty_init(struct bcache_device *d) | |||
492 | 518 | ||
493 | bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), | 519 | bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), |
494 | sectors_dirty_init_fn, 0); | 520 | sectors_dirty_init_fn, 0); |
495 | |||
496 | d->sectors_dirty_last = bcache_dev_sectors_dirty(d); | ||
497 | } | 521 | } |
498 | 522 | ||
499 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 523 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
@@ -507,10 +531,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) | |||
507 | dc->writeback_percent = 10; | 531 | dc->writeback_percent = 10; |
508 | dc->writeback_delay = 30; | 532 | dc->writeback_delay = 30; |
509 | dc->writeback_rate.rate = 1024; | 533 | dc->writeback_rate.rate = 1024; |
534 | dc->writeback_rate_minimum = 8; | ||
510 | 535 | ||
511 | dc->writeback_rate_update_seconds = 5; | 536 | dc->writeback_rate_update_seconds = 5; |
512 | dc->writeback_rate_d_term = 30; | 537 | dc->writeback_rate_p_term_inverse = 40; |
513 | dc->writeback_rate_p_term_inverse = 6000; | 538 | dc->writeback_rate_i_term_inverse = 10000; |
514 | 539 | ||
515 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | 540 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
516 | } | 541 | } |