aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/bcache/writeback.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 18:32:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 18:32:19 -0500
commite2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch)
treeb97a90170c45211bcc437761653aa8016c34afcd /drivers/md/bcache/writeback.c
parentabc36be236358162202e86ad88616ff95a755101 (diff)
parenta04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff)
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe: "This is the main pull request for block storage for 4.15-rc1. Nothing out of the ordinary in here, and no API changes or anything like that. Just various new features for drivers, core changes, etc. In particular, this pull request contains: - A patch series from Bart, closing the whole on blk/scsi-mq queue quescing. - A series from Christoph, building towards hidden gendisks (for multipath) and ability to move bio chains around. - NVMe - Support for native multipath for NVMe (Christoph). - Userspace notifications for AENs (Keith). - Command side-effects support (Keith). - SGL support (Chaitanya Kulkarni) - FC fixes and improvements (James Smart) - Lots of fixes and tweaks (Various) - bcache - New maintainer (Michael Lyle) - Writeback control improvements (Michael) - Various fixes (Coly, Elena, Eric, Liang, et al) - lightnvm updates, mostly centered around the pblk interface (Javier, Hans, and Rakesh). - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph) - Writeback series that fix the much discussed hundreds of millions of sync-all units. This goes all the way, as discussed previously (me). - Fix for missing wakeup on writeback timer adjustments (Yafang Shao). - Fix laptop mode on blk-mq (me). - {mq,name} tupple lookup for IO schedulers, allowing us to have alias names. This means you can use 'deadline' on both !mq and on mq (where it's called mq-deadline). (me). - blktrace race fix, oopsing on sg load (me). - blk-mq optimizations (me). - Obscure waitqueue race fix for kyber (Omar). - NBD fixes (Josef). - Disable writeback throttling by default on bfq, like we do on cfq (Luca Miccio). - Series from Ming that enable us to treat flush requests on blk-mq like any other request. This is a really nice cleanup. - Series from Ming that improves merging on blk-mq with schedulers, getting us closer to flipping the switch on scsi-mq again. - BFQ updates (Paolo). - blk-mq atomic flags memory ordering fixes (Peter Z). - Loop cgroup support (Shaohua). - Lots of minor fixes from lots of different folks, both for core and driver code" * 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits) nvme: fix visibility of "uuid" ns attribute blk-mq: fixup some comment typos and lengths ide: ide-atapi: fix compile error with defining macro DEBUG blk-mq: improve tag waiting setup for non-shared tags brd: remove unused brd_mutex blk-mq: only run the hardware queue if IO is pending block: avoid null pointer dereference on null disk fs: guard_bio_eod() needs to consider partitions xtensa/simdisk: fix compile error nvme: expose subsys attribute to sysfs nvme: create 'slaves' and 'holders' entries for hidden controllers block: create 'slaves' and 'holders' entries for hidden gendisks nvme: also expose the namespace identification sysfs files for mpath nodes nvme: implement multipath access to nvme subsystems nvme: track shared namespaces nvme: introduce a nvme_ns_ids structure nvme: track subsystems block, nvme: Introduce blk_mq_req_flags_t block, scsi: Make SCSI quiesce and resume work reliably block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag ...
Diffstat (limited to 'drivers/md/bcache/writeback.c')
-rw-r--r--drivers/md/bcache/writeback.c117
1 files changed, 71 insertions, 46 deletions
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 70454f2ad2fa..56a37884ca8b 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -26,48 +26,63 @@ static void __update_writeback_rate(struct cached_dev *dc)
26 bcache_flash_devs_sectors_dirty(c); 26 bcache_flash_devs_sectors_dirty(c);
27 uint64_t cache_dirty_target = 27 uint64_t cache_dirty_target =
28 div_u64(cache_sectors * dc->writeback_percent, 100); 28 div_u64(cache_sectors * dc->writeback_percent, 100);
29
30 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), 29 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
31 c->cached_dev_sectors); 30 c->cached_dev_sectors);
32 31
33 /* PD controller */ 32 /*
34 33 * PI controller:
34 * Figures out the amount that should be written per second.
35 *
36 * First, the error (number of sectors that are dirty beyond our
37 * target) is calculated. The error is accumulated (numerically
38 * integrated).
39 *
40 * Then, the proportional value and integral value are scaled
41 * based on configured values. These are stored as inverses to
42 * avoid fixed point math and to make configuration easy-- e.g.
43 * the default value of 40 for writeback_rate_p_term_inverse
44 * attempts to write at a rate that would retire all the dirty
45 * blocks in 40 seconds.
46 *
47 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
48 * of the error is accumulated in the integral term per second.
49 * This acts as a slow, long-term average that is not subject to
50 * variations in usage like the p term.
51 */
35 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 52 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
36 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 53 int64_t error = dirty - target;
37 int64_t proportional = dirty - target; 54 int64_t proportional_scaled =
38 int64_t change; 55 div_s64(error, dc->writeback_rate_p_term_inverse);
39 56 int64_t integral_scaled;
40 dc->disk.sectors_dirty_last = dirty; 57 uint32_t new_rate;
41 58
42 /* Scale to sectors per second */ 59 if ((error < 0 && dc->writeback_rate_integral > 0) ||
43 60 (error > 0 && time_before64(local_clock(),
44 proportional *= dc->writeback_rate_update_seconds; 61 dc->writeback_rate.next + NSEC_PER_MSEC))) {
45 proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); 62 /*
46 63 * Only decrease the integral term if it's more than
47 derivative = div_s64(derivative, dc->writeback_rate_update_seconds); 64 * zero. Only increase the integral term if the device
48 65 * is keeping up. (Don't wind up the integral
49 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, 66 * ineffectively in either case).
50 (dc->writeback_rate_d_term / 67 *
51 dc->writeback_rate_update_seconds) ?: 1, 0); 68 * It's necessary to scale this by
52 69 * writeback_rate_update_seconds to keep the integral
53 derivative *= dc->writeback_rate_d_term; 70 * term dimensioned properly.
54 derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); 71 */
55 72 dc->writeback_rate_integral += error *
56 change = proportional + derivative; 73 dc->writeback_rate_update_seconds;
74 }
57 75
58 /* Don't increase writeback rate if the device isn't keeping up */ 76 integral_scaled = div_s64(dc->writeback_rate_integral,
59 if (change > 0 && 77 dc->writeback_rate_i_term_inverse);
60 time_after64(local_clock(),
61 dc->writeback_rate.next + NSEC_PER_MSEC))
62 change = 0;
63 78
64 dc->writeback_rate.rate = 79 new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
65 clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 80 dc->writeback_rate_minimum, NSEC_PER_SEC);
66 1, NSEC_PER_MSEC);
67 81
68 dc->writeback_rate_proportional = proportional; 82 dc->writeback_rate_proportional = proportional_scaled;
69 dc->writeback_rate_derivative = derivative; 83 dc->writeback_rate_integral_scaled = integral_scaled;
70 dc->writeback_rate_change = change; 84 dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
85 dc->writeback_rate.rate = new_rate;
71 dc->writeback_rate_target = target; 86 dc->writeback_rate_target = target;
72} 87}
73 88
@@ -180,13 +195,21 @@ static void write_dirty(struct closure *cl)
180 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 195 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
181 struct keybuf_key *w = io->bio.bi_private; 196 struct keybuf_key *w = io->bio.bi_private;
182 197
183 dirty_init(w); 198 /*
184 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); 199 * IO errors are signalled using the dirty bit on the key.
185 io->bio.bi_iter.bi_sector = KEY_START(&w->key); 200 * If we failed to read, we should not attempt to write to the
186 bio_set_dev(&io->bio, io->dc->bdev); 201 * backing device. Instead, immediately go to write_dirty_finish
187 io->bio.bi_end_io = dirty_endio; 202 * to clean up.
203 */
204 if (KEY_DIRTY(&w->key)) {
205 dirty_init(w);
206 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
207 io->bio.bi_iter.bi_sector = KEY_START(&w->key);
208 bio_set_dev(&io->bio, io->dc->bdev);
209 io->bio.bi_end_io = dirty_endio;
188 210
189 closure_bio_submit(&io->bio, cl); 211 closure_bio_submit(&io->bio, cl);
212 }
190 213
191 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); 214 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
192} 215}
@@ -418,6 +441,8 @@ static int bch_writeback_thread(void *arg)
418 struct cached_dev *dc = arg; 441 struct cached_dev *dc = arg;
419 bool searched_full_index; 442 bool searched_full_index;
420 443
444 bch_ratelimit_reset(&dc->writeback_rate);
445
421 while (!kthread_should_stop()) { 446 while (!kthread_should_stop()) {
422 down_write(&dc->writeback_lock); 447 down_write(&dc->writeback_lock);
423 if (!atomic_read(&dc->has_dirty) || 448 if (!atomic_read(&dc->has_dirty) ||
@@ -445,7 +470,6 @@ static int bch_writeback_thread(void *arg)
445 470
446 up_write(&dc->writeback_lock); 471 up_write(&dc->writeback_lock);
447 472
448 bch_ratelimit_reset(&dc->writeback_rate);
449 read_dirty(dc); 473 read_dirty(dc);
450 474
451 if (searched_full_index) { 475 if (searched_full_index) {
@@ -455,6 +479,8 @@ static int bch_writeback_thread(void *arg)
455 !kthread_should_stop() && 479 !kthread_should_stop() &&
456 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 480 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
457 delay = schedule_timeout_interruptible(delay); 481 delay = schedule_timeout_interruptible(delay);
482
483 bch_ratelimit_reset(&dc->writeback_rate);
458 } 484 }
459 } 485 }
460 486
@@ -492,8 +518,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
492 518
493 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), 519 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
494 sectors_dirty_init_fn, 0); 520 sectors_dirty_init_fn, 0);
495
496 d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
497} 521}
498 522
499void bch_cached_dev_writeback_init(struct cached_dev *dc) 523void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -507,10 +531,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
507 dc->writeback_percent = 10; 531 dc->writeback_percent = 10;
508 dc->writeback_delay = 30; 532 dc->writeback_delay = 30;
509 dc->writeback_rate.rate = 1024; 533 dc->writeback_rate.rate = 1024;
534 dc->writeback_rate_minimum = 8;
510 535
511 dc->writeback_rate_update_seconds = 5; 536 dc->writeback_rate_update_seconds = 5;
512 dc->writeback_rate_d_term = 30; 537 dc->writeback_rate_p_term_inverse = 40;
513 dc->writeback_rate_p_term_inverse = 6000; 538 dc->writeback_rate_i_term_inverse = 10000;
514 539
515 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 540 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
516} 541}