aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorColy Li <colyli@suse.de>2018-08-09 03:48:49 -0400
committerJens Axboe <axboe@kernel.dk>2018-08-09 10:21:15 -0400
commitea8c5356d39048bc94bae068228f51ddbecc6b89 (patch)
tree26eb52bd46bd1b5099115799be67d483c8ec18e8
parentb467a6ac0b4bf57ec8c2329212e8a8a0231a2ef2 (diff)
bcache: set max writeback rate when I/O request is idle
Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") allows the writeback rate to be faster if there is no I/O request on a bcache device. It works well if there is only one bcache device attached to the cache set. If there are many bcache devices attached to a cache set, it may introduce performance regression because multiple faster writeback threads of the idle bcache devices will compete the btree level locks with the bcache device who have I/O requests coming. This patch fixes the above issue by only permitting fast writebac when all bcache devices attached on the cache set are idle. And if one of the bcache devices has new I/O request coming, minimized all writeback throughput immediately and let PI controller __update_writeback_rate() to decide the upcoming writeback rate for each bcache device. Also when all bcache devices are idle, limited wrieback rate to a small number is wast of thoughput, especially when backing devices are slower non-rotation devices (e.g. SATA SSD). This patch sets a max writeback rate for each backing device if the whole cache set is idle. A faster writeback rate in idle time means new I/Os may have more available space for dirty data, and people may observe a better write performance then. Please note bcache may change its cache mode in run time, and this patch still works if the cache mode is switched from writeback mode and there is still dirty data on cache. Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") Cc: stable@vger.kernel.org #4.16+ Signed-off-by: Coly Li <colyli@suse.de> Tested-by: Kai Krakow <kai@kaishome.de> Tested-by: Stefan Priebe <s.priebe@profihost.ag> Cc: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--drivers/md/bcache/bcache.h10
-rw-r--r--drivers/md/bcache/request.c59
-rw-r--r--drivers/md/bcache/super.c4
-rw-r--r--drivers/md/bcache/sysfs.c15
-rw-r--r--drivers/md/bcache/util.c2
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c91
7 files changed, 138 insertions, 45 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b393b3fd06b6..05f82ff6f016 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
328 */ 328 */
329 atomic_t has_dirty; 329 atomic_t has_dirty;
330 330
331 /*
332 * Set to zero by things that touch the backing volume-- except
333 * writeback. Incremented by writeback. Used to determine when to
334 * accelerate idle writeback.
335 */
336 atomic_t backing_idle;
337
338 struct bch_ratelimit writeback_rate; 331 struct bch_ratelimit writeback_rate;
339 struct delayed_work writeback_rate_update; 332 struct delayed_work writeback_rate_update;
340 333
@@ -515,6 +508,8 @@ struct cache_set {
515 struct cache_accounting accounting; 508 struct cache_accounting accounting;
516 509
517 unsigned long flags; 510 unsigned long flags;
511 atomic_t idle_counter;
512 atomic_t at_max_writeback_rate;
518 513
519 struct cache_sb sb; 514 struct cache_sb sb;
520 515
@@ -524,6 +519,7 @@ struct cache_set {
524 519
525 struct bcache_device **devices; 520 struct bcache_device **devices;
526 unsigned devices_max_used; 521 unsigned devices_max_used;
522 atomic_t attached_dev_nr;
527 struct list_head cached_devs; 523 struct list_head cached_devs;
528 uint64_t cached_dev_sectors; 524 uint64_t cached_dev_sectors;
529 atomic_long_t flash_dev_dirty_sectors; 525 atomic_long_t flash_dev_dirty_sectors;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 914d501ad1e0..7dbe8b6316a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1103,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
1103 generic_make_request(bio); 1103 generic_make_request(bio);
1104} 1104}
1105 1105
1106static void quit_max_writeback_rate(struct cache_set *c,
1107 struct cached_dev *this_dc)
1108{
1109 int i;
1110 struct bcache_device *d;
1111 struct cached_dev *dc;
1112
1113 /*
1114 * mutex bch_register_lock may compete with other parallel requesters,
1115 * or attach/detach operations on other backing device. Waiting to
1116 * the mutex lock may increase I/O request latency for seconds or more.
1117 * To avoid such situation, if mutext_trylock() failed, only writeback
1118 * rate of current cached device is set to 1, and __update_write_back()
1119 * will decide writeback rate of other cached devices (remember now
1120 * c->idle_counter is 0 already).
1121 */
1122 if (mutex_trylock(&bch_register_lock)) {
1123 for (i = 0; i < c->devices_max_used; i++) {
1124 if (!c->devices[i])
1125 continue;
1126
1127 if (UUID_FLASH_ONLY(&c->uuids[i]))
1128 continue;
1129
1130 d = c->devices[i];
1131 dc = container_of(d, struct cached_dev, disk);
1132 /*
1133 * set writeback rate to default minimum value,
1134 * then let update_writeback_rate() to decide the
1135 * upcoming rate.
1136 */
1137 atomic_long_set(&dc->writeback_rate.rate, 1);
1138 }
1139 mutex_unlock(&bch_register_lock);
1140 } else
1141 atomic_long_set(&this_dc->writeback_rate.rate, 1);
1142}
1143
1106/* Cached devices - read & write stuff */ 1144/* Cached devices - read & write stuff */
1107 1145
1108static blk_qc_t cached_dev_make_request(struct request_queue *q, 1146static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1120,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
1120 return BLK_QC_T_NONE; 1158 return BLK_QC_T_NONE;
1121 } 1159 }
1122 1160
1123 atomic_set(&dc->backing_idle, 0); 1161 if (likely(d->c)) {
1124 generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); 1162 if (atomic_read(&d->c->idle_counter))
1163 atomic_set(&d->c->idle_counter, 0);
1164 /*
1165 * If at_max_writeback_rate of cache set is true and new I/O
1166 * comes, quit max writeback rate of all cached devices
1167 * attached to this cache set, and set at_max_writeback_rate
1168 * to false.
1169 */
1170 if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
1171 atomic_set(&d->c->at_max_writeback_rate, 0);
1172 quit_max_writeback_rate(d->c, dc);
1173 }
1174 }
1175
1176 generic_start_io_acct(q,
1177 bio_op(bio),
1178 bio_sectors(bio),
1179 &d->disk->part0);
1125 1180
1126 bio_set_dev(bio, dc->bdev); 1181 bio_set_dev(bio, dc->bdev);
1127 bio->bi_iter.bi_sector += dc->sb.data_offset; 1182 bio->bi_iter.bi_sector += dc->sb.data_offset;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1e85cbb4c159..55a37641aa95 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -696,6 +696,8 @@ static void bcache_device_detach(struct bcache_device *d)
696{ 696{
697 lockdep_assert_held(&bch_register_lock); 697 lockdep_assert_held(&bch_register_lock);
698 698
699 atomic_dec(&d->c->attached_dev_nr);
700
699 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { 701 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
700 struct uuid_entry *u = d->c->uuids + d->id; 702 struct uuid_entry *u = d->c->uuids + d->id;
701 703
@@ -1144,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1144 1146
1145 bch_cached_dev_run(dc); 1147 bch_cached_dev_run(dc);
1146 bcache_device_link(&dc->disk, c, "bdev"); 1148 bcache_device_link(&dc->disk, c, "bdev");
1149 atomic_inc(&c->attached_dev_nr);
1147 1150
1148 /* Allow the writeback thread to proceed */ 1151 /* Allow the writeback thread to proceed */
1149 up_write(&dc->writeback_lock); 1152 up_write(&dc->writeback_lock);
@@ -1696,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1696 c->block_bits = ilog2(sb->block_size); 1699 c->block_bits = ilog2(sb->block_size);
1697 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); 1700 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1698 c->devices_max_used = 0; 1701 c->devices_max_used = 0;
1702 atomic_set(&c->attached_dev_nr, 0);
1699 c->btree_pages = bucket_pages(c); 1703 c->btree_pages = bucket_pages(c);
1700 if (c->btree_pages > BTREE_MAX_PAGES) 1704 if (c->btree_pages > BTREE_MAX_PAGES)
1701 c->btree_pages = max_t(int, c->btree_pages / 4, 1705 c->btree_pages = max_t(int, c->btree_pages / 4,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 3e9d3459a224..6e88142514fb 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -171,7 +171,8 @@ SHOW(__bch_cached_dev)
171 var_printf(writeback_running, "%i"); 171 var_printf(writeback_running, "%i");
172 var_print(writeback_delay); 172 var_print(writeback_delay);
173 var_print(writeback_percent); 173 var_print(writeback_percent);
174 sysfs_hprint(writeback_rate, wb ? dc->writeback_rate.rate << 9 : 0); 174 sysfs_hprint(writeback_rate,
175 wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
175 sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); 176 sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
176 sysfs_printf(io_error_limit, "%i", dc->error_limit); 177 sysfs_printf(io_error_limit, "%i", dc->error_limit);
177 sysfs_printf(io_disable, "%i", dc->io_disable); 178 sysfs_printf(io_disable, "%i", dc->io_disable);
@@ -193,7 +194,9 @@ SHOW(__bch_cached_dev)
193 * Except for dirty and target, other values should 194 * Except for dirty and target, other values should
194 * be 0 if writeback is not running. 195 * be 0 if writeback is not running.
195 */ 196 */
196 bch_hprint(rate, wb ? dc->writeback_rate.rate << 9 : 0); 197 bch_hprint(rate,
198 wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
199 : 0);
197 bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); 200 bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
198 bch_hprint(target, dc->writeback_rate_target << 9); 201 bch_hprint(target, dc->writeback_rate_target << 9);
199 bch_hprint(proportional, 202 bch_hprint(proportional,
@@ -261,8 +264,12 @@ STORE(__cached_dev)
261 264
262 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); 265 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
263 266
264 sysfs_strtoul_clamp(writeback_rate, 267 if (attr == &sysfs_writeback_rate) {
265 dc->writeback_rate.rate, 1, INT_MAX); 268 int v;
269
270 sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
271 atomic_long_set(&dc->writeback_rate.rate, v);
272 }
266 273
267 sysfs_strtoul_clamp(writeback_rate_update_seconds, 274 sysfs_strtoul_clamp(writeback_rate_update_seconds,
268 dc->writeback_rate_update_seconds, 275 dc->writeback_rate_update_seconds,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index fc479b026d6d..b15256bcf0e7 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
200{ 200{
201 uint64_t now = local_clock(); 201 uint64_t now = local_clock();
202 202
203 d->next += div_u64(done * NSEC_PER_SEC, d->rate); 203 d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
204 204
205 /* Bound the time. Don't let us fall further than 2 seconds behind 205 /* Bound the time. Don't let us fall further than 2 seconds behind
206 * (this prevents unnecessary backlog that would make it impossible 206 * (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cced87f8eb27..f7b0133c9d2f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,7 +442,7 @@ struct bch_ratelimit {
442 * Rate at which we want to do work, in units per second 442 * Rate at which we want to do work, in units per second
443 * The units here correspond to the units passed to bch_next_delay() 443 * The units here correspond to the units passed to bch_next_delay()
444 */ 444 */
445 uint32_t rate; 445 atomic_long_t rate;
446}; 446};
447 447
448static inline void bch_ratelimit_reset(struct bch_ratelimit *d) 448static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 912e969fedba..481d4cf38ac0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
104 104
105 dc->writeback_rate_proportional = proportional_scaled; 105 dc->writeback_rate_proportional = proportional_scaled;
106 dc->writeback_rate_integral_scaled = integral_scaled; 106 dc->writeback_rate_integral_scaled = integral_scaled;
107 dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; 107 dc->writeback_rate_change = new_rate -
108 dc->writeback_rate.rate = new_rate; 108 atomic_long_read(&dc->writeback_rate.rate);
109 atomic_long_set(&dc->writeback_rate.rate, new_rate);
109 dc->writeback_rate_target = target; 110 dc->writeback_rate_target = target;
110} 111}
111 112
113static bool set_at_max_writeback_rate(struct cache_set *c,
114 struct cached_dev *dc)
115{
116 /*
117 * Idle_counter is increased everytime when update_writeback_rate() is
118 * called. If all backing devices attached to the same cache set have
119 * identical dc->writeback_rate_update_seconds values, it is about 6
120 * rounds of update_writeback_rate() on each backing device before
121 * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
122 * to each dc->writeback_rate.rate.
123 * In order to avoid extra locking cost for counting exact dirty cached
124 * devices number, c->attached_dev_nr is used to calculate the idle
125 * throushold. It might be bigger if not all cached device are in write-
126 * back mode, but it still works well with limited extra rounds of
127 * update_writeback_rate().
128 */
129 if (atomic_inc_return(&c->idle_counter) <
130 atomic_read(&c->attached_dev_nr) * 6)
131 return false;
132
133 if (atomic_read(&c->at_max_writeback_rate) != 1)
134 atomic_set(&c->at_max_writeback_rate, 1);
135
136 atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
137
138 /* keep writeback_rate_target as existing value */
139 dc->writeback_rate_proportional = 0;
140 dc->writeback_rate_integral_scaled = 0;
141 dc->writeback_rate_change = 0;
142
143 /*
144 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
145 * new I/O arrives during before set_at_max_writeback_rate() returns.
146 * Then the writeback rate is set to 1, and its new value should be
147 * decided via __update_writeback_rate().
148 */
149 if ((atomic_read(&c->idle_counter) <
150 atomic_read(&c->attached_dev_nr) * 6) ||
151 !atomic_read(&c->at_max_writeback_rate))
152 return false;
153
154 return true;
155}
156
112static void update_writeback_rate(struct work_struct *work) 157static void update_writeback_rate(struct work_struct *work)
113{ 158{
114 struct cached_dev *dc = container_of(to_delayed_work(work), 159 struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
136 return; 181 return;
137 } 182 }
138 183
139 down_read(&dc->writeback_lock); 184 if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
140 185 /*
141 if (atomic_read(&dc->has_dirty) && 186 * If the whole cache set is idle, set_at_max_writeback_rate()
142 dc->writeback_percent) 187 * will set writeback rate to a max number. Then it is
143 __update_writeback_rate(dc); 188 * unncessary to update writeback rate for an idle cache set
189 * in maximum writeback rate number(s).
190 */
191 if (!set_at_max_writeback_rate(c, dc)) {
192 down_read(&dc->writeback_lock);
193 __update_writeback_rate(dc);
194 up_read(&dc->writeback_lock);
195 }
196 }
144 197
145 up_read(&dc->writeback_lock);
146 198
147 /* 199 /*
148 * CACHE_SET_IO_DISABLE might be set via sysfs interface, 200 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
422 474
423 delay = writeback_delay(dc, size); 475 delay = writeback_delay(dc, size);
424 476
425 /* If the control system would wait for at least half a
426 * second, and there's been no reqs hitting the backing disk
427 * for awhile: use an alternate mode where we have at most
428 * one contiguous set of writebacks in flight at a time. If
429 * someone wants to do IO it will be quick, as it will only
430 * have to contend with one operation in flight, and we'll
431 * be round-tripping data to the backing disk as quickly as
432 * it can accept it.
433 */
434 if (delay >= HZ / 2) {
435 /* 3 means at least 1.5 seconds, up to 7.5 if we
436 * have slowed way down.
437 */
438 if (atomic_inc_return(&dc->backing_idle) >= 3) {
439 /* Wait for current I/Os to finish */
440 closure_sync(&cl);
441 /* And immediately launch a new set. */
442 delay = 0;
443 }
444 }
445
446 while (!kthread_should_stop() && 477 while (!kthread_should_stop() &&
447 !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && 478 !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
448 delay) { 479 delay) {
@@ -741,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
741 dc->writeback_running = true; 772 dc->writeback_running = true;
742 dc->writeback_percent = 10; 773 dc->writeback_percent = 10;
743 dc->writeback_delay = 30; 774 dc->writeback_delay = 30;
744 dc->writeback_rate.rate = 1024; 775 atomic_long_set(&dc->writeback_rate.rate, 1024);
745 dc->writeback_rate_minimum = 8; 776 dc->writeback_rate_minimum = 8;
746 777
747 dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; 778 dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;