diff options
author | Kent Overstreet <kmo@daterainc.com> | 2013-09-24 02:17:31 -0400 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2013-10-05 10:13:09 -0400 |
commit | 48c4100e10b75152098675d534d8a6109b28f1bf (patch) | |
tree | d1ff65f68a6e48c6fd063229d54e74bb28cd0b57 /drivers/md | |
parent | 808eea9d2912e4a3fb8cd45e3e3da94d114757fb (diff) |
bcache: Fix a writeback performance regression
commit c2a4f3183a1248f615a695fbd8905da55ad11bba upstream.
Background writeback works by scanning the btree for dirty data and
adding those keys into a fixed size buffer, then for each dirty key in
the keybuf writing it to the backing device.
When read_dirty() finishes and it's time to scan for more dirty data, we
need to wait for the outstanding writeback IO to finish - they still
take up slots in the keybuf (so that foreground writes can check for
them to avoid races) - without that wait, we'll continually rescan when
we'll be able to add at most a key or two to the keybuf, and that takes
locks that starves foreground IO. Doh.
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/bcache.h | 7 | ||||
-rw-r--r-- | drivers/md/bcache/util.c | 11 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 43 |
4 files changed, 43 insertions, 30 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c42b14b2304c..6bc016eabdc9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -499,7 +499,7 @@ struct cached_dev { | |||
499 | */ | 499 | */ |
500 | atomic_t has_dirty; | 500 | atomic_t has_dirty; |
501 | 501 | ||
502 | struct ratelimit writeback_rate; | 502 | struct bch_ratelimit writeback_rate; |
503 | struct delayed_work writeback_rate_update; | 503 | struct delayed_work writeback_rate_update; |
504 | 504 | ||
505 | /* | 505 | /* |
@@ -508,10 +508,9 @@ struct cached_dev { | |||
508 | */ | 508 | */ |
509 | sector_t last_read; | 509 | sector_t last_read; |
510 | 510 | ||
511 | /* Number of writeback bios in flight */ | 511 | /* Limit number of writeback bios in flight */ |
512 | atomic_t in_flight; | 512 | struct semaphore in_flight; |
513 | struct closure_with_timer writeback; | 513 | struct closure_with_timer writeback; |
514 | struct closure_waitlist writeback_wait; | ||
515 | 514 | ||
516 | struct keybuf writeback_keys; | 515 | struct keybuf writeback_keys; |
517 | 516 | ||
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index da3a99e85b1e..38a43f8d36ec 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) | |||
190 | stats->last = now ?: 1; | 190 | stats->last = now ?: 1; |
191 | } | 191 | } |
192 | 192 | ||
193 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done) | 193 | /** |
194 | * bch_next_delay() - increment @d by the amount of work done, and return how | ||
195 | * long to delay until the next time to do some work. | ||
196 | * | ||
197 | * @d - the struct bch_ratelimit to update | ||
198 | * @done - the amount of work done, in arbitrary units | ||
199 | * | ||
200 | * Returns the amount of time to delay by, in jiffies | ||
201 | */ | ||
202 | uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) | ||
194 | { | 203 | { |
195 | uint64_t now = local_clock(); | 204 | uint64_t now = local_clock(); |
196 | 205 | ||
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e38c3a..43fd78affcea 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequency_units) | |||
452 | (ewma) >> factor; \ | 452 | (ewma) >> factor; \ |
453 | }) | 453 | }) |
454 | 454 | ||
455 | struct ratelimit { | 455 | struct bch_ratelimit { |
456 | /* Next time we want to do some work, in nanoseconds */ | ||
456 | uint64_t next; | 457 | uint64_t next; |
458 | |||
459 | /* | ||
460 | * Rate at which we want to do work, in units per nanosecond | ||
461 | * The units here correspond to the units passed to bch_next_delay() | ||
462 | */ | ||
457 | unsigned rate; | 463 | unsigned rate; |
458 | }; | 464 | }; |
459 | 465 | ||
460 | static inline void ratelimit_reset(struct ratelimit *d) | 466 | static inline void bch_ratelimit_reset(struct bch_ratelimit *d) |
461 | { | 467 | { |
462 | d->next = local_clock(); | 468 | d->next = local_clock(); |
463 | } | 469 | } |
464 | 470 | ||
465 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done); | 471 | uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); |
466 | 472 | ||
467 | #define __DIV_SAFE(n, d, zero) \ | 473 | #define __DIV_SAFE(n, d, zero) \ |
468 | ({ \ | 474 | ({ \ |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..12c0cd135ef8 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -91,11 +91,15 @@ static void update_writeback_rate(struct work_struct *work) | |||
91 | 91 | ||
92 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | 92 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) |
93 | { | 93 | { |
94 | uint64_t ret; | ||
95 | |||
94 | if (atomic_read(&dc->disk.detaching) || | 96 | if (atomic_read(&dc->disk.detaching) || |
95 | !dc->writeback_percent) | 97 | !dc->writeback_percent) |
96 | return 0; | 98 | return 0; |
97 | 99 | ||
98 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); | 100 | ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
101 | |||
102 | return min_t(uint64_t, ret, HZ); | ||
99 | } | 103 | } |
100 | 104 | ||
101 | /* Background writeback */ | 105 | /* Background writeback */ |
@@ -165,7 +169,7 @@ static void refill_dirty(struct closure *cl) | |||
165 | 169 | ||
166 | up_write(&dc->writeback_lock); | 170 | up_write(&dc->writeback_lock); |
167 | 171 | ||
168 | ratelimit_reset(&dc->writeback_rate); | 172 | bch_ratelimit_reset(&dc->writeback_rate); |
169 | 173 | ||
170 | /* Punt to workqueue only so we don't recurse and blow the stack */ | 174 | /* Punt to workqueue only so we don't recurse and blow the stack */ |
171 | continue_at(cl, read_dirty, dirty_wq); | 175 | continue_at(cl, read_dirty, dirty_wq); |
@@ -246,9 +250,7 @@ static void write_dirty_finish(struct closure *cl) | |||
246 | } | 250 | } |
247 | 251 | ||
248 | bch_keybuf_del(&dc->writeback_keys, w); | 252 | bch_keybuf_del(&dc->writeback_keys, w); |
249 | atomic_dec_bug(&dc->in_flight); | 253 | up(&dc->in_flight); |
250 | |||
251 | closure_wake_up(&dc->writeback_wait); | ||
252 | 254 | ||
253 | closure_return_with_destructor(cl, dirty_io_destructor); | 255 | closure_return_with_destructor(cl, dirty_io_destructor); |
254 | } | 256 | } |
@@ -278,7 +280,7 @@ static void write_dirty(struct closure *cl) | |||
278 | trace_bcache_write_dirty(&io->bio); | 280 | trace_bcache_write_dirty(&io->bio); |
279 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 281 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
280 | 282 | ||
281 | continue_at(cl, write_dirty_finish, dirty_wq); | 283 | continue_at(cl, write_dirty_finish, system_wq); |
282 | } | 284 | } |
283 | 285 | ||
284 | static void read_dirty_endio(struct bio *bio, int error) | 286 | static void read_dirty_endio(struct bio *bio, int error) |
@@ -299,7 +301,7 @@ static void read_dirty_submit(struct closure *cl) | |||
299 | trace_bcache_read_dirty(&io->bio); | 301 | trace_bcache_read_dirty(&io->bio); |
300 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 302 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
301 | 303 | ||
302 | continue_at(cl, write_dirty, dirty_wq); | 304 | continue_at(cl, write_dirty, system_wq); |
303 | } | 305 | } |
304 | 306 | ||
305 | static void read_dirty(struct closure *cl) | 307 | static void read_dirty(struct closure *cl) |
@@ -324,12 +326,9 @@ static void read_dirty(struct closure *cl) | |||
324 | 326 | ||
325 | if (delay > 0 && | 327 | if (delay > 0 && |
326 | (KEY_START(&w->key) != dc->last_read || | 328 | (KEY_START(&w->key) != dc->last_read || |
327 | jiffies_to_msecs(delay) > 50)) { | 329 | jiffies_to_msecs(delay) > 50)) |
328 | w->private = NULL; | 330 | while (delay) |
329 | 331 | delay = schedule_timeout(delay); | |
330 | closure_delay(&dc->writeback, delay); | ||
331 | continue_at(cl, read_dirty, dirty_wq); | ||
332 | } | ||
333 | 332 | ||
334 | dc->last_read = KEY_OFFSET(&w->key); | 333 | dc->last_read = KEY_OFFSET(&w->key); |
335 | 334 | ||
@@ -354,15 +353,10 @@ static void read_dirty(struct closure *cl) | |||
354 | 353 | ||
355 | pr_debug("%s", pkey(&w->key)); | 354 | pr_debug("%s", pkey(&w->key)); |
356 | 355 | ||
357 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | 356 | down(&dc->in_flight); |
357 | closure_call(&io->cl, read_dirty_submit, NULL, cl); | ||
358 | 358 | ||
359 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | 359 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); |
360 | |||
361 | atomic_inc(&dc->in_flight); | ||
362 | |||
363 | if (!closure_wait_event(&dc->writeback_wait, cl, | ||
364 | atomic_read(&dc->in_flight) < 64)) | ||
365 | continue_at(cl, read_dirty, dirty_wq); | ||
366 | } | 360 | } |
367 | 361 | ||
368 | if (0) { | 362 | if (0) { |
@@ -372,11 +366,16 @@ err: | |||
372 | bch_keybuf_del(&dc->writeback_keys, w); | 366 | bch_keybuf_del(&dc->writeback_keys, w); |
373 | } | 367 | } |
374 | 368 | ||
375 | refill_dirty(cl); | 369 | /* |
370 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | ||
371 | * freed) before refilling again | ||
372 | */ | ||
373 | continue_at(cl, refill_dirty, dirty_wq); | ||
376 | } | 374 | } |
377 | 375 | ||
378 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 376 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
379 | { | 377 | { |
378 | sema_init(&dc->in_flight, 64); | ||
380 | closure_init_unlocked(&dc->writeback); | 379 | closure_init_unlocked(&dc->writeback); |
381 | init_rwsem(&dc->writeback_lock); | 380 | init_rwsem(&dc->writeback_lock); |
382 | 381 | ||
@@ -406,7 +405,7 @@ void bch_writeback_exit(void) | |||
406 | 405 | ||
407 | int __init bch_writeback_init(void) | 406 | int __init bch_writeback_init(void) |
408 | { | 407 | { |
409 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | 408 | dirty_wq = create_workqueue("bcache_writeback"); |
410 | if (!dirty_wq) | 409 | if (!dirty_wq) |
411 | return -ENOMEM; | 410 | return -ENOMEM; |
412 | 411 | ||