diff options
author | Kent Overstreet <kmo@daterainc.com> | 2013-09-24 02:17:31 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-24 17:41:43 -0400 |
commit | c2a4f3183a1248f615a695fbd8905da55ad11bba (patch) | |
tree | 51233866301869506c0728c812fe3668ae1e94ce | |
parent | 61cbd250f867f98bb4738000afc6002d6f2b14bd (diff) |
bcache: Fix a writeback performance regression
Background writeback works by scanning the btree for dirty data and
adding those keys into a fixed size buffer, then for each dirty key in
the keybuf writing it to the backing device.
When read_dirty() finishes and it's time to scan for more dirty data, we
need to wait for the outstanding writeback IO to finish - they still
take up slots in the keybuf (so that foreground writes can check for
them to avoid races) - without that wait, we'll continually rescan when
we'll be able to add at most a key or two to the keybuf, and that takes
locks that starves foreground IO. Doh.
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: linux-stable <stable@vger.kernel.org> # >= v3.10
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | drivers/md/bcache/bcache.h | 7 | ||||
-rw-r--r-- | drivers/md/bcache/util.c | 11 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 43 |
4 files changed, 43 insertions, 30 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b39f6f0b45f2..0f12382aa35d 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -498,7 +498,7 @@ struct cached_dev { | |||
498 | */ | 498 | */ |
499 | atomic_t has_dirty; | 499 | atomic_t has_dirty; |
500 | 500 | ||
501 | struct ratelimit writeback_rate; | 501 | struct bch_ratelimit writeback_rate; |
502 | struct delayed_work writeback_rate_update; | 502 | struct delayed_work writeback_rate_update; |
503 | 503 | ||
504 | /* | 504 | /* |
@@ -507,10 +507,9 @@ struct cached_dev { | |||
507 | */ | 507 | */ |
508 | sector_t last_read; | 508 | sector_t last_read; |
509 | 509 | ||
510 | /* Number of writeback bios in flight */ | 510 | /* Limit number of writeback bios in flight */ |
511 | atomic_t in_flight; | 511 | struct semaphore in_flight; |
512 | struct closure_with_timer writeback; | 512 | struct closure_with_timer writeback; |
513 | struct closure_waitlist writeback_wait; | ||
514 | 513 | ||
515 | struct keybuf writeback_keys; | 514 | struct keybuf writeback_keys; |
516 | 515 | ||
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 98eb81159a22..420dad545c7d 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) | |||
190 | stats->last = now ?: 1; | 190 | stats->last = now ?: 1; |
191 | } | 191 | } |
192 | 192 | ||
193 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done) | 193 | /** |
194 | * bch_next_delay() - increment @d by the amount of work done, and return how | ||
195 | * long to delay until the next time to do some work. | ||
196 | * | ||
197 | * @d - the struct bch_ratelimit to update | ||
198 | * @done - the amount of work done, in arbitrary units | ||
199 | * | ||
200 | * Returns the amount of time to delay by, in jiffies | ||
201 | */ | ||
202 | uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) | ||
194 | { | 203 | { |
195 | uint64_t now = local_clock(); | 204 | uint64_t now = local_clock(); |
196 | 205 | ||
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1ae2a73ad85f..ea345c6896f4 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units) | |||
450 | (ewma) >> factor; \ | 450 | (ewma) >> factor; \ |
451 | }) | 451 | }) |
452 | 452 | ||
453 | struct ratelimit { | 453 | struct bch_ratelimit { |
454 | /* Next time we want to do some work, in nanoseconds */ | ||
454 | uint64_t next; | 455 | uint64_t next; |
456 | |||
457 | /* | ||
458 | * Rate at which we want to do work, in units per nanosecond | ||
459 | * The units here correspond to the units passed to bch_next_delay() | ||
460 | */ | ||
455 | unsigned rate; | 461 | unsigned rate; |
456 | }; | 462 | }; |
457 | 463 | ||
458 | static inline void ratelimit_reset(struct ratelimit *d) | 464 | static inline void bch_ratelimit_reset(struct bch_ratelimit *d) |
459 | { | 465 | { |
460 | d->next = local_clock(); | 466 | d->next = local_clock(); |
461 | } | 467 | } |
462 | 468 | ||
463 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done); | 469 | uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); |
464 | 470 | ||
465 | #define __DIV_SAFE(n, d, zero) \ | 471 | #define __DIV_SAFE(n, d, zero) \ |
466 | ({ \ | 472 | ({ \ |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 22cbff551628..27ac51934822 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work) | |||
94 | 94 | ||
95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | 95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) |
96 | { | 96 | { |
97 | uint64_t ret; | ||
98 | |||
97 | if (atomic_read(&dc->disk.detaching) || | 99 | if (atomic_read(&dc->disk.detaching) || |
98 | !dc->writeback_percent) | 100 | !dc->writeback_percent) |
99 | return 0; | 101 | return 0; |
100 | 102 | ||
101 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); | 103 | ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
104 | |||
105 | return min_t(uint64_t, ret, HZ); | ||
102 | } | 106 | } |
103 | 107 | ||
104 | /* Background writeback */ | 108 | /* Background writeback */ |
@@ -208,7 +212,7 @@ normal_refill: | |||
208 | 212 | ||
209 | up_write(&dc->writeback_lock); | 213 | up_write(&dc->writeback_lock); |
210 | 214 | ||
211 | ratelimit_reset(&dc->writeback_rate); | 215 | bch_ratelimit_reset(&dc->writeback_rate); |
212 | 216 | ||
213 | /* Punt to workqueue only so we don't recurse and blow the stack */ | 217 | /* Punt to workqueue only so we don't recurse and blow the stack */ |
214 | continue_at(cl, read_dirty, dirty_wq); | 218 | continue_at(cl, read_dirty, dirty_wq); |
@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl) | |||
318 | } | 322 | } |
319 | 323 | ||
320 | bch_keybuf_del(&dc->writeback_keys, w); | 324 | bch_keybuf_del(&dc->writeback_keys, w); |
321 | atomic_dec_bug(&dc->in_flight); | 325 | up(&dc->in_flight); |
322 | |||
323 | closure_wake_up(&dc->writeback_wait); | ||
324 | 326 | ||
325 | closure_return_with_destructor(cl, dirty_io_destructor); | 327 | closure_return_with_destructor(cl, dirty_io_destructor); |
326 | } | 328 | } |
@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl) | |||
349 | 351 | ||
350 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 352 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
351 | 353 | ||
352 | continue_at(cl, write_dirty_finish, dirty_wq); | 354 | continue_at(cl, write_dirty_finish, system_wq); |
353 | } | 355 | } |
354 | 356 | ||
355 | static void read_dirty_endio(struct bio *bio, int error) | 357 | static void read_dirty_endio(struct bio *bio, int error) |
@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl) | |||
369 | 371 | ||
370 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 372 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
371 | 373 | ||
372 | continue_at(cl, write_dirty, dirty_wq); | 374 | continue_at(cl, write_dirty, system_wq); |
373 | } | 375 | } |
374 | 376 | ||
375 | static void read_dirty(struct closure *cl) | 377 | static void read_dirty(struct closure *cl) |
@@ -394,12 +396,9 @@ static void read_dirty(struct closure *cl) | |||
394 | 396 | ||
395 | if (delay > 0 && | 397 | if (delay > 0 && |
396 | (KEY_START(&w->key) != dc->last_read || | 398 | (KEY_START(&w->key) != dc->last_read || |
397 | jiffies_to_msecs(delay) > 50)) { | 399 | jiffies_to_msecs(delay) > 50)) |
398 | w->private = NULL; | 400 | while (delay) |
399 | 401 | delay = schedule_timeout(delay); | |
400 | closure_delay(&dc->writeback, delay); | ||
401 | continue_at(cl, read_dirty, dirty_wq); | ||
402 | } | ||
403 | 402 | ||
404 | dc->last_read = KEY_OFFSET(&w->key); | 403 | dc->last_read = KEY_OFFSET(&w->key); |
405 | 404 | ||
@@ -424,15 +423,10 @@ static void read_dirty(struct closure *cl) | |||
424 | 423 | ||
425 | trace_bcache_writeback(&w->key); | 424 | trace_bcache_writeback(&w->key); |
426 | 425 | ||
427 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | 426 | down(&dc->in_flight); |
427 | closure_call(&io->cl, read_dirty_submit, NULL, cl); | ||
428 | 428 | ||
429 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | 429 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); |
430 | |||
431 | atomic_inc(&dc->in_flight); | ||
432 | |||
433 | if (!closure_wait_event(&dc->writeback_wait, cl, | ||
434 | atomic_read(&dc->in_flight) < 64)) | ||
435 | continue_at(cl, read_dirty, dirty_wq); | ||
436 | } | 430 | } |
437 | 431 | ||
438 | if (0) { | 432 | if (0) { |
@@ -442,7 +436,11 @@ err: | |||
442 | bch_keybuf_del(&dc->writeback_keys, w); | 436 | bch_keybuf_del(&dc->writeback_keys, w); |
443 | } | 437 | } |
444 | 438 | ||
445 | refill_dirty(cl); | 439 | /* |
440 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | ||
441 | * freed) before refilling again | ||
442 | */ | ||
443 | continue_at(cl, refill_dirty, dirty_wq); | ||
446 | } | 444 | } |
447 | 445 | ||
448 | /* Init */ | 446 | /* Init */ |
@@ -484,6 +482,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc) | |||
484 | 482 | ||
485 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 483 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
486 | { | 484 | { |
485 | sema_init(&dc->in_flight, 64); | ||
487 | closure_init_unlocked(&dc->writeback); | 486 | closure_init_unlocked(&dc->writeback); |
488 | init_rwsem(&dc->writeback_lock); | 487 | init_rwsem(&dc->writeback_lock); |
489 | 488 | ||
@@ -513,7 +512,7 @@ void bch_writeback_exit(void) | |||
513 | 512 | ||
514 | int __init bch_writeback_init(void) | 513 | int __init bch_writeback_init(void) |
515 | { | 514 | { |
516 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | 515 | dirty_wq = create_workqueue("bcache_writeback"); |
517 | if (!dirty_wq) | 516 | if (!dirty_wq) |
518 | return -ENOMEM; | 517 | return -ENOMEM; |
519 | 518 | ||