aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2013-09-24 02:17:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-24 17:41:43 -0400
commitc2a4f3183a1248f615a695fbd8905da55ad11bba (patch)
tree51233866301869506c0728c812fe3668ae1e94ce
parent61cbd250f867f98bb4738000afc6002d6f2b14bd (diff)
bcache: Fix a writeback performance regression
Background writeback works by scanning the btree for dirty data and adding those keys into a fixed size buffer, then for each dirty key in the keybuf writing it to the backing device. When read_dirty() finishes and it's time to scan for more dirty data, we need to wait for the outstanding writeback IO to finish - they still take up slots in the keybuf (so that foreground writes can check for them to avoid races) - without that wait, we'll continually rescan when we'll be able to add at most a key or two to the keybuf, and that takes locks that starves foreground IO. Doh. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: linux-stable <stable@vger.kernel.org> # >= v3.10 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/md/bcache/bcache.h7
-rw-r--r--drivers/md/bcache/util.c11
-rw-r--r--drivers/md/bcache/util.h12
-rw-r--r--drivers/md/bcache/writeback.c43
4 files changed, 43 insertions, 30 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b39f6f0b45f2..0f12382aa35d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -498,7 +498,7 @@ struct cached_dev {
498 */ 498 */
499 atomic_t has_dirty; 499 atomic_t has_dirty;
500 500
501 struct ratelimit writeback_rate; 501 struct bch_ratelimit writeback_rate;
502 struct delayed_work writeback_rate_update; 502 struct delayed_work writeback_rate_update;
503 503
504 /* 504 /*
@@ -507,10 +507,9 @@ struct cached_dev {
507 */ 507 */
508 sector_t last_read; 508 sector_t last_read;
509 509
510 /* Number of writeback bios in flight */ 510 /* Limit number of writeback bios in flight */
511 atomic_t in_flight; 511 struct semaphore in_flight;
512 struct closure_with_timer writeback; 512 struct closure_with_timer writeback;
513 struct closure_waitlist writeback_wait;
514 513
515 struct keybuf writeback_keys; 514 struct keybuf writeback_keys;
516 515
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 98eb81159a22..420dad545c7d 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
190 stats->last = now ?: 1; 190 stats->last = now ?: 1;
191} 191}
192 192
193unsigned bch_next_delay(struct ratelimit *d, uint64_t done) 193/**
194 * bch_next_delay() - increment @d by the amount of work done, and return how
195 * long to delay until the next time to do some work.
196 *
197 * @d - the struct bch_ratelimit to update
198 * @done - the amount of work done, in arbitrary units
199 *
200 * Returns the amount of time to delay by, in jiffies
201 */
202uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
194{ 203{
195 uint64_t now = local_clock(); 204 uint64_t now = local_clock();
196 205
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1ae2a73ad85f..ea345c6896f4 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units)
450 (ewma) >> factor; \ 450 (ewma) >> factor; \
451}) 451})
452 452
453struct ratelimit { 453struct bch_ratelimit {
454 /* Next time we want to do some work, in nanoseconds */
454 uint64_t next; 455 uint64_t next;
456
457 /*
458 * Rate at which we want to do work, in units per nanosecond
459 * The units here correspond to the units passed to bch_next_delay()
460 */
455 unsigned rate; 461 unsigned rate;
456}; 462};
457 463
458static inline void ratelimit_reset(struct ratelimit *d) 464static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
459{ 465{
460 d->next = local_clock(); 466 d->next = local_clock();
461} 467}
462 468
463unsigned bch_next_delay(struct ratelimit *d, uint64_t done); 469uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
464 470
465#define __DIV_SAFE(n, d, zero) \ 471#define __DIV_SAFE(n, d, zero) \
466({ \ 472({ \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 22cbff551628..27ac51934822 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work)
94 94
95static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 95static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
96{ 96{
97 uint64_t ret;
98
97 if (atomic_read(&dc->disk.detaching) || 99 if (atomic_read(&dc->disk.detaching) ||
98 !dc->writeback_percent) 100 !dc->writeback_percent)
99 return 0; 101 return 0;
100 102
101 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 103 ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
104
105 return min_t(uint64_t, ret, HZ);
102} 106}
103 107
104/* Background writeback */ 108/* Background writeback */
@@ -208,7 +212,7 @@ normal_refill:
208 212
209 up_write(&dc->writeback_lock); 213 up_write(&dc->writeback_lock);
210 214
211 ratelimit_reset(&dc->writeback_rate); 215 bch_ratelimit_reset(&dc->writeback_rate);
212 216
213 /* Punt to workqueue only so we don't recurse and blow the stack */ 217 /* Punt to workqueue only so we don't recurse and blow the stack */
214 continue_at(cl, read_dirty, dirty_wq); 218 continue_at(cl, read_dirty, dirty_wq);
@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl)
318 } 322 }
319 323
320 bch_keybuf_del(&dc->writeback_keys, w); 324 bch_keybuf_del(&dc->writeback_keys, w);
321 atomic_dec_bug(&dc->in_flight); 325 up(&dc->in_flight);
322
323 closure_wake_up(&dc->writeback_wait);
324 326
325 closure_return_with_destructor(cl, dirty_io_destructor); 327 closure_return_with_destructor(cl, dirty_io_destructor);
326} 328}
@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl)
349 351
350 closure_bio_submit(&io->bio, cl, &io->dc->disk); 352 closure_bio_submit(&io->bio, cl, &io->dc->disk);
351 353
352 continue_at(cl, write_dirty_finish, dirty_wq); 354 continue_at(cl, write_dirty_finish, system_wq);
353} 355}
354 356
355static void read_dirty_endio(struct bio *bio, int error) 357static void read_dirty_endio(struct bio *bio, int error)
@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)
369 371
370 closure_bio_submit(&io->bio, cl, &io->dc->disk); 372 closure_bio_submit(&io->bio, cl, &io->dc->disk);
371 373
372 continue_at(cl, write_dirty, dirty_wq); 374 continue_at(cl, write_dirty, system_wq);
373} 375}
374 376
375static void read_dirty(struct closure *cl) 377static void read_dirty(struct closure *cl)
@@ -394,12 +396,9 @@ static void read_dirty(struct closure *cl)
394 396
395 if (delay > 0 && 397 if (delay > 0 &&
396 (KEY_START(&w->key) != dc->last_read || 398 (KEY_START(&w->key) != dc->last_read ||
397 jiffies_to_msecs(delay) > 50)) { 399 jiffies_to_msecs(delay) > 50))
398 w->private = NULL; 400 while (delay)
399 401 delay = schedule_timeout(delay);
400 closure_delay(&dc->writeback, delay);
401 continue_at(cl, read_dirty, dirty_wq);
402 }
403 402
404 dc->last_read = KEY_OFFSET(&w->key); 403 dc->last_read = KEY_OFFSET(&w->key);
405 404
@@ -424,15 +423,10 @@ static void read_dirty(struct closure *cl)
424 423
425 trace_bcache_writeback(&w->key); 424 trace_bcache_writeback(&w->key);
426 425
427 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 426 down(&dc->in_flight);
427 closure_call(&io->cl, read_dirty_submit, NULL, cl);
428 428
429 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 429 delay = writeback_delay(dc, KEY_SIZE(&w->key));
430
431 atomic_inc(&dc->in_flight);
432
433 if (!closure_wait_event(&dc->writeback_wait, cl,
434 atomic_read(&dc->in_flight) < 64))
435 continue_at(cl, read_dirty, dirty_wq);
436 } 430 }
437 431
438 if (0) { 432 if (0) {
@@ -442,7 +436,11 @@ err:
442 bch_keybuf_del(&dc->writeback_keys, w); 436 bch_keybuf_del(&dc->writeback_keys, w);
443 } 437 }
444 438
445 refill_dirty(cl); 439 /*
440 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
441 * freed) before refilling again
442 */
443 continue_at(cl, refill_dirty, dirty_wq);
446} 444}
447 445
448/* Init */ 446/* Init */
@@ -484,6 +482,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
484 482
485void bch_cached_dev_writeback_init(struct cached_dev *dc) 483void bch_cached_dev_writeback_init(struct cached_dev *dc)
486{ 484{
485 sema_init(&dc->in_flight, 64);
487 closure_init_unlocked(&dc->writeback); 486 closure_init_unlocked(&dc->writeback);
488 init_rwsem(&dc->writeback_lock); 487 init_rwsem(&dc->writeback_lock);
489 488
@@ -513,7 +512,7 @@ void bch_writeback_exit(void)
513 512
514int __init bch_writeback_init(void) 513int __init bch_writeback_init(void)
515{ 514{
516 dirty_wq = create_singlethread_workqueue("bcache_writeback"); 515 dirty_wq = create_workqueue("bcache_writeback");
517 if (!dirty_wq) 516 if (!dirty_wq)
518 return -ENOMEM; 517 return -ENOMEM;
519 518