aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2013-09-24 02:17:31 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2013-10-05 10:13:09 -0400
commit48c4100e10b75152098675d534d8a6109b28f1bf (patch)
treed1ff65f68a6e48c6fd063229d54e74bb28cd0b57 /drivers/md
parent808eea9d2912e4a3fb8cd45e3e3da94d114757fb (diff)
bcache: Fix a writeback performance regression
commit c2a4f3183a1248f615a695fbd8905da55ad11bba upstream. Background writeback works by scanning the btree for dirty data and adding those keys into a fixed size buffer, then for each dirty key in the keybuf writing it to the backing device. When read_dirty() finishes and it's time to scan for more dirty data, we need to wait for the outstanding writeback IO to finish - they still take up slots in the keybuf (so that foreground writes can check for them to avoid races) - without that wait, we'll continually rescan when we'll be able to add at most a key or two to the keybuf, and that takes locks that starves foreground IO. Doh. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h7
-rw-r--r--drivers/md/bcache/util.c11
-rw-r--r--drivers/md/bcache/util.h12
-rw-r--r--drivers/md/bcache/writeback.c43
4 files changed, 43 insertions, 30 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index c42b14b2304c..6bc016eabdc9 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -499,7 +499,7 @@ struct cached_dev {
499 */ 499 */
500 atomic_t has_dirty; 500 atomic_t has_dirty;
501 501
502 struct ratelimit writeback_rate; 502 struct bch_ratelimit writeback_rate;
503 struct delayed_work writeback_rate_update; 503 struct delayed_work writeback_rate_update;
504 504
505 /* 505 /*
@@ -508,10 +508,9 @@ struct cached_dev {
508 */ 508 */
509 sector_t last_read; 509 sector_t last_read;
510 510
511 /* Number of writeback bios in flight */ 511 /* Limit number of writeback bios in flight */
512 atomic_t in_flight; 512 struct semaphore in_flight;
513 struct closure_with_timer writeback; 513 struct closure_with_timer writeback;
514 struct closure_waitlist writeback_wait;
515 514
516 struct keybuf writeback_keys; 515 struct keybuf writeback_keys;
517 516
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index da3a99e85b1e..38a43f8d36ec 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
190 stats->last = now ?: 1; 190 stats->last = now ?: 1;
191} 191}
192 192
193unsigned bch_next_delay(struct ratelimit *d, uint64_t done) 193/**
194 * bch_next_delay() - increment @d by the amount of work done, and return how
195 * long to delay until the next time to do some work.
196 *
197 * @d - the struct bch_ratelimit to update
198 * @done - the amount of work done, in arbitrary units
199 *
200 * Returns the amount of time to delay by, in jiffies
201 */
202uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
194{ 203{
195 uint64_t now = local_clock(); 204 uint64_t now = local_clock();
196 205
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 577393e38c3a..43fd78affcea 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequency_units)
452 (ewma) >> factor; \ 452 (ewma) >> factor; \
453}) 453})
454 454
455struct ratelimit { 455struct bch_ratelimit {
456 /* Next time we want to do some work, in nanoseconds */
456 uint64_t next; 457 uint64_t next;
458
459 /*
460 * Rate at which we want to do work, in units per nanosecond
461 * The units here correspond to the units passed to bch_next_delay()
462 */
457 unsigned rate; 463 unsigned rate;
458}; 464};
459 465
460static inline void ratelimit_reset(struct ratelimit *d) 466static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
461{ 467{
462 d->next = local_clock(); 468 d->next = local_clock();
463} 469}
464 470
465unsigned bch_next_delay(struct ratelimit *d, uint64_t done); 471uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
466 472
467#define __DIV_SAFE(n, d, zero) \ 473#define __DIV_SAFE(n, d, zero) \
468({ \ 474({ \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 2714ed3991d1..12c0cd135ef8 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -91,11 +91,15 @@ static void update_writeback_rate(struct work_struct *work)
91 91
92static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 92static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
93{ 93{
94 uint64_t ret;
95
94 if (atomic_read(&dc->disk.detaching) || 96 if (atomic_read(&dc->disk.detaching) ||
95 !dc->writeback_percent) 97 !dc->writeback_percent)
96 return 0; 98 return 0;
97 99
98 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 100 ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
101
102 return min_t(uint64_t, ret, HZ);
99} 103}
100 104
101/* Background writeback */ 105/* Background writeback */
@@ -165,7 +169,7 @@ static void refill_dirty(struct closure *cl)
165 169
166 up_write(&dc->writeback_lock); 170 up_write(&dc->writeback_lock);
167 171
168 ratelimit_reset(&dc->writeback_rate); 172 bch_ratelimit_reset(&dc->writeback_rate);
169 173
170 /* Punt to workqueue only so we don't recurse and blow the stack */ 174 /* Punt to workqueue only so we don't recurse and blow the stack */
171 continue_at(cl, read_dirty, dirty_wq); 175 continue_at(cl, read_dirty, dirty_wq);
@@ -246,9 +250,7 @@ static void write_dirty_finish(struct closure *cl)
246 } 250 }
247 251
248 bch_keybuf_del(&dc->writeback_keys, w); 252 bch_keybuf_del(&dc->writeback_keys, w);
249 atomic_dec_bug(&dc->in_flight); 253 up(&dc->in_flight);
250
251 closure_wake_up(&dc->writeback_wait);
252 254
253 closure_return_with_destructor(cl, dirty_io_destructor); 255 closure_return_with_destructor(cl, dirty_io_destructor);
254} 256}
@@ -278,7 +280,7 @@ static void write_dirty(struct closure *cl)
278 trace_bcache_write_dirty(&io->bio); 280 trace_bcache_write_dirty(&io->bio);
279 closure_bio_submit(&io->bio, cl, &io->dc->disk); 281 closure_bio_submit(&io->bio, cl, &io->dc->disk);
280 282
281 continue_at(cl, write_dirty_finish, dirty_wq); 283 continue_at(cl, write_dirty_finish, system_wq);
282} 284}
283 285
284static void read_dirty_endio(struct bio *bio, int error) 286static void read_dirty_endio(struct bio *bio, int error)
@@ -299,7 +301,7 @@ static void read_dirty_submit(struct closure *cl)
299 trace_bcache_read_dirty(&io->bio); 301 trace_bcache_read_dirty(&io->bio);
300 closure_bio_submit(&io->bio, cl, &io->dc->disk); 302 closure_bio_submit(&io->bio, cl, &io->dc->disk);
301 303
302 continue_at(cl, write_dirty, dirty_wq); 304 continue_at(cl, write_dirty, system_wq);
303} 305}
304 306
305static void read_dirty(struct closure *cl) 307static void read_dirty(struct closure *cl)
@@ -324,12 +326,9 @@ static void read_dirty(struct closure *cl)
324 326
325 if (delay > 0 && 327 if (delay > 0 &&
326 (KEY_START(&w->key) != dc->last_read || 328 (KEY_START(&w->key) != dc->last_read ||
327 jiffies_to_msecs(delay) > 50)) { 329 jiffies_to_msecs(delay) > 50))
328 w->private = NULL; 330 while (delay)
329 331 delay = schedule_timeout(delay);
330 closure_delay(&dc->writeback, delay);
331 continue_at(cl, read_dirty, dirty_wq);
332 }
333 332
334 dc->last_read = KEY_OFFSET(&w->key); 333 dc->last_read = KEY_OFFSET(&w->key);
335 334
@@ -354,15 +353,10 @@ static void read_dirty(struct closure *cl)
354 353
355 pr_debug("%s", pkey(&w->key)); 354 pr_debug("%s", pkey(&w->key));
356 355
357 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 356 down(&dc->in_flight);
357 closure_call(&io->cl, read_dirty_submit, NULL, cl);
358 358
359 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 359 delay = writeback_delay(dc, KEY_SIZE(&w->key));
360
361 atomic_inc(&dc->in_flight);
362
363 if (!closure_wait_event(&dc->writeback_wait, cl,
364 atomic_read(&dc->in_flight) < 64))
365 continue_at(cl, read_dirty, dirty_wq);
366 } 360 }
367 361
368 if (0) { 362 if (0) {
@@ -372,11 +366,16 @@ err:
372 bch_keybuf_del(&dc->writeback_keys, w); 366 bch_keybuf_del(&dc->writeback_keys, w);
373 } 367 }
374 368
375 refill_dirty(cl); 369 /*
370 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
371 * freed) before refilling again
372 */
373 continue_at(cl, refill_dirty, dirty_wq);
376} 374}
377 375
378void bch_cached_dev_writeback_init(struct cached_dev *dc) 376void bch_cached_dev_writeback_init(struct cached_dev *dc)
379{ 377{
378 sema_init(&dc->in_flight, 64);
380 closure_init_unlocked(&dc->writeback); 379 closure_init_unlocked(&dc->writeback);
381 init_rwsem(&dc->writeback_lock); 380 init_rwsem(&dc->writeback_lock);
382 381
@@ -406,7 +405,7 @@ void bch_writeback_exit(void)
406 405
407int __init bch_writeback_init(void) 406int __init bch_writeback_init(void)
408{ 407{
409 dirty_wq = create_singlethread_workqueue("bcache_writeback"); 408 dirty_wq = create_workqueue("bcache_writeback");
410 if (!dirty_wq) 409 if (!dirty_wq)
411 return -ENOMEM; 410 return -ENOMEM;
412 411