From c37511b863f36c1cc6e18440717fd4cc0e881b8a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Apr 2013 15:39:55 -0700 Subject: bcache: Fix/revamp tracepoints The tracepoints were reworked to be more sensible, and fixed a null pointer deref in one of the tracepoints. Converted some of the pr_debug()s to tracepoints - this is partly a performance optimization; it used to be that with DEBUG or CONFIG_DYNAMIC_DEBUG pr_debug() was an empty macro; but at some point it was changed to an empty inline function. Some of the pr_debug() statements had rather expensive function calls as part of the arguments, so this code was getting run unnecessarily even on non debug kernels - in some fast paths, too. Signed-off-by: Kent Overstreet --- drivers/md/bcache/request.c | 65 ++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 31 deletions(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..695469958c1e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -530,10 +530,9 @@ static void bch_insert_data_loop(struct closure *cl) if (KEY_CSUM(k)) bio_csum(n, k); - pr_debug("%s", pkey(k)); + trace_bcache_cache_insert(k); bch_keylist_push(&op->keys); - trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); n->bi_rw |= REQ_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio); @@ -784,11 +783,8 @@ static void request_read_error(struct closure *cl) int i; if (s->recoverable) { - /* The cache read failed, but we can retry from the backing - * device. - */ - pr_debug("recovering at sector %llu", - (uint64_t) s->orig_bio->bi_sector); + /* Retry from the backing device: */ + trace_bcache_read_retry(s->orig_bio); s->error = 0; bv = s->bio.bio.bi_io_vec; @@ -806,7 +802,6 @@ static void request_read_error(struct closure *cl) /* XXX: invalidate cache */ - trace_bcache_read_retry(&s->bio.bio); closure_bio_submit(&s->bio.bio, &s->cl, s->d); } @@ -899,6 +894,7 @@ static void request_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); if (s->error) continue_at_nobarrier(cl, request_read_error, bcache_wq); @@ -969,7 +965,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; bio_get(s->op.cache_bio); - trace_bcache_cache_miss(s->orig_bio); closure_bio_submit(s->op.cache_bio, &s->cl, s->d); return ret; @@ -1040,15 +1035,15 @@ static void request_write(struct cached_dev *dc, struct search *s) if (should_writeback(dc, s->orig_bio)) s->writeback = true; + trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); + if (!s->writeback) { s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, dc->disk.bio_split); - trace_bcache_writethrough(s->orig_bio); closure_bio_submit(bio, cl, s->d); } else { s->op.cache_bio = bio; - trace_bcache_writeback(s->orig_bio); bch_writeback_add(dc, bio_sectors(bio)); } out: @@ -1058,7 +1053,6 @@ skip: s->op.skip = true; s->op.cache_bio = s->orig_bio; bio_get(s->op.cache_bio); - trace_bcache_write_skip(s->orig_bio); if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -1088,9 +1082,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s) /* Cached devices - read & write stuff */ -int bch_get_congested(struct cache_set *c) +unsigned bch_get_congested(struct cache_set *c) { int i; + long rand; if (!c->congested_read_threshold_us && !c->congested_write_threshold_us) @@ -1106,7 +1101,13 @@ int bch_get_congested(struct cache_set *c) i += CONGESTED_MAX; - return i <= 0 ? 1 : fract_exp_two(i, 6); + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; } static void add_sequential(struct task_struct *t) @@ -1126,10 +1127,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) { struct cache_set *c = s->op.c; struct bio *bio = &s->bio.bio; - - long rand; - int cutoff = bch_get_congested(c); unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); if (atomic_read(&dc->disk.detaching) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || @@ -1147,17 +1146,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) goto skip; } - if (!cutoff) { - cutoff = dc->sequential_cutoff >> 9; + if (!congested && !dc->sequential_cutoff) + goto rescale; - if (!cutoff) - goto rescale; - - if (mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - } + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; if (dc->sequential_merge) { struct io *i; @@ -1192,12 +1188,19 @@ found: add_sequential(s->task); } - rand = get_random_int(); - cutoff -= bitmap_weight(&rand, BITS_PER_LONG); + sectors = max(s->task->sequential_io, + s->task->sequential_io_avg) >> 9; - if (cutoff <= (int) (max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9)) + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(s->orig_bio); goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(s->orig_bio); + goto skip; + } rescale: bch_rescale_priorities(c, bio_sectors(bio)); -- cgit v1.2.2 From 279afbad4e54acbd61bf88a54a73af3bbfdeb5dd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2013 06:21:07 -0700 Subject: bcache: Track dirty data by stripe To make background writeback aware of raid5/6 stripes, we first need to track the amount of dirty data within each stripe - we do this by breaking up the existing sectors_dirty into per stripe atomic_ts Signed-off-by: Kent Overstreet --- drivers/md/bcache/request.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 695469958c1e..017c95fced8e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include #include @@ -1044,7 +1045,7 @@ static void request_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl, s->d); } else { s->op.cache_bio = bio; - bch_writeback_add(dc, bio_sectors(bio)); + bch_writeback_add(dc); } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); -- cgit v1.2.2 From 72c270612bd33192fa836ad0f2939af1ca218292 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2013 06:24:39 -0700 Subject: bcache: Write out full stripes Now that we're tracking dirty data per stripe, we can add two optimizations for raid5/6: * If a stripe is already dirty, force writes to that stripe to writeback mode - to help build up full stripes of dirty data * When flushing dirty data, preferentially write out full stripes first if there are any. Signed-off-by: Kent Overstreet --- drivers/md/bcache/request.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 017c95fced8e..17bd59704eba 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -22,8 +22,6 @@ #define CUTOFF_CACHE_ADD 95 #define CUTOFF_CACHE_READA 90 -#define CUTOFF_WRITEBACK 50 -#define CUTOFF_WRITEBACK_SYNC 75 struct kmem_cache *bch_search_cache; @@ -998,17 +996,6 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static bool should_writeback(struct cached_dev *dc, struct bio *bio) -{ - unsigned threshold = (bio->bi_rw & REQ_SYNC) - ? CUTOFF_WRITEBACK_SYNC - : CUTOFF_WRITEBACK; - - return !atomic_read(&dc->disk.detaching) && - cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && - dc->disk.c->gc_stats.in_use < threshold; -} - static void request_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1030,12 +1017,16 @@ static void request_write(struct cached_dev *dc, struct search *s) if (bio->bi_rw & REQ_DISCARD) goto skip; + if (should_writeback(dc, s->orig_bio, + cache_mode(dc, bio), + s->op.skip)) { + s->op.skip = false; + s->writeback = true; + } + if (s->op.skip) goto skip; - if (should_writeback(dc, s->orig_bio)) - s->writeback = true; - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); if (!s->writeback) { -- cgit v1.2.2 From e49c7c374e7aacd1f04ecbc21d9dbbeeea4a77d6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Jun 2013 17:25:38 -0700 Subject: bcache: FUA fixes Journal writes need to be marked FUA, not just REQ_FLUSH. And btree node writes have... weird ordering requirements. Signed-off-by: Kent Overstreet --- drivers/md/bcache/request.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 17bd59704eba..bcdf1f782c3e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1035,8 +1035,19 @@ static void request_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl, s->d); } else { - s->op.cache_bio = bio; bch_writeback_add(dc); + + if (s->op.flush_journal) { + /* Also need to send a flush to the backing device */ + s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); + + bio->bi_size = 0; + bio->bi_vcnt = 0; + closure_bio_submit(bio, cl, s->d); + } else { + s->op.cache_bio = bio; + } } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); -- cgit v1.2.2 From 8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2013 18:15:57 -0700 Subject: bcache: Use standard utility code Some of bcache's utility code has made it into the rest of the kernel, so drop the bcache versions. Bcache used to have a workaround for allocating from a bio set under generic_make_request() (if you allocated more than once, the bios you already allocated would get stuck on current->bio_list when you submitted, and you'd risk deadlock) - bcache would mask out __GFP_WAIT when allocating bios under generic_make_request() so that allocation could fail and it could retry from workqueue. But bio_alloc_bioset() has a workaround now, so we can drop this hack and the associated error handling. Signed-off-by: Kent Overstreet --- drivers/md/bcache/request.c | 87 ++++++++++----------------------------------- 1 file changed, 18 insertions(+), 69 deletions(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index bcdf1f782c3e..b6e74d3c8faf 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -509,10 +509,6 @@ static void bch_insert_data_loop(struct closure *cl) goto err; n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); - if (!n) { - __bkey_put(op->c, k); - continue_at(cl, bch_insert_data_loop, bcache_wq); - } n->bi_end_io = bch_insert_data_endio; n->bi_private = cl; @@ -821,53 +817,13 @@ static void request_read_done(struct closure *cl) */ if (s->op.cache_bio) { - struct bio_vec *src, *dst; - unsigned src_offset, dst_offset, bytes; - void *dst_ptr; - bio_reset(s->op.cache_bio); s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; bch_bio_map(s->op.cache_bio, NULL); - src = bio_iovec(s->op.cache_bio); - dst = bio_iovec(s->cache_miss); - src_offset = src->bv_offset; - dst_offset = dst->bv_offset; - dst_ptr = kmap(dst->bv_page); - - while (1) { - if (dst_offset == dst->bv_offset + dst->bv_len) { - kunmap(dst->bv_page); - dst++; - if (dst == bio_iovec_idx(s->cache_miss, - s->cache_miss->bi_vcnt)) - break; - - dst_offset = dst->bv_offset; - dst_ptr = kmap(dst->bv_page); - } - - if (src_offset == src->bv_offset + src->bv_len) { - src++; - if (src == bio_iovec_idx(s->op.cache_bio, - s->op.cache_bio->bi_vcnt)) - BUG(); - - src_offset = src->bv_offset; - } - - bytes = min(dst->bv_offset + dst->bv_len - dst_offset, - src->bv_offset + src->bv_len - src_offset); - - memcpy(dst_ptr + dst_offset, - page_address(src->bv_page) + src_offset, - bytes); - - src_offset += bytes; - dst_offset += bytes; - } + bio_copy_data(s->cache_miss, s->op.cache_bio); bio_put(s->cache_miss); s->cache_miss = NULL; @@ -912,9 +868,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *miss; miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (!miss) - return -EAGAIN; - if (miss == bio) s->op.lookup_done = true; @@ -933,8 +886,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, reada = min(dc->readahead >> 9, sectors - bio_sectors(miss)); - if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) - reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); + if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) + reada = bdev_sectors(miss->bi_bdev) - + bio_end_sector(miss); } s->cache_bio_sectors = bio_sectors(miss) + reada; @@ -958,7 +912,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_put; bch_bio_map(s->op.cache_bio, NULL); - if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) + if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; s->cache_miss = miss; @@ -1002,7 +956,7 @@ static void request_write(struct cached_dev *dc, struct search *s) struct bio *bio = &s->bio.bio; struct bkey start, end; start = KEY(dc->disk.id, bio->bi_sector, 0); - end = KEY(dc->disk.id, bio_end(bio), 0); + end = KEY(dc->disk.id, bio_end_sector(bio), 0); bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); @@ -1176,7 +1130,7 @@ found: if (i->sequential + bio->bi_size > i->sequential) i->sequential += bio->bi_size; - i->last = bio_end(bio); + i->last = bio_end_sector(bio); i->jiffies = jiffies + msecs_to_jiffies(5000); s->task->sequential_io = i->sequential; @@ -1294,30 +1248,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc) static int flash_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { + struct bio_vec *bv; + int i; + /* Zero fill bio */ - while (bio->bi_idx != bio->bi_vcnt) { - struct bio_vec *bv = bio_iovec(bio); + bio_for_each_segment(bv, bio, i) { unsigned j = min(bv->bv_len >> 9, sectors); void *p = kmap(bv->bv_page); memset(p + bv->bv_offset, 0, j << 9); kunmap(bv->bv_page); - bv->bv_len -= j << 9; - bv->bv_offset += j << 9; - - if (bv->bv_len) - return 0; - - bio->bi_sector += j; - bio->bi_size -= j << 9; - - bio->bi_idx++; - sectors -= j; + sectors -= j; } - s->op.lookup_done = true; + bio_advance(bio, min(sectors << 9, bio->bi_size)); + + if (!bio->bi_size) + s->op.lookup_done = true; return 0; } @@ -1344,8 +1293,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) closure_call(&s->op.cl, btree_read_async, NULL, cl); } else if (bio_has_data(bio) || s->op.skip) { bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, - &KEY(d->id, bio->bi_sector, 0), - &KEY(d->id, bio_end(bio), 0)); + &KEY(d->id, bio->bi_sector, 0), + &KEY(d->id, bio_end_sector(bio), 0)); s->writeback = true; s->op.cache_bio = bio; -- cgit v1.2.2 From 54d12f2b4fd0f218590d1490b41a18d0e2328a9a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jul 2013 18:44:40 -0700 Subject: bcache: Advertise that flushes are supported Whoops - bcache's flush/FUA was mostly correct, but flushes get filtered out unless we say we support them... Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/request.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index b6e74d3c8faf..786a1a4f74d8 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -488,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl) bch_queue_gc(op->c); } + /* + * Journal writes are marked REQ_FLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); + do { unsigned i; struct bkey *k; @@ -710,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->task = current; s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; + s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; s->recoverable = 1; s->start_time = jiffies; -- cgit v1.2.2