aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-24 17:42:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-24 17:42:03 -0400
commite288e931c12e77982aa582e2380cd072becfe488 (patch)
tree8d1b08d7b87bd5529da561e9602cc40698b08009
parentdb6aaf4d55f95dcb6b162c3a59b56eb1e85ccdfe (diff)
parentc0f04d88e46d14de51f4baebb6efafb7d59e9f96 (diff)
Merge branch 'bcache' (bcache fixes from Kent Overstreet)
Merge bcache fixes from Kent Overstreet: "There's fixes for _three_ different data corruption bugs, all of which were found by users hitting them in the wild. The first one isn't bcache specific - in 3.11 bcache was switched to the bio_copy_data in fs/bio.c, and that's when the bug in that code was discovered, but it's also used by raid1 and pktcdvd. (That was my code too, so the bug's doubly embarassing given that it was or should've been just a cut and paste from bcache code. Dunno what happened there). Most of these (all the non data corruption bugs, actually) were ready before the merge window and have been sitting in Jens' tree, but I don't know what's been up with him lately..." * emailed patches from Kent Overstreet <kmo@daterainc.com>: bcache: Fix flushes in writeback mode bcache: Fix for handling overlapping extents when reading in a btree node bcache: Fix a shrinker deadlock bcache: Fix a dumb CPU spinning bug in writeback bcache: Fix a flush/fua performance bug bcache: Fix a writeback performance regression bcache: Correct printf()-style format length modifier bcache: Fix for when no journal entries are found bcache: Strip endline when writing the label through sysfs bcache: Fix a dumb journal discard bug block: Fix bio_copy_data()
-rw-r--r--drivers/md/bcache/bcache.h7
-rw-r--r--drivers/md/bcache/bset.c39
-rw-r--r--drivers/md/bcache/btree.c4
-rw-r--r--drivers/md/bcache/journal.c33
-rw-r--r--drivers/md/bcache/request.c15
-rw-r--r--drivers/md/bcache/sysfs.c9
-rw-r--r--drivers/md/bcache/util.c11
-rw-r--r--drivers/md/bcache/util.h12
-rw-r--r--drivers/md/bcache/writeback.c42
-rw-r--r--fs/bio.c4
10 files changed, 110 insertions, 66 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b39f6f0b45f2..0f12382aa35d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -498,7 +498,7 @@ struct cached_dev {
498 */ 498 */
499 atomic_t has_dirty; 499 atomic_t has_dirty;
500 500
501 struct ratelimit writeback_rate; 501 struct bch_ratelimit writeback_rate;
502 struct delayed_work writeback_rate_update; 502 struct delayed_work writeback_rate_update;
503 503
504 /* 504 /*
@@ -507,10 +507,9 @@ struct cached_dev {
507 */ 507 */
508 sector_t last_read; 508 sector_t last_read;
509 509
510 /* Number of writeback bios in flight */ 510 /* Limit number of writeback bios in flight */
511 atomic_t in_flight; 511 struct semaphore in_flight;
512 struct closure_with_timer writeback; 512 struct closure_with_timer writeback;
513 struct closure_waitlist writeback_wait;
514 513
515 struct keybuf writeback_keys; 514 struct keybuf writeback_keys;
516 515
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 8010eed06a51..22d1ae72c282 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -926,28 +926,45 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
926 926
927/* Mergesort */ 927/* Mergesort */
928 928
929static void sort_key_next(struct btree_iter *iter,
930 struct btree_iter_set *i)
931{
932 i->k = bkey_next(i->k);
933
934 if (i->k == i->end)
935 *i = iter->data[--iter->used];
936}
937
929static void btree_sort_fixup(struct btree_iter *iter) 938static void btree_sort_fixup(struct btree_iter *iter)
930{ 939{
931 while (iter->used > 1) { 940 while (iter->used > 1) {
932 struct btree_iter_set *top = iter->data, *i = top + 1; 941 struct btree_iter_set *top = iter->data, *i = top + 1;
933 struct bkey *k;
934 942
935 if (iter->used > 2 && 943 if (iter->used > 2 &&
936 btree_iter_cmp(i[0], i[1])) 944 btree_iter_cmp(i[0], i[1]))
937 i++; 945 i++;
938 946
939 for (k = i->k; 947 if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
940 k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
941 k = bkey_next(k))
942 if (top->k > i->k)
943 __bch_cut_front(top->k, k);
944 else if (KEY_SIZE(k))
945 bch_cut_back(&START_KEY(k), top->k);
946
947 if (top->k < i->k || k == i->k)
948 break; 948 break;
949 949
950 heap_sift(iter, i - top, btree_iter_cmp); 950 if (!KEY_SIZE(i->k)) {
951 sort_key_next(iter, i);
952 heap_sift(iter, i - top, btree_iter_cmp);
953 continue;
954 }
955
956 if (top->k > i->k) {
957 if (bkey_cmp(top->k, i->k) >= 0)
958 sort_key_next(iter, i);
959 else
960 bch_cut_front(top->k, i->k);
961
962 heap_sift(iter, i - top, btree_iter_cmp);
963 } else {
964 /* can't happen because of comparison func */
965 BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
966 bch_cut_back(&START_KEY(i->k), top->k);
967 }
951 } 968 }
952} 969}
953 970
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f9764e61978b..f42fc7ed9cd6 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -255,7 +255,7 @@ void bch_btree_node_read(struct btree *b)
255 255
256 return; 256 return;
257err: 257err:
258 bch_cache_set_error(b->c, "io error reading bucket %lu", 258 bch_cache_set_error(b->c, "io error reading bucket %zu",
259 PTR_BUCKET_NR(b->c, &b->key, 0)); 259 PTR_BUCKET_NR(b->c, &b->key, 0));
260} 260}
261 261
@@ -612,7 +612,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
612 return SHRINK_STOP; 612 return SHRINK_STOP;
613 613
614 /* Return -1 if we can't do anything right now */ 614 /* Return -1 if we can't do anything right now */
615 if (sc->gfp_mask & __GFP_WAIT) 615 if (sc->gfp_mask & __GFP_IO)
616 mutex_lock(&c->bucket_lock); 616 mutex_lock(&c->bucket_lock);
617 else if (!mutex_trylock(&c->bucket_lock)) 617 else if (!mutex_trylock(&c->bucket_lock))
618 return -1; 618 return -1;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ba95ab84b2be..8435f81e5d85 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -153,7 +153,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
153 bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); 153 bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
154 pr_debug("%u journal buckets", ca->sb.njournal_buckets); 154 pr_debug("%u journal buckets", ca->sb.njournal_buckets);
155 155
156 /* Read journal buckets ordered by golden ratio hash to quickly 156 /*
157 * Read journal buckets ordered by golden ratio hash to quickly
157 * find a sequence of buckets with valid journal entries 158 * find a sequence of buckets with valid journal entries
158 */ 159 */
159 for (i = 0; i < ca->sb.njournal_buckets; i++) { 160 for (i = 0; i < ca->sb.njournal_buckets; i++) {
@@ -166,18 +167,20 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
166 goto bsearch; 167 goto bsearch;
167 } 168 }
168 169
169 /* If that fails, check all the buckets we haven't checked 170 /*
171 * If that fails, check all the buckets we haven't checked
170 * already 172 * already
171 */ 173 */
172 pr_debug("falling back to linear search"); 174 pr_debug("falling back to linear search");
173 175
174 for (l = 0; l < ca->sb.njournal_buckets; l++) { 176 for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
175 if (test_bit(l, bitmap)) 177 l < ca->sb.njournal_buckets;
176 continue; 178 l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
177
178 if (read_bucket(l)) 179 if (read_bucket(l))
179 goto bsearch; 180 goto bsearch;
180 } 181
182 if (list_empty(list))
183 continue;
181bsearch: 184bsearch:
182 /* Binary search */ 185 /* Binary search */
183 m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); 186 m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
@@ -197,10 +200,12 @@ bsearch:
197 r = m; 200 r = m;
198 } 201 }
199 202
200 /* Read buckets in reverse order until we stop finding more 203 /*
204 * Read buckets in reverse order until we stop finding more
201 * journal entries 205 * journal entries
202 */ 206 */
203 pr_debug("finishing up"); 207 pr_debug("finishing up: m %u njournal_buckets %u",
208 m, ca->sb.njournal_buckets);
204 l = m; 209 l = m;
205 210
206 while (1) { 211 while (1) {
@@ -228,9 +233,10 @@ bsearch:
228 } 233 }
229 } 234 }
230 235
231 c->journal.seq = list_entry(list->prev, 236 if (!list_empty(list))
232 struct journal_replay, 237 c->journal.seq = list_entry(list->prev,
233 list)->j.seq; 238 struct journal_replay,
239 list)->j.seq;
234 240
235 return 0; 241 return 0;
236#undef read_bucket 242#undef read_bucket
@@ -428,7 +434,7 @@ static void do_journal_discard(struct cache *ca)
428 return; 434 return;
429 } 435 }
430 436
431 switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { 437 switch (atomic_read(&ja->discard_in_flight)) {
432 case DISCARD_IN_FLIGHT: 438 case DISCARD_IN_FLIGHT:
433 return; 439 return;
434 440
@@ -689,6 +695,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl)
689 if (cl) 695 if (cl)
690 BUG_ON(!closure_wait(&w->wait, cl)); 696 BUG_ON(!closure_wait(&w->wait, cl));
691 697
698 closure_flush(&c->journal.io);
692 __journal_try_write(c, true); 699 __journal_try_write(c, true);
693 } 700 }
694} 701}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 786a1a4f74d8..71eb233b9ace 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -997,14 +997,17 @@ static void request_write(struct cached_dev *dc, struct search *s)
997 } else { 997 } else {
998 bch_writeback_add(dc); 998 bch_writeback_add(dc);
999 999
1000 if (s->op.flush_journal) { 1000 if (bio->bi_rw & REQ_FLUSH) {
1001 /* Also need to send a flush to the backing device */ 1001 /* Also need to send a flush to the backing device */
1002 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 1002 struct bio *flush = bio_alloc_bioset(0, GFP_NOIO,
1003 dc->disk.bio_split); 1003 dc->disk.bio_split);
1004 1004
1005 bio->bi_size = 0; 1005 flush->bi_rw = WRITE_FLUSH;
1006 bio->bi_vcnt = 0; 1006 flush->bi_bdev = bio->bi_bdev;
1007 closure_bio_submit(bio, cl, s->d); 1007 flush->bi_end_io = request_endio;
1008 flush->bi_private = cl;
1009
1010 closure_bio_submit(flush, cl, s->d);
1008 } else { 1011 } else {
1009 s->op.cache_bio = bio; 1012 s->op.cache_bio = bio;
1010 } 1013 }
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4fe6ab2fbe2e..924dcfdae111 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -223,8 +223,13 @@ STORE(__cached_dev)
223 } 223 }
224 224
225 if (attr == &sysfs_label) { 225 if (attr == &sysfs_label) {
226 /* note: endlines are preserved */ 226 if (size > SB_LABEL_SIZE)
227 memcpy(dc->sb.label, buf, SB_LABEL_SIZE); 227 return -EINVAL;
228 memcpy(dc->sb.label, buf, size);
229 if (size < SB_LABEL_SIZE)
230 dc->sb.label[size] = '\0';
231 if (size && dc->sb.label[size - 1] == '\n')
232 dc->sb.label[size - 1] = '\0';
228 bch_write_bdev_super(dc, NULL); 233 bch_write_bdev_super(dc, NULL);
229 if (dc->disk.c) { 234 if (dc->disk.c) {
230 memcpy(dc->disk.c->uuids[dc->disk.id].label, 235 memcpy(dc->disk.c->uuids[dc->disk.id].label,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 98eb81159a22..420dad545c7d 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
190 stats->last = now ?: 1; 190 stats->last = now ?: 1;
191} 191}
192 192
193unsigned bch_next_delay(struct ratelimit *d, uint64_t done) 193/**
194 * bch_next_delay() - increment @d by the amount of work done, and return how
195 * long to delay until the next time to do some work.
196 *
197 * @d - the struct bch_ratelimit to update
198 * @done - the amount of work done, in arbitrary units
199 *
200 * Returns the amount of time to delay by, in jiffies
201 */
202uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
194{ 203{
195 uint64_t now = local_clock(); 204 uint64_t now = local_clock();
196 205
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1ae2a73ad85f..ea345c6896f4 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units)
450 (ewma) >> factor; \ 450 (ewma) >> factor; \
451}) 451})
452 452
453struct ratelimit { 453struct bch_ratelimit {
454 /* Next time we want to do some work, in nanoseconds */
454 uint64_t next; 455 uint64_t next;
456
457 /*
458 * Rate at which we want to do work, in units per nanosecond
459 * The units here correspond to the units passed to bch_next_delay()
460 */
455 unsigned rate; 461 unsigned rate;
456}; 462};
457 463
458static inline void ratelimit_reset(struct ratelimit *d) 464static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
459{ 465{
460 d->next = local_clock(); 466 d->next = local_clock();
461} 467}
462 468
463unsigned bch_next_delay(struct ratelimit *d, uint64_t done); 469uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
464 470
465#define __DIV_SAFE(n, d, zero) \ 471#define __DIV_SAFE(n, d, zero) \
466({ \ 472({ \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 22cbff551628..ba3ee48320f2 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work)
94 94
95static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 95static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
96{ 96{
97 uint64_t ret;
98
97 if (atomic_read(&dc->disk.detaching) || 99 if (atomic_read(&dc->disk.detaching) ||
98 !dc->writeback_percent) 100 !dc->writeback_percent)
99 return 0; 101 return 0;
100 102
101 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 103 ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
104
105 return min_t(uint64_t, ret, HZ);
102} 106}
103 107
104/* Background writeback */ 108/* Background writeback */
@@ -208,7 +212,7 @@ normal_refill:
208 212
209 up_write(&dc->writeback_lock); 213 up_write(&dc->writeback_lock);
210 214
211 ratelimit_reset(&dc->writeback_rate); 215 bch_ratelimit_reset(&dc->writeback_rate);
212 216
213 /* Punt to workqueue only so we don't recurse and blow the stack */ 217 /* Punt to workqueue only so we don't recurse and blow the stack */
214 continue_at(cl, read_dirty, dirty_wq); 218 continue_at(cl, read_dirty, dirty_wq);
@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl)
318 } 322 }
319 323
320 bch_keybuf_del(&dc->writeback_keys, w); 324 bch_keybuf_del(&dc->writeback_keys, w);
321 atomic_dec_bug(&dc->in_flight); 325 up(&dc->in_flight);
322
323 closure_wake_up(&dc->writeback_wait);
324 326
325 closure_return_with_destructor(cl, dirty_io_destructor); 327 closure_return_with_destructor(cl, dirty_io_destructor);
326} 328}
@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl)
349 351
350 closure_bio_submit(&io->bio, cl, &io->dc->disk); 352 closure_bio_submit(&io->bio, cl, &io->dc->disk);
351 353
352 continue_at(cl, write_dirty_finish, dirty_wq); 354 continue_at(cl, write_dirty_finish, system_wq);
353} 355}
354 356
355static void read_dirty_endio(struct bio *bio, int error) 357static void read_dirty_endio(struct bio *bio, int error)
@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)
369 371
370 closure_bio_submit(&io->bio, cl, &io->dc->disk); 372 closure_bio_submit(&io->bio, cl, &io->dc->disk);
371 373
372 continue_at(cl, write_dirty, dirty_wq); 374 continue_at(cl, write_dirty, system_wq);
373} 375}
374 376
375static void read_dirty(struct closure *cl) 377static void read_dirty(struct closure *cl)
@@ -394,12 +396,8 @@ static void read_dirty(struct closure *cl)
394 396
395 if (delay > 0 && 397 if (delay > 0 &&
396 (KEY_START(&w->key) != dc->last_read || 398 (KEY_START(&w->key) != dc->last_read ||
397 jiffies_to_msecs(delay) > 50)) { 399 jiffies_to_msecs(delay) > 50))
398 w->private = NULL; 400 delay = schedule_timeout_uninterruptible(delay);
399
400 closure_delay(&dc->writeback, delay);
401 continue_at(cl, read_dirty, dirty_wq);
402 }
403 401
404 dc->last_read = KEY_OFFSET(&w->key); 402 dc->last_read = KEY_OFFSET(&w->key);
405 403
@@ -424,15 +422,10 @@ static void read_dirty(struct closure *cl)
424 422
425 trace_bcache_writeback(&w->key); 423 trace_bcache_writeback(&w->key);
426 424
427 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 425 down(&dc->in_flight);
426 closure_call(&io->cl, read_dirty_submit, NULL, cl);
428 427
429 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 428 delay = writeback_delay(dc, KEY_SIZE(&w->key));
430
431 atomic_inc(&dc->in_flight);
432
433 if (!closure_wait_event(&dc->writeback_wait, cl,
434 atomic_read(&dc->in_flight) < 64))
435 continue_at(cl, read_dirty, dirty_wq);
436 } 429 }
437 430
438 if (0) { 431 if (0) {
@@ -442,7 +435,11 @@ err:
442 bch_keybuf_del(&dc->writeback_keys, w); 435 bch_keybuf_del(&dc->writeback_keys, w);
443 } 436 }
444 437
445 refill_dirty(cl); 438 /*
439 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
440 * freed) before refilling again
441 */
442 continue_at(cl, refill_dirty, dirty_wq);
446} 443}
447 444
448/* Init */ 445/* Init */
@@ -484,6 +481,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
484 481
485void bch_cached_dev_writeback_init(struct cached_dev *dc) 482void bch_cached_dev_writeback_init(struct cached_dev *dc)
486{ 483{
484 sema_init(&dc->in_flight, 64);
487 closure_init_unlocked(&dc->writeback); 485 closure_init_unlocked(&dc->writeback);
488 init_rwsem(&dc->writeback_lock); 486 init_rwsem(&dc->writeback_lock);
489 487
@@ -513,7 +511,7 @@ void bch_writeback_exit(void)
513 511
514int __init bch_writeback_init(void) 512int __init bch_writeback_init(void)
515{ 513{
516 dirty_wq = create_singlethread_workqueue("bcache_writeback"); 514 dirty_wq = create_workqueue("bcache_writeback");
517 if (!dirty_wq) 515 if (!dirty_wq)
518 return -ENOMEM; 516 return -ENOMEM;
519 517
diff --git a/fs/bio.c b/fs/bio.c
index b3b20ed9510e..ea5035da4d9a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src)
917 src_p = kmap_atomic(src_bv->bv_page); 917 src_p = kmap_atomic(src_bv->bv_page);
918 dst_p = kmap_atomic(dst_bv->bv_page); 918 dst_p = kmap_atomic(dst_bv->bv_page);
919 919
920 memcpy(dst_p + dst_bv->bv_offset, 920 memcpy(dst_p + dst_offset,
921 src_p + src_bv->bv_offset, 921 src_p + src_offset,
922 bytes); 922 bytes);
923 923
924 kunmap_atomic(dst_p); 924 kunmap_atomic(dst_p);