diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/bcache/alloc.c | 46 | ||||
| -rw-r--r-- | drivers/md/bcache/bcache.h | 61 | ||||
| -rw-r--r-- | drivers/md/bcache/bset.c | 56 | ||||
| -rw-r--r-- | drivers/md/bcache/bset.h | 4 | ||||
| -rw-r--r-- | drivers/md/bcache/btree.c | 451 | ||||
| -rw-r--r-- | drivers/md/bcache/btree.h | 35 | ||||
| -rw-r--r-- | drivers/md/bcache/closure.c | 6 | ||||
| -rw-r--r-- | drivers/md/bcache/debug.c | 178 | ||||
| -rw-r--r-- | drivers/md/bcache/debug.h | 11 | ||||
| -rw-r--r-- | drivers/md/bcache/io.c | 68 | ||||
| -rw-r--r-- | drivers/md/bcache/journal.c | 25 | ||||
| -rw-r--r-- | drivers/md/bcache/movinggc.c | 24 | ||||
| -rw-r--r-- | drivers/md/bcache/request.c | 197 | ||||
| -rw-r--r-- | drivers/md/bcache/request.h | 2 | ||||
| -rw-r--r-- | drivers/md/bcache/super.c | 171 | ||||
| -rw-r--r-- | drivers/md/bcache/sysfs.c | 68 | ||||
| -rw-r--r-- | drivers/md/bcache/trace.c | 47 | ||||
| -rw-r--r-- | drivers/md/bcache/util.c | 17 | ||||
| -rw-r--r-- | drivers/md/bcache/util.h | 6 | ||||
| -rw-r--r-- | drivers/md/bcache/writeback.c | 133 | ||||
| -rw-r--r-- | drivers/md/bcache/writeback.h | 64 | ||||
| -rw-r--r-- | drivers/md/md.c | 14 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 53 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 19 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 15 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 1 |
26 files changed, 926 insertions, 846 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f2947e08b..e45f5575fd4d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
| @@ -63,7 +63,10 @@ | |||
| 63 | #include "bcache.h" | 63 | #include "bcache.h" |
| 64 | #include "btree.h" | 64 | #include "btree.h" |
| 65 | 65 | ||
| 66 | #include <linux/freezer.h> | ||
| 67 | #include <linux/kthread.h> | ||
| 66 | #include <linux/random.h> | 68 | #include <linux/random.h> |
| 69 | #include <trace/events/bcache.h> | ||
| 67 | 70 | ||
| 68 | #define MAX_IN_FLIGHT_DISCARDS 8U | 71 | #define MAX_IN_FLIGHT_DISCARDS 8U |
| 69 | 72 | ||
| @@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w) | |||
| 151 | mutex_unlock(&ca->set->bucket_lock); | 154 | mutex_unlock(&ca->set->bucket_lock); |
| 152 | 155 | ||
| 153 | closure_wake_up(&ca->set->bucket_wait); | 156 | closure_wake_up(&ca->set->bucket_wait); |
| 154 | wake_up(&ca->set->alloc_wait); | 157 | wake_up_process(ca->alloc_thread); |
| 155 | 158 | ||
| 156 | closure_put(&ca->set->cl); | 159 | closure_put(&ca->set->cl); |
| 157 | } | 160 | } |
| @@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca) | |||
| 350 | break; | 353 | break; |
| 351 | } | 354 | } |
| 352 | 355 | ||
| 353 | pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", | 356 | trace_bcache_alloc_invalidate(ca); |
| 354 | fifo_used(&ca->free), ca->free.size, | ||
| 355 | fifo_used(&ca->free_inc), ca->free_inc.size, | ||
| 356 | fifo_used(&ca->unused), ca->unused.size); | ||
| 357 | } | 357 | } |
| 358 | 358 | ||
| 359 | #define allocator_wait(ca, cond) \ | 359 | #define allocator_wait(ca, cond) \ |
| 360 | do { \ | 360 | do { \ |
| 361 | DEFINE_WAIT(__wait); \ | ||
| 362 | \ | ||
| 363 | while (1) { \ | 361 | while (1) { \ |
| 364 | prepare_to_wait(&ca->set->alloc_wait, \ | 362 | set_current_state(TASK_INTERRUPTIBLE); \ |
| 365 | &__wait, TASK_INTERRUPTIBLE); \ | ||
| 366 | if (cond) \ | 363 | if (cond) \ |
| 367 | break; \ | 364 | break; \ |
| 368 | \ | 365 | \ |
| 369 | mutex_unlock(&(ca)->set->bucket_lock); \ | 366 | mutex_unlock(&(ca)->set->bucket_lock); \ |
| 370 | if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ | 367 | if (kthread_should_stop()) \ |
| 371 | finish_wait(&ca->set->alloc_wait, &__wait); \ | 368 | return 0; \ |
| 372 | closure_return(cl); \ | ||
| 373 | } \ | ||
| 374 | \ | 369 | \ |
| 370 | try_to_freeze(); \ | ||
| 375 | schedule(); \ | 371 | schedule(); \ |
| 376 | mutex_lock(&(ca)->set->bucket_lock); \ | 372 | mutex_lock(&(ca)->set->bucket_lock); \ |
| 377 | } \ | 373 | } \ |
| 378 | \ | 374 | __set_current_state(TASK_RUNNING); \ |
| 379 | finish_wait(&ca->set->alloc_wait, &__wait); \ | ||
| 380 | } while (0) | 375 | } while (0) |
| 381 | 376 | ||
| 382 | void bch_allocator_thread(struct closure *cl) | 377 | static int bch_allocator_thread(void *arg) |
| 383 | { | 378 | { |
| 384 | struct cache *ca = container_of(cl, struct cache, alloc); | 379 | struct cache *ca = arg; |
| 385 | 380 | ||
| 386 | mutex_lock(&ca->set->bucket_lock); | 381 | mutex_lock(&ca->set->bucket_lock); |
| 387 | 382 | ||
| @@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) | |||
| 442 | { | 437 | { |
| 443 | long r = -1; | 438 | long r = -1; |
| 444 | again: | 439 | again: |
| 445 | wake_up(&ca->set->alloc_wait); | 440 | wake_up_process(ca->alloc_thread); |
| 446 | 441 | ||
| 447 | if (fifo_used(&ca->free) > ca->watermark[watermark] && | 442 | if (fifo_used(&ca->free) > ca->watermark[watermark] && |
| 448 | fifo_pop(&ca->free, r)) { | 443 | fifo_pop(&ca->free, r)) { |
| @@ -476,9 +471,7 @@ again: | |||
| 476 | return r; | 471 | return r; |
| 477 | } | 472 | } |
| 478 | 473 | ||
| 479 | pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", | 474 | trace_bcache_alloc_fail(ca); |
| 480 | atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), | ||
| 481 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||
| 482 | 475 | ||
| 483 | if (cl) { | 476 | if (cl) { |
| 484 | closure_wait(&ca->set->bucket_wait, cl); | 477 | closure_wait(&ca->set->bucket_wait, cl); |
| @@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |||
| 552 | 545 | ||
| 553 | /* Init */ | 546 | /* Init */ |
| 554 | 547 | ||
| 548 | int bch_cache_allocator_start(struct cache *ca) | ||
| 549 | { | ||
| 550 | struct task_struct *k = kthread_run(bch_allocator_thread, | ||
| 551 | ca, "bcache_allocator"); | ||
| 552 | if (IS_ERR(k)) | ||
| 553 | return PTR_ERR(k); | ||
| 554 | |||
| 555 | ca->alloc_thread = k; | ||
| 556 | return 0; | ||
| 557 | } | ||
| 558 | |||
| 555 | void bch_cache_allocator_exit(struct cache *ca) | 559 | void bch_cache_allocator_exit(struct cache *ca) |
| 556 | { | 560 | { |
| 557 | struct discard *d; | 561 | struct discard *d; |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b42a4ab..b39f6f0b45f2 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
| @@ -178,7 +178,6 @@ | |||
| 178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ | 178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ |
| 179 | 179 | ||
| 180 | #include <linux/bio.h> | 180 | #include <linux/bio.h> |
| 181 | #include <linux/blktrace_api.h> | ||
| 182 | #include <linux/kobject.h> | 181 | #include <linux/kobject.h> |
| 183 | #include <linux/list.h> | 182 | #include <linux/list.h> |
| 184 | #include <linux/mutex.h> | 183 | #include <linux/mutex.h> |
| @@ -388,8 +387,6 @@ struct keybuf_key { | |||
| 388 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); | 387 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); |
| 389 | 388 | ||
| 390 | struct keybuf { | 389 | struct keybuf { |
| 391 | keybuf_pred_fn *key_predicate; | ||
| 392 | |||
| 393 | struct bkey last_scanned; | 390 | struct bkey last_scanned; |
| 394 | spinlock_t lock; | 391 | spinlock_t lock; |
| 395 | 392 | ||
| @@ -437,9 +434,12 @@ struct bcache_device { | |||
| 437 | 434 | ||
| 438 | /* If nonzero, we're detaching/unregistering from cache set */ | 435 | /* If nonzero, we're detaching/unregistering from cache set */ |
| 439 | atomic_t detaching; | 436 | atomic_t detaching; |
| 437 | int flush_done; | ||
| 438 | |||
| 439 | uint64_t nr_stripes; | ||
| 440 | unsigned stripe_size_bits; | ||
| 441 | atomic_t *stripe_sectors_dirty; | ||
| 440 | 442 | ||
| 441 | atomic_long_t sectors_dirty; | ||
| 442 | unsigned long sectors_dirty_gc; | ||
| 443 | unsigned long sectors_dirty_last; | 443 | unsigned long sectors_dirty_last; |
| 444 | long sectors_dirty_derivative; | 444 | long sectors_dirty_derivative; |
| 445 | 445 | ||
| @@ -531,6 +531,7 @@ struct cached_dev { | |||
| 531 | unsigned sequential_merge:1; | 531 | unsigned sequential_merge:1; |
| 532 | unsigned verify:1; | 532 | unsigned verify:1; |
| 533 | 533 | ||
| 534 | unsigned partial_stripes_expensive:1; | ||
| 534 | unsigned writeback_metadata:1; | 535 | unsigned writeback_metadata:1; |
| 535 | unsigned writeback_running:1; | 536 | unsigned writeback_running:1; |
| 536 | unsigned char writeback_percent; | 537 | unsigned char writeback_percent; |
| @@ -565,8 +566,7 @@ struct cache { | |||
| 565 | 566 | ||
| 566 | unsigned watermark[WATERMARK_MAX]; | 567 | unsigned watermark[WATERMARK_MAX]; |
| 567 | 568 | ||
| 568 | struct closure alloc; | 569 | struct task_struct *alloc_thread; |
| 569 | struct workqueue_struct *alloc_workqueue; | ||
| 570 | 570 | ||
| 571 | struct closure prio; | 571 | struct closure prio; |
| 572 | struct prio_set *disk_buckets; | 572 | struct prio_set *disk_buckets; |
| @@ -664,13 +664,9 @@ struct gc_stat { | |||
| 664 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; | 664 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; |
| 665 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. | 665 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. |
| 666 | * flushing dirty data). | 666 | * flushing dirty data). |
| 667 | * | ||
| 668 | * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down | ||
| 669 | * the allocation thread. | ||
| 670 | */ | 667 | */ |
| 671 | #define CACHE_SET_UNREGISTERING 0 | 668 | #define CACHE_SET_UNREGISTERING 0 |
| 672 | #define CACHE_SET_STOPPING 1 | 669 | #define CACHE_SET_STOPPING 1 |
| 673 | #define CACHE_SET_STOPPING_2 2 | ||
| 674 | 670 | ||
| 675 | struct cache_set { | 671 | struct cache_set { |
| 676 | struct closure cl; | 672 | struct closure cl; |
| @@ -703,9 +699,6 @@ struct cache_set { | |||
| 703 | /* For the btree cache */ | 699 | /* For the btree cache */ |
| 704 | struct shrinker shrink; | 700 | struct shrinker shrink; |
| 705 | 701 | ||
| 706 | /* For the allocator itself */ | ||
| 707 | wait_queue_head_t alloc_wait; | ||
| 708 | |||
| 709 | /* For the btree cache and anything allocation related */ | 702 | /* For the btree cache and anything allocation related */ |
| 710 | struct mutex bucket_lock; | 703 | struct mutex bucket_lock; |
| 711 | 704 | ||
| @@ -823,10 +816,9 @@ struct cache_set { | |||
| 823 | 816 | ||
| 824 | /* | 817 | /* |
| 825 | * A btree node on disk could have too many bsets for an iterator to fit | 818 | * A btree node on disk could have too many bsets for an iterator to fit |
| 826 | * on the stack - this is a single element mempool for btree_read_work() | 819 | * on the stack - have to dynamically allocate them |
| 827 | */ | 820 | */ |
| 828 | struct mutex fill_lock; | 821 | mempool_t *fill_iter; |
| 829 | struct btree_iter *fill_iter; | ||
| 830 | 822 | ||
| 831 | /* | 823 | /* |
| 832 | * btree_sort() is a merge sort and requires temporary space - single | 824 | * btree_sort() is a merge sort and requires temporary space - single |
| @@ -834,6 +826,7 @@ struct cache_set { | |||
| 834 | */ | 826 | */ |
| 835 | struct mutex sort_lock; | 827 | struct mutex sort_lock; |
| 836 | struct bset *sort; | 828 | struct bset *sort; |
| 829 | unsigned sort_crit_factor; | ||
| 837 | 830 | ||
| 838 | /* List of buckets we're currently writing data to */ | 831 | /* List of buckets we're currently writing data to */ |
| 839 | struct list_head data_buckets; | 832 | struct list_head data_buckets; |
| @@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void) | |||
| 906 | return local_clock() >> 10; | 899 | return local_clock() >> 10; |
| 907 | } | 900 | } |
| 908 | 901 | ||
| 909 | #define MAX_BSETS 4U | ||
| 910 | |||
| 911 | #define BTREE_PRIO USHRT_MAX | 902 | #define BTREE_PRIO USHRT_MAX |
| 912 | #define INITIAL_PRIO 32768 | 903 | #define INITIAL_PRIO 32768 |
| 913 | 904 | ||
| @@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k) | |||
| 1112 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | 1103 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); |
| 1113 | } | 1104 | } |
| 1114 | 1105 | ||
| 1115 | /* Blktrace macros */ | ||
| 1116 | |||
| 1117 | #define blktrace_msg(c, fmt, ...) \ | ||
| 1118 | do { \ | ||
| 1119 | struct request_queue *q = bdev_get_queue(c->bdev); \ | ||
| 1120 | if (q) \ | ||
| 1121 | blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ | ||
| 1122 | } while (0) | ||
| 1123 | |||
| 1124 | #define blktrace_msg_all(s, fmt, ...) \ | ||
| 1125 | do { \ | ||
| 1126 | struct cache *_c; \ | ||
| 1127 | unsigned i; \ | ||
| 1128 | for_each_cache(_c, (s), i) \ | ||
| 1129 | blktrace_msg(_c, fmt, ##__VA_ARGS__); \ | ||
| 1130 | } while (0) | ||
| 1131 | |||
| 1132 | static inline void cached_dev_put(struct cached_dev *dc) | 1106 | static inline void cached_dev_put(struct cached_dev *dc) |
| 1133 | { | 1107 | { |
| 1134 | if (atomic_dec_and_test(&dc->count)) | 1108 | if (atomic_dec_and_test(&dc->count)) |
| @@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b) | |||
| 1173 | static struct kobj_attribute ksysfs_##n = \ | 1147 | static struct kobj_attribute ksysfs_##n = \ |
| 1174 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) | 1148 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) |
| 1175 | 1149 | ||
| 1176 | /* Forward declarations */ | 1150 | static inline void wake_up_allocators(struct cache_set *c) |
| 1151 | { | ||
| 1152 | struct cache *ca; | ||
| 1153 | unsigned i; | ||
| 1154 | |||
| 1155 | for_each_cache(ca, c, i) | ||
| 1156 | wake_up_process(ca->alloc_thread); | ||
| 1157 | } | ||
| 1177 | 1158 | ||
| 1178 | void bch_writeback_queue(struct cached_dev *); | 1159 | /* Forward declarations */ |
| 1179 | void bch_writeback_add(struct cached_dev *, unsigned); | ||
| 1180 | 1160 | ||
| 1181 | void bch_count_io_errors(struct cache *, int, const char *); | 1161 | void bch_count_io_errors(struct cache *, int, const char *); |
| 1182 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, | 1162 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, |
| @@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); | |||
| 1193 | uint8_t bch_inc_gen(struct cache *, struct bucket *); | 1173 | uint8_t bch_inc_gen(struct cache *, struct bucket *); |
| 1194 | void bch_rescale_priorities(struct cache_set *, int); | 1174 | void bch_rescale_priorities(struct cache_set *, int); |
| 1195 | bool bch_bucket_add_unused(struct cache *, struct bucket *); | 1175 | bool bch_bucket_add_unused(struct cache *, struct bucket *); |
| 1196 | void bch_allocator_thread(struct closure *); | ||
| 1197 | 1176 | ||
| 1198 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); | 1177 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); |
| 1199 | void bch_bucket_free(struct cache_set *, struct bkey *); | 1178 | void bch_bucket_free(struct cache_set *, struct bkey *); |
| @@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *); | |||
| 1241 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); | 1220 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); |
| 1242 | void bch_btree_cache_free(struct cache_set *); | 1221 | void bch_btree_cache_free(struct cache_set *); |
| 1243 | int bch_btree_cache_alloc(struct cache_set *); | 1222 | int bch_btree_cache_alloc(struct cache_set *); |
| 1244 | void bch_cached_dev_writeback_init(struct cached_dev *); | ||
| 1245 | void bch_moving_init_cache_set(struct cache_set *); | 1223 | void bch_moving_init_cache_set(struct cache_set *); |
| 1246 | 1224 | ||
| 1225 | int bch_cache_allocator_start(struct cache *ca); | ||
| 1247 | void bch_cache_allocator_exit(struct cache *ca); | 1226 | void bch_cache_allocator_exit(struct cache *ca); |
| 1248 | int bch_cache_allocator_init(struct cache *ca); | 1227 | int bch_cache_allocator_init(struct cache *ca); |
| 1249 | 1228 | ||
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 1d27d3af3251..8010eed06a51 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
| @@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) | |||
| 78 | bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | 78 | bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) |
| 79 | { | 79 | { |
| 80 | unsigned i; | 80 | unsigned i; |
| 81 | char buf[80]; | ||
| 81 | 82 | ||
| 82 | if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) | 83 | if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) |
| 83 | goto bad; | 84 | goto bad; |
| @@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | |||
| 102 | 103 | ||
| 103 | return false; | 104 | return false; |
| 104 | bad: | 105 | bad: |
| 105 | cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); | 106 | bch_bkey_to_text(buf, sizeof(buf), k); |
| 107 | cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); | ||
| 106 | return true; | 108 | return true; |
| 107 | } | 109 | } |
| 108 | 110 | ||
| @@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) | |||
| 162 | #ifdef CONFIG_BCACHE_EDEBUG | 164 | #ifdef CONFIG_BCACHE_EDEBUG |
| 163 | bug: | 165 | bug: |
| 164 | mutex_unlock(&b->c->bucket_lock); | 166 | mutex_unlock(&b->c->bucket_lock); |
| 165 | btree_bug(b, | 167 | |
| 168 | { | ||
| 169 | char buf[80]; | ||
| 170 | |||
| 171 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
| 172 | btree_bug(b, | ||
| 166 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | 173 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", |
| 167 | pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | 174 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), |
| 168 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | 175 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); |
| 176 | } | ||
| 169 | return true; | 177 | return true; |
| 170 | #endif | 178 | #endif |
| 171 | } | 179 | } |
| @@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) | |||
| 1084 | new->sets->size = 0; | 1092 | new->sets->size = 0; |
| 1085 | } | 1093 | } |
| 1086 | 1094 | ||
| 1095 | #define SORT_CRIT (4096 / sizeof(uint64_t)) | ||
| 1096 | |||
| 1087 | void bch_btree_sort_lazy(struct btree *b) | 1097 | void bch_btree_sort_lazy(struct btree *b) |
| 1088 | { | 1098 | { |
| 1089 | if (b->nsets) { | 1099 | unsigned crit = SORT_CRIT; |
| 1090 | unsigned i, j, keys = 0, total; | 1100 | int i; |
| 1091 | 1101 | ||
| 1092 | for (i = 0; i <= b->nsets; i++) | 1102 | /* Don't sort if nothing to do */ |
| 1093 | keys += b->sets[i].data->keys; | 1103 | if (!b->nsets) |
| 1094 | 1104 | goto out; | |
| 1095 | total = keys; | ||
| 1096 | 1105 | ||
| 1097 | for (j = 0; j < b->nsets; j++) { | 1106 | /* If not a leaf node, always sort */ |
| 1098 | if (keys * 2 < total || | 1107 | if (b->level) { |
| 1099 | keys < 1000) { | 1108 | bch_btree_sort(b); |
| 1100 | bch_btree_sort_partial(b, j); | 1109 | return; |
| 1101 | return; | 1110 | } |
| 1102 | } | ||
| 1103 | 1111 | ||
| 1104 | keys -= b->sets[j].data->keys; | 1112 | for (i = b->nsets - 1; i >= 0; --i) { |
| 1105 | } | 1113 | crit *= b->c->sort_crit_factor; |
| 1106 | 1114 | ||
| 1107 | /* Must sort if b->nsets == 3 or we'll overflow */ | 1115 | if (b->sets[i].data->keys < crit) { |
| 1108 | if (b->nsets >= (MAX_BSETS - 1) - b->level) { | 1116 | bch_btree_sort_partial(b, i); |
| 1109 | bch_btree_sort(b); | ||
| 1110 | return; | 1117 | return; |
| 1111 | } | 1118 | } |
| 1112 | } | 1119 | } |
| 1113 | 1120 | ||
| 1121 | /* Sort if we'd overflow */ | ||
| 1122 | if (b->nsets + 1 == MAX_BSETS) { | ||
| 1123 | bch_btree_sort(b); | ||
| 1124 | return; | ||
| 1125 | } | ||
| 1126 | |||
| 1127 | out: | ||
| 1114 | bset_build_written_tree(b); | 1128 | bset_build_written_tree(b); |
| 1115 | } | 1129 | } |
| 1116 | 1130 | ||
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 57a9cff41546..ae115a253d73 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | #ifndef _BCACHE_BSET_H | 1 | #ifndef _BCACHE_BSET_H |
| 2 | #define _BCACHE_BSET_H | 2 | #define _BCACHE_BSET_H |
| 3 | 3 | ||
| 4 | #include <linux/slab.h> | ||
| 5 | |||
| 4 | /* | 6 | /* |
| 5 | * BKEYS: | 7 | * BKEYS: |
| 6 | * | 8 | * |
| @@ -142,6 +144,8 @@ | |||
| 142 | 144 | ||
| 143 | /* Btree key comparison/iteration */ | 145 | /* Btree key comparison/iteration */ |
| 144 | 146 | ||
| 147 | #define MAX_BSETS 4U | ||
| 148 | |||
| 145 | struct btree_iter { | 149 | struct btree_iter { |
| 146 | size_t size, used; | 150 | size_t size, used; |
| 147 | struct btree_iter_set { | 151 | struct btree_iter_set { |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..ee372884c405 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include "btree.h" | 24 | #include "btree.h" |
| 25 | #include "debug.h" | 25 | #include "debug.h" |
| 26 | #include "request.h" | 26 | #include "request.h" |
| 27 | #include "writeback.h" | ||
| 27 | 28 | ||
| 28 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
| 29 | #include <linux/bitops.h> | 30 | #include <linux/bitops.h> |
| @@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) | |||
| 134 | return crc ^ 0xffffffffffffffffULL; | 135 | return crc ^ 0xffffffffffffffffULL; |
| 135 | } | 136 | } |
| 136 | 137 | ||
| 137 | static void btree_bio_endio(struct bio *bio, int error) | 138 | static void bch_btree_node_read_done(struct btree *b) |
| 138 | { | 139 | { |
| 139 | struct closure *cl = bio->bi_private; | ||
| 140 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
| 141 | |||
| 142 | if (error) | ||
| 143 | set_btree_node_io_error(b); | ||
| 144 | |||
| 145 | bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) | ||
| 146 | ? "writing btree" : "reading btree"); | ||
| 147 | closure_put(cl); | ||
| 148 | } | ||
| 149 | |||
| 150 | static void btree_bio_init(struct btree *b) | ||
| 151 | { | ||
| 152 | BUG_ON(b->bio); | ||
| 153 | b->bio = bch_bbio_alloc(b->c); | ||
| 154 | |||
| 155 | b->bio->bi_end_io = btree_bio_endio; | ||
| 156 | b->bio->bi_private = &b->io.cl; | ||
| 157 | } | ||
| 158 | |||
| 159 | void bch_btree_read_done(struct closure *cl) | ||
| 160 | { | ||
| 161 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
| 162 | struct bset *i = b->sets[0].data; | ||
| 163 | struct btree_iter *iter = b->c->fill_iter; | ||
| 164 | const char *err = "bad btree header"; | 140 | const char *err = "bad btree header"; |
| 165 | BUG_ON(b->nsets || b->written); | 141 | struct bset *i = b->sets[0].data; |
| 166 | 142 | struct btree_iter *iter; | |
| 167 | bch_bbio_free(b->bio, b->c); | ||
| 168 | b->bio = NULL; | ||
| 169 | 143 | ||
| 170 | mutex_lock(&b->c->fill_lock); | 144 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
| 145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; | ||
| 171 | iter->used = 0; | 146 | iter->used = 0; |
| 172 | 147 | ||
| 173 | if (btree_node_io_error(b) || | 148 | if (!i->seq) |
| 174 | !i->seq) | ||
| 175 | goto err; | 149 | goto err; |
| 176 | 150 | ||
| 177 | for (; | 151 | for (; |
| @@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl) | |||
| 228 | if (b->written < btree_blocks(b)) | 202 | if (b->written < btree_blocks(b)) |
| 229 | bch_bset_init_next(b); | 203 | bch_bset_init_next(b); |
| 230 | out: | 204 | out: |
| 231 | 205 | mempool_free(iter, b->c->fill_iter); | |
| 232 | mutex_unlock(&b->c->fill_lock); | 206 | return; |
| 233 | |||
| 234 | spin_lock(&b->c->btree_read_time_lock); | ||
| 235 | bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); | ||
| 236 | spin_unlock(&b->c->btree_read_time_lock); | ||
| 237 | |||
| 238 | smp_wmb(); /* read_done is our write lock */ | ||
| 239 | set_btree_node_read_done(b); | ||
| 240 | |||
| 241 | closure_return(cl); | ||
| 242 | err: | 207 | err: |
| 243 | set_btree_node_io_error(b); | 208 | set_btree_node_io_error(b); |
| 244 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | 209 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", |
| @@ -247,48 +212,69 @@ err: | |||
| 247 | goto out; | 212 | goto out; |
| 248 | } | 213 | } |
| 249 | 214 | ||
| 250 | void bch_btree_read(struct btree *b) | 215 | static void btree_node_read_endio(struct bio *bio, int error) |
| 216 | { | ||
| 217 | struct closure *cl = bio->bi_private; | ||
| 218 | closure_put(cl); | ||
| 219 | } | ||
| 220 | |||
| 221 | void bch_btree_node_read(struct btree *b) | ||
| 251 | { | 222 | { |
| 252 | BUG_ON(b->nsets || b->written); | 223 | uint64_t start_time = local_clock(); |
| 224 | struct closure cl; | ||
| 225 | struct bio *bio; | ||
| 226 | |||
| 227 | trace_bcache_btree_read(b); | ||
| 228 | |||
| 229 | closure_init_stack(&cl); | ||
| 230 | |||
| 231 | bio = bch_bbio_alloc(b->c); | ||
| 232 | bio->bi_rw = REQ_META|READ_SYNC; | ||
| 233 | bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
| 234 | bio->bi_end_io = btree_node_read_endio; | ||
| 235 | bio->bi_private = &cl; | ||
| 236 | |||
| 237 | bch_bio_map(bio, b->sets[0].data); | ||
| 238 | |||
| 239 | bch_submit_bbio(bio, b->c, &b->key, 0); | ||
| 240 | closure_sync(&cl); | ||
| 253 | 241 | ||
| 254 | if (!closure_trylock(&b->io.cl, &b->c->cl)) | 242 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
| 255 | BUG(); | 243 | set_btree_node_io_error(b); |
| 256 | 244 | ||
| 257 | b->io_start_time = local_clock(); | 245 | bch_bbio_free(bio, b->c); |
| 258 | 246 | ||
| 259 | btree_bio_init(b); | 247 | if (btree_node_io_error(b)) |
| 260 | b->bio->bi_rw = REQ_META|READ_SYNC; | 248 | goto err; |
| 261 | b->bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
| 262 | 249 | ||
| 263 | bch_bio_map(b->bio, b->sets[0].data); | 250 | bch_btree_node_read_done(b); |
| 264 | 251 | ||
| 265 | pr_debug("%s", pbtree(b)); | 252 | spin_lock(&b->c->btree_read_time_lock); |
| 266 | trace_bcache_btree_read(b->bio); | 253 | bch_time_stats_update(&b->c->btree_read_time, start_time); |
| 267 | bch_submit_bbio(b->bio, b->c, &b->key, 0); | 254 | spin_unlock(&b->c->btree_read_time_lock); |
| 268 | 255 | ||
| 269 | continue_at(&b->io.cl, bch_btree_read_done, system_wq); | 256 | return; |
| 257 | err: | ||
| 258 | bch_cache_set_error(b->c, "io error reading bucket %lu", | ||
| 259 | PTR_BUCKET_NR(b->c, &b->key, 0)); | ||
| 270 | } | 260 | } |
| 271 | 261 | ||
| 272 | static void btree_complete_write(struct btree *b, struct btree_write *w) | 262 | static void btree_complete_write(struct btree *b, struct btree_write *w) |
| 273 | { | 263 | { |
| 274 | if (w->prio_blocked && | 264 | if (w->prio_blocked && |
| 275 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | 265 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) |
| 276 | wake_up(&b->c->alloc_wait); | 266 | wake_up_allocators(b->c); |
| 277 | 267 | ||
| 278 | if (w->journal) { | 268 | if (w->journal) { |
| 279 | atomic_dec_bug(w->journal); | 269 | atomic_dec_bug(w->journal); |
| 280 | __closure_wake_up(&b->c->journal.wait); | 270 | __closure_wake_up(&b->c->journal.wait); |
| 281 | } | 271 | } |
| 282 | 272 | ||
| 283 | if (w->owner) | ||
| 284 | closure_put(w->owner); | ||
| 285 | |||
| 286 | w->prio_blocked = 0; | 273 | w->prio_blocked = 0; |
| 287 | w->journal = NULL; | 274 | w->journal = NULL; |
| 288 | w->owner = NULL; | ||
| 289 | } | 275 | } |
| 290 | 276 | ||
| 291 | static void __btree_write_done(struct closure *cl) | 277 | static void __btree_node_write_done(struct closure *cl) |
| 292 | { | 278 | { |
| 293 | struct btree *b = container_of(cl, struct btree, io.cl); | 279 | struct btree *b = container_of(cl, struct btree, io.cl); |
| 294 | struct btree_write *w = btree_prev_write(b); | 280 | struct btree_write *w = btree_prev_write(b); |
| @@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl) | |||
| 304 | closure_return(cl); | 290 | closure_return(cl); |
| 305 | } | 291 | } |
| 306 | 292 | ||
| 307 | static void btree_write_done(struct closure *cl) | 293 | static void btree_node_write_done(struct closure *cl) |
| 308 | { | 294 | { |
| 309 | struct btree *b = container_of(cl, struct btree, io.cl); | 295 | struct btree *b = container_of(cl, struct btree, io.cl); |
| 310 | struct bio_vec *bv; | 296 | struct bio_vec *bv; |
| @@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl) | |||
| 313 | __bio_for_each_segment(bv, b->bio, n, 0) | 299 | __bio_for_each_segment(bv, b->bio, n, 0) |
| 314 | __free_page(bv->bv_page); | 300 | __free_page(bv->bv_page); |
| 315 | 301 | ||
| 316 | __btree_write_done(cl); | 302 | __btree_node_write_done(cl); |
| 317 | } | 303 | } |
| 318 | 304 | ||
| 319 | static void do_btree_write(struct btree *b) | 305 | static void btree_node_write_endio(struct bio *bio, int error) |
| 306 | { | ||
| 307 | struct closure *cl = bio->bi_private; | ||
| 308 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
| 309 | |||
| 310 | if (error) | ||
| 311 | set_btree_node_io_error(b); | ||
| 312 | |||
| 313 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); | ||
| 314 | closure_put(cl); | ||
| 315 | } | ||
| 316 | |||
| 317 | static void do_btree_node_write(struct btree *b) | ||
| 320 | { | 318 | { |
| 321 | struct closure *cl = &b->io.cl; | 319 | struct closure *cl = &b->io.cl; |
| 322 | struct bset *i = b->sets[b->nsets].data; | 320 | struct bset *i = b->sets[b->nsets].data; |
| @@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b) | |||
| 325 | i->version = BCACHE_BSET_VERSION; | 323 | i->version = BCACHE_BSET_VERSION; |
| 326 | i->csum = btree_csum_set(b, i); | 324 | i->csum = btree_csum_set(b, i); |
| 327 | 325 | ||
| 328 | btree_bio_init(b); | 326 | BUG_ON(b->bio); |
| 329 | b->bio->bi_rw = REQ_META|WRITE_SYNC; | 327 | b->bio = bch_bbio_alloc(b->c); |
| 330 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 328 | |
| 329 | b->bio->bi_end_io = btree_node_write_endio; | ||
| 330 | b->bio->bi_private = &b->io.cl; | ||
| 331 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; | ||
| 332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | ||
| 331 | bch_bio_map(b->bio, i); | 333 | bch_bio_map(b->bio, i); |
| 332 | 334 | ||
| 335 | /* | ||
| 336 | * If we're appending to a leaf node, we don't technically need FUA - | ||
| 337 | * this write just needs to be persisted before the next journal write, | ||
| 338 | * which will be marked FLUSH|FUA. | ||
| 339 | * | ||
| 340 | * Similarly if we're writing a new btree root - the pointer is going to | ||
| 341 | * be in the next journal entry. | ||
| 342 | * | ||
| 343 | * But if we're writing a new btree node (that isn't a root) or | ||
| 344 | * appending to a non leaf btree node, we need either FUA or a flush | ||
| 345 | * when we write the parent with the new pointer. FUA is cheaper than a | ||
| 346 | * flush, and writes appending to leaf nodes aren't blocking anything so | ||
| 347 | * just make all btree node writes FUA to keep things sane. | ||
| 348 | */ | ||
| 349 | |||
| 333 | bkey_copy(&k.key, &b->key); | 350 | bkey_copy(&k.key, &b->key); |
| 334 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | 351 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); |
| 335 | 352 | ||
| 336 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { | 353 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { |
| 337 | int j; | 354 | int j; |
| 338 | struct bio_vec *bv; | 355 | struct bio_vec *bv; |
| 339 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 356 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); |
| @@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b) | |||
| 342 | memcpy(page_address(bv->bv_page), | 359 | memcpy(page_address(bv->bv_page), |
| 343 | base + j * PAGE_SIZE, PAGE_SIZE); | 360 | base + j * PAGE_SIZE, PAGE_SIZE); |
| 344 | 361 | ||
| 345 | trace_bcache_btree_write(b->bio); | ||
| 346 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 362 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
| 347 | 363 | ||
| 348 | continue_at(cl, btree_write_done, NULL); | 364 | continue_at(cl, btree_node_write_done, NULL); |
| 349 | } else { | 365 | } else { |
| 350 | b->bio->bi_vcnt = 0; | 366 | b->bio->bi_vcnt = 0; |
| 351 | bch_bio_map(b->bio, i); | 367 | bch_bio_map(b->bio, i); |
| 352 | 368 | ||
| 353 | trace_bcache_btree_write(b->bio); | ||
| 354 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 369 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
| 355 | 370 | ||
| 356 | closure_sync(cl); | 371 | closure_sync(cl); |
| 357 | __btree_write_done(cl); | 372 | __btree_node_write_done(cl); |
| 358 | } | 373 | } |
| 359 | } | 374 | } |
| 360 | 375 | ||
| 361 | static void __btree_write(struct btree *b) | 376 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
| 362 | { | 377 | { |
| 363 | struct bset *i = b->sets[b->nsets].data; | 378 | struct bset *i = b->sets[b->nsets].data; |
| 364 | 379 | ||
| 380 | trace_bcache_btree_write(b); | ||
| 381 | |||
| 365 | BUG_ON(current->bio_list); | 382 | BUG_ON(current->bio_list); |
| 383 | BUG_ON(b->written >= btree_blocks(b)); | ||
| 384 | BUG_ON(b->written && !i->keys); | ||
| 385 | BUG_ON(b->sets->data->seq != i->seq); | ||
| 386 | bch_check_key_order(b, i); | ||
| 366 | 387 | ||
| 367 | closure_lock(&b->io, &b->c->cl); | ||
| 368 | cancel_delayed_work(&b->work); | 388 | cancel_delayed_work(&b->work); |
| 369 | 389 | ||
| 390 | /* If caller isn't waiting for write, parent refcount is cache set */ | ||
| 391 | closure_lock(&b->io, parent ?: &b->c->cl); | ||
| 392 | |||
| 370 | clear_bit(BTREE_NODE_dirty, &b->flags); | 393 | clear_bit(BTREE_NODE_dirty, &b->flags); |
| 371 | change_bit(BTREE_NODE_write_idx, &b->flags); | 394 | change_bit(BTREE_NODE_write_idx, &b->flags); |
| 372 | 395 | ||
| 373 | bch_check_key_order(b, i); | 396 | do_btree_node_write(b); |
| 374 | BUG_ON(b->written && !i->keys); | ||
| 375 | |||
| 376 | do_btree_write(b); | ||
| 377 | |||
| 378 | pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); | ||
| 379 | 397 | ||
| 380 | b->written += set_blocks(i, b->c); | 398 | b->written += set_blocks(i, b->c); |
| 381 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | 399 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, |
| @@ -387,37 +405,31 @@ static void __btree_write(struct btree *b) | |||
| 387 | bch_bset_init_next(b); | 405 | bch_bset_init_next(b); |
| 388 | } | 406 | } |
| 389 | 407 | ||
| 390 | static void btree_write_work(struct work_struct *w) | 408 | static void btree_node_write_work(struct work_struct *w) |
| 391 | { | 409 | { |
| 392 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 410 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); |
| 393 | 411 | ||
| 394 | down_write(&b->lock); | 412 | rw_lock(true, b, b->level); |
| 395 | 413 | ||
| 396 | if (btree_node_dirty(b)) | 414 | if (btree_node_dirty(b)) |
| 397 | __btree_write(b); | 415 | bch_btree_node_write(b, NULL); |
| 398 | up_write(&b->lock); | 416 | rw_unlock(true, b); |
| 399 | } | 417 | } |
| 400 | 418 | ||
| 401 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | 419 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) |
| 402 | { | 420 | { |
| 403 | struct bset *i = b->sets[b->nsets].data; | 421 | struct bset *i = b->sets[b->nsets].data; |
| 404 | struct btree_write *w = btree_current_write(b); | 422 | struct btree_write *w = btree_current_write(b); |
| 405 | 423 | ||
| 406 | BUG_ON(b->written && | 424 | BUG_ON(!b->written); |
| 407 | (b->written >= btree_blocks(b) || | 425 | BUG_ON(!i->keys); |
| 408 | i->seq != b->sets[0].data->seq || | ||
| 409 | !i->keys)); | ||
| 410 | 426 | ||
| 411 | if (!btree_node_dirty(b)) { | 427 | if (!btree_node_dirty(b)) |
| 412 | set_btree_node_dirty(b); | 428 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); |
| 413 | queue_delayed_work(btree_io_wq, &b->work, | ||
| 414 | msecs_to_jiffies(30000)); | ||
| 415 | } | ||
| 416 | 429 | ||
| 417 | w->prio_blocked += b->prio_blocked; | 430 | set_btree_node_dirty(b); |
| 418 | b->prio_blocked = 0; | ||
| 419 | 431 | ||
| 420 | if (op && op->journal && !b->level) { | 432 | if (op && op->journal) { |
| 421 | if (w->journal && | 433 | if (w->journal && |
| 422 | journal_pin_cmp(b->c, w, op)) { | 434 | journal_pin_cmp(b->c, w, op)) { |
| 423 | atomic_dec_bug(w->journal); | 435 | atomic_dec_bug(w->journal); |
| @@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | |||
| 430 | } | 442 | } |
| 431 | } | 443 | } |
| 432 | 444 | ||
| 433 | if (current->bio_list) | ||
| 434 | return; | ||
| 435 | |||
| 436 | /* Force write if set is too big */ | 445 | /* Force write if set is too big */ |
| 437 | if (now || | 446 | if (set_bytes(i) > PAGE_SIZE - 48 && |
| 438 | b->level || | 447 | !current->bio_list) |
| 439 | set_bytes(i) > PAGE_SIZE - 48) { | 448 | bch_btree_node_write(b, NULL); |
| 440 | if (op && now) { | ||
| 441 | /* Must wait on multiple writes */ | ||
| 442 | BUG_ON(w->owner); | ||
| 443 | w->owner = &op->cl; | ||
| 444 | closure_get(&op->cl); | ||
| 445 | } | ||
| 446 | |||
| 447 | __btree_write(b); | ||
| 448 | } | ||
| 449 | BUG_ON(!b->written); | ||
| 450 | } | 449 | } |
| 451 | 450 | ||
| 452 | /* | 451 | /* |
| @@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, | |||
| 559 | init_rwsem(&b->lock); | 558 | init_rwsem(&b->lock); |
| 560 | lockdep_set_novalidate_class(&b->lock); | 559 | lockdep_set_novalidate_class(&b->lock); |
| 561 | INIT_LIST_HEAD(&b->list); | 560 | INIT_LIST_HEAD(&b->list); |
| 562 | INIT_DELAYED_WORK(&b->work, btree_write_work); | 561 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
| 563 | b->c = c; | 562 | b->c = c; |
| 564 | closure_init_unlocked(&b->io); | 563 | closure_init_unlocked(&b->io); |
| 565 | 564 | ||
| @@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | |||
| 582 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 581 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); |
| 583 | 582 | ||
| 584 | if (cl && btree_node_dirty(b)) | 583 | if (cl && btree_node_dirty(b)) |
| 585 | bch_btree_write(b, true, NULL); | 584 | bch_btree_node_write(b, NULL); |
| 586 | 585 | ||
| 587 | if (cl) | 586 | if (cl) |
| 588 | closure_wait_event_async(&b->io.wait, cl, | 587 | closure_wait_event_async(&b->io.wait, cl, |
| @@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
| 623 | else if (!mutex_trylock(&c->bucket_lock)) | 622 | else if (!mutex_trylock(&c->bucket_lock)) |
| 624 | return -1; | 623 | return -1; |
| 625 | 624 | ||
| 625 | /* | ||
| 626 | * It's _really_ critical that we don't free too many btree nodes - we | ||
| 627 | * have to always leave ourselves a reserve. The reserve is how we | ||
| 628 | * guarantee that allocating memory for a new btree node can always | ||
| 629 | * succeed, so that inserting keys into the btree can always succeed and | ||
| 630 | * IO can always make forward progress: | ||
| 631 | */ | ||
| 626 | nr /= c->btree_pages; | 632 | nr /= c->btree_pages; |
| 627 | nr = min_t(unsigned long, nr, mca_can_free(c)); | 633 | nr = min_t(unsigned long, nr, mca_can_free(c)); |
| 628 | 634 | ||
| @@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | |||
| 766 | int ret = -ENOMEM; | 772 | int ret = -ENOMEM; |
| 767 | struct btree *i; | 773 | struct btree *i; |
| 768 | 774 | ||
| 775 | trace_bcache_btree_cache_cannibalize(c); | ||
| 776 | |||
| 769 | if (!cl) | 777 | if (!cl) |
| 770 | return ERR_PTR(-ENOMEM); | 778 | return ERR_PTR(-ENOMEM); |
| 771 | 779 | ||
| @@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | |||
| 784 | return ERR_PTR(-EAGAIN); | 792 | return ERR_PTR(-EAGAIN); |
| 785 | } | 793 | } |
| 786 | 794 | ||
| 787 | /* XXX: tracepoint */ | ||
| 788 | c->try_harder = cl; | 795 | c->try_harder = cl; |
| 789 | c->try_harder_start = local_clock(); | 796 | c->try_harder_start = local_clock(); |
| 790 | retry: | 797 | retry: |
| @@ -905,6 +912,9 @@ retry: | |||
| 905 | b = mca_find(c, k); | 912 | b = mca_find(c, k); |
| 906 | 913 | ||
| 907 | if (!b) { | 914 | if (!b) { |
| 915 | if (current->bio_list) | ||
| 916 | return ERR_PTR(-EAGAIN); | ||
| 917 | |||
| 908 | mutex_lock(&c->bucket_lock); | 918 | mutex_lock(&c->bucket_lock); |
| 909 | b = mca_alloc(c, k, level, &op->cl); | 919 | b = mca_alloc(c, k, level, &op->cl); |
| 910 | mutex_unlock(&c->bucket_lock); | 920 | mutex_unlock(&c->bucket_lock); |
| @@ -914,7 +924,7 @@ retry: | |||
| 914 | if (IS_ERR(b)) | 924 | if (IS_ERR(b)) |
| 915 | return b; | 925 | return b; |
| 916 | 926 | ||
| 917 | bch_btree_read(b); | 927 | bch_btree_node_read(b); |
| 918 | 928 | ||
| 919 | if (!write) | 929 | if (!write) |
| 920 | downgrade_write(&b->lock); | 930 | downgrade_write(&b->lock); |
| @@ -937,15 +947,12 @@ retry: | |||
| 937 | for (; i <= b->nsets; i++) | 947 | for (; i <= b->nsets; i++) |
| 938 | prefetch(b->sets[i].data); | 948 | prefetch(b->sets[i].data); |
| 939 | 949 | ||
| 940 | if (!closure_wait_event(&b->io.wait, &op->cl, | 950 | if (btree_node_io_error(b)) { |
| 941 | btree_node_read_done(b))) { | ||
| 942 | rw_unlock(write, b); | ||
| 943 | b = ERR_PTR(-EAGAIN); | ||
| 944 | } else if (btree_node_io_error(b)) { | ||
| 945 | rw_unlock(write, b); | 951 | rw_unlock(write, b); |
| 946 | b = ERR_PTR(-EIO); | 952 | return ERR_PTR(-EIO); |
| 947 | } else | 953 | } |
| 948 | BUG_ON(!b->written); | 954 | |
| 955 | BUG_ON(!b->written); | ||
| 949 | 956 | ||
| 950 | return b; | 957 | return b; |
| 951 | } | 958 | } |
| @@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
| 959 | mutex_unlock(&c->bucket_lock); | 966 | mutex_unlock(&c->bucket_lock); |
| 960 | 967 | ||
| 961 | if (!IS_ERR_OR_NULL(b)) { | 968 | if (!IS_ERR_OR_NULL(b)) { |
| 962 | bch_btree_read(b); | 969 | bch_btree_node_read(b); |
| 963 | rw_unlock(true, b); | 970 | rw_unlock(true, b); |
| 964 | } | 971 | } |
| 965 | } | 972 | } |
| @@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op) | |||
| 970 | { | 977 | { |
| 971 | unsigned i; | 978 | unsigned i; |
| 972 | 979 | ||
| 980 | trace_bcache_btree_node_free(b); | ||
| 981 | |||
| 973 | /* | 982 | /* |
| 974 | * The BUG_ON() in btree_node_get() implies that we must have a write | 983 | * The BUG_ON() in btree_node_get() implies that we must have a write |
| 975 | * lock on parent to free or even invalidate a node | 984 | * lock on parent to free or even invalidate a node |
| 976 | */ | 985 | */ |
| 977 | BUG_ON(op->lock <= b->level); | 986 | BUG_ON(op->lock <= b->level); |
| 978 | BUG_ON(b == b->c->root); | 987 | BUG_ON(b == b->c->root); |
| 979 | pr_debug("bucket %s", pbtree(b)); | ||
| 980 | 988 | ||
| 981 | if (btree_node_dirty(b)) | 989 | if (btree_node_dirty(b)) |
| 982 | btree_complete_write(b, btree_current_write(b)); | 990 | btree_complete_write(b, btree_current_write(b)); |
| 983 | clear_bit(BTREE_NODE_dirty, &b->flags); | 991 | clear_bit(BTREE_NODE_dirty, &b->flags); |
| 984 | 992 | ||
| 985 | if (b->prio_blocked && | ||
| 986 | !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) | ||
| 987 | wake_up(&b->c->alloc_wait); | ||
| 988 | |||
| 989 | b->prio_blocked = 0; | ||
| 990 | |||
| 991 | cancel_delayed_work(&b->work); | 993 | cancel_delayed_work(&b->work); |
| 992 | 994 | ||
| 993 | mutex_lock(&b->c->bucket_lock); | 995 | mutex_lock(&b->c->bucket_lock); |
| @@ -1028,17 +1030,20 @@ retry: | |||
| 1028 | goto retry; | 1030 | goto retry; |
| 1029 | } | 1031 | } |
| 1030 | 1032 | ||
| 1031 | set_btree_node_read_done(b); | ||
| 1032 | b->accessed = 1; | 1033 | b->accessed = 1; |
| 1033 | bch_bset_init_next(b); | 1034 | bch_bset_init_next(b); |
| 1034 | 1035 | ||
| 1035 | mutex_unlock(&c->bucket_lock); | 1036 | mutex_unlock(&c->bucket_lock); |
| 1037 | |||
| 1038 | trace_bcache_btree_node_alloc(b); | ||
| 1036 | return b; | 1039 | return b; |
| 1037 | err_free: | 1040 | err_free: |
| 1038 | bch_bucket_free(c, &k.key); | 1041 | bch_bucket_free(c, &k.key); |
| 1039 | __bkey_put(c, &k.key); | 1042 | __bkey_put(c, &k.key); |
| 1040 | err: | 1043 | err: |
| 1041 | mutex_unlock(&c->bucket_lock); | 1044 | mutex_unlock(&c->bucket_lock); |
| 1045 | |||
| 1046 | trace_bcache_btree_node_alloc_fail(b); | ||
| 1042 | return b; | 1047 | return b; |
| 1043 | } | 1048 | } |
| 1044 | 1049 | ||
| @@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, | |||
| 1137 | gc->nkeys++; | 1142 | gc->nkeys++; |
| 1138 | 1143 | ||
| 1139 | gc->data += KEY_SIZE(k); | 1144 | gc->data += KEY_SIZE(k); |
| 1140 | if (KEY_DIRTY(k)) { | 1145 | if (KEY_DIRTY(k)) |
| 1141 | gc->dirty += KEY_SIZE(k); | 1146 | gc->dirty += KEY_SIZE(k); |
| 1142 | if (d) | ||
| 1143 | d->sectors_dirty_gc += KEY_SIZE(k); | ||
| 1144 | } | ||
| 1145 | } | 1147 | } |
| 1146 | 1148 | ||
| 1147 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1149 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) |
| @@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | |||
| 1166 | 1168 | ||
| 1167 | if (!IS_ERR_OR_NULL(n)) { | 1169 | if (!IS_ERR_OR_NULL(n)) { |
| 1168 | swap(b, n); | 1170 | swap(b, n); |
| 1171 | __bkey_put(b->c, &b->key); | ||
| 1169 | 1172 | ||
| 1170 | memcpy(k->ptr, b->key.ptr, | 1173 | memcpy(k->ptr, b->key.ptr, |
| 1171 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | 1174 | sizeof(uint64_t) * KEY_PTRS(&b->key)); |
| 1172 | 1175 | ||
| 1173 | __bkey_put(b->c, &b->key); | ||
| 1174 | atomic_inc(&b->c->prio_blocked); | ||
| 1175 | b->prio_blocked++; | ||
| 1176 | |||
| 1177 | btree_node_free(n, op); | 1176 | btree_node_free(n, op); |
| 1178 | up_write(&n->lock); | 1177 | up_write(&n->lock); |
| 1179 | } | 1178 | } |
| @@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
| 1278 | btree_node_free(r->b, op); | 1277 | btree_node_free(r->b, op); |
| 1279 | up_write(&r->b->lock); | 1278 | up_write(&r->b->lock); |
| 1280 | 1279 | ||
| 1281 | pr_debug("coalesced %u nodes", nodes); | 1280 | trace_bcache_btree_gc_coalesce(nodes); |
| 1282 | 1281 | ||
| 1283 | gc->nodes--; | 1282 | gc->nodes--; |
| 1284 | nodes--; | 1283 | nodes--; |
| @@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
| 1293 | void write(struct btree *r) | 1292 | void write(struct btree *r) |
| 1294 | { | 1293 | { |
| 1295 | if (!r->written) | 1294 | if (!r->written) |
| 1296 | bch_btree_write(r, true, op); | 1295 | bch_btree_node_write(r, &op->cl); |
| 1297 | else if (btree_node_dirty(r)) { | 1296 | else if (btree_node_dirty(r)) |
| 1298 | BUG_ON(btree_current_write(r)->owner); | 1297 | bch_btree_node_write(r, writes); |
| 1299 | btree_current_write(r)->owner = writes; | ||
| 1300 | closure_get(writes); | ||
| 1301 | |||
| 1302 | bch_btree_write(r, true, NULL); | ||
| 1303 | } | ||
| 1304 | 1298 | ||
| 1305 | up_write(&r->lock); | 1299 | up_write(&r->lock); |
| 1306 | } | 1300 | } |
| @@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
| 1386 | ret = btree_gc_recurse(b, op, writes, gc); | 1380 | ret = btree_gc_recurse(b, op, writes, gc); |
| 1387 | 1381 | ||
| 1388 | if (!b->written || btree_node_dirty(b)) { | 1382 | if (!b->written || btree_node_dirty(b)) { |
| 1389 | atomic_inc(&b->c->prio_blocked); | 1383 | bch_btree_node_write(b, n ? &op->cl : NULL); |
| 1390 | b->prio_blocked++; | ||
| 1391 | bch_btree_write(b, true, n ? op : NULL); | ||
| 1392 | } | 1384 | } |
| 1393 | 1385 | ||
| 1394 | if (!IS_ERR_OR_NULL(n)) { | 1386 | if (!IS_ERR_OR_NULL(n)) { |
| @@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c) | |||
| 1405 | { | 1397 | { |
| 1406 | struct cache *ca; | 1398 | struct cache *ca; |
| 1407 | struct bucket *b; | 1399 | struct bucket *b; |
| 1408 | struct bcache_device **d; | ||
| 1409 | unsigned i; | 1400 | unsigned i; |
| 1410 | 1401 | ||
| 1411 | if (!c->gc_mark_valid) | 1402 | if (!c->gc_mark_valid) |
| @@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c) | |||
| 1419 | for_each_cache(ca, c, i) | 1410 | for_each_cache(ca, c, i) |
| 1420 | for_each_bucket(b, ca) { | 1411 | for_each_bucket(b, ca) { |
| 1421 | b->gc_gen = b->gen; | 1412 | b->gc_gen = b->gen; |
| 1422 | if (!atomic_read(&b->pin)) | 1413 | if (!atomic_read(&b->pin)) { |
| 1423 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | 1414 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); |
| 1415 | SET_GC_SECTORS_USED(b, 0); | ||
| 1416 | } | ||
| 1424 | } | 1417 | } |
| 1425 | 1418 | ||
| 1426 | for (d = c->devices; | ||
| 1427 | d < c->devices + c->nr_uuids; | ||
| 1428 | d++) | ||
| 1429 | if (*d) | ||
| 1430 | (*d)->sectors_dirty_gc = 0; | ||
| 1431 | |||
| 1432 | mutex_unlock(&c->bucket_lock); | 1419 | mutex_unlock(&c->bucket_lock); |
| 1433 | } | 1420 | } |
| 1434 | 1421 | ||
| @@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
| 1437 | size_t available = 0; | 1424 | size_t available = 0; |
| 1438 | struct bucket *b; | 1425 | struct bucket *b; |
| 1439 | struct cache *ca; | 1426 | struct cache *ca; |
| 1440 | struct bcache_device **d; | ||
| 1441 | unsigned i; | 1427 | unsigned i; |
| 1442 | 1428 | ||
| 1443 | mutex_lock(&c->bucket_lock); | 1429 | mutex_lock(&c->bucket_lock); |
| @@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
| 1480 | } | 1466 | } |
| 1481 | } | 1467 | } |
| 1482 | 1468 | ||
| 1483 | for (d = c->devices; | ||
| 1484 | d < c->devices + c->nr_uuids; | ||
| 1485 | d++) | ||
| 1486 | if (*d) { | ||
| 1487 | unsigned long last = | ||
| 1488 | atomic_long_read(&((*d)->sectors_dirty)); | ||
| 1489 | long difference = (*d)->sectors_dirty_gc - last; | ||
| 1490 | |||
| 1491 | pr_debug("sectors dirty off by %li", difference); | ||
| 1492 | |||
| 1493 | (*d)->sectors_dirty_last += difference; | ||
| 1494 | |||
| 1495 | atomic_long_set(&((*d)->sectors_dirty), | ||
| 1496 | (*d)->sectors_dirty_gc); | ||
| 1497 | } | ||
| 1498 | |||
| 1499 | mutex_unlock(&c->bucket_lock); | 1469 | mutex_unlock(&c->bucket_lock); |
| 1500 | return available; | 1470 | return available; |
| 1501 | } | 1471 | } |
| @@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl) | |||
| 1508 | struct gc_stat stats; | 1478 | struct gc_stat stats; |
| 1509 | struct closure writes; | 1479 | struct closure writes; |
| 1510 | struct btree_op op; | 1480 | struct btree_op op; |
| 1511 | |||
| 1512 | uint64_t start_time = local_clock(); | 1481 | uint64_t start_time = local_clock(); |
| 1513 | trace_bcache_gc_start(c->sb.set_uuid); | 1482 | |
| 1514 | blktrace_msg_all(c, "Starting gc"); | 1483 | trace_bcache_gc_start(c); |
| 1515 | 1484 | ||
| 1516 | memset(&stats, 0, sizeof(struct gc_stat)); | 1485 | memset(&stats, 0, sizeof(struct gc_stat)); |
| 1517 | closure_init_stack(&writes); | 1486 | closure_init_stack(&writes); |
| @@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl) | |||
| 1520 | 1489 | ||
| 1521 | btree_gc_start(c); | 1490 | btree_gc_start(c); |
| 1522 | 1491 | ||
| 1492 | atomic_inc(&c->prio_blocked); | ||
| 1493 | |||
| 1523 | ret = btree_root(gc_root, c, &op, &writes, &stats); | 1494 | ret = btree_root(gc_root, c, &op, &writes, &stats); |
| 1524 | closure_sync(&op.cl); | 1495 | closure_sync(&op.cl); |
| 1525 | closure_sync(&writes); | 1496 | closure_sync(&writes); |
| 1526 | 1497 | ||
| 1527 | if (ret) { | 1498 | if (ret) { |
| 1528 | blktrace_msg_all(c, "Stopped gc"); | ||
| 1529 | pr_warn("gc failed!"); | 1499 | pr_warn("gc failed!"); |
| 1530 | |||
| 1531 | continue_at(cl, bch_btree_gc, bch_gc_wq); | 1500 | continue_at(cl, bch_btree_gc, bch_gc_wq); |
| 1532 | } | 1501 | } |
| 1533 | 1502 | ||
| @@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl) | |||
| 1537 | 1506 | ||
| 1538 | available = bch_btree_gc_finish(c); | 1507 | available = bch_btree_gc_finish(c); |
| 1539 | 1508 | ||
| 1509 | atomic_dec(&c->prio_blocked); | ||
| 1510 | wake_up_allocators(c); | ||
| 1511 | |||
| 1540 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1512 | bch_time_stats_update(&c->btree_gc_time, start_time); |
| 1541 | 1513 | ||
| 1542 | stats.key_bytes *= sizeof(uint64_t); | 1514 | stats.key_bytes *= sizeof(uint64_t); |
| @@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl) | |||
| 1544 | stats.data <<= 9; | 1516 | stats.data <<= 9; |
| 1545 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1517 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; |
| 1546 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1518 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
| 1547 | blktrace_msg_all(c, "Finished gc"); | ||
| 1548 | 1519 | ||
| 1549 | trace_bcache_gc_end(c->sb.set_uuid); | 1520 | trace_bcache_gc_end(c); |
| 1550 | wake_up(&c->alloc_wait); | ||
| 1551 | 1521 | ||
| 1552 | continue_at(cl, bch_moving_gc, bch_gc_wq); | 1522 | continue_at(cl, bch_moving_gc, bch_gc_wq); |
| 1553 | } | 1523 | } |
| @@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b, | |||
| 1654 | struct btree_iter *iter, | 1624 | struct btree_iter *iter, |
| 1655 | struct btree_op *op) | 1625 | struct btree_op *op) |
| 1656 | { | 1626 | { |
| 1657 | void subtract_dirty(struct bkey *k, int sectors) | 1627 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) |
| 1658 | { | 1628 | { |
| 1659 | struct bcache_device *d = b->c->devices[KEY_INODE(k)]; | 1629 | if (KEY_DIRTY(k)) |
| 1660 | 1630 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | |
| 1661 | if (KEY_DIRTY(k) && d) | 1631 | offset, -sectors); |
| 1662 | atomic_long_sub(sectors, &d->sectors_dirty); | ||
| 1663 | } | 1632 | } |
| 1664 | 1633 | ||
| 1634 | uint64_t old_offset; | ||
| 1665 | unsigned old_size, sectors_found = 0; | 1635 | unsigned old_size, sectors_found = 0; |
| 1666 | 1636 | ||
| 1667 | while (1) { | 1637 | while (1) { |
| @@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
| 1673 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | 1643 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) |
| 1674 | continue; | 1644 | continue; |
| 1675 | 1645 | ||
| 1646 | old_offset = KEY_START(k); | ||
| 1676 | old_size = KEY_SIZE(k); | 1647 | old_size = KEY_SIZE(k); |
| 1677 | 1648 | ||
| 1678 | /* | 1649 | /* |
| @@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
| 1728 | 1699 | ||
| 1729 | struct bkey *top; | 1700 | struct bkey *top; |
| 1730 | 1701 | ||
| 1731 | subtract_dirty(k, KEY_SIZE(insert)); | 1702 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); |
| 1732 | 1703 | ||
| 1733 | if (bkey_written(b, k)) { | 1704 | if (bkey_written(b, k)) { |
| 1734 | /* | 1705 | /* |
| @@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
| 1775 | } | 1746 | } |
| 1776 | } | 1747 | } |
| 1777 | 1748 | ||
| 1778 | subtract_dirty(k, old_size - KEY_SIZE(k)); | 1749 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); |
| 1779 | } | 1750 | } |
| 1780 | 1751 | ||
| 1781 | check_failed: | 1752 | check_failed: |
| @@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
| 1798 | { | 1769 | { |
| 1799 | struct bset *i = b->sets[b->nsets].data; | 1770 | struct bset *i = b->sets[b->nsets].data; |
| 1800 | struct bkey *m, *prev; | 1771 | struct bkey *m, *prev; |
| 1801 | const char *status = "insert"; | 1772 | unsigned status = BTREE_INSERT_STATUS_INSERT; |
| 1802 | 1773 | ||
| 1803 | BUG_ON(bkey_cmp(k, &b->key) > 0); | 1774 | BUG_ON(bkey_cmp(k, &b->key) > 0); |
| 1804 | BUG_ON(b->level && !KEY_PTRS(k)); | 1775 | BUG_ON(b->level && !KEY_PTRS(k)); |
| @@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
| 1831 | goto insert; | 1802 | goto insert; |
| 1832 | 1803 | ||
| 1833 | /* prev is in the tree, if we merge we're done */ | 1804 | /* prev is in the tree, if we merge we're done */ |
| 1834 | status = "back merging"; | 1805 | status = BTREE_INSERT_STATUS_BACK_MERGE; |
| 1835 | if (prev && | 1806 | if (prev && |
| 1836 | bch_bkey_try_merge(b, prev, k)) | 1807 | bch_bkey_try_merge(b, prev, k)) |
| 1837 | goto merged; | 1808 | goto merged; |
| 1838 | 1809 | ||
| 1839 | status = "overwrote front"; | 1810 | status = BTREE_INSERT_STATUS_OVERWROTE; |
| 1840 | if (m != end(i) && | 1811 | if (m != end(i) && |
| 1841 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | 1812 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) |
| 1842 | goto copy; | 1813 | goto copy; |
| 1843 | 1814 | ||
| 1844 | status = "front merge"; | 1815 | status = BTREE_INSERT_STATUS_FRONT_MERGE; |
| 1845 | if (m != end(i) && | 1816 | if (m != end(i) && |
| 1846 | bch_bkey_try_merge(b, k, m)) | 1817 | bch_bkey_try_merge(b, k, m)) |
| 1847 | goto copy; | 1818 | goto copy; |
| @@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
| 1851 | insert: shift_keys(b, m, k); | 1822 | insert: shift_keys(b, m, k); |
| 1852 | copy: bkey_copy(m, k); | 1823 | copy: bkey_copy(m, k); |
| 1853 | merged: | 1824 | merged: |
| 1854 | bch_check_keys(b, "%s for %s at %s: %s", status, | 1825 | if (KEY_DIRTY(k)) |
| 1855 | op_type(op), pbtree(b), pkey(k)); | 1826 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
| 1856 | bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, | 1827 | KEY_START(k), KEY_SIZE(k)); |
| 1857 | op_type(op), pbtree(b), pkey(k)); | 1828 | |
| 1829 | bch_check_keys(b, "%u for %s", status, op_type(op)); | ||
| 1858 | 1830 | ||
| 1859 | if (b->level && !KEY_OFFSET(k)) | 1831 | if (b->level && !KEY_OFFSET(k)) |
| 1860 | b->prio_blocked++; | 1832 | btree_current_write(b)->prio_blocked++; |
| 1861 | 1833 | ||
| 1862 | pr_debug("%s for %s at %s: %s", status, | 1834 | trace_bcache_btree_insert_key(b, k, op->type, status); |
| 1863 | op_type(op), pbtree(b), pkey(k)); | ||
| 1864 | 1835 | ||
| 1865 | return true; | 1836 | return true; |
| 1866 | } | 1837 | } |
| 1867 | 1838 | ||
| 1868 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | 1839 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) |
| 1869 | { | 1840 | { |
| 1870 | bool ret = false; | 1841 | bool ret = false; |
| 1871 | struct bkey *k; | 1842 | struct bkey *k; |
| @@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | |||
| 1896 | should_split(b)) | 1867 | should_split(b)) |
| 1897 | goto out; | 1868 | goto out; |
| 1898 | 1869 | ||
| 1899 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | 1870 | op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); |
| 1900 | 1871 | ||
| 1901 | SET_KEY_PTRS(&op->replace, 1); | 1872 | SET_KEY_PTRS(&op->replace, 1); |
| 1902 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | 1873 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); |
| @@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | |||
| 1907 | 1878 | ||
| 1908 | BUG_ON(op->type != BTREE_INSERT); | 1879 | BUG_ON(op->type != BTREE_INSERT); |
| 1909 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | 1880 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); |
| 1910 | bch_btree_write(b, false, NULL); | ||
| 1911 | ret = true; | 1881 | ret = true; |
| 1912 | out: | 1882 | out: |
| 1913 | downgrade_write(&b->lock); | 1883 | downgrade_write(&b->lock); |
| @@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
| 1929 | 1899 | ||
| 1930 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | 1900 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; |
| 1931 | 1901 | ||
| 1932 | pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", | ||
| 1933 | pbtree(b), n1->sets[0].data->keys); | ||
| 1934 | |||
| 1935 | if (split) { | 1902 | if (split) { |
| 1936 | unsigned keys = 0; | 1903 | unsigned keys = 0; |
| 1937 | 1904 | ||
| 1905 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | ||
| 1906 | |||
| 1938 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | 1907 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); |
| 1939 | if (IS_ERR(n2)) | 1908 | if (IS_ERR(n2)) |
| 1940 | goto err_free1; | 1909 | goto err_free1; |
| @@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
| 1967 | bkey_copy_key(&n2->key, &b->key); | 1936 | bkey_copy_key(&n2->key, &b->key); |
| 1968 | 1937 | ||
| 1969 | bch_keylist_add(&op->keys, &n2->key); | 1938 | bch_keylist_add(&op->keys, &n2->key); |
| 1970 | bch_btree_write(n2, true, op); | 1939 | bch_btree_node_write(n2, &op->cl); |
| 1971 | rw_unlock(true, n2); | 1940 | rw_unlock(true, n2); |
| 1972 | } else | 1941 | } else { |
| 1942 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | ||
| 1943 | |||
| 1973 | bch_btree_insert_keys(n1, op); | 1944 | bch_btree_insert_keys(n1, op); |
| 1945 | } | ||
| 1974 | 1946 | ||
| 1975 | bch_keylist_add(&op->keys, &n1->key); | 1947 | bch_keylist_add(&op->keys, &n1->key); |
| 1976 | bch_btree_write(n1, true, op); | 1948 | bch_btree_node_write(n1, &op->cl); |
| 1977 | 1949 | ||
| 1978 | if (n3) { | 1950 | if (n3) { |
| 1979 | bkey_copy_key(&n3->key, &MAX_KEY); | 1951 | bkey_copy_key(&n3->key, &MAX_KEY); |
| 1980 | bch_btree_insert_keys(n3, op); | 1952 | bch_btree_insert_keys(n3, op); |
| 1981 | bch_btree_write(n3, true, op); | 1953 | bch_btree_node_write(n3, &op->cl); |
| 1982 | 1954 | ||
| 1983 | closure_sync(&op->cl); | 1955 | closure_sync(&op->cl); |
| 1984 | bch_btree_set_root(n3); | 1956 | bch_btree_set_root(n3); |
| @@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | |||
| 2082 | 2054 | ||
| 2083 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2055 | BUG_ON(write_block(b) != b->sets[b->nsets].data); |
| 2084 | 2056 | ||
| 2085 | if (bch_btree_insert_keys(b, op)) | 2057 | if (bch_btree_insert_keys(b, op)) { |
| 2086 | bch_btree_write(b, false, op); | 2058 | if (!b->level) |
| 2059 | bch_btree_leaf_dirty(b, op); | ||
| 2060 | else | ||
| 2061 | bch_btree_node_write(b, &op->cl); | ||
| 2062 | } | ||
| 2087 | } | 2063 | } |
| 2088 | 2064 | ||
| 2089 | return 0; | 2065 | return 0; |
| @@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) | |||
| 2140 | void bch_btree_set_root(struct btree *b) | 2116 | void bch_btree_set_root(struct btree *b) |
| 2141 | { | 2117 | { |
| 2142 | unsigned i; | 2118 | unsigned i; |
| 2119 | struct closure cl; | ||
| 2120 | |||
| 2121 | closure_init_stack(&cl); | ||
| 2122 | |||
| 2123 | trace_bcache_btree_set_root(b); | ||
| 2143 | 2124 | ||
| 2144 | BUG_ON(!b->written); | 2125 | BUG_ON(!b->written); |
| 2145 | 2126 | ||
| @@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b) | |||
| 2153 | b->c->root = b; | 2134 | b->c->root = b; |
| 2154 | __bkey_put(b->c, &b->key); | 2135 | __bkey_put(b->c, &b->key); |
| 2155 | 2136 | ||
| 2156 | bch_journal_meta(b->c, NULL); | 2137 | bch_journal_meta(b->c, &cl); |
| 2157 | pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); | 2138 | closure_sync(&cl); |
| 2158 | } | 2139 | } |
| 2159 | 2140 | ||
| 2160 | /* Cache lookup */ | 2141 | /* Cache lookup */ |
| @@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | |||
| 2215 | KEY_OFFSET(k) - bio->bi_sector); | 2196 | KEY_OFFSET(k) - bio->bi_sector); |
| 2216 | 2197 | ||
| 2217 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 2198 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
| 2218 | if (!n) | ||
| 2219 | return -EAGAIN; | ||
| 2220 | |||
| 2221 | if (n == bio) | 2199 | if (n == bio) |
| 2222 | op->lookup_done = true; | 2200 | op->lookup_done = true; |
| 2223 | 2201 | ||
| @@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | |||
| 2240 | n->bi_end_io = bch_cache_read_endio; | 2218 | n->bi_end_io = bch_cache_read_endio; |
| 2241 | n->bi_private = &s->cl; | 2219 | n->bi_private = &s->cl; |
| 2242 | 2220 | ||
| 2243 | trace_bcache_cache_hit(n); | ||
| 2244 | __bch_submit_bbio(n, b->c); | 2221 | __bch_submit_bbio(n, b->c); |
| 2245 | } | 2222 | } |
| 2246 | 2223 | ||
| @@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | |||
| 2257 | struct btree_iter iter; | 2234 | struct btree_iter iter; |
| 2258 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | 2235 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); |
| 2259 | 2236 | ||
| 2260 | pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, | ||
| 2261 | (uint64_t) bio->bi_sector); | ||
| 2262 | |||
| 2263 | do { | 2237 | do { |
| 2264 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 2238 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
| 2265 | if (!k) { | 2239 | if (!k) { |
| @@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | |||
| 2303 | } | 2277 | } |
| 2304 | 2278 | ||
| 2305 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | 2279 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, |
| 2306 | struct keybuf *buf, struct bkey *end) | 2280 | struct keybuf *buf, struct bkey *end, |
| 2281 | keybuf_pred_fn *pred) | ||
| 2307 | { | 2282 | { |
| 2308 | struct btree_iter iter; | 2283 | struct btree_iter iter; |
| 2309 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | 2284 | bch_btree_iter_init(b, &iter, &buf->last_scanned); |
| @@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
| 2322 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2297 | if (bkey_cmp(&buf->last_scanned, end) >= 0) |
| 2323 | break; | 2298 | break; |
| 2324 | 2299 | ||
| 2325 | if (buf->key_predicate(buf, k)) { | 2300 | if (pred(buf, k)) { |
| 2326 | struct keybuf_key *w; | 2301 | struct keybuf_key *w; |
| 2327 | 2302 | ||
| 2328 | pr_debug("%s", pkey(k)); | ||
| 2329 | |||
| 2330 | spin_lock(&buf->lock); | 2303 | spin_lock(&buf->lock); |
| 2331 | 2304 | ||
| 2332 | w = array_alloc(&buf->freelist); | 2305 | w = array_alloc(&buf->freelist); |
| @@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
| 2343 | if (!k) | 2316 | if (!k) |
| 2344 | break; | 2317 | break; |
| 2345 | 2318 | ||
| 2346 | btree(refill_keybuf, k, b, op, buf, end); | 2319 | btree(refill_keybuf, k, b, op, buf, end, pred); |
| 2347 | /* | 2320 | /* |
| 2348 | * Might get an error here, but can't really do anything | 2321 | * Might get an error here, but can't really do anything |
| 2349 | * and it'll get logged elsewhere. Just read what we | 2322 | * and it'll get logged elsewhere. Just read what we |
| @@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
| 2361 | } | 2334 | } |
| 2362 | 2335 | ||
| 2363 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | 2336 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, |
| 2364 | struct bkey *end) | 2337 | struct bkey *end, keybuf_pred_fn *pred) |
| 2365 | { | 2338 | { |
| 2366 | struct bkey start = buf->last_scanned; | 2339 | struct bkey start = buf->last_scanned; |
| 2367 | struct btree_op op; | 2340 | struct btree_op op; |
| @@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | |||
| 2369 | 2342 | ||
| 2370 | cond_resched(); | 2343 | cond_resched(); |
| 2371 | 2344 | ||
| 2372 | btree_root(refill_keybuf, c, &op, buf, end); | 2345 | btree_root(refill_keybuf, c, &op, buf, end, pred); |
| 2373 | closure_sync(&op.cl); | 2346 | closure_sync(&op.cl); |
| 2374 | 2347 | ||
| 2375 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | 2348 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", |
| @@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | |||
| 2455 | 2428 | ||
| 2456 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | 2429 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, |
| 2457 | struct keybuf *buf, | 2430 | struct keybuf *buf, |
| 2458 | struct bkey *end) | 2431 | struct bkey *end, |
| 2432 | keybuf_pred_fn *pred) | ||
| 2459 | { | 2433 | { |
| 2460 | struct keybuf_key *ret; | 2434 | struct keybuf_key *ret; |
| 2461 | 2435 | ||
| @@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | |||
| 2469 | break; | 2443 | break; |
| 2470 | } | 2444 | } |
| 2471 | 2445 | ||
| 2472 | bch_refill_keybuf(c, buf, end); | 2446 | bch_refill_keybuf(c, buf, end, pred); |
| 2473 | } | 2447 | } |
| 2474 | 2448 | ||
| 2475 | return ret; | 2449 | return ret; |
| 2476 | } | 2450 | } |
| 2477 | 2451 | ||
| 2478 | void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) | 2452 | void bch_keybuf_init(struct keybuf *buf) |
| 2479 | { | 2453 | { |
| 2480 | buf->key_predicate = fn; | ||
| 2481 | buf->last_scanned = MAX_KEY; | 2454 | buf->last_scanned = MAX_KEY; |
| 2482 | buf->keys = RB_ROOT; | 2455 | buf->keys = RB_ROOT; |
| 2483 | 2456 | ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index af4a7092a28c..3333d3723633 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
| @@ -102,7 +102,6 @@ | |||
| 102 | #include "debug.h" | 102 | #include "debug.h" |
| 103 | 103 | ||
| 104 | struct btree_write { | 104 | struct btree_write { |
| 105 | struct closure *owner; | ||
| 106 | atomic_t *journal; | 105 | atomic_t *journal; |
| 107 | 106 | ||
| 108 | /* If btree_split() frees a btree node, it writes a new pointer to that | 107 | /* If btree_split() frees a btree node, it writes a new pointer to that |
| @@ -142,16 +141,12 @@ struct btree { | |||
| 142 | */ | 141 | */ |
| 143 | struct bset_tree sets[MAX_BSETS]; | 142 | struct bset_tree sets[MAX_BSETS]; |
| 144 | 143 | ||
| 145 | /* Used to refcount bio splits, also protects b->bio */ | 144 | /* For outstanding btree writes, used as a lock - protects write_idx */ |
| 146 | struct closure_with_waitlist io; | 145 | struct closure_with_waitlist io; |
| 147 | 146 | ||
| 148 | /* Gets transferred to w->prio_blocked - see the comment there */ | ||
| 149 | int prio_blocked; | ||
| 150 | |||
| 151 | struct list_head list; | 147 | struct list_head list; |
| 152 | struct delayed_work work; | 148 | struct delayed_work work; |
| 153 | 149 | ||
| 154 | uint64_t io_start_time; | ||
| 155 | struct btree_write writes[2]; | 150 | struct btree_write writes[2]; |
| 156 | struct bio *bio; | 151 | struct bio *bio; |
| 157 | }; | 152 | }; |
| @@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ | |||
| 164 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | 159 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ |
| 165 | 160 | ||
| 166 | enum btree_flags { | 161 | enum btree_flags { |
| 167 | BTREE_NODE_read_done, | ||
| 168 | BTREE_NODE_io_error, | 162 | BTREE_NODE_io_error, |
| 169 | BTREE_NODE_dirty, | 163 | BTREE_NODE_dirty, |
| 170 | BTREE_NODE_write_idx, | 164 | BTREE_NODE_write_idx, |
| 171 | }; | 165 | }; |
| 172 | 166 | ||
| 173 | BTREE_FLAG(read_done); | ||
| 174 | BTREE_FLAG(io_error); | 167 | BTREE_FLAG(io_error); |
| 175 | BTREE_FLAG(dirty); | 168 | BTREE_FLAG(dirty); |
| 176 | BTREE_FLAG(write_idx); | 169 | BTREE_FLAG(write_idx); |
| @@ -278,6 +271,13 @@ struct btree_op { | |||
| 278 | BKEY_PADDED(replace); | 271 | BKEY_PADDED(replace); |
| 279 | }; | 272 | }; |
| 280 | 273 | ||
| 274 | enum { | ||
| 275 | BTREE_INSERT_STATUS_INSERT, | ||
| 276 | BTREE_INSERT_STATUS_BACK_MERGE, | ||
| 277 | BTREE_INSERT_STATUS_OVERWROTE, | ||
| 278 | BTREE_INSERT_STATUS_FRONT_MERGE, | ||
| 279 | }; | ||
| 280 | |||
| 281 | void bch_btree_op_init_stack(struct btree_op *); | 281 | void bch_btree_op_init_stack(struct btree_op *); |
| 282 | 282 | ||
| 283 | static inline void rw_lock(bool w, struct btree *b, int level) | 283 | static inline void rw_lock(bool w, struct btree *b, int level) |
| @@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b) | |||
| 293 | #ifdef CONFIG_BCACHE_EDEBUG | 293 | #ifdef CONFIG_BCACHE_EDEBUG |
| 294 | unsigned i; | 294 | unsigned i; |
| 295 | 295 | ||
| 296 | if (w && | 296 | if (w && b->key.ptr[0]) |
| 297 | b->key.ptr[0] && | ||
| 298 | btree_node_read_done(b)) | ||
| 299 | for (i = 0; i <= b->nsets; i++) | 297 | for (i = 0; i <= b->nsets; i++) |
| 300 | bch_check_key_order(b, b->sets[i].data); | 298 | bch_check_key_order(b, b->sets[i].data); |
| 301 | #endif | 299 | #endif |
| @@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b) | |||
| 370 | > btree_blocks(b)); | 368 | > btree_blocks(b)); |
| 371 | } | 369 | } |
| 372 | 370 | ||
| 373 | void bch_btree_read_done(struct closure *); | 371 | void bch_btree_node_read(struct btree *); |
| 374 | void bch_btree_read(struct btree *); | 372 | void bch_btree_node_write(struct btree *, struct closure *); |
| 375 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op); | ||
| 376 | 373 | ||
| 377 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); | 374 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); |
| 378 | void bch_btree_set_root(struct btree *); | 375 | void bch_btree_set_root(struct btree *); |
| @@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | |||
| 380 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | 377 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, |
| 381 | int, struct btree_op *); | 378 | int, struct btree_op *); |
| 382 | 379 | ||
| 383 | bool bch_btree_insert_keys(struct btree *, struct btree_op *); | ||
| 384 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | 380 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, |
| 385 | struct bio *); | 381 | struct bio *); |
| 386 | int bch_btree_insert(struct btree_op *, struct cache_set *); | 382 | int bch_btree_insert(struct btree_op *, struct cache_set *); |
| @@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *); | |||
| 393 | int bch_btree_check(struct cache_set *, struct btree_op *); | 389 | int bch_btree_check(struct cache_set *, struct btree_op *); |
| 394 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | 390 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); |
| 395 | 391 | ||
| 396 | void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); | 392 | void bch_keybuf_init(struct keybuf *); |
| 397 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); | 393 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, |
| 394 | keybuf_pred_fn *); | ||
| 398 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | 395 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, |
| 399 | struct bkey *); | 396 | struct bkey *); |
| 400 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | 397 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); |
| 401 | struct keybuf_key *bch_keybuf_next(struct keybuf *); | 398 | struct keybuf_key *bch_keybuf_next(struct keybuf *); |
| 402 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, | 399 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, |
| 403 | struct keybuf *, struct bkey *); | 400 | struct bkey *, keybuf_pred_fn *); |
| 404 | 401 | ||
| 405 | #endif | 402 | #endif |
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index bd05a9a8c7cf..9aba2017f0d1 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c | |||
| @@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | |||
| 66 | } else { | 66 | } else { |
| 67 | struct closure *parent = cl->parent; | 67 | struct closure *parent = cl->parent; |
| 68 | struct closure_waitlist *wait = closure_waitlist(cl); | 68 | struct closure_waitlist *wait = closure_waitlist(cl); |
| 69 | closure_fn *destructor = cl->fn; | ||
| 69 | 70 | ||
| 70 | closure_debug_destroy(cl); | 71 | closure_debug_destroy(cl); |
| 71 | 72 | ||
| 73 | smp_mb(); | ||
| 72 | atomic_set(&cl->remaining, -1); | 74 | atomic_set(&cl->remaining, -1); |
| 73 | 75 | ||
| 74 | if (wait) | 76 | if (wait) |
| 75 | closure_wake_up(wait); | 77 | closure_wake_up(wait); |
| 76 | 78 | ||
| 77 | if (cl->fn) | 79 | if (destructor) |
| 78 | cl->fn(cl); | 80 | destructor(cl); |
| 79 | 81 | ||
| 80 | if (parent) | 82 | if (parent) |
| 81 | closure_put(parent); | 83 | closure_put(parent); |
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 89fd5204924e..88e6411eab4f 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c | |||
| @@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | |||
| 47 | return ""; | 47 | return ""; |
| 48 | } | 48 | } |
| 49 | 49 | ||
| 50 | struct keyprint_hack bch_pkey(const struct bkey *k) | 50 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) |
| 51 | { | 51 | { |
| 52 | unsigned i = 0; | 52 | unsigned i = 0; |
| 53 | struct keyprint_hack r; | 53 | char *out = buf, *end = buf + size; |
| 54 | char *out = r.s, *end = r.s + KEYHACK_SIZE; | ||
| 55 | 54 | ||
| 56 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | 55 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) |
| 57 | 56 | ||
| @@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k) | |||
| 75 | if (KEY_CSUM(k)) | 74 | if (KEY_CSUM(k)) |
| 76 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | 75 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); |
| 77 | #undef p | 76 | #undef p |
| 78 | return r; | 77 | return out - buf; |
| 79 | } | 78 | } |
| 80 | 79 | ||
| 81 | struct keyprint_hack bch_pbtree(const struct btree *b) | 80 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b) |
| 82 | { | 81 | { |
| 83 | struct keyprint_hack r; | 82 | return scnprintf(buf, size, "%zu level %i/%i", |
| 84 | 83 | PTR_BUCKET_NR(b->c, &b->key, 0), | |
| 85 | snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), | 84 | b->level, b->c->root ? b->c->root->level : -1); |
| 86 | b->level, b->c->root ? b->c->root->level : -1); | ||
| 87 | return r; | ||
| 88 | } | 85 | } |
| 89 | 86 | ||
| 90 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | 87 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) |
| @@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i) | |||
| 100 | { | 97 | { |
| 101 | struct bkey *k; | 98 | struct bkey *k; |
| 102 | unsigned j; | 99 | unsigned j; |
| 100 | char buf[80]; | ||
| 103 | 101 | ||
| 104 | for (k = i->start; k < end(i); k = bkey_next(k)) { | 102 | for (k = i->start; k < end(i); k = bkey_next(k)) { |
| 103 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
| 105 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | 104 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), |
| 106 | (uint64_t *) k - i->d, i->keys, pkey(k)); | 105 | (uint64_t *) k - i->d, i->keys, buf); |
| 107 | 106 | ||
| 108 | for (j = 0; j < KEY_PTRS(k); j++) { | 107 | for (j = 0; j < KEY_PTRS(k); j++) { |
| 109 | size_t n = PTR_BUCKET_NR(b->c, k, j); | 108 | size_t n = PTR_BUCKET_NR(b->c, k, j); |
| @@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new) | |||
| 144 | v->written = 0; | 143 | v->written = 0; |
| 145 | v->level = b->level; | 144 | v->level = b->level; |
| 146 | 145 | ||
| 147 | bch_btree_read(v); | 146 | bch_btree_node_read(v); |
| 148 | closure_wait_event(&v->io.wait, &cl, | 147 | closure_wait_event(&v->io.wait, &cl, |
| 149 | atomic_read(&b->io.cl.remaining) == -1); | 148 | atomic_read(&b->io.cl.remaining) == -1); |
| 150 | 149 | ||
| @@ -200,7 +199,7 @@ void bch_data_verify(struct search *s) | |||
| 200 | if (!check) | 199 | if (!check) |
| 201 | return; | 200 | return; |
| 202 | 201 | ||
| 203 | if (bch_bio_alloc_pages(check, GFP_NOIO)) | 202 | if (bio_alloc_pages(check, GFP_NOIO)) |
| 204 | goto out_put; | 203 | goto out_put; |
| 205 | 204 | ||
| 206 | check->bi_rw = READ_SYNC; | 205 | check->bi_rw = READ_SYNC; |
| @@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | |||
| 252 | va_list args) | 251 | va_list args) |
| 253 | { | 252 | { |
| 254 | unsigned i; | 253 | unsigned i; |
| 254 | char buf[80]; | ||
| 255 | 255 | ||
| 256 | console_lock(); | 256 | console_lock(); |
| 257 | 257 | ||
| @@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | |||
| 262 | 262 | ||
| 263 | console_unlock(); | 263 | console_unlock(); |
| 264 | 264 | ||
| 265 | panic("at %s\n", pbtree(b)); | 265 | bch_btree_to_text(buf, sizeof(buf), b); |
| 266 | panic("at %s\n", buf); | ||
| 266 | } | 267 | } |
| 267 | 268 | ||
| 268 | void bch_check_key_order_msg(struct btree *b, struct bset *i, | 269 | void bch_check_key_order_msg(struct btree *b, struct bset *i, |
| @@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, | |||
| 337 | { | 338 | { |
| 338 | struct dump_iterator *i = file->private_data; | 339 | struct dump_iterator *i = file->private_data; |
| 339 | ssize_t ret = 0; | 340 | ssize_t ret = 0; |
| 341 | char kbuf[80]; | ||
| 340 | 342 | ||
| 341 | while (size) { | 343 | while (size) { |
| 342 | struct keybuf_key *w; | 344 | struct keybuf_key *w; |
| @@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, | |||
| 355 | if (i->bytes) | 357 | if (i->bytes) |
| 356 | break; | 358 | break; |
| 357 | 359 | ||
| 358 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); | 360 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); |
| 359 | if (!w) | 361 | if (!w) |
| 360 | break; | 362 | break; |
| 361 | 363 | ||
| 362 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); | 364 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); |
| 365 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); | ||
| 363 | bch_keybuf_del(&i->keys, w); | 366 | bch_keybuf_del(&i->keys, w); |
| 364 | } | 367 | } |
| 365 | 368 | ||
| @@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file) | |||
| 377 | 380 | ||
| 378 | file->private_data = i; | 381 | file->private_data = i; |
| 379 | i->c = c; | 382 | i->c = c; |
| 380 | bch_keybuf_init(&i->keys, dump_pred); | 383 | bch_keybuf_init(&i->keys); |
| 381 | i->keys.last_scanned = KEY(0, 0, 0); | 384 | i->keys.last_scanned = KEY(0, 0, 0); |
| 382 | 385 | ||
| 383 | return 0; | 386 | return 0; |
| @@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c) | |||
| 409 | 412 | ||
| 410 | #endif | 413 | #endif |
| 411 | 414 | ||
| 412 | /* Fuzz tester has rotted: */ | ||
| 413 | #if 0 | ||
| 414 | |||
| 415 | static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, | ||
| 416 | const char *buffer, size_t size) | ||
| 417 | { | ||
| 418 | void dump(struct btree *b) | ||
| 419 | { | ||
| 420 | struct bset *i; | ||
| 421 | |||
| 422 | for (i = b->sets[0].data; | ||
| 423 | index(i, b) < btree_blocks(b) && | ||
| 424 | i->seq == b->sets[0].data->seq; | ||
| 425 | i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) | ||
| 426 | dump_bset(b, i); | ||
| 427 | } | ||
| 428 | |||
| 429 | struct cache_sb *sb; | ||
| 430 | struct cache_set *c; | ||
| 431 | struct btree *all[3], *b, *fill, *orig; | ||
| 432 | int j; | ||
| 433 | |||
| 434 | struct btree_op op; | ||
| 435 | bch_btree_op_init_stack(&op); | ||
| 436 | |||
| 437 | sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); | ||
| 438 | if (!sb) | ||
| 439 | return -ENOMEM; | ||
| 440 | |||
| 441 | sb->bucket_size = 128; | ||
| 442 | sb->block_size = 4; | ||
| 443 | |||
| 444 | c = bch_cache_set_alloc(sb); | ||
| 445 | if (!c) | ||
| 446 | return -ENOMEM; | ||
| 447 | |||
| 448 | for (j = 0; j < 3; j++) { | ||
| 449 | BUG_ON(list_empty(&c->btree_cache)); | ||
| 450 | all[j] = list_first_entry(&c->btree_cache, struct btree, list); | ||
| 451 | list_del_init(&all[j]->list); | ||
| 452 | |||
| 453 | all[j]->key = KEY(0, 0, c->sb.bucket_size); | ||
| 454 | bkey_copy_key(&all[j]->key, &MAX_KEY); | ||
| 455 | } | ||
| 456 | |||
| 457 | b = all[0]; | ||
| 458 | fill = all[1]; | ||
| 459 | orig = all[2]; | ||
| 460 | |||
| 461 | while (1) { | ||
| 462 | for (j = 0; j < 3; j++) | ||
| 463 | all[j]->written = all[j]->nsets = 0; | ||
| 464 | |||
| 465 | bch_bset_init_next(b); | ||
| 466 | |||
| 467 | while (1) { | ||
| 468 | struct bset *i = write_block(b); | ||
| 469 | struct bkey *k = op.keys.top; | ||
| 470 | unsigned rand; | ||
| 471 | |||
| 472 | bkey_init(k); | ||
| 473 | rand = get_random_int(); | ||
| 474 | |||
| 475 | op.type = rand & 1 | ||
| 476 | ? BTREE_INSERT | ||
| 477 | : BTREE_REPLACE; | ||
| 478 | rand >>= 1; | ||
| 479 | |||
| 480 | SET_KEY_SIZE(k, bucket_remainder(c, rand)); | ||
| 481 | rand >>= c->bucket_bits; | ||
| 482 | rand &= 1024 * 512 - 1; | ||
| 483 | rand += c->sb.bucket_size; | ||
| 484 | SET_KEY_OFFSET(k, rand); | ||
| 485 | #if 0 | ||
| 486 | SET_KEY_PTRS(k, 1); | ||
| 487 | #endif | ||
| 488 | bch_keylist_push(&op.keys); | ||
| 489 | bch_btree_insert_keys(b, &op); | ||
| 490 | |||
| 491 | if (should_split(b) || | ||
| 492 | set_blocks(i, b->c) != | ||
| 493 | __set_blocks(i, i->keys + 15, b->c)) { | ||
| 494 | i->csum = csum_set(i); | ||
| 495 | |||
| 496 | memcpy(write_block(fill), | ||
| 497 | i, set_bytes(i)); | ||
| 498 | |||
| 499 | b->written += set_blocks(i, b->c); | ||
| 500 | fill->written = b->written; | ||
| 501 | if (b->written == btree_blocks(b)) | ||
| 502 | break; | ||
| 503 | |||
| 504 | bch_btree_sort_lazy(b); | ||
| 505 | bch_bset_init_next(b); | ||
| 506 | } | ||
| 507 | } | ||
| 508 | |||
| 509 | memcpy(orig->sets[0].data, | ||
| 510 | fill->sets[0].data, | ||
| 511 | btree_bytes(c)); | ||
| 512 | |||
| 513 | bch_btree_sort(b); | ||
| 514 | fill->written = 0; | ||
| 515 | bch_btree_read_done(&fill->io.cl); | ||
| 516 | |||
| 517 | if (b->sets[0].data->keys != fill->sets[0].data->keys || | ||
| 518 | memcmp(b->sets[0].data->start, | ||
| 519 | fill->sets[0].data->start, | ||
| 520 | b->sets[0].data->keys * sizeof(uint64_t))) { | ||
| 521 | struct bset *i = b->sets[0].data; | ||
| 522 | struct bkey *k, *l; | ||
| 523 | |||
| 524 | for (k = i->start, | ||
| 525 | l = fill->sets[0].data->start; | ||
| 526 | k < end(i); | ||
| 527 | k = bkey_next(k), l = bkey_next(l)) | ||
| 528 | if (bkey_cmp(k, l) || | ||
| 529 | KEY_SIZE(k) != KEY_SIZE(l)) | ||
| 530 | pr_err("key %zi differs: %s != %s", | ||
| 531 | (uint64_t *) k - i->d, | ||
| 532 | pkey(k), pkey(l)); | ||
| 533 | |||
| 534 | for (j = 0; j < 3; j++) { | ||
| 535 | pr_err("**** Set %i ****", j); | ||
| 536 | dump(all[j]); | ||
| 537 | } | ||
| 538 | panic("\n"); | ||
| 539 | } | ||
| 540 | |||
| 541 | pr_info("fuzz complete: %i keys", b->sets[0].data->keys); | ||
| 542 | } | ||
| 543 | } | ||
| 544 | |||
| 545 | kobj_attribute_write(fuzz, btree_fuzz); | ||
| 546 | #endif | ||
| 547 | |||
| 548 | void bch_debug_exit(void) | 415 | void bch_debug_exit(void) |
| 549 | { | 416 | { |
| 550 | if (!IS_ERR_OR_NULL(debug)) | 417 | if (!IS_ERR_OR_NULL(debug)) |
| @@ -554,11 +421,6 @@ void bch_debug_exit(void) | |||
| 554 | int __init bch_debug_init(struct kobject *kobj) | 421 | int __init bch_debug_init(struct kobject *kobj) |
| 555 | { | 422 | { |
| 556 | int ret = 0; | 423 | int ret = 0; |
| 557 | #if 0 | ||
| 558 | ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); | ||
| 559 | if (ret) | ||
| 560 | return ret; | ||
| 561 | #endif | ||
| 562 | 424 | ||
| 563 | debug = debugfs_create_dir("bcache", NULL); | 425 | debug = debugfs_create_dir("bcache", NULL); |
| 564 | return ret; | 426 | return ret; |
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index f9378a218148..1c39b5a2489b 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h | |||
| @@ -3,15 +3,8 @@ | |||
| 3 | 3 | ||
| 4 | /* Btree/bkey debug printing */ | 4 | /* Btree/bkey debug printing */ |
| 5 | 5 | ||
| 6 | #define KEYHACK_SIZE 80 | 6 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); |
| 7 | struct keyprint_hack { | 7 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b); |
| 8 | char s[KEYHACK_SIZE]; | ||
| 9 | }; | ||
| 10 | |||
| 11 | struct keyprint_hack bch_pkey(const struct bkey *k); | ||
| 12 | struct keyprint_hack bch_pbtree(const struct btree *b); | ||
| 13 | #define pkey(k) (&bch_pkey(k).s[0]) | ||
| 14 | #define pbtree(b) (&bch_pbtree(b).s[0]) | ||
| 15 | 8 | ||
| 16 | #ifdef CONFIG_BCACHE_EDEBUG | 9 | #ifdef CONFIG_BCACHE_EDEBUG |
| 17 | 10 | ||
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..9056632995b1 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c | |||
| @@ -9,6 +9,8 @@ | |||
| 9 | #include "bset.h" | 9 | #include "bset.h" |
| 10 | #include "debug.h" | 10 | #include "debug.h" |
| 11 | 11 | ||
| 12 | #include <linux/blkdev.h> | ||
| 13 | |||
| 12 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) | 14 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) |
| 13 | { | 15 | { |
| 14 | struct bio *p = bio->bi_private; | 16 | struct bio *p = bio->bi_private; |
| @@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio) | |||
| 66 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a | 68 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a |
| 67 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not | 69 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not |
| 68 | * freed before the split. | 70 | * freed before the split. |
| 69 | * | ||
| 70 | * If bch_bio_split() is running under generic_make_request(), it's not safe to | ||
| 71 | * allocate more than one bio from the same bio set. Therefore, if it is running | ||
| 72 | * under generic_make_request() it masks out __GFP_WAIT when doing the | ||
| 73 | * allocation. The caller must check for failure if there's any possibility of | ||
| 74 | * it being called from under generic_make_request(); it is then the caller's | ||
| 75 | * responsibility to retry from a safe context (by e.g. punting to workqueue). | ||
| 76 | */ | 71 | */ |
| 77 | struct bio *bch_bio_split(struct bio *bio, int sectors, | 72 | struct bio *bch_bio_split(struct bio *bio, int sectors, |
| 78 | gfp_t gfp, struct bio_set *bs) | 73 | gfp_t gfp, struct bio_set *bs) |
| @@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, | |||
| 83 | 78 | ||
| 84 | BUG_ON(sectors <= 0); | 79 | BUG_ON(sectors <= 0); |
| 85 | 80 | ||
| 86 | /* | ||
| 87 | * If we're being called from underneath generic_make_request() and we | ||
| 88 | * already allocated any bios from this bio set, we risk deadlock if we | ||
| 89 | * use the mempool. So instead, we possibly fail and let the caller punt | ||
| 90 | * to workqueue or somesuch and retry in a safe context. | ||
| 91 | */ | ||
| 92 | if (current->bio_list) | ||
| 93 | gfp &= ~__GFP_WAIT; | ||
| 94 | |||
| 95 | if (sectors >= bio_sectors(bio)) | 81 | if (sectors >= bio_sectors(bio)) |
| 96 | return bio; | 82 | return bio; |
| 97 | 83 | ||
| 98 | if (bio->bi_rw & REQ_DISCARD) { | 84 | if (bio->bi_rw & REQ_DISCARD) { |
| 99 | ret = bio_alloc_bioset(gfp, 1, bs); | 85 | ret = bio_alloc_bioset(gfp, 1, bs); |
| 86 | if (!ret) | ||
| 87 | return NULL; | ||
| 100 | idx = 0; | 88 | idx = 0; |
| 101 | goto out; | 89 | goto out; |
| 102 | } | 90 | } |
| @@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio) | |||
| 160 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | 148 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); |
| 161 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, | 149 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, |
| 162 | queue_max_segments(q)); | 150 | queue_max_segments(q)); |
| 163 | struct bio_vec *bv, *end = bio_iovec(bio) + | ||
| 164 | min_t(int, bio_segments(bio), max_segments); | ||
| 165 | 151 | ||
| 166 | if (bio->bi_rw & REQ_DISCARD) | 152 | if (bio->bi_rw & REQ_DISCARD) |
| 167 | return min(ret, q->limits.max_discard_sectors); | 153 | return min(ret, q->limits.max_discard_sectors); |
| 168 | 154 | ||
| 169 | if (bio_segments(bio) > max_segments || | 155 | if (bio_segments(bio) > max_segments || |
| 170 | q->merge_bvec_fn) { | 156 | q->merge_bvec_fn) { |
| 157 | struct bio_vec *bv; | ||
| 158 | int i, seg = 0; | ||
| 159 | |||
| 171 | ret = 0; | 160 | ret = 0; |
| 172 | 161 | ||
| 173 | for (bv = bio_iovec(bio); bv < end; bv++) { | 162 | bio_for_each_segment(bv, bio, i) { |
| 174 | struct bvec_merge_data bvm = { | 163 | struct bvec_merge_data bvm = { |
| 175 | .bi_bdev = bio->bi_bdev, | 164 | .bi_bdev = bio->bi_bdev, |
| 176 | .bi_sector = bio->bi_sector, | 165 | .bi_sector = bio->bi_sector, |
| @@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio) | |||
| 178 | .bi_rw = bio->bi_rw, | 167 | .bi_rw = bio->bi_rw, |
| 179 | }; | 168 | }; |
| 180 | 169 | ||
| 170 | if (seg == max_segments) | ||
| 171 | break; | ||
| 172 | |||
| 181 | if (q->merge_bvec_fn && | 173 | if (q->merge_bvec_fn && |
| 182 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) | 174 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) |
| 183 | break; | 175 | break; |
| 184 | 176 | ||
| 177 | seg++; | ||
| 185 | ret += bv->bv_len >> 9; | 178 | ret += bv->bv_len >> 9; |
| 186 | } | 179 | } |
| 187 | } | 180 | } |
| @@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error) | |||
| 218 | closure_put(cl); | 211 | closure_put(cl); |
| 219 | } | 212 | } |
| 220 | 213 | ||
| 221 | static void __bch_bio_submit_split(struct closure *cl) | ||
| 222 | { | ||
| 223 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
| 224 | struct bio *bio = s->bio, *n; | ||
| 225 | |||
| 226 | do { | ||
| 227 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), | ||
| 228 | GFP_NOIO, s->p->bio_split); | ||
| 229 | if (!n) | ||
| 230 | continue_at(cl, __bch_bio_submit_split, system_wq); | ||
| 231 | |||
| 232 | n->bi_end_io = bch_bio_submit_split_endio; | ||
| 233 | n->bi_private = cl; | ||
| 234 | |||
| 235 | closure_get(cl); | ||
| 236 | bch_generic_make_request_hack(n); | ||
| 237 | } while (n != bio); | ||
| 238 | |||
| 239 | continue_at(cl, bch_bio_submit_split_done, NULL); | ||
| 240 | } | ||
| 241 | |||
| 242 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | 214 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) |
| 243 | { | 215 | { |
| 244 | struct bio_split_hook *s; | 216 | struct bio_split_hook *s; |
| 217 | struct bio *n; | ||
| 245 | 218 | ||
| 246 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) | 219 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) |
| 247 | goto submit; | 220 | goto submit; |
| @@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | |||
| 250 | goto submit; | 223 | goto submit; |
| 251 | 224 | ||
| 252 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); | 225 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); |
| 226 | closure_init(&s->cl, NULL); | ||
| 253 | 227 | ||
| 254 | s->bio = bio; | 228 | s->bio = bio; |
| 255 | s->p = p; | 229 | s->p = p; |
| @@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | |||
| 257 | s->bi_private = bio->bi_private; | 231 | s->bi_private = bio->bi_private; |
| 258 | bio_get(bio); | 232 | bio_get(bio); |
| 259 | 233 | ||
| 260 | closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); | 234 | do { |
| 261 | return; | 235 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), |
| 236 | GFP_NOIO, s->p->bio_split); | ||
| 237 | |||
| 238 | n->bi_end_io = bch_bio_submit_split_endio; | ||
| 239 | n->bi_private = &s->cl; | ||
| 240 | |||
| 241 | closure_get(&s->cl); | ||
| 242 | bch_generic_make_request_hack(n); | ||
| 243 | } while (n != bio); | ||
| 244 | |||
| 245 | continue_at(&s->cl, bch_bio_submit_split_done, NULL); | ||
| 262 | submit: | 246 | submit: |
| 263 | bch_generic_make_request_hack(bio); | 247 | bch_generic_make_request_hack(bio); |
| 264 | } | 248 | } |
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..ba95ab84b2be 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
| @@ -9,6 +9,8 @@ | |||
| 9 | #include "debug.h" | 9 | #include "debug.h" |
| 10 | #include "request.h" | 10 | #include "request.h" |
| 11 | 11 | ||
| 12 | #include <trace/events/bcache.h> | ||
| 13 | |||
| 12 | /* | 14 | /* |
| 13 | * Journal replay/recovery: | 15 | * Journal replay/recovery: |
| 14 | * | 16 | * |
| @@ -182,9 +184,14 @@ bsearch: | |||
| 182 | pr_debug("starting binary search, l %u r %u", l, r); | 184 | pr_debug("starting binary search, l %u r %u", l, r); |
| 183 | 185 | ||
| 184 | while (l + 1 < r) { | 186 | while (l + 1 < r) { |
| 187 | seq = list_entry(list->prev, struct journal_replay, | ||
| 188 | list)->j.seq; | ||
| 189 | |||
| 185 | m = (l + r) >> 1; | 190 | m = (l + r) >> 1; |
| 191 | read_bucket(m); | ||
| 186 | 192 | ||
| 187 | if (read_bucket(m)) | 193 | if (seq != list_entry(list->prev, struct journal_replay, |
| 194 | list)->j.seq) | ||
| 188 | l = m; | 195 | l = m; |
| 189 | else | 196 | else |
| 190 | r = m; | 197 | r = m; |
| @@ -300,7 +307,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, | |||
| 300 | for (k = i->j.start; | 307 | for (k = i->j.start; |
| 301 | k < end(&i->j); | 308 | k < end(&i->j); |
| 302 | k = bkey_next(k)) { | 309 | k = bkey_next(k)) { |
| 303 | pr_debug("%s", pkey(k)); | 310 | trace_bcache_journal_replay_key(k); |
| 311 | |||
| 304 | bkey_copy(op->keys.top, k); | 312 | bkey_copy(op->keys.top, k); |
| 305 | bch_keylist_push(&op->keys); | 313 | bch_keylist_push(&op->keys); |
| 306 | 314 | ||
| @@ -384,7 +392,7 @@ out: | |||
| 384 | return; | 392 | return; |
| 385 | found: | 393 | found: |
| 386 | if (btree_node_dirty(best)) | 394 | if (btree_node_dirty(best)) |
| 387 | bch_btree_write(best, true, NULL); | 395 | bch_btree_node_write(best, NULL); |
| 388 | rw_unlock(true, best); | 396 | rw_unlock(true, best); |
| 389 | } | 397 | } |
| 390 | 398 | ||
| @@ -617,7 +625,7 @@ static void journal_write_unlocked(struct closure *cl) | |||
| 617 | bio_reset(bio); | 625 | bio_reset(bio); |
| 618 | bio->bi_sector = PTR_OFFSET(k, i); | 626 | bio->bi_sector = PTR_OFFSET(k, i); |
| 619 | bio->bi_bdev = ca->bdev; | 627 | bio->bi_bdev = ca->bdev; |
| 620 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; | 628 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; |
| 621 | bio->bi_size = sectors << 9; | 629 | bio->bi_size = sectors << 9; |
| 622 | 630 | ||
| 623 | bio->bi_end_io = journal_write_endio; | 631 | bio->bi_end_io = journal_write_endio; |
| @@ -712,7 +720,8 @@ void bch_journal(struct closure *cl) | |||
| 712 | spin_lock(&c->journal.lock); | 720 | spin_lock(&c->journal.lock); |
| 713 | 721 | ||
| 714 | if (journal_full(&c->journal)) { | 722 | if (journal_full(&c->journal)) { |
| 715 | /* XXX: tracepoint */ | 723 | trace_bcache_journal_full(c); |
| 724 | |||
| 716 | closure_wait(&c->journal.wait, cl); | 725 | closure_wait(&c->journal.wait, cl); |
| 717 | 726 | ||
| 718 | journal_reclaim(c); | 727 | journal_reclaim(c); |
| @@ -728,13 +737,15 @@ void bch_journal(struct closure *cl) | |||
| 728 | 737 | ||
| 729 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || | 738 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || |
| 730 | b > c->journal.blocks_free) { | 739 | b > c->journal.blocks_free) { |
| 731 | /* XXX: If we were inserting so many keys that they won't fit in | 740 | trace_bcache_journal_entry_full(c); |
| 741 | |||
| 742 | /* | ||
| 743 | * XXX: If we were inserting so many keys that they won't fit in | ||
| 732 | * an _empty_ journal write, we'll deadlock. For now, handle | 744 | * an _empty_ journal write, we'll deadlock. For now, handle |
| 733 | * this in bch_keylist_realloc() - but something to think about. | 745 | * this in bch_keylist_realloc() - but something to think about. |
| 734 | */ | 746 | */ |
| 735 | BUG_ON(!w->data->keys); | 747 | BUG_ON(!w->data->keys); |
| 736 | 748 | ||
| 737 | /* XXX: tracepoint */ | ||
| 738 | BUG_ON(!closure_wait(&w->wait, cl)); | 749 | BUG_ON(!closure_wait(&w->wait, cl)); |
| 739 | 750 | ||
| 740 | closure_flush(&c->journal.io); | 751 | closure_flush(&c->journal.io); |
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8589512c972e..1a3b4f4786c3 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c | |||
| @@ -9,6 +9,8 @@ | |||
| 9 | #include "debug.h" | 9 | #include "debug.h" |
| 10 | #include "request.h" | 10 | #include "request.h" |
| 11 | 11 | ||
| 12 | #include <trace/events/bcache.h> | ||
| 13 | |||
| 12 | struct moving_io { | 14 | struct moving_io { |
| 13 | struct keybuf_key *w; | 15 | struct keybuf_key *w; |
| 14 | struct search s; | 16 | struct search s; |
| @@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl) | |||
| 44 | { | 46 | { |
| 45 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 47 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); |
| 46 | struct bio *bio = &io->bio.bio; | 48 | struct bio *bio = &io->bio.bio; |
| 47 | struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); | 49 | struct bio_vec *bv; |
| 50 | int i; | ||
| 48 | 51 | ||
| 49 | while (bv-- != bio->bi_io_vec) | 52 | bio_for_each_segment_all(bv, bio, i) |
| 50 | __free_page(bv->bv_page); | 53 | __free_page(bv->bv_page); |
| 51 | 54 | ||
| 52 | pr_debug("%s %s", io->s.op.insert_collision | 55 | if (io->s.op.insert_collision) |
| 53 | ? "collision moving" : "moved", | 56 | trace_bcache_gc_copy_collision(&io->w->key); |
| 54 | pkey(&io->w->key)); | ||
| 55 | 57 | ||
| 56 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | 58 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); |
| 57 | 59 | ||
| @@ -94,8 +96,6 @@ static void write_moving(struct closure *cl) | |||
| 94 | struct moving_io *io = container_of(s, struct moving_io, s); | 96 | struct moving_io *io = container_of(s, struct moving_io, s); |
| 95 | 97 | ||
| 96 | if (!s->error) { | 98 | if (!s->error) { |
| 97 | trace_bcache_write_moving(&io->bio.bio); | ||
| 98 | |||
| 99 | moving_init(io); | 99 | moving_init(io); |
| 100 | 100 | ||
| 101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | 101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); |
| @@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl) | |||
| 122 | struct moving_io *io = container_of(s, struct moving_io, s); | 122 | struct moving_io *io = container_of(s, struct moving_io, s); |
| 123 | struct bio *bio = &io->bio.bio; | 123 | struct bio *bio = &io->bio.bio; |
| 124 | 124 | ||
| 125 | trace_bcache_read_moving(bio); | ||
| 126 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | 125 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); |
| 127 | 126 | ||
| 128 | continue_at(cl, write_moving, bch_gc_wq); | 127 | continue_at(cl, write_moving, bch_gc_wq); |
| @@ -138,7 +137,8 @@ static void read_moving(struct closure *cl) | |||
| 138 | /* XXX: if we error, background writeback could stall indefinitely */ | 137 | /* XXX: if we error, background writeback could stall indefinitely */ |
| 139 | 138 | ||
| 140 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { | 139 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { |
| 141 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); | 140 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, |
| 141 | &MAX_KEY, moving_pred); | ||
| 142 | if (!w) | 142 | if (!w) |
| 143 | break; | 143 | break; |
| 144 | 144 | ||
| @@ -159,10 +159,10 @@ static void read_moving(struct closure *cl) | |||
| 159 | bio->bi_rw = READ; | 159 | bio->bi_rw = READ; |
| 160 | bio->bi_end_io = read_moving_endio; | 160 | bio->bi_end_io = read_moving_endio; |
| 161 | 161 | ||
| 162 | if (bch_bio_alloc_pages(bio, GFP_KERNEL)) | 162 | if (bio_alloc_pages(bio, GFP_KERNEL)) |
| 163 | goto err; | 163 | goto err; |
| 164 | 164 | ||
| 165 | pr_debug("%s", pkey(&w->key)); | 165 | trace_bcache_gc_copy(&w->key); |
| 166 | 166 | ||
| 167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | 167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); |
| 168 | 168 | ||
| @@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl) | |||
| 250 | 250 | ||
| 251 | void bch_moving_init_cache_set(struct cache_set *c) | 251 | void bch_moving_init_cache_set(struct cache_set *c) |
| 252 | { | 252 | { |
| 253 | bch_keybuf_init(&c->moving_gc_keys, moving_pred); | 253 | bch_keybuf_init(&c->moving_gc_keys); |
| 254 | } | 254 | } |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..786a1a4f74d8 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include "btree.h" | 10 | #include "btree.h" |
| 11 | #include "debug.h" | 11 | #include "debug.h" |
| 12 | #include "request.h" | 12 | #include "request.h" |
| 13 | #include "writeback.h" | ||
| 13 | 14 | ||
| 14 | #include <linux/cgroup.h> | 15 | #include <linux/cgroup.h> |
| 15 | #include <linux/module.h> | 16 | #include <linux/module.h> |
| @@ -21,8 +22,6 @@ | |||
| 21 | 22 | ||
| 22 | #define CUTOFF_CACHE_ADD 95 | 23 | #define CUTOFF_CACHE_ADD 95 |
| 23 | #define CUTOFF_CACHE_READA 90 | 24 | #define CUTOFF_CACHE_READA 90 |
| 24 | #define CUTOFF_WRITEBACK 50 | ||
| 25 | #define CUTOFF_WRITEBACK_SYNC 75 | ||
| 26 | 25 | ||
| 27 | struct kmem_cache *bch_search_cache; | 26 | struct kmem_cache *bch_search_cache; |
| 28 | 27 | ||
| @@ -489,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl) | |||
| 489 | bch_queue_gc(op->c); | 488 | bch_queue_gc(op->c); |
| 490 | } | 489 | } |
| 491 | 490 | ||
| 491 | /* | ||
| 492 | * Journal writes are marked REQ_FLUSH; if the original write was a | ||
| 493 | * flush, it'll wait on the journal write. | ||
| 494 | */ | ||
| 495 | bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); | ||
| 496 | |||
| 492 | do { | 497 | do { |
| 493 | unsigned i; | 498 | unsigned i; |
| 494 | struct bkey *k; | 499 | struct bkey *k; |
| @@ -510,10 +515,6 @@ static void bch_insert_data_loop(struct closure *cl) | |||
| 510 | goto err; | 515 | goto err; |
| 511 | 516 | ||
| 512 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | 517 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); |
| 513 | if (!n) { | ||
| 514 | __bkey_put(op->c, k); | ||
| 515 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
| 516 | } | ||
| 517 | 518 | ||
| 518 | n->bi_end_io = bch_insert_data_endio; | 519 | n->bi_end_io = bch_insert_data_endio; |
| 519 | n->bi_private = cl; | 520 | n->bi_private = cl; |
| @@ -530,10 +531,9 @@ static void bch_insert_data_loop(struct closure *cl) | |||
| 530 | if (KEY_CSUM(k)) | 531 | if (KEY_CSUM(k)) |
| 531 | bio_csum(n, k); | 532 | bio_csum(n, k); |
| 532 | 533 | ||
| 533 | pr_debug("%s", pkey(k)); | 534 | trace_bcache_cache_insert(k); |
| 534 | bch_keylist_push(&op->keys); | 535 | bch_keylist_push(&op->keys); |
| 535 | 536 | ||
| 536 | trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); | ||
| 537 | n->bi_rw |= REQ_WRITE; | 537 | n->bi_rw |= REQ_WRITE; |
| 538 | bch_submit_bbio(n, op->c, k, 0); | 538 | bch_submit_bbio(n, op->c, k, 0); |
| 539 | } while (n != bio); | 539 | } while (n != bio); |
| @@ -716,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | |||
| 716 | s->task = current; | 716 | s->task = current; |
| 717 | s->orig_bio = bio; | 717 | s->orig_bio = bio; |
| 718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | 718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; |
| 719 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; | 719 | s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; |
| 720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | 720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; |
| 721 | s->recoverable = 1; | 721 | s->recoverable = 1; |
| 722 | s->start_time = jiffies; | 722 | s->start_time = jiffies; |
| @@ -784,11 +784,8 @@ static void request_read_error(struct closure *cl) | |||
| 784 | int i; | 784 | int i; |
| 785 | 785 | ||
| 786 | if (s->recoverable) { | 786 | if (s->recoverable) { |
| 787 | /* The cache read failed, but we can retry from the backing | 787 | /* Retry from the backing device: */ |
| 788 | * device. | 788 | trace_bcache_read_retry(s->orig_bio); |
| 789 | */ | ||
| 790 | pr_debug("recovering at sector %llu", | ||
| 791 | (uint64_t) s->orig_bio->bi_sector); | ||
| 792 | 789 | ||
| 793 | s->error = 0; | 790 | s->error = 0; |
| 794 | bv = s->bio.bio.bi_io_vec; | 791 | bv = s->bio.bio.bi_io_vec; |
| @@ -806,7 +803,6 @@ static void request_read_error(struct closure *cl) | |||
| 806 | 803 | ||
| 807 | /* XXX: invalidate cache */ | 804 | /* XXX: invalidate cache */ |
| 808 | 805 | ||
| 809 | trace_bcache_read_retry(&s->bio.bio); | ||
| 810 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | 806 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); |
| 811 | } | 807 | } |
| 812 | 808 | ||
| @@ -827,53 +823,13 @@ static void request_read_done(struct closure *cl) | |||
| 827 | */ | 823 | */ |
| 828 | 824 | ||
| 829 | if (s->op.cache_bio) { | 825 | if (s->op.cache_bio) { |
| 830 | struct bio_vec *src, *dst; | ||
| 831 | unsigned src_offset, dst_offset, bytes; | ||
| 832 | void *dst_ptr; | ||
| 833 | |||
| 834 | bio_reset(s->op.cache_bio); | 826 | bio_reset(s->op.cache_bio); |
| 835 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | 827 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; |
| 836 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | 828 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; |
| 837 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 829 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; |
| 838 | bch_bio_map(s->op.cache_bio, NULL); | 830 | bch_bio_map(s->op.cache_bio, NULL); |
| 839 | 831 | ||
| 840 | src = bio_iovec(s->op.cache_bio); | 832 | bio_copy_data(s->cache_miss, s->op.cache_bio); |
| 841 | dst = bio_iovec(s->cache_miss); | ||
| 842 | src_offset = src->bv_offset; | ||
| 843 | dst_offset = dst->bv_offset; | ||
| 844 | dst_ptr = kmap(dst->bv_page); | ||
| 845 | |||
| 846 | while (1) { | ||
| 847 | if (dst_offset == dst->bv_offset + dst->bv_len) { | ||
| 848 | kunmap(dst->bv_page); | ||
| 849 | dst++; | ||
| 850 | if (dst == bio_iovec_idx(s->cache_miss, | ||
| 851 | s->cache_miss->bi_vcnt)) | ||
| 852 | break; | ||
| 853 | |||
| 854 | dst_offset = dst->bv_offset; | ||
| 855 | dst_ptr = kmap(dst->bv_page); | ||
| 856 | } | ||
| 857 | |||
| 858 | if (src_offset == src->bv_offset + src->bv_len) { | ||
| 859 | src++; | ||
| 860 | if (src == bio_iovec_idx(s->op.cache_bio, | ||
| 861 | s->op.cache_bio->bi_vcnt)) | ||
| 862 | BUG(); | ||
| 863 | |||
| 864 | src_offset = src->bv_offset; | ||
| 865 | } | ||
| 866 | |||
| 867 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, | ||
| 868 | src->bv_offset + src->bv_len - src_offset); | ||
| 869 | |||
| 870 | memcpy(dst_ptr + dst_offset, | ||
| 871 | page_address(src->bv_page) + src_offset, | ||
| 872 | bytes); | ||
| 873 | |||
| 874 | src_offset += bytes; | ||
| 875 | dst_offset += bytes; | ||
| 876 | } | ||
| 877 | 833 | ||
| 878 | bio_put(s->cache_miss); | 834 | bio_put(s->cache_miss); |
| 879 | s->cache_miss = NULL; | 835 | s->cache_miss = NULL; |
| @@ -899,6 +855,7 @@ static void request_read_done_bh(struct closure *cl) | |||
| 899 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 855 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
| 900 | 856 | ||
| 901 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | 857 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); |
| 858 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); | ||
| 902 | 859 | ||
| 903 | if (s->error) | 860 | if (s->error) |
| 904 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | 861 | continue_at_nobarrier(cl, request_read_error, bcache_wq); |
| @@ -917,9 +874,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
| 917 | struct bio *miss; | 874 | struct bio *miss; |
| 918 | 875 | ||
| 919 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 876 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
| 920 | if (!miss) | ||
| 921 | return -EAGAIN; | ||
| 922 | |||
| 923 | if (miss == bio) | 877 | if (miss == bio) |
| 924 | s->op.lookup_done = true; | 878 | s->op.lookup_done = true; |
| 925 | 879 | ||
| @@ -938,8 +892,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
| 938 | reada = min(dc->readahead >> 9, | 892 | reada = min(dc->readahead >> 9, |
| 939 | sectors - bio_sectors(miss)); | 893 | sectors - bio_sectors(miss)); |
| 940 | 894 | ||
| 941 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) | 895 | if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) |
| 942 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); | 896 | reada = bdev_sectors(miss->bi_bdev) - |
| 897 | bio_end_sector(miss); | ||
| 943 | } | 898 | } |
| 944 | 899 | ||
| 945 | s->cache_bio_sectors = bio_sectors(miss) + reada; | 900 | s->cache_bio_sectors = bio_sectors(miss) + reada; |
| @@ -963,13 +918,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
| 963 | goto out_put; | 918 | goto out_put; |
| 964 | 919 | ||
| 965 | bch_bio_map(s->op.cache_bio, NULL); | 920 | bch_bio_map(s->op.cache_bio, NULL); |
| 966 | if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | 921 | if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) |
| 967 | goto out_put; | 922 | goto out_put; |
| 968 | 923 | ||
| 969 | s->cache_miss = miss; | 924 | s->cache_miss = miss; |
| 970 | bio_get(s->op.cache_bio); | 925 | bio_get(s->op.cache_bio); |
| 971 | 926 | ||
| 972 | trace_bcache_cache_miss(s->orig_bio); | ||
| 973 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | 927 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); |
| 974 | 928 | ||
| 975 | return ret; | 929 | return ret; |
| @@ -1002,24 +956,13 @@ static void cached_dev_write_complete(struct closure *cl) | |||
| 1002 | cached_dev_bio_complete(cl); | 956 | cached_dev_bio_complete(cl); |
| 1003 | } | 957 | } |
| 1004 | 958 | ||
| 1005 | static bool should_writeback(struct cached_dev *dc, struct bio *bio) | ||
| 1006 | { | ||
| 1007 | unsigned threshold = (bio->bi_rw & REQ_SYNC) | ||
| 1008 | ? CUTOFF_WRITEBACK_SYNC | ||
| 1009 | : CUTOFF_WRITEBACK; | ||
| 1010 | |||
| 1011 | return !atomic_read(&dc->disk.detaching) && | ||
| 1012 | cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && | ||
| 1013 | dc->disk.c->gc_stats.in_use < threshold; | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | static void request_write(struct cached_dev *dc, struct search *s) | 959 | static void request_write(struct cached_dev *dc, struct search *s) |
| 1017 | { | 960 | { |
| 1018 | struct closure *cl = &s->cl; | 961 | struct closure *cl = &s->cl; |
| 1019 | struct bio *bio = &s->bio.bio; | 962 | struct bio *bio = &s->bio.bio; |
| 1020 | struct bkey start, end; | 963 | struct bkey start, end; |
| 1021 | start = KEY(dc->disk.id, bio->bi_sector, 0); | 964 | start = KEY(dc->disk.id, bio->bi_sector, 0); |
| 1022 | end = KEY(dc->disk.id, bio_end(bio), 0); | 965 | end = KEY(dc->disk.id, bio_end_sector(bio), 0); |
| 1023 | 966 | ||
| 1024 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | 967 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); |
| 1025 | 968 | ||
| @@ -1034,22 +977,37 @@ static void request_write(struct cached_dev *dc, struct search *s) | |||
| 1034 | if (bio->bi_rw & REQ_DISCARD) | 977 | if (bio->bi_rw & REQ_DISCARD) |
| 1035 | goto skip; | 978 | goto skip; |
| 1036 | 979 | ||
| 980 | if (should_writeback(dc, s->orig_bio, | ||
| 981 | cache_mode(dc, bio), | ||
| 982 | s->op.skip)) { | ||
| 983 | s->op.skip = false; | ||
| 984 | s->writeback = true; | ||
| 985 | } | ||
| 986 | |||
| 1037 | if (s->op.skip) | 987 | if (s->op.skip) |
| 1038 | goto skip; | 988 | goto skip; |
| 1039 | 989 | ||
| 1040 | if (should_writeback(dc, s->orig_bio)) | 990 | trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); |
| 1041 | s->writeback = true; | ||
| 1042 | 991 | ||
| 1043 | if (!s->writeback) { | 992 | if (!s->writeback) { |
| 1044 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | 993 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, |
| 1045 | dc->disk.bio_split); | 994 | dc->disk.bio_split); |
| 1046 | 995 | ||
| 1047 | trace_bcache_writethrough(s->orig_bio); | ||
| 1048 | closure_bio_submit(bio, cl, s->d); | 996 | closure_bio_submit(bio, cl, s->d); |
| 1049 | } else { | 997 | } else { |
| 1050 | s->op.cache_bio = bio; | 998 | bch_writeback_add(dc); |
| 1051 | trace_bcache_writeback(s->orig_bio); | 999 | |
| 1052 | bch_writeback_add(dc, bio_sectors(bio)); | 1000 | if (s->op.flush_journal) { |
| 1001 | /* Also need to send a flush to the backing device */ | ||
| 1002 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | ||
| 1003 | dc->disk.bio_split); | ||
| 1004 | |||
| 1005 | bio->bi_size = 0; | ||
| 1006 | bio->bi_vcnt = 0; | ||
| 1007 | closure_bio_submit(bio, cl, s->d); | ||
| 1008 | } else { | ||
| 1009 | s->op.cache_bio = bio; | ||
| 1010 | } | ||
| 1053 | } | 1011 | } |
| 1054 | out: | 1012 | out: |
| 1055 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1013 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
| @@ -1058,7 +1016,6 @@ skip: | |||
| 1058 | s->op.skip = true; | 1016 | s->op.skip = true; |
| 1059 | s->op.cache_bio = s->orig_bio; | 1017 | s->op.cache_bio = s->orig_bio; |
| 1060 | bio_get(s->op.cache_bio); | 1018 | bio_get(s->op.cache_bio); |
| 1061 | trace_bcache_write_skip(s->orig_bio); | ||
| 1062 | 1019 | ||
| 1063 | if ((bio->bi_rw & REQ_DISCARD) && | 1020 | if ((bio->bi_rw & REQ_DISCARD) && |
| 1064 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1021 | !blk_queue_discard(bdev_get_queue(dc->bdev))) |
| @@ -1088,9 +1045,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s) | |||
| 1088 | 1045 | ||
| 1089 | /* Cached devices - read & write stuff */ | 1046 | /* Cached devices - read & write stuff */ |
| 1090 | 1047 | ||
| 1091 | int bch_get_congested(struct cache_set *c) | 1048 | unsigned bch_get_congested(struct cache_set *c) |
| 1092 | { | 1049 | { |
| 1093 | int i; | 1050 | int i; |
| 1051 | long rand; | ||
| 1094 | 1052 | ||
| 1095 | if (!c->congested_read_threshold_us && | 1053 | if (!c->congested_read_threshold_us && |
| 1096 | !c->congested_write_threshold_us) | 1054 | !c->congested_write_threshold_us) |
| @@ -1106,7 +1064,13 @@ int bch_get_congested(struct cache_set *c) | |||
| 1106 | 1064 | ||
| 1107 | i += CONGESTED_MAX; | 1065 | i += CONGESTED_MAX; |
| 1108 | 1066 | ||
| 1109 | return i <= 0 ? 1 : fract_exp_two(i, 6); | 1067 | if (i > 0) |
| 1068 | i = fract_exp_two(i, 6); | ||
| 1069 | |||
| 1070 | rand = get_random_int(); | ||
| 1071 | i -= bitmap_weight(&rand, BITS_PER_LONG); | ||
| 1072 | |||
| 1073 | return i > 0 ? i : 1; | ||
| 1110 | } | 1074 | } |
| 1111 | 1075 | ||
| 1112 | static void add_sequential(struct task_struct *t) | 1076 | static void add_sequential(struct task_struct *t) |
| @@ -1126,10 +1090,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) | |||
| 1126 | { | 1090 | { |
| 1127 | struct cache_set *c = s->op.c; | 1091 | struct cache_set *c = s->op.c; |
| 1128 | struct bio *bio = &s->bio.bio; | 1092 | struct bio *bio = &s->bio.bio; |
| 1129 | |||
| 1130 | long rand; | ||
| 1131 | int cutoff = bch_get_congested(c); | ||
| 1132 | unsigned mode = cache_mode(dc, bio); | 1093 | unsigned mode = cache_mode(dc, bio); |
| 1094 | unsigned sectors, congested = bch_get_congested(c); | ||
| 1133 | 1095 | ||
| 1134 | if (atomic_read(&dc->disk.detaching) || | 1096 | if (atomic_read(&dc->disk.detaching) || |
| 1135 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | 1097 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || |
| @@ -1147,17 +1109,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) | |||
| 1147 | goto skip; | 1109 | goto skip; |
| 1148 | } | 1110 | } |
| 1149 | 1111 | ||
| 1150 | if (!cutoff) { | 1112 | if (!congested && !dc->sequential_cutoff) |
| 1151 | cutoff = dc->sequential_cutoff >> 9; | 1113 | goto rescale; |
| 1152 | 1114 | ||
| 1153 | if (!cutoff) | 1115 | if (!congested && |
| 1154 | goto rescale; | 1116 | mode == CACHE_MODE_WRITEBACK && |
| 1155 | 1117 | (bio->bi_rw & REQ_WRITE) && | |
| 1156 | if (mode == CACHE_MODE_WRITEBACK && | 1118 | (bio->bi_rw & REQ_SYNC)) |
| 1157 | (bio->bi_rw & REQ_WRITE) && | 1119 | goto rescale; |
| 1158 | (bio->bi_rw & REQ_SYNC)) | ||
| 1159 | goto rescale; | ||
| 1160 | } | ||
| 1161 | 1120 | ||
| 1162 | if (dc->sequential_merge) { | 1121 | if (dc->sequential_merge) { |
| 1163 | struct io *i; | 1122 | struct io *i; |
| @@ -1177,7 +1136,7 @@ found: | |||
| 1177 | if (i->sequential + bio->bi_size > i->sequential) | 1136 | if (i->sequential + bio->bi_size > i->sequential) |
| 1178 | i->sequential += bio->bi_size; | 1137 | i->sequential += bio->bi_size; |
| 1179 | 1138 | ||
| 1180 | i->last = bio_end(bio); | 1139 | i->last = bio_end_sector(bio); |
| 1181 | i->jiffies = jiffies + msecs_to_jiffies(5000); | 1140 | i->jiffies = jiffies + msecs_to_jiffies(5000); |
| 1182 | s->task->sequential_io = i->sequential; | 1141 | s->task->sequential_io = i->sequential; |
| 1183 | 1142 | ||
| @@ -1192,12 +1151,19 @@ found: | |||
| 1192 | add_sequential(s->task); | 1151 | add_sequential(s->task); |
| 1193 | } | 1152 | } |
| 1194 | 1153 | ||
| 1195 | rand = get_random_int(); | 1154 | sectors = max(s->task->sequential_io, |
| 1196 | cutoff -= bitmap_weight(&rand, BITS_PER_LONG); | 1155 | s->task->sequential_io_avg) >> 9; |
| 1197 | 1156 | ||
| 1198 | if (cutoff <= (int) (max(s->task->sequential_io, | 1157 | if (dc->sequential_cutoff && |
| 1199 | s->task->sequential_io_avg) >> 9)) | 1158 | sectors >= dc->sequential_cutoff >> 9) { |
| 1159 | trace_bcache_bypass_sequential(s->orig_bio); | ||
| 1200 | goto skip; | 1160 | goto skip; |
| 1161 | } | ||
| 1162 | |||
| 1163 | if (congested && sectors >= congested) { | ||
| 1164 | trace_bcache_bypass_congested(s->orig_bio); | ||
| 1165 | goto skip; | ||
| 1166 | } | ||
| 1201 | 1167 | ||
| 1202 | rescale: | 1168 | rescale: |
| 1203 | bch_rescale_priorities(c, bio_sectors(bio)); | 1169 | bch_rescale_priorities(c, bio_sectors(bio)); |
| @@ -1288,30 +1254,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc) | |||
| 1288 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | 1254 | static int flash_dev_cache_miss(struct btree *b, struct search *s, |
| 1289 | struct bio *bio, unsigned sectors) | 1255 | struct bio *bio, unsigned sectors) |
| 1290 | { | 1256 | { |
| 1257 | struct bio_vec *bv; | ||
| 1258 | int i; | ||
| 1259 | |||
| 1291 | /* Zero fill bio */ | 1260 | /* Zero fill bio */ |
| 1292 | 1261 | ||
| 1293 | while (bio->bi_idx != bio->bi_vcnt) { | 1262 | bio_for_each_segment(bv, bio, i) { |
| 1294 | struct bio_vec *bv = bio_iovec(bio); | ||
| 1295 | unsigned j = min(bv->bv_len >> 9, sectors); | 1263 | unsigned j = min(bv->bv_len >> 9, sectors); |
| 1296 | 1264 | ||
| 1297 | void *p = kmap(bv->bv_page); | 1265 | void *p = kmap(bv->bv_page); |
| 1298 | memset(p + bv->bv_offset, 0, j << 9); | 1266 | memset(p + bv->bv_offset, 0, j << 9); |
| 1299 | kunmap(bv->bv_page); | 1267 | kunmap(bv->bv_page); |
| 1300 | 1268 | ||
| 1301 | bv->bv_len -= j << 9; | 1269 | sectors -= j; |
| 1302 | bv->bv_offset += j << 9; | ||
| 1303 | |||
| 1304 | if (bv->bv_len) | ||
| 1305 | return 0; | ||
| 1306 | |||
| 1307 | bio->bi_sector += j; | ||
| 1308 | bio->bi_size -= j << 9; | ||
| 1309 | |||
| 1310 | bio->bi_idx++; | ||
| 1311 | sectors -= j; | ||
| 1312 | } | 1270 | } |
| 1313 | 1271 | ||
| 1314 | s->op.lookup_done = true; | 1272 | bio_advance(bio, min(sectors << 9, bio->bi_size)); |
| 1273 | |||
| 1274 | if (!bio->bi_size) | ||
| 1275 | s->op.lookup_done = true; | ||
| 1315 | 1276 | ||
| 1316 | return 0; | 1277 | return 0; |
| 1317 | } | 1278 | } |
| @@ -1338,8 +1299,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | |||
| 1338 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 1299 | closure_call(&s->op.cl, btree_read_async, NULL, cl); |
| 1339 | } else if (bio_has_data(bio) || s->op.skip) { | 1300 | } else if (bio_has_data(bio) || s->op.skip) { |
| 1340 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | 1301 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, |
| 1341 | &KEY(d->id, bio->bi_sector, 0), | 1302 | &KEY(d->id, bio->bi_sector, 0), |
| 1342 | &KEY(d->id, bio_end(bio), 0)); | 1303 | &KEY(d->id, bio_end_sector(bio), 0)); |
| 1343 | 1304 | ||
| 1344 | s->writeback = true; | 1305 | s->writeback = true; |
| 1345 | s->op.cache_bio = bio; | 1306 | s->op.cache_bio = bio; |
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 254d9ab5707c..57dc4784f4f4 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h | |||
| @@ -30,7 +30,7 @@ struct search { | |||
| 30 | }; | 30 | }; |
| 31 | 31 | ||
| 32 | void bch_cache_read_endio(struct bio *, int); | 32 | void bch_cache_read_endio(struct bio *, int); |
| 33 | int bch_get_congested(struct cache_set *); | 33 | unsigned bch_get_congested(struct cache_set *); |
| 34 | void bch_insert_data(struct closure *cl); | 34 | void bch_insert_data(struct closure *cl); |
| 35 | void bch_btree_insert_async(struct closure *); | 35 | void bch_btree_insert_async(struct closure *); |
| 36 | void bch_cache_read_endio(struct bio *, int); | 36 | void bch_cache_read_endio(struct bio *, int); |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b653a3f..547c4c57b052 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
| @@ -10,10 +10,13 @@ | |||
| 10 | #include "btree.h" | 10 | #include "btree.h" |
| 11 | #include "debug.h" | 11 | #include "debug.h" |
| 12 | #include "request.h" | 12 | #include "request.h" |
| 13 | #include "writeback.h" | ||
| 13 | 14 | ||
| 15 | #include <linux/blkdev.h> | ||
| 14 | #include <linux/buffer_head.h> | 16 | #include <linux/buffer_head.h> |
| 15 | #include <linux/debugfs.h> | 17 | #include <linux/debugfs.h> |
| 16 | #include <linux/genhd.h> | 18 | #include <linux/genhd.h> |
| 19 | #include <linux/kthread.h> | ||
| 17 | #include <linux/module.h> | 20 | #include <linux/module.h> |
| 18 | #include <linux/random.h> | 21 | #include <linux/random.h> |
| 19 | #include <linux/reboot.h> | 22 | #include <linux/reboot.h> |
| @@ -342,6 +345,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
| 342 | struct closure *cl = &c->uuid_write.cl; | 345 | struct closure *cl = &c->uuid_write.cl; |
| 343 | struct uuid_entry *u; | 346 | struct uuid_entry *u; |
| 344 | unsigned i; | 347 | unsigned i; |
| 348 | char buf[80]; | ||
| 345 | 349 | ||
| 346 | BUG_ON(!parent); | 350 | BUG_ON(!parent); |
| 347 | closure_lock(&c->uuid_write, parent); | 351 | closure_lock(&c->uuid_write, parent); |
| @@ -362,8 +366,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
| 362 | break; | 366 | break; |
| 363 | } | 367 | } |
| 364 | 368 | ||
| 365 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", | 369 | bch_bkey_to_text(buf, sizeof(buf), k); |
| 366 | pkey(&c->uuid_bucket)); | 370 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); |
| 367 | 371 | ||
| 368 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) | 372 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) |
| 369 | if (!bch_is_zero(u->uuid, 16)) | 373 | if (!bch_is_zero(u->uuid, 16)) |
| @@ -543,7 +547,6 @@ void bch_prio_write(struct cache *ca) | |||
| 543 | 547 | ||
| 544 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), | 548 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), |
| 545 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | 549 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); |
| 546 | blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); | ||
| 547 | 550 | ||
| 548 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { | 551 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { |
| 549 | long bucket; | 552 | long bucket; |
| @@ -704,7 +707,8 @@ static void bcache_device_detach(struct bcache_device *d) | |||
| 704 | atomic_set(&d->detaching, 0); | 707 | atomic_set(&d->detaching, 0); |
| 705 | } | 708 | } |
| 706 | 709 | ||
| 707 | bcache_device_unlink(d); | 710 | if (!d->flush_done) |
| 711 | bcache_device_unlink(d); | ||
| 708 | 712 | ||
| 709 | d->c->devices[d->id] = NULL; | 713 | d->c->devices[d->id] = NULL; |
| 710 | closure_put(&d->c->caching); | 714 | closure_put(&d->c->caching); |
| @@ -743,13 +747,35 @@ static void bcache_device_free(struct bcache_device *d) | |||
| 743 | mempool_destroy(d->unaligned_bvec); | 747 | mempool_destroy(d->unaligned_bvec); |
| 744 | if (d->bio_split) | 748 | if (d->bio_split) |
| 745 | bioset_free(d->bio_split); | 749 | bioset_free(d->bio_split); |
| 750 | if (is_vmalloc_addr(d->stripe_sectors_dirty)) | ||
| 751 | vfree(d->stripe_sectors_dirty); | ||
| 752 | else | ||
| 753 | kfree(d->stripe_sectors_dirty); | ||
| 746 | 754 | ||
| 747 | closure_debug_destroy(&d->cl); | 755 | closure_debug_destroy(&d->cl); |
| 748 | } | 756 | } |
| 749 | 757 | ||
| 750 | static int bcache_device_init(struct bcache_device *d, unsigned block_size) | 758 | static int bcache_device_init(struct bcache_device *d, unsigned block_size, |
| 759 | sector_t sectors) | ||
| 751 | { | 760 | { |
| 752 | struct request_queue *q; | 761 | struct request_queue *q; |
| 762 | size_t n; | ||
| 763 | |||
| 764 | if (!d->stripe_size_bits) | ||
| 765 | d->stripe_size_bits = 31; | ||
| 766 | |||
| 767 | d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> | ||
| 768 | d->stripe_size_bits; | ||
| 769 | |||
| 770 | if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) | ||
| 771 | return -ENOMEM; | ||
| 772 | |||
| 773 | n = d->nr_stripes * sizeof(atomic_t); | ||
| 774 | d->stripe_sectors_dirty = n < PAGE_SIZE << 6 | ||
| 775 | ? kzalloc(n, GFP_KERNEL) | ||
| 776 | : vzalloc(n); | ||
| 777 | if (!d->stripe_sectors_dirty) | ||
| 778 | return -ENOMEM; | ||
| 753 | 779 | ||
| 754 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 780 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
| 755 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, | 781 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, |
| @@ -759,6 +785,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) | |||
| 759 | !(q = blk_alloc_queue(GFP_KERNEL))) | 785 | !(q = blk_alloc_queue(GFP_KERNEL))) |
| 760 | return -ENOMEM; | 786 | return -ENOMEM; |
| 761 | 787 | ||
| 788 | set_capacity(d->disk, sectors); | ||
| 762 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); | 789 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); |
| 763 | 790 | ||
| 764 | d->disk->major = bcache_major; | 791 | d->disk->major = bcache_major; |
| @@ -781,6 +808,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) | |||
| 781 | set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); | 808 | set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); |
| 782 | set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); | 809 | set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); |
| 783 | 810 | ||
| 811 | blk_queue_flush(q, REQ_FLUSH|REQ_FUA); | ||
| 812 | |||
| 784 | return 0; | 813 | return 0; |
| 785 | } | 814 | } |
| 786 | 815 | ||
| @@ -800,6 +829,17 @@ static void calc_cached_dev_sectors(struct cache_set *c) | |||
| 800 | void bch_cached_dev_run(struct cached_dev *dc) | 829 | void bch_cached_dev_run(struct cached_dev *dc) |
| 801 | { | 830 | { |
| 802 | struct bcache_device *d = &dc->disk; | 831 | struct bcache_device *d = &dc->disk; |
| 832 | char buf[SB_LABEL_SIZE + 1]; | ||
| 833 | char *env[] = { | ||
| 834 | "DRIVER=bcache", | ||
| 835 | kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), | ||
| 836 | NULL, | ||
| 837 | NULL, | ||
| 838 | }; | ||
| 839 | |||
| 840 | memcpy(buf, dc->sb.label, SB_LABEL_SIZE); | ||
| 841 | buf[SB_LABEL_SIZE] = '\0'; | ||
| 842 | env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); | ||
| 803 | 843 | ||
| 804 | if (atomic_xchg(&dc->running, 1)) | 844 | if (atomic_xchg(&dc->running, 1)) |
| 805 | return; | 845 | return; |
| @@ -816,10 +856,12 @@ void bch_cached_dev_run(struct cached_dev *dc) | |||
| 816 | 856 | ||
| 817 | add_disk(d->disk); | 857 | add_disk(d->disk); |
| 818 | bd_link_disk_holder(dc->bdev, dc->disk.disk); | 858 | bd_link_disk_holder(dc->bdev, dc->disk.disk); |
| 819 | #if 0 | 859 | /* won't show up in the uevent file, use udevadm monitor -e instead |
| 820 | char *env[] = { "SYMLINK=label" , NULL }; | 860 | * only class / kset properties are persistent */ |
| 821 | kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); | 861 | kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); |
| 822 | #endif | 862 | kfree(env[1]); |
| 863 | kfree(env[2]); | ||
| 864 | |||
| 823 | if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || | 865 | if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || |
| 824 | sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) | 866 | sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) |
| 825 | pr_debug("error creating sysfs link"); | 867 | pr_debug("error creating sysfs link"); |
| @@ -960,6 +1002,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | |||
| 960 | atomic_set(&dc->count, 1); | 1002 | atomic_set(&dc->count, 1); |
| 961 | 1003 | ||
| 962 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { | 1004 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { |
| 1005 | bch_sectors_dirty_init(dc); | ||
| 963 | atomic_set(&dc->has_dirty, 1); | 1006 | atomic_set(&dc->has_dirty, 1); |
| 964 | atomic_inc(&dc->count); | 1007 | atomic_inc(&dc->count); |
| 965 | bch_writeback_queue(dc); | 1008 | bch_writeback_queue(dc); |
| @@ -1014,6 +1057,14 @@ static void cached_dev_flush(struct closure *cl) | |||
| 1014 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | 1057 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); |
| 1015 | struct bcache_device *d = &dc->disk; | 1058 | struct bcache_device *d = &dc->disk; |
| 1016 | 1059 | ||
| 1060 | mutex_lock(&bch_register_lock); | ||
| 1061 | d->flush_done = 1; | ||
| 1062 | |||
| 1063 | if (d->c) | ||
| 1064 | bcache_device_unlink(d); | ||
| 1065 | |||
| 1066 | mutex_unlock(&bch_register_lock); | ||
| 1067 | |||
| 1017 | bch_cache_accounting_destroy(&dc->accounting); | 1068 | bch_cache_accounting_destroy(&dc->accounting); |
| 1018 | kobject_del(&d->kobj); | 1069 | kobject_del(&d->kobj); |
| 1019 | 1070 | ||
| @@ -1045,7 +1096,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
| 1045 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); | 1096 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); |
| 1046 | } | 1097 | } |
| 1047 | 1098 | ||
| 1048 | ret = bcache_device_init(&dc->disk, block_size); | 1099 | ret = bcache_device_init(&dc->disk, block_size, |
| 1100 | dc->bdev->bd_part->nr_sects - dc->sb.data_offset); | ||
| 1049 | if (ret) | 1101 | if (ret) |
| 1050 | return ret; | 1102 | return ret; |
| 1051 | 1103 | ||
| @@ -1144,11 +1196,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) | |||
| 1144 | 1196 | ||
| 1145 | kobject_init(&d->kobj, &bch_flash_dev_ktype); | 1197 | kobject_init(&d->kobj, &bch_flash_dev_ktype); |
| 1146 | 1198 | ||
| 1147 | if (bcache_device_init(d, block_bytes(c))) | 1199 | if (bcache_device_init(d, block_bytes(c), u->sectors)) |
| 1148 | goto err; | 1200 | goto err; |
| 1149 | 1201 | ||
| 1150 | bcache_device_attach(d, c, u - c->uuids); | 1202 | bcache_device_attach(d, c, u - c->uuids); |
| 1151 | set_capacity(d->disk, u->sectors); | ||
| 1152 | bch_flash_dev_request_init(d); | 1203 | bch_flash_dev_request_init(d); |
| 1153 | add_disk(d->disk); | 1204 | add_disk(d->disk); |
| 1154 | 1205 | ||
| @@ -1255,9 +1306,10 @@ static void cache_set_free(struct closure *cl) | |||
| 1255 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | 1306 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); |
| 1256 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); | 1307 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); |
| 1257 | 1308 | ||
| 1258 | kfree(c->fill_iter); | ||
| 1259 | if (c->bio_split) | 1309 | if (c->bio_split) |
| 1260 | bioset_free(c->bio_split); | 1310 | bioset_free(c->bio_split); |
| 1311 | if (c->fill_iter) | ||
| 1312 | mempool_destroy(c->fill_iter); | ||
| 1261 | if (c->bio_meta) | 1313 | if (c->bio_meta) |
| 1262 | mempool_destroy(c->bio_meta); | 1314 | mempool_destroy(c->bio_meta); |
| 1263 | if (c->search) | 1315 | if (c->search) |
| @@ -1278,11 +1330,9 @@ static void cache_set_free(struct closure *cl) | |||
| 1278 | static void cache_set_flush(struct closure *cl) | 1330 | static void cache_set_flush(struct closure *cl) |
| 1279 | { | 1331 | { |
| 1280 | struct cache_set *c = container_of(cl, struct cache_set, caching); | 1332 | struct cache_set *c = container_of(cl, struct cache_set, caching); |
| 1333 | struct cache *ca; | ||
| 1281 | struct btree *b; | 1334 | struct btree *b; |
| 1282 | 1335 | unsigned i; | |
| 1283 | /* Shut down allocator threads */ | ||
| 1284 | set_bit(CACHE_SET_STOPPING_2, &c->flags); | ||
| 1285 | wake_up(&c->alloc_wait); | ||
| 1286 | 1336 | ||
| 1287 | bch_cache_accounting_destroy(&c->accounting); | 1337 | bch_cache_accounting_destroy(&c->accounting); |
| 1288 | 1338 | ||
| @@ -1295,7 +1345,11 @@ static void cache_set_flush(struct closure *cl) | |||
| 1295 | /* Should skip this if we're unregistering because of an error */ | 1345 | /* Should skip this if we're unregistering because of an error */ |
| 1296 | list_for_each_entry(b, &c->btree_cache, list) | 1346 | list_for_each_entry(b, &c->btree_cache, list) |
| 1297 | if (btree_node_dirty(b)) | 1347 | if (btree_node_dirty(b)) |
| 1298 | bch_btree_write(b, true, NULL); | 1348 | bch_btree_node_write(b, NULL); |
| 1349 | |||
| 1350 | for_each_cache(ca, c, i) | ||
| 1351 | if (ca->alloc_thread) | ||
| 1352 | kthread_stop(ca->alloc_thread); | ||
| 1299 | 1353 | ||
| 1300 | closure_return(cl); | 1354 | closure_return(cl); |
| 1301 | } | 1355 | } |
| @@ -1303,18 +1357,22 @@ static void cache_set_flush(struct closure *cl) | |||
| 1303 | static void __cache_set_unregister(struct closure *cl) | 1357 | static void __cache_set_unregister(struct closure *cl) |
| 1304 | { | 1358 | { |
| 1305 | struct cache_set *c = container_of(cl, struct cache_set, caching); | 1359 | struct cache_set *c = container_of(cl, struct cache_set, caching); |
| 1306 | struct cached_dev *dc, *t; | 1360 | struct cached_dev *dc; |
| 1307 | size_t i; | 1361 | size_t i; |
| 1308 | 1362 | ||
| 1309 | mutex_lock(&bch_register_lock); | 1363 | mutex_lock(&bch_register_lock); |
| 1310 | 1364 | ||
| 1311 | if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) | ||
| 1312 | list_for_each_entry_safe(dc, t, &c->cached_devs, list) | ||
| 1313 | bch_cached_dev_detach(dc); | ||
| 1314 | |||
| 1315 | for (i = 0; i < c->nr_uuids; i++) | 1365 | for (i = 0; i < c->nr_uuids; i++) |
| 1316 | if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) | 1366 | if (c->devices[i]) { |
| 1317 | bcache_device_stop(c->devices[i]); | 1367 | if (!UUID_FLASH_ONLY(&c->uuids[i]) && |
| 1368 | test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { | ||
| 1369 | dc = container_of(c->devices[i], | ||
| 1370 | struct cached_dev, disk); | ||
| 1371 | bch_cached_dev_detach(dc); | ||
| 1372 | } else { | ||
| 1373 | bcache_device_stop(c->devices[i]); | ||
| 1374 | } | ||
| 1375 | } | ||
| 1318 | 1376 | ||
| 1319 | mutex_unlock(&bch_register_lock); | 1377 | mutex_unlock(&bch_register_lock); |
| 1320 | 1378 | ||
| @@ -1373,9 +1431,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
| 1373 | c->btree_pages = max_t(int, c->btree_pages / 4, | 1431 | c->btree_pages = max_t(int, c->btree_pages / 4, |
| 1374 | BTREE_MAX_PAGES); | 1432 | BTREE_MAX_PAGES); |
| 1375 | 1433 | ||
| 1376 | init_waitqueue_head(&c->alloc_wait); | 1434 | c->sort_crit_factor = int_sqrt(c->btree_pages); |
| 1435 | |||
| 1377 | mutex_init(&c->bucket_lock); | 1436 | mutex_init(&c->bucket_lock); |
| 1378 | mutex_init(&c->fill_lock); | ||
| 1379 | mutex_init(&c->sort_lock); | 1437 | mutex_init(&c->sort_lock); |
| 1380 | spin_lock_init(&c->sort_time_lock); | 1438 | spin_lock_init(&c->sort_time_lock); |
| 1381 | closure_init_unlocked(&c->sb_write); | 1439 | closure_init_unlocked(&c->sb_write); |
| @@ -1401,8 +1459,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
| 1401 | !(c->bio_meta = mempool_create_kmalloc_pool(2, | 1459 | !(c->bio_meta = mempool_create_kmalloc_pool(2, |
| 1402 | sizeof(struct bbio) + sizeof(struct bio_vec) * | 1460 | sizeof(struct bbio) + sizeof(struct bio_vec) * |
| 1403 | bucket_pages(c))) || | 1461 | bucket_pages(c))) || |
| 1462 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || | ||
| 1404 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 1463 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
| 1405 | !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || | ||
| 1406 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || | 1464 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || |
| 1407 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | 1465 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || |
| 1408 | bch_journal_alloc(c) || | 1466 | bch_journal_alloc(c) || |
| @@ -1410,8 +1468,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
| 1410 | bch_open_buckets_alloc(c)) | 1468 | bch_open_buckets_alloc(c)) |
| 1411 | goto err; | 1469 | goto err; |
| 1412 | 1470 | ||
| 1413 | c->fill_iter->size = sb->bucket_size / sb->block_size; | ||
| 1414 | |||
| 1415 | c->congested_read_threshold_us = 2000; | 1471 | c->congested_read_threshold_us = 2000; |
| 1416 | c->congested_write_threshold_us = 20000; | 1472 | c->congested_write_threshold_us = 20000; |
| 1417 | c->error_limit = 8 << IO_ERROR_SHIFT; | 1473 | c->error_limit = 8 << IO_ERROR_SHIFT; |
| @@ -1496,9 +1552,10 @@ static void run_cache_set(struct cache_set *c) | |||
| 1496 | */ | 1552 | */ |
| 1497 | bch_journal_next(&c->journal); | 1553 | bch_journal_next(&c->journal); |
| 1498 | 1554 | ||
| 1555 | err = "error starting allocator thread"; | ||
| 1499 | for_each_cache(ca, c, i) | 1556 | for_each_cache(ca, c, i) |
| 1500 | closure_call(&ca->alloc, bch_allocator_thread, | 1557 | if (bch_cache_allocator_start(ca)) |
| 1501 | system_wq, &c->cl); | 1558 | goto err; |
| 1502 | 1559 | ||
| 1503 | /* | 1560 | /* |
| 1504 | * First place it's safe to allocate: btree_check() and | 1561 | * First place it's safe to allocate: btree_check() and |
| @@ -1531,17 +1588,16 @@ static void run_cache_set(struct cache_set *c) | |||
| 1531 | 1588 | ||
| 1532 | bch_btree_gc_finish(c); | 1589 | bch_btree_gc_finish(c); |
| 1533 | 1590 | ||
| 1591 | err = "error starting allocator thread"; | ||
| 1534 | for_each_cache(ca, c, i) | 1592 | for_each_cache(ca, c, i) |
| 1535 | closure_call(&ca->alloc, bch_allocator_thread, | 1593 | if (bch_cache_allocator_start(ca)) |
| 1536 | ca->alloc_workqueue, &c->cl); | 1594 | goto err; |
| 1537 | 1595 | ||
| 1538 | mutex_lock(&c->bucket_lock); | 1596 | mutex_lock(&c->bucket_lock); |
| 1539 | for_each_cache(ca, c, i) | 1597 | for_each_cache(ca, c, i) |
| 1540 | bch_prio_write(ca); | 1598 | bch_prio_write(ca); |
| 1541 | mutex_unlock(&c->bucket_lock); | 1599 | mutex_unlock(&c->bucket_lock); |
| 1542 | 1600 | ||
| 1543 | wake_up(&c->alloc_wait); | ||
| 1544 | |||
| 1545 | err = "cannot allocate new UUID bucket"; | 1601 | err = "cannot allocate new UUID bucket"; |
| 1546 | if (__uuid_write(c)) | 1602 | if (__uuid_write(c)) |
| 1547 | goto err_unlock_gc; | 1603 | goto err_unlock_gc; |
| @@ -1552,7 +1608,7 @@ static void run_cache_set(struct cache_set *c) | |||
| 1552 | goto err_unlock_gc; | 1608 | goto err_unlock_gc; |
| 1553 | 1609 | ||
| 1554 | bkey_copy_key(&c->root->key, &MAX_KEY); | 1610 | bkey_copy_key(&c->root->key, &MAX_KEY); |
| 1555 | bch_btree_write(c->root, true, &op); | 1611 | bch_btree_node_write(c->root, &op.cl); |
| 1556 | 1612 | ||
| 1557 | bch_btree_set_root(c->root); | 1613 | bch_btree_set_root(c->root); |
| 1558 | rw_unlock(true, c->root); | 1614 | rw_unlock(true, c->root); |
| @@ -1673,9 +1729,6 @@ void bch_cache_release(struct kobject *kobj) | |||
| 1673 | 1729 | ||
| 1674 | bio_split_pool_free(&ca->bio_split_hook); | 1730 | bio_split_pool_free(&ca->bio_split_hook); |
| 1675 | 1731 | ||
| 1676 | if (ca->alloc_workqueue) | ||
| 1677 | destroy_workqueue(ca->alloc_workqueue); | ||
| 1678 | |||
| 1679 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); | 1732 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); |
| 1680 | kfree(ca->prio_buckets); | 1733 | kfree(ca->prio_buckets); |
| 1681 | vfree(ca->buckets); | 1734 | vfree(ca->buckets); |
| @@ -1723,7 +1776,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
| 1723 | !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * | 1776 | !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * |
| 1724 | 2, GFP_KERNEL)) || | 1777 | 2, GFP_KERNEL)) || |
| 1725 | !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || | 1778 | !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || |
| 1726 | !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || | ||
| 1727 | bio_split_pool_init(&ca->bio_split_hook)) | 1779 | bio_split_pool_init(&ca->bio_split_hook)) |
| 1728 | return -ENOMEM; | 1780 | return -ENOMEM; |
| 1729 | 1781 | ||
| @@ -1786,6 +1838,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, | |||
| 1786 | kobj_attribute_write(register, register_bcache); | 1838 | kobj_attribute_write(register, register_bcache); |
| 1787 | kobj_attribute_write(register_quiet, register_bcache); | 1839 | kobj_attribute_write(register_quiet, register_bcache); |
| 1788 | 1840 | ||
| 1841 | static bool bch_is_open_backing(struct block_device *bdev) { | ||
| 1842 | struct cache_set *c, *tc; | ||
| 1843 | struct cached_dev *dc, *t; | ||
| 1844 | |||
| 1845 | list_for_each_entry_safe(c, tc, &bch_cache_sets, list) | ||
| 1846 | list_for_each_entry_safe(dc, t, &c->cached_devs, list) | ||
| 1847 | if (dc->bdev == bdev) | ||
| 1848 | return true; | ||
| 1849 | list_for_each_entry_safe(dc, t, &uncached_devices, list) | ||
| 1850 | if (dc->bdev == bdev) | ||
| 1851 | return true; | ||
| 1852 | return false; | ||
| 1853 | } | ||
| 1854 | |||
| 1855 | static bool bch_is_open_cache(struct block_device *bdev) { | ||
| 1856 | struct cache_set *c, *tc; | ||
| 1857 | struct cache *ca; | ||
| 1858 | unsigned i; | ||
| 1859 | |||
| 1860 | list_for_each_entry_safe(c, tc, &bch_cache_sets, list) | ||
| 1861 | for_each_cache(ca, c, i) | ||
| 1862 | if (ca->bdev == bdev) | ||
| 1863 | return true; | ||
| 1864 | return false; | ||
| 1865 | } | ||
| 1866 | |||
| 1867 | static bool bch_is_open(struct block_device *bdev) { | ||
| 1868 | return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); | ||
| 1869 | } | ||
| 1870 | |||
| 1789 | static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | 1871 | static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, |
| 1790 | const char *buffer, size_t size) | 1872 | const char *buffer, size_t size) |
| 1791 | { | 1873 | { |
| @@ -1810,8 +1892,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | |||
| 1810 | FMODE_READ|FMODE_WRITE|FMODE_EXCL, | 1892 | FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
| 1811 | sb); | 1893 | sb); |
| 1812 | if (IS_ERR(bdev)) { | 1894 | if (IS_ERR(bdev)) { |
| 1813 | if (bdev == ERR_PTR(-EBUSY)) | 1895 | if (bdev == ERR_PTR(-EBUSY)) { |
| 1814 | err = "device busy"; | 1896 | bdev = lookup_bdev(strim(path)); |
| 1897 | if (!IS_ERR(bdev) && bch_is_open(bdev)) | ||
| 1898 | err = "device already registered"; | ||
| 1899 | else | ||
| 1900 | err = "device busy"; | ||
| 1901 | } | ||
| 1815 | goto err; | 1902 | goto err; |
| 1816 | } | 1903 | } |
| 1817 | 1904 | ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4d9cca47e4c6..12a2c2846f99 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
| @@ -9,7 +9,9 @@ | |||
| 9 | #include "sysfs.h" | 9 | #include "sysfs.h" |
| 10 | #include "btree.h" | 10 | #include "btree.h" |
| 11 | #include "request.h" | 11 | #include "request.h" |
| 12 | #include "writeback.h" | ||
| 12 | 13 | ||
| 14 | #include <linux/blkdev.h> | ||
| 13 | #include <linux/sort.h> | 15 | #include <linux/sort.h> |
| 14 | 16 | ||
| 15 | static const char * const cache_replacement_policies[] = { | 17 | static const char * const cache_replacement_policies[] = { |
| @@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse); | |||
| 79 | rw_attribute(writeback_rate_d_smooth); | 81 | rw_attribute(writeback_rate_d_smooth); |
| 80 | read_attribute(writeback_rate_debug); | 82 | read_attribute(writeback_rate_debug); |
| 81 | 83 | ||
| 84 | read_attribute(stripe_size); | ||
| 85 | read_attribute(partial_stripes_expensive); | ||
| 86 | |||
| 82 | rw_attribute(synchronous); | 87 | rw_attribute(synchronous); |
| 83 | rw_attribute(journal_delay_ms); | 88 | rw_attribute(journal_delay_ms); |
| 84 | rw_attribute(discard); | 89 | rw_attribute(discard); |
| @@ -127,7 +132,7 @@ SHOW(__bch_cached_dev) | |||
| 127 | char derivative[20]; | 132 | char derivative[20]; |
| 128 | char target[20]; | 133 | char target[20]; |
| 129 | bch_hprint(dirty, | 134 | bch_hprint(dirty, |
| 130 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | 135 | bcache_dev_sectors_dirty(&dc->disk) << 9); |
| 131 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); | 136 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); |
| 132 | bch_hprint(target, dc->writeback_rate_target << 9); | 137 | bch_hprint(target, dc->writeback_rate_target << 9); |
| 133 | 138 | ||
| @@ -143,7 +148,10 @@ SHOW(__bch_cached_dev) | |||
| 143 | } | 148 | } |
| 144 | 149 | ||
| 145 | sysfs_hprint(dirty_data, | 150 | sysfs_hprint(dirty_data, |
| 146 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | 151 | bcache_dev_sectors_dirty(&dc->disk) << 9); |
| 152 | |||
| 153 | sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); | ||
| 154 | var_printf(partial_stripes_expensive, "%u"); | ||
| 147 | 155 | ||
| 148 | var_printf(sequential_merge, "%i"); | 156 | var_printf(sequential_merge, "%i"); |
| 149 | var_hprint(sequential_cutoff); | 157 | var_hprint(sequential_cutoff); |
| @@ -170,6 +178,7 @@ STORE(__cached_dev) | |||
| 170 | disk.kobj); | 178 | disk.kobj); |
| 171 | unsigned v = size; | 179 | unsigned v = size; |
| 172 | struct cache_set *c; | 180 | struct cache_set *c; |
| 181 | struct kobj_uevent_env *env; | ||
| 173 | 182 | ||
| 174 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) | 183 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) |
| 175 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) | 184 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) |
| @@ -214,6 +223,7 @@ STORE(__cached_dev) | |||
| 214 | } | 223 | } |
| 215 | 224 | ||
| 216 | if (attr == &sysfs_label) { | 225 | if (attr == &sysfs_label) { |
| 226 | /* note: endlines are preserved */ | ||
| 217 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); | 227 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); |
| 218 | bch_write_bdev_super(dc, NULL); | 228 | bch_write_bdev_super(dc, NULL); |
| 219 | if (dc->disk.c) { | 229 | if (dc->disk.c) { |
| @@ -221,6 +231,15 @@ STORE(__cached_dev) | |||
| 221 | buf, SB_LABEL_SIZE); | 231 | buf, SB_LABEL_SIZE); |
| 222 | bch_uuid_write(dc->disk.c); | 232 | bch_uuid_write(dc->disk.c); |
| 223 | } | 233 | } |
| 234 | env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); | ||
| 235 | if (!env) | ||
| 236 | return -ENOMEM; | ||
| 237 | add_uevent_var(env, "DRIVER=bcache"); | ||
| 238 | add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), | ||
| 239 | add_uevent_var(env, "CACHED_LABEL=%s", buf); | ||
| 240 | kobject_uevent_env( | ||
| 241 | &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); | ||
| 242 | kfree(env); | ||
| 224 | } | 243 | } |
| 225 | 244 | ||
| 226 | if (attr == &sysfs_attach) { | 245 | if (attr == &sysfs_attach) { |
| @@ -284,6 +303,8 @@ static struct attribute *bch_cached_dev_files[] = { | |||
| 284 | &sysfs_writeback_rate_d_smooth, | 303 | &sysfs_writeback_rate_d_smooth, |
| 285 | &sysfs_writeback_rate_debug, | 304 | &sysfs_writeback_rate_debug, |
| 286 | &sysfs_dirty_data, | 305 | &sysfs_dirty_data, |
| 306 | &sysfs_stripe_size, | ||
| 307 | &sysfs_partial_stripes_expensive, | ||
| 287 | &sysfs_sequential_cutoff, | 308 | &sysfs_sequential_cutoff, |
| 288 | &sysfs_sequential_merge, | 309 | &sysfs_sequential_merge, |
| 289 | &sysfs_clear_stats, | 310 | &sysfs_clear_stats, |
| @@ -665,12 +686,10 @@ SHOW(__bch_cache) | |||
| 665 | int cmp(const void *l, const void *r) | 686 | int cmp(const void *l, const void *r) |
| 666 | { return *((uint16_t *) r) - *((uint16_t *) l); } | 687 | { return *((uint16_t *) r) - *((uint16_t *) l); } |
| 667 | 688 | ||
| 668 | /* Number of quantiles we compute */ | ||
| 669 | const unsigned nq = 31; | ||
| 670 | |||
| 671 | size_t n = ca->sb.nbuckets, i, unused, btree; | 689 | size_t n = ca->sb.nbuckets, i, unused, btree; |
| 672 | uint64_t sum = 0; | 690 | uint64_t sum = 0; |
| 673 | uint16_t q[nq], *p, *cached; | 691 | /* Compute 31 quantiles */ |
| 692 | uint16_t q[31], *p, *cached; | ||
| 674 | ssize_t ret; | 693 | ssize_t ret; |
| 675 | 694 | ||
| 676 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); | 695 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); |
| @@ -703,26 +722,29 @@ SHOW(__bch_cache) | |||
| 703 | if (n) | 722 | if (n) |
| 704 | do_div(sum, n); | 723 | do_div(sum, n); |
| 705 | 724 | ||
| 706 | for (i = 0; i < nq; i++) | 725 | for (i = 0; i < ARRAY_SIZE(q); i++) |
| 707 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; | 726 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / |
| 727 | (ARRAY_SIZE(q) + 1)]; | ||
| 708 | 728 | ||
| 709 | vfree(p); | 729 | vfree(p); |
| 710 | 730 | ||
| 711 | ret = snprintf(buf, PAGE_SIZE, | 731 | ret = scnprintf(buf, PAGE_SIZE, |
| 712 | "Unused: %zu%%\n" | 732 | "Unused: %zu%%\n" |
| 713 | "Metadata: %zu%%\n" | 733 | "Metadata: %zu%%\n" |
| 714 | "Average: %llu\n" | 734 | "Average: %llu\n" |
| 715 | "Sectors per Q: %zu\n" | 735 | "Sectors per Q: %zu\n" |
| 716 | "Quantiles: [", | 736 | "Quantiles: [", |
| 717 | unused * 100 / (size_t) ca->sb.nbuckets, | 737 | unused * 100 / (size_t) ca->sb.nbuckets, |
| 718 | btree * 100 / (size_t) ca->sb.nbuckets, sum, | 738 | btree * 100 / (size_t) ca->sb.nbuckets, sum, |
| 719 | n * ca->sb.bucket_size / (nq + 1)); | 739 | n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); |
| 720 | 740 | ||
| 721 | for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) | 741 | for (i = 0; i < ARRAY_SIZE(q); i++) |
| 722 | ret += snprintf(buf + ret, PAGE_SIZE - ret, | 742 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, |
| 723 | i < nq - 1 ? "%u " : "%u]\n", q[i]); | 743 | "%u ", q[i]); |
| 724 | 744 | ret--; | |
| 725 | buf[PAGE_SIZE - 1] = '\0'; | 745 | |
| 746 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); | ||
| 747 | |||
| 726 | return ret; | 748 | return ret; |
| 727 | } | 749 | } |
| 728 | 750 | ||
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index 983f9bb411bc..f7b6c197f90f 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #include "btree.h" | 2 | #include "btree.h" |
| 3 | #include "request.h" | 3 | #include "request.h" |
| 4 | 4 | ||
| 5 | #include <linux/blktrace_api.h> | ||
| 5 | #include <linux/module.h> | 6 | #include <linux/module.h> |
| 6 | 7 | ||
| 7 | #define CREATE_TRACE_POINTS | 8 | #define CREATE_TRACE_POINTS |
| @@ -9,18 +10,44 @@ | |||
| 9 | 10 | ||
| 10 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); | 11 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); |
| 11 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); | 12 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); |
| 12 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); | 13 | |
| 13 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); | 14 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential); |
| 14 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); | 15 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested); |
| 16 | |||
| 17 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read); | ||
| 18 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write); | ||
| 15 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); | 19 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); |
| 16 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); | 20 | |
| 17 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | 21 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); |
| 18 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); | 22 | |
| 23 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key); | ||
| 24 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); | ||
| 25 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full); | ||
| 26 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full); | ||
| 27 | |||
| 28 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize); | ||
| 29 | |||
| 19 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); | 30 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); |
| 20 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); | 31 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); |
| 21 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); | 32 | |
| 22 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); | 33 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc); |
| 23 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); | 34 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail); |
| 24 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); | 35 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free); |
| 36 | |||
| 37 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce); | ||
| 25 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); | 38 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); |
| 26 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); | 39 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); |
| 40 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); | ||
| 41 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); | ||
| 42 | |||
| 43 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key); | ||
| 44 | |||
| 45 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); | ||
| 46 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); | ||
| 47 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); | ||
| 48 | |||
| 49 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); | ||
| 50 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); | ||
| 51 | |||
| 52 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | ||
| 53 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision); | ||
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index da3a99e85b1e..98eb81159a22 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
| @@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, | |||
| 228 | } | 228 | } |
| 229 | } | 229 | } |
| 230 | 230 | ||
| 231 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp) | ||
| 232 | { | ||
| 233 | int i; | ||
| 234 | struct bio_vec *bv; | ||
| 235 | |||
| 236 | bio_for_each_segment(bv, bio, i) { | ||
| 237 | bv->bv_page = alloc_page(gfp); | ||
| 238 | if (!bv->bv_page) { | ||
| 239 | while (bv-- != bio->bi_io_vec + bio->bi_idx) | ||
| 240 | __free_page(bv->bv_page); | ||
| 241 | return -ENOMEM; | ||
| 242 | } | ||
| 243 | } | ||
| 244 | |||
| 245 | return 0; | ||
| 246 | } | ||
| 247 | |||
| 248 | /* | 231 | /* |
| 249 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any | 232 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any |
| 250 | * use permitted, subject to terms of PostgreSQL license; see.) | 233 | * use permitted, subject to terms of PostgreSQL license; see.) |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e38c3a..1ae2a73ad85f 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
| @@ -15,8 +15,6 @@ | |||
| 15 | 15 | ||
| 16 | struct closure; | 16 | struct closure; |
| 17 | 17 | ||
| 18 | #include <trace/events/bcache.h> | ||
| 19 | |||
| 20 | #ifdef CONFIG_BCACHE_EDEBUG | 18 | #ifdef CONFIG_BCACHE_EDEBUG |
| 21 | 19 | ||
| 22 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | 20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) |
| @@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) | |||
| 566 | return x; | 564 | return x; |
| 567 | } | 565 | } |
| 568 | 566 | ||
| 569 | #define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) | ||
| 570 | |||
| 571 | void bch_bio_map(struct bio *bio, void *base); | 567 | void bch_bio_map(struct bio *bio, void *base); |
| 572 | 568 | ||
| 573 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp); | ||
| 574 | |||
| 575 | static inline sector_t bdev_sectors(struct block_device *bdev) | 569 | static inline sector_t bdev_sectors(struct block_device *bdev) |
| 576 | { | 570 | { |
| 577 | return bdev->bd_inode->i_size >> 9; | 571 | return bdev->bd_inode->i_size >> 9; |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..22cbff551628 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
| @@ -9,6 +9,9 @@ | |||
| 9 | #include "bcache.h" | 9 | #include "bcache.h" |
| 10 | #include "btree.h" | 10 | #include "btree.h" |
| 11 | #include "debug.h" | 11 | #include "debug.h" |
| 12 | #include "writeback.h" | ||
| 13 | |||
| 14 | #include <trace/events/bcache.h> | ||
| 12 | 15 | ||
| 13 | static struct workqueue_struct *dirty_wq; | 16 | static struct workqueue_struct *dirty_wq; |
| 14 | 17 | ||
| @@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc) | |||
| 36 | 39 | ||
| 37 | int change = 0; | 40 | int change = 0; |
| 38 | int64_t error; | 41 | int64_t error; |
| 39 | int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); | 42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
| 40 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | 43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
| 41 | 44 | ||
| 42 | dc->disk.sectors_dirty_last = dirty; | 45 | dc->disk.sectors_dirty_last = dirty; |
| @@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |||
| 105 | return KEY_DIRTY(k); | 108 | return KEY_DIRTY(k); |
| 106 | } | 109 | } |
| 107 | 110 | ||
| 111 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) | ||
| 112 | { | ||
| 113 | uint64_t stripe; | ||
| 114 | unsigned nr_sectors = KEY_SIZE(k); | ||
| 115 | struct cached_dev *dc = container_of(buf, struct cached_dev, | ||
| 116 | writeback_keys); | ||
| 117 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; | ||
| 118 | |||
| 119 | if (!KEY_DIRTY(k)) | ||
| 120 | return false; | ||
| 121 | |||
| 122 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; | ||
| 123 | while (1) { | ||
| 124 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != | ||
| 125 | stripe_size) | ||
| 126 | return false; | ||
| 127 | |||
| 128 | if (nr_sectors <= stripe_size) | ||
| 129 | return true; | ||
| 130 | |||
| 131 | nr_sectors -= stripe_size; | ||
| 132 | stripe++; | ||
| 133 | } | ||
| 134 | } | ||
| 135 | |||
| 108 | static void dirty_init(struct keybuf_key *w) | 136 | static void dirty_init(struct keybuf_key *w) |
| 109 | { | 137 | { |
| 110 | struct dirty_io *io = w->private; | 138 | struct dirty_io *io = w->private; |
| @@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl) | |||
| 149 | searched_from_start = true; | 177 | searched_from_start = true; |
| 150 | } | 178 | } |
| 151 | 179 | ||
| 152 | bch_refill_keybuf(dc->disk.c, buf, &end); | 180 | if (dc->partial_stripes_expensive) { |
| 181 | uint64_t i; | ||
| 182 | |||
| 183 | for (i = 0; i < dc->disk.nr_stripes; i++) | ||
| 184 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | ||
| 185 | 1 << dc->disk.stripe_size_bits) | ||
| 186 | goto full_stripes; | ||
| 187 | |||
| 188 | goto normal_refill; | ||
| 189 | full_stripes: | ||
| 190 | bch_refill_keybuf(dc->disk.c, buf, &end, | ||
| 191 | dirty_full_stripe_pred); | ||
| 192 | } else { | ||
| 193 | normal_refill: | ||
| 194 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | ||
| 195 | } | ||
| 153 | 196 | ||
| 154 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | 197 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { |
| 155 | /* Searched the entire btree - delay awhile */ | 198 | /* Searched the entire btree - delay awhile */ |
| @@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc) | |||
| 181 | } | 224 | } |
| 182 | } | 225 | } |
| 183 | 226 | ||
| 184 | void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | 227 | void bch_writeback_add(struct cached_dev *dc) |
| 185 | { | 228 | { |
| 186 | atomic_long_add(sectors, &dc->disk.sectors_dirty); | ||
| 187 | |||
| 188 | if (!atomic_read(&dc->has_dirty) && | 229 | if (!atomic_read(&dc->has_dirty) && |
| 189 | !atomic_xchg(&dc->has_dirty, 1)) { | 230 | !atomic_xchg(&dc->has_dirty, 1)) { |
| 190 | atomic_inc(&dc->count); | 231 | atomic_inc(&dc->count); |
| @@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | |||
| 203 | } | 244 | } |
| 204 | } | 245 | } |
| 205 | 246 | ||
| 247 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | ||
| 248 | uint64_t offset, int nr_sectors) | ||
| 249 | { | ||
| 250 | struct bcache_device *d = c->devices[inode]; | ||
| 251 | unsigned stripe_size, stripe_offset; | ||
| 252 | uint64_t stripe; | ||
| 253 | |||
| 254 | if (!d) | ||
| 255 | return; | ||
| 256 | |||
| 257 | stripe_size = 1 << d->stripe_size_bits; | ||
| 258 | stripe = offset >> d->stripe_size_bits; | ||
| 259 | stripe_offset = offset & (stripe_size - 1); | ||
| 260 | |||
| 261 | while (nr_sectors) { | ||
| 262 | int s = min_t(unsigned, abs(nr_sectors), | ||
| 263 | stripe_size - stripe_offset); | ||
| 264 | |||
| 265 | if (nr_sectors < 0) | ||
| 266 | s = -s; | ||
| 267 | |||
| 268 | atomic_add(s, d->stripe_sectors_dirty + stripe); | ||
| 269 | nr_sectors -= s; | ||
| 270 | stripe_offset = 0; | ||
| 271 | stripe++; | ||
| 272 | } | ||
| 273 | } | ||
| 274 | |||
| 206 | /* Background writeback - IO loop */ | 275 | /* Background writeback - IO loop */ |
| 207 | 276 | ||
| 208 | static void dirty_io_destructor(struct closure *cl) | 277 | static void dirty_io_destructor(struct closure *cl) |
| @@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl) | |||
| 216 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 285 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
| 217 | struct keybuf_key *w = io->bio.bi_private; | 286 | struct keybuf_key *w = io->bio.bi_private; |
| 218 | struct cached_dev *dc = io->dc; | 287 | struct cached_dev *dc = io->dc; |
| 219 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | 288 | struct bio_vec *bv; |
| 289 | int i; | ||
| 220 | 290 | ||
| 221 | while (bv-- != io->bio.bi_io_vec) | 291 | bio_for_each_segment_all(bv, &io->bio, i) |
| 222 | __free_page(bv->bv_page); | 292 | __free_page(bv->bv_page); |
| 223 | 293 | ||
| 224 | /* This is kind of a dumb way of signalling errors. */ | 294 | /* This is kind of a dumb way of signalling errors. */ |
| @@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl) | |||
| 236 | for (i = 0; i < KEY_PTRS(&w->key); i++) | 306 | for (i = 0; i < KEY_PTRS(&w->key); i++) |
| 237 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | 307 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); |
| 238 | 308 | ||
| 239 | pr_debug("clearing %s", pkey(&w->key)); | ||
| 240 | bch_btree_insert(&op, dc->disk.c); | 309 | bch_btree_insert(&op, dc->disk.c); |
| 241 | closure_sync(&op.cl); | 310 | closure_sync(&op.cl); |
| 242 | 311 | ||
| 312 | if (op.insert_collision) | ||
| 313 | trace_bcache_writeback_collision(&w->key); | ||
| 314 | |||
| 243 | atomic_long_inc(op.insert_collision | 315 | atomic_long_inc(op.insert_collision |
| 244 | ? &dc->disk.c->writeback_keys_failed | 316 | ? &dc->disk.c->writeback_keys_failed |
| 245 | : &dc->disk.c->writeback_keys_done); | 317 | : &dc->disk.c->writeback_keys_done); |
| @@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl) | |||
| 275 | io->bio.bi_bdev = io->dc->bdev; | 347 | io->bio.bi_bdev = io->dc->bdev; |
| 276 | io->bio.bi_end_io = dirty_endio; | 348 | io->bio.bi_end_io = dirty_endio; |
| 277 | 349 | ||
| 278 | trace_bcache_write_dirty(&io->bio); | ||
| 279 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 350 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
| 280 | 351 | ||
| 281 | continue_at(cl, write_dirty_finish, dirty_wq); | 352 | continue_at(cl, write_dirty_finish, dirty_wq); |
| @@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl) | |||
| 296 | { | 367 | { |
| 297 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 368 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
| 298 | 369 | ||
| 299 | trace_bcache_read_dirty(&io->bio); | ||
| 300 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 370 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
| 301 | 371 | ||
| 302 | continue_at(cl, write_dirty, dirty_wq); | 372 | continue_at(cl, write_dirty, dirty_wq); |
| @@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl) | |||
| 349 | io->bio.bi_rw = READ; | 419 | io->bio.bi_rw = READ; |
| 350 | io->bio.bi_end_io = read_dirty_endio; | 420 | io->bio.bi_end_io = read_dirty_endio; |
| 351 | 421 | ||
| 352 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) | 422 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
| 353 | goto err_free; | 423 | goto err_free; |
| 354 | 424 | ||
| 355 | pr_debug("%s", pkey(&w->key)); | 425 | trace_bcache_writeback(&w->key); |
| 356 | 426 | ||
| 357 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | 427 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); |
| 358 | 428 | ||
| @@ -375,12 +445,49 @@ err: | |||
| 375 | refill_dirty(cl); | 445 | refill_dirty(cl); |
| 376 | } | 446 | } |
| 377 | 447 | ||
| 448 | /* Init */ | ||
| 449 | |||
| 450 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, | ||
| 451 | struct cached_dev *dc) | ||
| 452 | { | ||
| 453 | struct bkey *k; | ||
| 454 | struct btree_iter iter; | ||
| 455 | |||
| 456 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); | ||
| 457 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) | ||
| 458 | if (!b->level) { | ||
| 459 | if (KEY_INODE(k) > dc->disk.id) | ||
| 460 | break; | ||
| 461 | |||
| 462 | if (KEY_DIRTY(k)) | ||
| 463 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, | ||
| 464 | KEY_START(k), | ||
| 465 | KEY_SIZE(k)); | ||
| 466 | } else { | ||
| 467 | btree(sectors_dirty_init, k, b, op, dc); | ||
| 468 | if (KEY_INODE(k) > dc->disk.id) | ||
| 469 | break; | ||
| 470 | |||
| 471 | cond_resched(); | ||
| 472 | } | ||
| 473 | |||
| 474 | return 0; | ||
| 475 | } | ||
| 476 | |||
| 477 | void bch_sectors_dirty_init(struct cached_dev *dc) | ||
| 478 | { | ||
| 479 | struct btree_op op; | ||
| 480 | |||
| 481 | bch_btree_op_init_stack(&op); | ||
| 482 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); | ||
| 483 | } | ||
| 484 | |||
| 378 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 485 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
| 379 | { | 486 | { |
| 380 | closure_init_unlocked(&dc->writeback); | 487 | closure_init_unlocked(&dc->writeback); |
| 381 | init_rwsem(&dc->writeback_lock); | 488 | init_rwsem(&dc->writeback_lock); |
| 382 | 489 | ||
| 383 | bch_keybuf_init(&dc->writeback_keys, dirty_pred); | 490 | bch_keybuf_init(&dc->writeback_keys); |
| 384 | 491 | ||
| 385 | dc->writeback_metadata = true; | 492 | dc->writeback_metadata = true; |
| 386 | dc->writeback_running = true; | 493 | dc->writeback_running = true; |
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h new file mode 100644 index 000000000000..c91f61bb95b6 --- /dev/null +++ b/drivers/md/bcache/writeback.h | |||
| @@ -0,0 +1,64 @@ | |||
| 1 | #ifndef _BCACHE_WRITEBACK_H | ||
| 2 | #define _BCACHE_WRITEBACK_H | ||
| 3 | |||
| 4 | #define CUTOFF_WRITEBACK 40 | ||
| 5 | #define CUTOFF_WRITEBACK_SYNC 70 | ||
| 6 | |||
| 7 | static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) | ||
| 8 | { | ||
| 9 | uint64_t i, ret = 0; | ||
| 10 | |||
| 11 | for (i = 0; i < d->nr_stripes; i++) | ||
| 12 | ret += atomic_read(d->stripe_sectors_dirty + i); | ||
| 13 | |||
| 14 | return ret; | ||
| 15 | } | ||
| 16 | |||
| 17 | static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, | ||
| 18 | uint64_t offset, | ||
| 19 | unsigned nr_sectors) | ||
| 20 | { | ||
| 21 | uint64_t stripe = offset >> d->stripe_size_bits; | ||
| 22 | |||
| 23 | while (1) { | ||
| 24 | if (atomic_read(d->stripe_sectors_dirty + stripe)) | ||
| 25 | return true; | ||
| 26 | |||
| 27 | if (nr_sectors <= 1 << d->stripe_size_bits) | ||
| 28 | return false; | ||
| 29 | |||
| 30 | nr_sectors -= 1 << d->stripe_size_bits; | ||
| 31 | stripe++; | ||
| 32 | } | ||
| 33 | } | ||
| 34 | |||
| 35 | static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, | ||
| 36 | unsigned cache_mode, bool would_skip) | ||
| 37 | { | ||
| 38 | unsigned in_use = dc->disk.c->gc_stats.in_use; | ||
| 39 | |||
| 40 | if (cache_mode != CACHE_MODE_WRITEBACK || | ||
| 41 | atomic_read(&dc->disk.detaching) || | ||
| 42 | in_use > CUTOFF_WRITEBACK_SYNC) | ||
| 43 | return false; | ||
| 44 | |||
| 45 | if (dc->partial_stripes_expensive && | ||
| 46 | bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, | ||
| 47 | bio_sectors(bio))) | ||
| 48 | return true; | ||
| 49 | |||
| 50 | if (would_skip) | ||
| 51 | return false; | ||
| 52 | |||
| 53 | return bio->bi_rw & REQ_SYNC || | ||
| 54 | in_use <= CUTOFF_WRITEBACK; | ||
| 55 | } | ||
| 56 | |||
| 57 | void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); | ||
| 58 | void bch_writeback_queue(struct cached_dev *); | ||
| 59 | void bch_writeback_add(struct cached_dev *); | ||
| 60 | |||
| 61 | void bch_sectors_dirty_init(struct cached_dev *dc); | ||
| 62 | void bch_cached_dev_writeback_init(struct cached_dev *); | ||
| 63 | |||
| 64 | #endif | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index dddc87bcf64a..9f13e13506ef 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -7716,20 +7716,6 @@ static int remove_and_add_spares(struct mddev *mddev, | |||
| 7716 | continue; | 7716 | continue; |
| 7717 | 7717 | ||
| 7718 | rdev->recovery_offset = 0; | 7718 | rdev->recovery_offset = 0; |
| 7719 | if (rdev->saved_raid_disk >= 0 && mddev->in_sync) { | ||
| 7720 | spin_lock_irq(&mddev->write_lock); | ||
| 7721 | if (mddev->in_sync) | ||
| 7722 | /* OK, this device, which is in_sync, | ||
| 7723 | * will definitely be noticed before | ||
| 7724 | * the next write, so recovery isn't | ||
| 7725 | * needed. | ||
| 7726 | */ | ||
| 7727 | rdev->recovery_offset = mddev->recovery_cp; | ||
| 7728 | spin_unlock_irq(&mddev->write_lock); | ||
| 7729 | } | ||
| 7730 | if (mddev->ro && rdev->recovery_offset != MaxSector) | ||
| 7731 | /* not safe to add this disk now */ | ||
| 7732 | continue; | ||
| 7733 | if (mddev->pers-> | 7719 | if (mddev->pers-> |
| 7734 | hot_add_disk(mddev, rdev) == 0) { | 7720 | hot_add_disk(mddev, rdev) == 0) { |
| 7735 | if (sysfs_link_rdev(mddev, rdev)) | 7721 | if (sysfs_link_rdev(mddev, rdev)) |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ec734588a1c6..d60412c7f995 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -1849,6 +1849,36 @@ static int process_checks(struct r1bio *r1_bio) | |||
| 1849 | int i; | 1849 | int i; |
| 1850 | int vcnt; | 1850 | int vcnt; |
| 1851 | 1851 | ||
| 1852 | /* Fix variable parts of all bios */ | ||
| 1853 | vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); | ||
| 1854 | for (i = 0; i < conf->raid_disks * 2; i++) { | ||
| 1855 | int j; | ||
| 1856 | int size; | ||
| 1857 | struct bio *b = r1_bio->bios[i]; | ||
| 1858 | if (b->bi_end_io != end_sync_read) | ||
| 1859 | continue; | ||
| 1860 | /* fixup the bio for reuse */ | ||
| 1861 | bio_reset(b); | ||
| 1862 | b->bi_vcnt = vcnt; | ||
| 1863 | b->bi_size = r1_bio->sectors << 9; | ||
| 1864 | b->bi_sector = r1_bio->sector + | ||
| 1865 | conf->mirrors[i].rdev->data_offset; | ||
| 1866 | b->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
| 1867 | b->bi_end_io = end_sync_read; | ||
| 1868 | b->bi_private = r1_bio; | ||
| 1869 | |||
| 1870 | size = b->bi_size; | ||
| 1871 | for (j = 0; j < vcnt ; j++) { | ||
| 1872 | struct bio_vec *bi; | ||
| 1873 | bi = &b->bi_io_vec[j]; | ||
| 1874 | bi->bv_offset = 0; | ||
| 1875 | if (size > PAGE_SIZE) | ||
| 1876 | bi->bv_len = PAGE_SIZE; | ||
| 1877 | else | ||
| 1878 | bi->bv_len = size; | ||
| 1879 | size -= PAGE_SIZE; | ||
| 1880 | } | ||
| 1881 | } | ||
| 1852 | for (primary = 0; primary < conf->raid_disks * 2; primary++) | 1882 | for (primary = 0; primary < conf->raid_disks * 2; primary++) |
| 1853 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && | 1883 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && |
| 1854 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { | 1884 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { |
| @@ -1857,12 +1887,10 @@ static int process_checks(struct r1bio *r1_bio) | |||
| 1857 | break; | 1887 | break; |
| 1858 | } | 1888 | } |
| 1859 | r1_bio->read_disk = primary; | 1889 | r1_bio->read_disk = primary; |
| 1860 | vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); | ||
| 1861 | for (i = 0; i < conf->raid_disks * 2; i++) { | 1890 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 1862 | int j; | 1891 | int j; |
| 1863 | struct bio *pbio = r1_bio->bios[primary]; | 1892 | struct bio *pbio = r1_bio->bios[primary]; |
| 1864 | struct bio *sbio = r1_bio->bios[i]; | 1893 | struct bio *sbio = r1_bio->bios[i]; |
| 1865 | int size; | ||
| 1866 | 1894 | ||
| 1867 | if (sbio->bi_end_io != end_sync_read) | 1895 | if (sbio->bi_end_io != end_sync_read) |
| 1868 | continue; | 1896 | continue; |
| @@ -1888,27 +1916,6 @@ static int process_checks(struct r1bio *r1_bio) | |||
| 1888 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | 1916 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); |
| 1889 | continue; | 1917 | continue; |
| 1890 | } | 1918 | } |
| 1891 | /* fixup the bio for reuse */ | ||
| 1892 | bio_reset(sbio); | ||
| 1893 | sbio->bi_vcnt = vcnt; | ||
| 1894 | sbio->bi_size = r1_bio->sectors << 9; | ||
| 1895 | sbio->bi_sector = r1_bio->sector + | ||
| 1896 | conf->mirrors[i].rdev->data_offset; | ||
| 1897 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
| 1898 | sbio->bi_end_io = end_sync_read; | ||
| 1899 | sbio->bi_private = r1_bio; | ||
| 1900 | |||
| 1901 | size = sbio->bi_size; | ||
| 1902 | for (j = 0; j < vcnt ; j++) { | ||
| 1903 | struct bio_vec *bi; | ||
| 1904 | bi = &sbio->bi_io_vec[j]; | ||
| 1905 | bi->bv_offset = 0; | ||
| 1906 | if (size > PAGE_SIZE) | ||
| 1907 | bi->bv_len = PAGE_SIZE; | ||
| 1908 | else | ||
| 1909 | bi->bv_len = size; | ||
| 1910 | size -= PAGE_SIZE; | ||
| 1911 | } | ||
| 1912 | 1919 | ||
| 1913 | bio_copy_data(sbio, pbio); | 1920 | bio_copy_data(sbio, pbio); |
| 1914 | } | 1921 | } |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index cd066b63bdaf..df7b0a06b0ea 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -2097,11 +2097,17 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 2097 | * both 'first' and 'i', so we just compare them. | 2097 | * both 'first' and 'i', so we just compare them. |
| 2098 | * All vec entries are PAGE_SIZE; | 2098 | * All vec entries are PAGE_SIZE; |
| 2099 | */ | 2099 | */ |
| 2100 | for (j = 0; j < vcnt; j++) | 2100 | int sectors = r10_bio->sectors; |
| 2101 | for (j = 0; j < vcnt; j++) { | ||
| 2102 | int len = PAGE_SIZE; | ||
| 2103 | if (sectors < (len / 512)) | ||
| 2104 | len = sectors * 512; | ||
| 2101 | if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), | 2105 | if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), |
| 2102 | page_address(tbio->bi_io_vec[j].bv_page), | 2106 | page_address(tbio->bi_io_vec[j].bv_page), |
| 2103 | fbio->bi_io_vec[j].bv_len)) | 2107 | len)) |
| 2104 | break; | 2108 | break; |
| 2109 | sectors -= len/512; | ||
| 2110 | } | ||
| 2105 | if (j == vcnt) | 2111 | if (j == vcnt) |
| 2106 | continue; | 2112 | continue; |
| 2107 | atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); | 2113 | atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); |
| @@ -2284,12 +2290,18 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 2284 | d = r10_bio->devs[1].devnum; | 2290 | d = r10_bio->devs[1].devnum; |
| 2285 | wbio = r10_bio->devs[1].bio; | 2291 | wbio = r10_bio->devs[1].bio; |
| 2286 | wbio2 = r10_bio->devs[1].repl_bio; | 2292 | wbio2 = r10_bio->devs[1].repl_bio; |
| 2293 | /* Need to test wbio2->bi_end_io before we call | ||
| 2294 | * generic_make_request as if the former is NULL, | ||
| 2295 | * the latter is free to free wbio2. | ||
| 2296 | */ | ||
| 2297 | if (wbio2 && !wbio2->bi_end_io) | ||
| 2298 | wbio2 = NULL; | ||
| 2287 | if (wbio->bi_end_io) { | 2299 | if (wbio->bi_end_io) { |
| 2288 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2300 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
| 2289 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); | 2301 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); |
| 2290 | generic_make_request(wbio); | 2302 | generic_make_request(wbio); |
| 2291 | } | 2303 | } |
| 2292 | if (wbio2 && wbio2->bi_end_io) { | 2304 | if (wbio2) { |
| 2293 | atomic_inc(&conf->mirrors[d].replacement->nr_pending); | 2305 | atomic_inc(&conf->mirrors[d].replacement->nr_pending); |
| 2294 | md_sync_acct(conf->mirrors[d].replacement->bdev, | 2306 | md_sync_acct(conf->mirrors[d].replacement->bdev, |
| 2295 | bio_sectors(wbio2)); | 2307 | bio_sectors(wbio2)); |
| @@ -3407,6 +3419,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 3407 | 3419 | ||
| 3408 | if (bio->bi_end_io == end_sync_read) { | 3420 | if (bio->bi_end_io == end_sync_read) { |
| 3409 | md_sync_acct(bio->bi_bdev, nr_sectors); | 3421 | md_sync_acct(bio->bi_bdev, nr_sectors); |
| 3422 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 3410 | generic_make_request(bio); | 3423 | generic_make_request(bio); |
| 3411 | } | 3424 | } |
| 3412 | } | 3425 | } |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2bf094a587cb..78ea44336e75 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -3462,6 +3462,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 3462 | test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | 3462 | test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { |
| 3463 | set_bit(STRIPE_SYNCING, &sh->state); | 3463 | set_bit(STRIPE_SYNCING, &sh->state); |
| 3464 | clear_bit(STRIPE_INSYNC, &sh->state); | 3464 | clear_bit(STRIPE_INSYNC, &sh->state); |
| 3465 | clear_bit(STRIPE_REPLACED, &sh->state); | ||
| 3465 | } | 3466 | } |
| 3466 | spin_unlock(&sh->stripe_lock); | 3467 | spin_unlock(&sh->stripe_lock); |
| 3467 | } | 3468 | } |
| @@ -3607,19 +3608,23 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 3607 | handle_parity_checks5(conf, sh, &s, disks); | 3608 | handle_parity_checks5(conf, sh, &s, disks); |
| 3608 | } | 3609 | } |
| 3609 | 3610 | ||
| 3610 | if (s.replacing && s.locked == 0 | 3611 | if ((s.replacing || s.syncing) && s.locked == 0 |
| 3611 | && !test_bit(STRIPE_INSYNC, &sh->state)) { | 3612 | && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) |
| 3613 | && !test_bit(STRIPE_REPLACED, &sh->state)) { | ||
| 3612 | /* Write out to replacement devices where possible */ | 3614 | /* Write out to replacement devices where possible */ |
| 3613 | for (i = 0; i < conf->raid_disks; i++) | 3615 | for (i = 0; i < conf->raid_disks; i++) |
| 3614 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && | 3616 | if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { |
| 3615 | test_bit(R5_NeedReplace, &sh->dev[i].flags)) { | 3617 | WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); |
| 3616 | set_bit(R5_WantReplace, &sh->dev[i].flags); | 3618 | set_bit(R5_WantReplace, &sh->dev[i].flags); |
| 3617 | set_bit(R5_LOCKED, &sh->dev[i].flags); | 3619 | set_bit(R5_LOCKED, &sh->dev[i].flags); |
| 3618 | s.locked++; | 3620 | s.locked++; |
| 3619 | } | 3621 | } |
| 3620 | set_bit(STRIPE_INSYNC, &sh->state); | 3622 | if (s.replacing) |
| 3623 | set_bit(STRIPE_INSYNC, &sh->state); | ||
| 3624 | set_bit(STRIPE_REPLACED, &sh->state); | ||
| 3621 | } | 3625 | } |
| 3622 | if ((s.syncing || s.replacing) && s.locked == 0 && | 3626 | if ((s.syncing || s.replacing) && s.locked == 0 && |
| 3627 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
| 3623 | test_bit(STRIPE_INSYNC, &sh->state)) { | 3628 | test_bit(STRIPE_INSYNC, &sh->state)) { |
| 3624 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | 3629 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
| 3625 | clear_bit(STRIPE_SYNCING, &sh->state); | 3630 | clear_bit(STRIPE_SYNCING, &sh->state); |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b0b663b119a8..70c49329ca9a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -306,6 +306,7 @@ enum { | |||
| 306 | STRIPE_SYNC_REQUESTED, | 306 | STRIPE_SYNC_REQUESTED, |
| 307 | STRIPE_SYNCING, | 307 | STRIPE_SYNCING, |
| 308 | STRIPE_INSYNC, | 308 | STRIPE_INSYNC, |
| 309 | STRIPE_REPLACED, | ||
| 309 | STRIPE_PREREAD_ACTIVE, | 310 | STRIPE_PREREAD_ACTIVE, |
| 310 | STRIPE_DELAYED, | 311 | STRIPE_DELAYED, |
| 311 | STRIPE_DEGRADED, | 312 | STRIPE_DEGRADED, |
